# part 1/2 of motif heatmap
# process homer known motif files
## one-step homer known-motif files to final merged tsv

In [None]:
import glob,sys,os
from shutil import copyfile
import numpy as np
import pandas as pd
import matplotlib.pyplot as plot

# move konw_motifs from separate folder to target folders and rename them by code

In [None]:
# knownResults path
homer_motif_files_path = '/Users/jplab/Desktop/DAILY_CODE_DATA/2022-4/data/4-12_shnc.ATAC.Motif/one_strand/sizeDefault_narrowpeak/*/knownResults.txt'
# target path to save knownResults
target_path = '/Users/jplab/Desktop/DAILY_CODE_DATA/2022-4/data/4-13_ATAC_SHNC_motif_heatmap/know_motifs'


homer_motif_files = glob.glob(homer_motif_files_path)

for hmf in homer_motif_files:
    prefix = hmf.split('/')[-2]
    surfix = hmf.split('/')[-1]
    new_file_name = prefix+'_'+surfix

    copyfile(hmf,target_path+'/'+new_file_name)



## process raw motif txt generated by homer

In [None]:
known_motif_path = target_path+'/*_knownResults.txt'

processed_motif_csv = '/Users/jplab/Desktop/DAILY_CODE_DATA/2022-4/data/4-13_ATAC_shnc_csaw/processed_motif_csv'

if not os.path.exists(processed_motif_csv):
    os.mkdir(processed_motif_csv)

# process homer known motif results
known_motifs = glob.glob(known_motif_path)
for file in known_motifs:

    sample_name = file.split('/')[-1].replace('_knownResults.txt','')
    print(sample_name)

    # rename column name for simple
    df = pd.read_csv(file,header=0, sep='\t')
    df = df.rename(columns={'P-value':'p-value','% of Target Sequences with Motif':'target','% of Background Sequences with Motif':'background'})
    # print(df)

    dic={}
    dic['motif']=[]
    dic['log10(p_value)']=[]
    dic['fold_enrichment']=[]
    dic['target']=[]
    dic['background']=[]

    for index, row in df.iterrows():
        # print(index,row)
        t=float(row['target'].replace('%',''))
        b=float(row['background'].replace('%',''))
        # need to add sys.float_info.min to avoid /0 error. when 1e-7490 converted to float will be 0
        p=np.log10(float(row['p-value']) + sys.float_info.min)
        fold= t/(b+0.001)
        # print(t,b,p,fold) 

        # process motif filters:   
        # p <= -2 : p value <= 0.01, log10(p-value)<=-2
        # fold > 1.5 : target/background >= 1.5
        # a >= 2 : target value >= 2%
        dic['fold_enrichment'].append(fold)
        dic['motif'].append(row['Motif Name'].split('/')[0])
        dic['target'].append(t)
        dic['background'].append(b)
        if  p <= -2 and fold >=1.5 and t >=2:
            dic['log10(p_value)'].append(abs(p))
        else :
            dic['log10(p_value)'].append(0)
    # print(dic)

    # convert dic to dataframe 
    data=pd.DataFrame(dic,index=dic['motif'])
    # print(data)

    # sort and to save to tsv file
    data = data.sort_index(ascending=False)
    data.to_csv(processed_motif_csv+'/%s_motif.tsv' % sample_name,index=False,header=True,sep='\t')
    # print(data.columns.tolist())
    
    # bar plot part, comment if need
    # # plot and save barplot of each motif target and background percentage
    # ax=data[['target','background','fold_enrichment','log10(p_value)']].plot(kind='bar',title='motif about %s' % sample_name,align="center", figsize=(200,12),legend=True,fontsize=25)
    # ax.set_ylabel('peaks percent(%)',fontsize=20)
    # plot.xticks(fontsize=10,rotation=20)
    # plot.savefig(target_path+'/%s_motif_peaksPercent.png' % sample_name) 

    # # plot and save barplot of each motif fold enrichment
    # ax=data[['fold_enrichment']].plot(kind='bar',title='motif about %s' % sample_name,align="center", figsize=(100,12),legend=True,fontsize=25)
    # ax.set_ylabel('fold enrichment(target/background)',fontsize=20)
    # plot.xticks(fontsize=10,rotation=20)
    # plot.savefig(target_path+'/%s_motif_foldEnrichment.png' % sample_name) 

    # # plot and save barplot of each motif fold enrichment 
    # ax=data[['log10(p_value)']].plot(kind='bar',title='motif about %s' % sample_name,align="center", figsize=(100,12),legend=True,fontsize=25)
    # ax.set_ylabel('log10(p_value)',fontsize=20)
    # plot.xticks(fontsize=10,rotation=20)
    # plot.savefig(target_path+'/%s_motif_log10(p).png' % sample_name) 

## merge motif to one dataframe

In [None]:
# read each sample's filtered motif;
# remove dup motif name;
# merge each samples pvalue;
processed_motif_path = processed_motif_csv+'/*.tsv'
save_to_path = target_path

# add sample name into this list to re-order dataframe column order later
conds = ['motif']

# need to check path is correct
processed_motifs_files = glob.glob(processed_motif_path)
# print('processed_motifs_files:',processed_motifs_files)
dic = {}

for processed_motif in processed_motifs_files:
    sample_name = processed_motif.split('/')[-1].replace('.tsv','')
    # print('sample name: {0}'.format(sample_name))
    conds.append(sample_name)

    df = pd.read_csv(processed_motif,header=0, sep='\t')
    # remove dup motif name and sort by name
    df = df.drop_duplicates(subset=['motif'])
    df = df.sort_values(by=['motif'])
    # print(df)

    # add each sample p-value list
    dic[sample_name] = df['log10(p_value)']

# add motif column by motif names(all same between samples)
# check column name of motif in dataframe: 'motif' or 'Motif name' or else
dic['motif']=df['motif']
print(dic)
print(conds)

## re-oder and re-name dataframe, save to tsv

In [None]:
# save to file
data=pd.DataFrame(dic)

print('1',data.columns.tolist())
print(data.columns.tolist())

# use this conds list to re-order columns
new_order = ['motif', 
'NC-B1.posStrand.narrowPeak.sizedefault_motif',
'NC-B2.posStrand.narrowPeak.sizedefault_motif', 
'NC-D1.posStrand.narrowPeak.sizedefault_motif',
'NC-D2.posStrand.narrowPeak.sizedefault_motif', 
'NC-T1.posStrand.narrowPeak.sizedefault_motif', 
'NC-T2.posStrand.narrowPeak.sizedefault_motif',
'F-B1.posStrand.narrowPeak.sizedefault_motif', 
'F-B2.posStrand.narrowPeak.sizedefault_motif',
'F-D1.posStrand.narrowPeak.sizedefault_motif', 
'F-D2.posStrand.narrowPeak.sizedefault_motif',
'F-T1.posStrand.narrowPeak.sizedefault_motif', 
'F-T2.posStrand.narrowPeak.sizedefault_motif',
]
data=data[new_order]

data=data.rename(
    columns={
    'NC-B1.posStrand.narrowPeak.sizedefault_motif':'NC-B1',
    'NC-B2.posStrand.narrowPeak.sizedefault_motif':'NC-B2',
    'NC-D1.posStrand.narrowPeak.sizedefault_motif':'NC-D1',
    'NC-D2.posStrand.narrowPeak.sizedefault_motif':'NC-D2',
    'NC-T1.posStrand.narrowPeak.sizedefault_motif':'NC-T1',
    'NC-T2.posStrand.narrowPeak.sizedefault_motif':'NC-T2',
    'F-B1.posStrand.narrowPeak.sizedefault_motif':'F-B1',
    'F-B2.posStrand.narrowPeak.sizedefault_motif':'F-B2',
    'F-D1.posStrand.narrowPeak.sizedefault_motif':'F-D1',
    'F-D2.posStrand.narrowPeak.sizedefault_motif':'F-D2',
    'F-T1.posStrand.narrowPeak.sizedefault_motif':'F-T1',
    'F-T2.posStrand.narrowPeak.sizedefault_motif':'F-T2'
    }
)
# print('2',data.columns.tolist())

data.to_csv(save_to_path+'/merged_known_motifs_pvalue.tsv',index=False,header=True,sep='\t')