# part 1/2 of motif heatmap: process homer known motif files
## one-step homer known-motif files
* just Fold change, no Pvalue
* No process on Fold change, keep origin fold

In [1]:
import glob,sys,os
from shutil import copyfile
import numpy as np
import pandas as pd


# move konw_motifs from separate folder to target folders and rename them by code

In [2]:
# knownResults path
homer_motif_files_path = [
    '/Users/jplab/Desktop/snakepipes_Data/ATAC_seq_shFOXA1_snakepipes_results/MOTIF/MOTIF_navieOverlap/*/knownResults.txt',
    '/Users/jplab/Desktop/snakepipes_Data/ATAC_seq_shFOXA1_snakepipes_results/MOTIF/MOTIF_ncFOXA1_BDT/*/knownResults.txt',
    '/Users/jplab/Desktop/snakepipes_Data/ATAC_seq_shFOXA1_snakepipes_results/MOTIF/MOTIF_shFOXA1_BDT/*/knownResults.txt',
    ]
target_files = []

for p in homer_motif_files_path:
    l = glob.glob(p)
    target_files.extend(l)

if(len(target_files) == 0):
    print('No known motif files found')
    sys.exit()

# target path to save knownResults
target_path = '/Users/jplab/Desktop/DAILY_CODE_DATA/2022-5/data/5-25_profileheatmap_motifHeatmap'


homer_motif_files = target_files

for hmf in homer_motif_files:
    prefix = hmf.split('/')[-2]
    surfix = hmf.split('/')[-1]
    new_file_name = prefix+'_'+surfix

    copyfile(hmf,target_path+'/'+new_file_name)



## process raw motif txt generated by homer

In [3]:
known_motif_path = target_path+'/*_knownResults.txt'

processed_motif_csv = target_path + '/processed_motif_csv'

if not os.path.exists(processed_motif_csv):
    os.mkdir(processed_motif_csv)

# process homer known motif results
known_motifs = glob.glob(known_motif_path)
for file in known_motifs:

    sample_name = file.split('/')[-1].replace('_knownResults.txt','')
    print(sample_name)

    # rename column name for simple
    df = pd.read_csv(file,header=0, sep='\t')
    df = df.rename(columns={'P-value':'p-value','% of Target Sequences with Motif':'target','% of Background Sequences with Motif':'background'})
    # print(df)

    dic={}
    dic['motif']=[]
    dic['log10(p_value)']=[]
    dic['fold_enrichment']=[]
    dic['target']=[]
    dic['background']=[]

    for index, row in df.iterrows():
        # print(index,row)
        t=float(row['target'].replace('%',''))
        b=float(row['background'].replace('%',''))
        # need to add sys.float_info.min to avoid /0 error. when 1e-7490 converted to float will be 0
        p=np.log10(float(row['p-value']) + sys.float_info.min)
        fold= t/(b+0.001)
        # print(t,b,p,fold) 

        # No filter here
        dic['fold_enrichment'].append(fold)
        dic['motif'].append(row['Motif Name'].split('/')[0])
        dic['target'].append(t)
        dic['background'].append(b)
        dic['log10(p_value)'].append(abs(p))

    # convert dic to dataframe 
    data=pd.DataFrame(dic,index=dic['motif'])
    # print(data)

    # sort and to save to tsv file
    data = data.sort_index(ascending=False)
    data.to_csv(processed_motif_csv+'/%s_motif_onlyFCnoPvalue.tsv' % sample_name,index=False,header=True,sep='\t')
    # print(data.columns.tolist())

ncFOXA1_BDT_TGFmostEnrich
shFOXA1.final_common_overlap
shFOXA1_BDT_TGFmostEnrich
F_T_navieOverlap
ncFOXA1.BDT_B14UniqOpen
NC_T_navieOverlap
ncFOXA1.final_common_overlap
shFOXA1_BDT_B14UniqOpen
F_B_navieOverlap
shFOXA1_BDT_TGFUniqOpen
NC_D_navieOverlap
shFOXA1_BDT_B14mostEnrich
NC_B_navieOverlap
ncFOXA1.BDT_TGFUniqOpen
ncFOXA1_BDT_B14mostEnrich
F_D_navieOverlap


## merge motif to one dataframe

In [None]:
# read each sample's filtered motif;
# remove dup motif name;
# merge each samples pvalue;
processed_motif_path = processed_motif_csv+'/*.tsv'
save_to_path = target_path

# add sample name into this list to re-order dataframe column order later
conds = ['motif']

# need to check path is correct
processed_motifs_files = glob.glob(processed_motif_path)
# print('processed_motifs_files:',processed_motifs_files)
dic = {}

for processed_motif in processed_motifs_files:
    sample_name = processed_motif.split('/')[-1].replace('.tsv','')
    # print('sample name: {0}'.format(sample_name))
    conds.append(sample_name)

    df = pd.read_csv(processed_motif,header=0, sep='\t')
    # remove dup motif name and sort by name
    df = df.drop_duplicates(subset=['motif'])
    df = df.sort_values(by=['motif'])
    # print(df)

    # add each sample p-value list
    dic[sample_name] = df['log10(p_value)']

# add motif column by motif names(all same between samples)
# check column name of motif in dataframe: 'motif' or 'Motif name' or else
dic['motif']=df['motif']
print(dic)
print(conds)

## re-oder and re-name dataframe, save to tsv

In [None]:
# save to file
data=pd.DataFrame(dic)

print(data.columns.tolist())

# use this conds list to re-order columns
new_order = ['motif', 
'ncFOXA1.final_common_overlap_motif',
'shFOXA1.final_common_overlap_motif', 
'NC_T_navieOverlap_motif', 
'F_T_navieOverlap_motif',
'ncFOXA1_BDT_TGFmostEnrich_motif', 
'ncFOXA1.BDT_TGFUniqOpen_motif', 
'shFOXA1_BDT_TGFmostEnrich_motif', 
'shFOXA1_BDT_TGFUniqOpen_motif', 
'NC_D_navieOverlap_motif', 
'F_D_navieOverlap_motif', 
'NC_B_navieOverlap_motif', 
'F_B_navieOverlap_motif', 
'ncFOXA1_BDT_B14mostEnrich_motif', 
'ncFOXA1.BDT_B14UniqOpen_motif', 
'shFOXA1_BDT_B14mostEnrich_motif', 
'shFOXA1_BDT_B14UniqOpen_motif', 

]
data=data[new_order]

data=data.rename(
    columns={
'ncFOXA1.final_common_overlap_motif':'ncBDTCommon',
'shFOXA1.final_common_overlap_motif':'shBDTCommon',
'NC_T_navieOverlap_motif':'ncFOXA1.TGF.navieOverlap',
'F_T_navieOverlap_motif':'shFOXA1.TGF.navieOverlap',
'ncFOXA1_BDT_TGFmostEnrich_motif':'ncTGFMostEnrich',
'ncFOXA1.BDT_TGFUniqOpen_motif':'ncTGFUniqOpen',
'shFOXA1_BDT_TGFmostEnrich_motif':'shTGFMostEnrich',
'shFOXA1_BDT_TGFUniqOpen_motif':'shTGFUniqOpen',
'NC_D_navieOverlap_motif':'ncFOXA1.DMSO.navieOverlap',
'F_D_navieOverlap_motif':'shFOXA1.DMSO.navieOverlap',
'NC_B_navieOverlap_motif':'ncFOXA1.B14.navieOverlap',
'F_B_navieOverlap_motif':'shFOXA1.B14.navieOverlap',
'ncFOXA1_BDT_B14mostEnrich_motif':'ncB14MostEnrich',
'shFOXA1_BDT_B14UniqOpen_motif':'shB14UniqOpen', 
'shFOXA1_BDT_B14mostEnrich_motif':'shB14MostEnrich',
'ncFOXA1.BDT_B14UniqOpen_motif':'ncB14UniqOpen'
    }
)
# print('2',data.columns.tolist())

data.to_csv(save_to_path+'/merged_known_motifs_pvalue.tsv',index=False,header=True,sep='\t')