In [66]:
import pandas as pd
import numpy as np
import os
from collections import Counter
from matplotlib import pyplot as plt
import ast
from scipy.stats import ttest_ind

In [200]:
DATA_DIR = '/Users/Yuan/Documents/BLab/Predict_target_genes/data'
motifs = pd.read_csv(os.path.join(DATA_DIR, 'intermediate/Mon/motif_list.txt'),sep='\t', header=None)
motifs = list(motifs[0])

In [479]:
def lists_to_binary(df_loci_motif, motifs):
    df_loci_motif.columns = [['loci','motifs']]
    binary_df = pd.DataFrame([[0] * len(motifs)] * len(df_loci_motif))
    binary_df.index = df_loci_motif['loci']
    binary_df.columns = motifs
    del binary_df.index.name
    
    for t in xrange(len(df_loci_motif)):
        eLoci = df_loci_motif.iloc[t]
        binary_df.loc[eLoci['loci'],ast.literal_eval(eLoci['motifs'])] = 1
        
    return binary_df

In [491]:
def compute_genes_TFMB(true_df, false_df, motifs):
    
    true_SNP_motif = lists_to_binary(true_df[['SNP','motif_SNP']], motifs)
    false_SNP_motif = lists_to_binary(false_df[['SNP','motif_SNP']], motifs)
    true_gene_motif = lists_to_binary(true_df[['GENE','motif_gene']], motifs)
    false_gene_motif = lists_to_binary(false_df[['GENE','motif_gene']], motifs)
    
    ### make sure the SNPs' motif align
    temp = list(np.sum(true_SNP_motif,axis=1)).index(np.max(np.sum(true_SNP_motif,axis=1)))
    temp_SNP = true_SNP_motif.index[temp]
    assert list(true[true['SNP']==temp_SNP]['motif_SNP'])  == list(false[false['SNP']==temp_SNP]['motif_SNP'])
    
    return true_gene_motif, false_gene_motif

In [252]:
def compare_proportions(true_gene_motif, false_gene_motif):
    '''
    The proportions of genes with 0/1/2... TF motif binding sites
    '''
    true_statistics = pd.DataFrame.from_dict(Counter(list(np.sum(true_gene_motif, axis=1))),orient='index')
    false_statistics = pd.DataFrame.from_dict(Counter(list(np.sum(false_gene_motif, axis=1))),orient='index')
    statistics = true_statistics.merge(false_statistics,left_index=True, right_index=True, how='outer').fillna(0)
    statistics.columns = ['true', 'false']
    proportions = statistics.apply(lambda x: np.array(x)*1.0 / np.sum(x),axis=0)
    return proportions


In [507]:
zero_percent, one_percent, two_percent = [], [], []
SNP_ATAC, gene_ATAC = [], []
celltype = 'Mon'

for chr in xrange(22):
    chr = chr+1
    try:
        true = pd.read_csv(os.path.join(DATA_DIR, 'eQTL/FairFax/%s_cis_%i_fdr05_50kb_annotated.csv' % (celltype,chr)),sep='\t')
        false = pd.read_csv(os.path.join(DATA_DIR, 'eQTL/FairFax/%s_cis_%i_fdr05_50kb_negative_annotated.csv' % (celltype,chr)),sep='\t')
        true = true.fillna('[]')
        false = false.fillna('[]')
        true.ix[true['ATAC_gene'] == 0,'motif_gene'] = '[]'
        false.ix[false['ATAC_gene'] == 0,'motif_gene'] = '[]'
        
        SNP_ATAC.append([Counter(true['ATAC_SNP'])[1]/float(len(true)), Counter(false['ATAC_SNP'])[1]/float(len(false))])
        gene_ATAC.append([Counter(true['ATAC_gene'])[1]/float(len(true)), Counter(false['ATAC_gene'])[1]/float(len(false))])
        
        true_gene_motif, false_gene_motif = compute_genes_TFMB(true, false, motifs)
        zero_percent.append([Counter(true_gene_motif.sum(axis=1))[0] / float(len(true)), Counter(false_gene_motif.sum(axis=1))[0] / float(len(false))])
        one_percent.append([Counter(true_gene_motif.sum(axis=1))[1] / float(len(true)), Counter(false_gene_motif.sum(axis=1))[1] / float(len(false))])
        two_percent.append([Counter(true_gene_motif.sum(axis=1))[2] / float(len(true)), Counter(false_gene_motif.sum(axis=1))[2] / float(len(false))])

    except:
        print "chr",chr
        

print ttest_ind([x[0] for x in gene_ATAC], [x[1] for x in gene_ATAC])
print ttest_ind([x[0] for x in zero_percent], [x[1] for x in zero_percent])
print ttest_ind([x[0] for x in one_percent], [x[1] for x in one_percent])
print ttest_ind([x[0] for x in two_percent], [x[1] for x in two_percent])



chr 6
chr 12
chr 15
chr 19
(array(0.046361558239191546), 0.96327830949771243)
(array(-0.7882154276922746), 0.43603206836865083)
(array(0.8228939915242323), 0.41630270169106909)
(array(0.3721679642529832), 0.71207751294069366)


In [508]:
gene_ATAC

[[0.05142857142857143, 0.05084745762711865],
 [0.14613778705636743, 0.16099773242630386],
 [0.11351351351351352, 0.09117647058823529],
 [0.10596026490066225, 0.13028169014084506],
 [0.08793969849246232, 0.08238636363636363],
 [0.0997229916897507, 0.15454545454545454],
 [0.2246376811594203, 0.19047619047619047],
 [0.191044776119403, 0.14381270903010032],
 [0.22608695652173913, 0.2161290322580645],
 [0.24836601307189543, 0.2384428223844282],
 [0.19753086419753085, 0.24503311258278146],
 [0.17279411764705882, 0.1328125],
 [0.22560975609756098, 0.1935483870967742],
 [0.1365079365079365, 0.1649122807017544],
 [0.25333333333333335, 0.18681318681318682],
 [0.3358208955223881, 0.2992125984251969],
 [0.18326693227091634, 0.22477064220183487],
 [0.23931623931623933, 0.37037037037037035],
 [0.2804878048780488, 0.2214765100671141]]

In [498]:
### plots


plt.figure()
plt.plot([x[0] for x in zero_percent],label="True eGene")
plt.plot([x[1] for x in zero_percent],label="Random gene")
plt.legend()
plt.title("Proportion of genes with zero TF motifs in the promoter regions (2KB)")
plt.savefig("plots/motifs_gene_zero_ATAC.png")



plt.figure()
plt.plot([x[0] for x in one_percent],label="True eGene")
plt.plot([x[1] for x in one_percent],label="Random gene")
plt.legend()
plt.title("Proportion of genes with one TF motifs in the promoter regions (2KB)")
plt.savefig("plots/motifs_gene_one_ATAC.png")



plt.figure()
plt.plot([x[0] for x in two_percent],label="True eGene")
plt.plot([x[1] for x in two_percent],label="Random gene")
plt.legend()
plt.title("Proportion of genes with two TF motifs in the promoter regions (2KB)")
plt.savefig("plots/motifs_gene_two_ATAC.png")

plt.figure()
plt.plot([x[0] for x in open_chromosome],label="True eGene")
plt.plot([x[1] for x in open_chromosome],label="Random gene")
plt.legend()
plt.savefig("plots/genes_ATAC.png")


plt.close()



In [312]:
true_tfs = pd.DataFrame.from_dict(dict(Counter(true['motif_gene'])),orient='index')
false_tfs = pd.DataFrame.from_dict(dict(Counter(false['motif_gene'])),orient='index')
tfs = true_tfs.merge(false_tfs,how='outer',left_index=True, right_index=True)
tfs.columns=['true','false']

tfs = tfs.apply(lambda x: np.array(x) * 1.0/np.nansum(x), axis=0)
tfs = tfs.transpose()
tfs = tfs.fillna(0)

In [284]:
plt.figure()
plt.scatter(tfs.loc['true'],tfs.loc['false'])
plt.plot(np.array(range(0,10))/10.0, np.array(range(0,10))/10.0)
plt.xlabel("True")
plt.ylabel("False")
plt.show()
plt.close()