In [2]:
import numpy as np
import pandas as pd

In [3]:
# Gene_TSS and hic_dist obtained from EnhancerPredictions.txt in EnhancerPredictions directory

gene_tss = pd.read_csv("../EnhancerPredictions/Gene_TSS.txt", sep='\t')
enhancer_list = pd.read_csv("../EnhancerPredictions/Enhancer_list.bed", sep="\t")

## Showing format of files 

In [4]:
gene_tss.head()

Unnamed: 0,TargetGene,TargetGeneTSS
0,CCT8L2,17073700
1,CCT8L2,17073700
2,CCT8L2,17073700
3,TPTEP1,17082800
4,TPTEP1,17082800


In [41]:
enhancer_list.head()

Unnamed: 0,chr,start,end
0,chr22,17080380,17081156
1,chr22,17083849,17085052
2,chr22,17081312,17081812
3,chr22,17080380,17081156
4,chr22,17085387,17085887


In [6]:
# Assuming that the genes have similar TSS
# Obtaining unique genes from gene file 
unique = gene_tss['TargetGene'].drop_duplicates()
unique_1 = unique.reset_index(drop=True)

# Creating Gene DF
dict_unique_genes = {'Gene/TSS': unique_1}
genes = pd.DataFrame(dict_unique_genes)
genes.head()

Unnamed: 0,Gene/TSS
0,CCT8L2
1,TPTEP1
2,ANKRD62P1-PARP4P3
3,IL17RA
4,HDHD5


In [8]:
# Creating enhancer DF
enh_list = enhancer_list.applymap(str)
col = enh_list[['chr', 'start', 'end']].apply(lambda x: ','.join(x[x.notnull()]), axis = 1)
df = pd.DataFrame(columns=col)
df.head()

Unnamed: 0,"chr22,17080380,17081156","chr22,17083849,17085052","chr22,17081312,17081812","chr22,17080380,17081156.1","chr22,17085387,17085887","chr22,17083849,17085052.1","chr22,17081312,17081812.1","chr22,17169502,17170002","chr22,17162848,17163348","chr22,17083849,17085052.2",...,"chr22,50965194,50965694","chr22,50977588,50978449","chr22,50936544,50937403","chr22,50965194,50965694.1","chr22,51051582,51052575","chr22,50557000,50557500","chr22,50977588,50978449.1","chr22,50201667,50202291","chr22,50936544,50937403.1","chr22,50965194,50965694.2"


In [9]:
# Creating empty matrix for enhancer columns
df_2 = {column: [0]*len(genes) for column in df}
df_3 = pd.DataFrame(df_2)
df_3.head()

Unnamed: 0,"chr22,17080380,17081156","chr22,17083849,17085052","chr22,17081312,17081812","chr22,17085387,17085887","chr22,17169502,17170002","chr22,17162848,17163348","chr22,17651972,17653076","chr22,17568542,17569042","chr22,18484010,18484860","chr22,18115642,18116537",...,"chr22,50744278,50744778","chr22,50742949,50743449","chr22,50948185,50948685","chr22,50908215,50908715","chr22,50891626,50892126","chr22,50965194,50965694","chr22,50972567,50973216","chr22,50971768,50972268","chr22,51007093,51007593","chr22,51051582,51052575"
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
# Creating empty gene_enhancer matrix 
new_df = pd.concat([genes, df_3], axis=1)
new_df.set_index('Gene/TSS', inplace=True)
new_df.head()

Unnamed: 0_level_0,"chr22,17080380,17081156","chr22,17083849,17085052","chr22,17081312,17081812","chr22,17085387,17085887","chr22,17169502,17170002","chr22,17162848,17163348","chr22,17651972,17653076","chr22,17568542,17569042","chr22,18484010,18484860","chr22,18115642,18116537",...,"chr22,50744278,50744778","chr22,50742949,50743449","chr22,50948185,50948685","chr22,50908215,50908715","chr22,50891626,50892126","chr22,50965194,50965694","chr22,50972567,50973216","chr22,50971768,50972268","chr22,51007093,51007593","chr22,51051582,51052575"
Gene/TSS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CCT8L2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TPTEP1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ANKRD62P1-PARP4P3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
IL17RA,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HDHD5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
enh_predictions = pd.read_csv("../EnhancerPredictions/EnhancerPredictions.txt", sep=",")

In [12]:
enh_predictions.head()

Unnamed: 0,chr,start,end,DHS.ENCFF030DCL.bam.readCount,DHS.ENCFF030DCL.bam.RPM,DHS.ENCFF030DCL.bam.readCount.quantile,DHS.ENCFF030DCL.bam.RPM.quantile,DHS.ENCFF030DCL.bam.RPKM,DHS.ENCFF030DCL.bam.RPKM.quantile,DHS.RPM,...,hic.distance.unscaled,hic.rowmax.unscaled,hic.distance.adj,hic_adjustment,estimatedCP,estimatedCP.adj,ABC.Score.Numerator,ABC.Score,powerlaw.Score.Numerator,powerlaw.Score
0,chr22,17080380,17081156,96,5.6886,0.85,0.85,7.3306,0.8192,5.6886,...,0.026,0.0574,45.8664,0.4975,0.4143,41.4285,130.9569,0.0502,118.2857,0.0707
1,chr22,17083849,17085052,84,4.9775,0.8286,0.8286,4.1376,0.6709,4.9775,...,0.026,0.0574,45.8664,0.4975,0.3174,31.743,415.4143,0.1592,287.498,0.1719
2,chr22,17081312,17081812,32,1.8962,0.6038,0.6038,3.7924,0.6439,1.8962,...,0.026,0.0574,45.8664,0.4975,0.3887,38.8712,112.5083,0.0431,95.3493,0.057
3,chr22,17080380,17081156,96,5.6886,0.85,0.85,7.3306,0.8192,5.6886,...,0.0592,0.0592,100.0,0.4975,0.7109,71.0934,285.5181,0.0671,202.9846,0.0688
4,chr22,17085387,17085887,25,1.4814,0.5189,0.5189,2.9628,0.5501,1.4814,...,0.0435,0.0592,73.8763,0.4975,0.6379,63.7918,125.3966,0.0295,108.2793,0.0367


In [13]:
enh_predictions=enh_predictions.applymap(str)

In [14]:
enh_predictions.columns

Index(['chr', 'start', 'end', 'DHS.ENCFF030DCL.bam.readCount',
       'DHS.ENCFF030DCL.bam.RPM', 'DHS.ENCFF030DCL.bam.readCount.quantile',
       'DHS.ENCFF030DCL.bam.RPM.quantile', 'DHS.ENCFF030DCL.bam.RPKM',
       'DHS.ENCFF030DCL.bam.RPKM.quantile', 'DHS.RPM', 'DHS.RPM.quantile',
       'DHS.RPKM', 'DHS.RPKM.quantile', 'H3K27ac.ENCFF384ZZM.bam.readCount',
       'H3K27ac.ENCFF384ZZM.bam.RPM',
       'H3K27ac.ENCFF384ZZM.bam.readCount.quantile',
       'H3K27ac.ENCFF384ZZM.bam.RPM.quantile', 'H3K27ac.ENCFF384ZZM.bam.RPKM',
       'H3K27ac.ENCFF384ZZM.bam.RPKM.quantile', 'H3K27ac.RPM',
       'H3K27ac.RPM.quantile', 'H3K27ac.RPKM', 'H3K27ac.RPKM.quantile',
       'normalized_h3K27ac', 'normalized_dhs', 'activity_base', 'cellType',
       'class', 'isPromoterElement', 'isGenicElement', 'isIntergenicElement',
       'enhancerSymbol', 'name', 'distance', 'isSelfPromoter', 'TargetGene',
       'TargetGeneTSS', 'TargetGeneExpression',
       'TargetGenePromoterActivityQuantile', 'hic.dist

In [16]:
# FILLING IN ENHANCER_GENE MATRIX
for i in unique: 
    genomic_locations = []
    matches = enh_predictions.loc[enh_predictions['TargetGene']==i]
    scores = list(matches['ABC.Score'])
    matches = matches.applymap(str)
    genomic_locations = matches[['chr', 'start', 'end']].apply(lambda x: ','.join(x[x.notnull()]), axis = 1)
    k=0
    for j in genomic_locations: 
        if k < len(scores):    
            new_df.iloc[new_df.index == i,new_df.columns==j] = scores[k]
            k+=1
    


In [17]:
# Saves df 
new_df.to_pickle("Enh_Gene_Matrix_ABC_Score")