# 04__preprocess_mpranalyze_compare

in this notebook, i re-shape the counts data to run MPRAnalyze comparison mode. importantly, i also include the negative controls for comparison mode that I made in the previous notebook (01). 

In [1]:
import warnings
warnings.filterwarnings('ignore')

import itertools
import pandas as pd
import math
import matplotlib.pyplot as plt
import numpy as np
import re
import seaborn as sns
import sys

from scipy.stats import spearmanr

# import utils
sys.path.append("../../../utils")
from plotting_utils import *

%matplotlib inline
%config InlineBackend.figure_format = 'svg'
mpl.rcParams['figure.autolayout'] = False

In [2]:
sns.set(**PAPER_PRESET)
fontsize = PAPER_FONTSIZE

In [3]:
np.random.seed(2019)

## functions

In [4]:
def ctrl_status(row):
    if "CONTROL" in row.comp_id:
        return True
    else:
        return False

## variables

In [5]:
mpranalyze_dir = "../../../data/02__mpra/01__counts/mpranalyze_files"

In [6]:
dna_counts_f = "%s/dna_counts.mpranalyze.for_quantification.txt" % mpranalyze_dir
rna_counts_f = "%s/rna_counts.mpranalyze.for_quantification.txt" % mpranalyze_dir

In [7]:
data_dir = "../../../data/02__mpra/02__activs"

In [8]:
# human_max_f = "%s/human_TSS_vals.max_tile.txt" % data_dir
# mouse_max_f = "%s/mouse_TSS_vals.max_tile.txt" % data_dir

In [9]:
tss_map_f = "../../../data/01__design/01__mpra_list/mpra_tss.with_ids.RECLASSIFIED_WITH_MAX.txt"

In [10]:
dna_col_ann_f = "%s/dna_col_ann.mpranalyze.for_quantification.txt" % mpranalyze_dir
rna_col_ann_f = "%s/rna_col_ann.mpranalyze.for_quantification.txt" % mpranalyze_dir

In [11]:
human_vals_f = "%s/human_TSS_vals.both_tiles.txt" % data_dir
mouse_vals_f = "%s/mouse_TSS_vals.both_tiles.txt" % data_dir

## 1. import data

In [12]:
dna_counts = pd.read_table(dna_counts_f)
dna_counts.head()

Unnamed: 0,element,samp:dna_1__barc:1,samp:dna_1__barc:10,samp:dna_1__barc:11,samp:dna_1__barc:12,samp:dna_1__barc:13,samp:dna_1__barc:2,samp:dna_1__barc:3,samp:dna_1__barc:4,samp:dna_1__barc:5,samp:dna_1__barc:6,samp:dna_1__barc:7,samp:dna_1__barc:8,samp:dna_1__barc:9
0,AAAAAAAAAAAAAAAAACCCTGCAGAGAGCCTGCAAAGTCACTGCC...,614.0,126.0,94.0,2024.0,968.0,806.0,592.0,78.0,0.0,224.0,478.0,32.0,320.0
1,AAAAAAAAAAAAAAAGAAAAGAAAAGAAAAAAAAGAAAGGATTGAG...,64.0,12.0,52.0,0.0,16.0,94.0,44.0,128.0,178.0,44.0,0.0,20.0,340.0
2,AAAAAAAAAAAAGAGGAGAAATAGATTGTTACCTTATATTATTTAA...,218.0,54.0,170.0,22.0,66.0,182.0,116.0,8.0,28.0,0.0,72.0,116.0,0.0
3,AAAAAAAAAACCGGCAAAATGTCCTTTTCCTTGTTTTGAAAAGACT...,460.0,346.0,626.0,448.0,324.0,502.0,86.0,162.0,210.0,414.0,352.0,524.0,468.0
4,AAAAAAAAAGGCCACGCTCAAAACCCCAGACTAGTTTTCCTCACCA...,596.0,242.0,456.0,180.0,0.0,828.0,226.0,42.0,302.0,446.0,750.0,540.0,650.0


In [13]:
rna_counts = pd.read_table(rna_counts_f)
rna_counts.head()

Unnamed: 0,element,samp:HUES64_rep1__barc:1,samp:HUES64_rep1__barc:10,samp:HUES64_rep1__barc:11,samp:HUES64_rep1__barc:12,samp:HUES64_rep1__barc:13,samp:HUES64_rep1__barc:2,samp:HUES64_rep1__barc:3,samp:HUES64_rep1__barc:4,samp:HUES64_rep1__barc:5,...,samp:mESC_rep3__barc:12,samp:mESC_rep3__barc:13,samp:mESC_rep3__barc:2,samp:mESC_rep3__barc:3,samp:mESC_rep3__barc:4,samp:mESC_rep3__barc:5,samp:mESC_rep3__barc:6,samp:mESC_rep3__barc:7,samp:mESC_rep3__barc:8,samp:mESC_rep3__barc:9
0,AAAAAAAAAAAAAAAAACCCTGCAGAGAGCCTGCAAAGTCACTGCC...,6863.0,2060.0,40.0,22028.0,7696.0,9308.0,4931.0,519.0,0.0,...,7989.0,2737.0,2496.0,1980.0,232.0,0.0,530.0,1062.0,74.0,1393.0
1,AAAAAAAAAAAAAAAGAAAAGAAAAGAAAAAAAAGAAAGGATTGAG...,9.0,1.0,2.0,0.0,3.0,29.0,151.0,136.0,229.0,...,0.0,4.0,133.0,1.0,31.0,334.0,0.0,0.0,0.0,422.0
2,AAAAAAAAAAAAGAGGAGAAATAGATTGTTACCTTATATTATTTAA...,28.0,264.0,119.0,1.0,7.0,272.0,20.0,15.0,0.0,...,0.0,0.0,117.0,3.0,0.0,3.0,0.0,100.0,0.0,0.0
3,AAAAAAAAAACCGGCAAAATGTCCTTTTCCTTGTTTTGAAAAGACT...,175.0,16.0,789.0,755.0,661.0,415.0,50.0,7.0,600.0,...,266.0,0.0,30.0,10.0,100.0,291.0,222.0,57.0,128.0,68.0
4,AAAAAAAAAGGCCACGCTCAAAACCCCAGACTAGTTTTCCTCACCA...,1045.0,176.0,733.0,97.0,0.0,2295.0,619.0,0.0,1659.0,...,18.0,0.0,327.0,119.0,2.0,326.0,228.0,611.0,553.0,541.0


In [14]:
human_vals = pd.read_table(human_vals_f)
mouse_vals = pd.read_table(mouse_vals_f)
human_vals.head()

Unnamed: 0,HUES64,mESC,HUES64_pval,mESC_pval,HUES64_padj,mESC_padj,element,tile_type,element_id,name,...,dupe_info,HUES64_log,mESC_log,tss_id,species,tss_tile_num,hg19_id,minimal_biotype_hg19,stem_exp_hg19,orig_species
0,1.326796,1.348824,0.145102,0.232651,0.331993,0.545526,AAAAAAAAAAAAAAAGAAAAGAAAAGAAAAAAAAGAAAGGATTGAG...,WILDTYPE,HUMAN_EVO_TSS__h.1133__tile1:129255098-1292552...,HUMAN_EVO_TSS__h.1133__tile1,...,7281.1.0.0,0.122804,0.129955,h.1133,HUMAN,tile1,h.1133,eRNA,0.20306,human
1,1.202075,1.503351,0.242922,0.12895,0.488166,0.3645,CTACTCCGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGGGTGG...,WILDTYPE,HUMAN_EVO_TSS__h.1133__tile2:129255212-1292553...,HUMAN_EVO_TSS__h.1133__tile2,...,7282.1.0.0,0.079932,0.177061,h.1133,HUMAN,tile2,h.1133,eRNA,0.20306,human
2,1.323999,1.457128,0.146955,0.155924,0.335623,0.417211,AAAAAAAAAGGCCACGCTCAAAACCCCAGACTAGTTTTCCTCACCA...,WILDTYPE,HUMAN_EVO_TSS__h.1436__tile1:157192113-1571922...,HUMAN_EVO_TSS__h.1436__tile1,...,4134.1.0.0,0.121888,0.163498,h.1436,HUMAN,tile1,h.1436,eRNA,0.0390438,human
3,0.899162,0.755892,0.571115,0.790854,0.76329,0.837563,TTAAACTCGTTTAAACCAATTTTGAATAAGCTACTTCTTAGGCTAG...,WILDTYPE,HUMAN_EVO_TSS__h.1436__tile2:157191999-1571921...,HUMAN_EVO_TSS__h.1436__tile2,...,4133.1.0.0,-0.046162,-0.12154,h.1436,HUMAN,tile2,h.1436,eRNA,0.0390438,human
4,1.137333,1.305637,0.305141,0.268285,0.564281,0.591663,AAAAAAAAGAAGAAGTGGGGCTTTCCAGAGGTGATTAAGTCATGAG...,WILDTYPE,HUMAN_EVO_TSS__h.1951__tile1:208526987-2085271...,HUMAN_EVO_TSS__h.1951__tile1,...,4272.1.0.0,0.055888,0.115822,h.1951,HUMAN,tile1,h.1951,lncRNA,0.0288889,human


In [15]:
tss_map = pd.read_table(tss_map_f, sep="\t")
tss_map.head()

Unnamed: 0,hg19_id,mm9_id,cage_id_hg19,cage_id_mm9,name_peak_hg19,name_peak_mm9,biotype_hg19,biotype_mm9,minimal_biotype_hg19,minimal_biotype_mm9,...,stem_exp_mm9,max_cage_hg19,max_cage_mm9,orig_species,har,tss_tile_num_max_hg19,tss_tile_num_max_mm9,n_tiles_hg19,n_tiles_mm9,tile_match
0,h.0,m.0,"chr1:2984976..2984989,-","chr4:154011655..154011673,+",ENSG00000177133.6,ENSMUSG00000085069.2,divergent,divergent,lncRNA,lncRNA,...,0.0175,367.0,2286.0,human,False,,,,,
1,h.1,m.0,"chr1:2984997..2985037,-","chr4:154011655..154011673,+",ENSG00000177133.6,ENSMUSG00000085069.2,divergent,divergent,lncRNA,lncRNA,...,0.0175,367.0,2286.0,human,False,,,,,
2,h.2,m.1,"chr1:2985420..2985438,-","chr4:154011250..154011257,+",ENSG00000177133.6,ENSMUSG00000085069.2,divergent,divergent,lncRNA,lncRNA,...,0.0,6966.0,1361.0,human,False,tile1,tile1,2.0,2.0,tile1:tile1
3,h.3,m.2,"chr1:8086546..8086571,+","chr4:150229039..150229050,-",ENSG00000238290.1,ENSMUSG00000078492.3,divergent,antisense,lncRNA,lncRNA,...,0.0,960.0,123.0,human,False,tile1,tile1,1.0,1.0,tile1:tile1
4,h.4,m.3,"chr1:26498321..26498327,-","chr4:133799669..133799683,+",ENSG00000236782.1,ENSMUSG00000086322.7,antisense,protein_coding,other,mRNA,...,0.0,753.0,770.0,human,False,tile1,tile1,2.0,2.0,tile1:tile1


In [16]:
old_dna_col_ann = pd.read_table(dna_col_ann_f, index_col=0)
old_dna_col_ann.head()

Unnamed: 0,sample,condition,barcode
samp:dna_1__barc:1,1,dna,1
samp:dna_1__barc:10,1,dna,10
samp:dna_1__barc:11,1,dna,11
samp:dna_1__barc:12,1,dna,12
samp:dna_1__barc:13,1,dna,13


In [17]:
old_rna_col_ann = pd.read_table(rna_col_ann_f, index_col=0)
old_rna_col_ann.head()

Unnamed: 0,sample,condition,barcode
samp:HUES64_rep1__barc:1,rep1,HUES64,1
samp:HUES64_rep1__barc:10,rep1,HUES64,10
samp:HUES64_rep1__barc:11,rep1,HUES64,11
samp:HUES64_rep1__barc:12,rep1,HUES64,12
samp:HUES64_rep1__barc:13,rep1,HUES64,13


## 2. remove any sequences in TSS map that we removed at initial MPRAnalyze (low counts)

In [18]:
# filter out any elements we removed at initial steps (low dna counts)
human_vals = human_vals[human_vals["element"].isin(dna_counts["element"])]
mouse_vals = mouse_vals[mouse_vals["element"].isin(dna_counts["element"])]

## 3. get positive ctrl dna/rna counts

In [19]:
dna_counts_ctrl = dna_counts[dna_counts["element"].str.contains("samp")]
print(len(dna_counts_ctrl))
rna_counts_ctrl = rna_counts[rna_counts["element"].str.contains("samp")]
print(len(rna_counts_ctrl))

400
400


# first make files needed for seq. comparison (native and cis effects)

## 1. merge ortholog pairs w/ counts
old:::: this time, always pair tile1 with tile1 and tile2 with tile2
new:::: pair tile1 with tile1 unless maximum is tile2 in both species

In [20]:
tss_max = tss_map[["hg19_id", "mm9_id", "tile_match"]]
tss_max.head()

Unnamed: 0,hg19_id,mm9_id,tile_match
0,h.0,m.0,
1,h.1,m.0,
2,h.2,m.1,tile1:tile1
3,h.3,m.2,tile1:tile1
4,h.4,m.3,tile1:tile1


In [21]:
human_vals.head()

Unnamed: 0,HUES64,mESC,HUES64_pval,mESC_pval,HUES64_padj,mESC_padj,element,tile_type,element_id,name,...,dupe_info,HUES64_log,mESC_log,tss_id,species,tss_tile_num,hg19_id,minimal_biotype_hg19,stem_exp_hg19,orig_species
0,1.326796,1.348824,0.145102,0.232651,0.331993,0.545526,AAAAAAAAAAAAAAAGAAAAGAAAAGAAAAAAAAGAAAGGATTGAG...,WILDTYPE,HUMAN_EVO_TSS__h.1133__tile1:129255098-1292552...,HUMAN_EVO_TSS__h.1133__tile1,...,7281.1.0.0,0.122804,0.129955,h.1133,HUMAN,tile1,h.1133,eRNA,0.20306,human
1,1.202075,1.503351,0.242922,0.12895,0.488166,0.3645,CTACTCCGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGGGTGG...,WILDTYPE,HUMAN_EVO_TSS__h.1133__tile2:129255212-1292553...,HUMAN_EVO_TSS__h.1133__tile2,...,7282.1.0.0,0.079932,0.177061,h.1133,HUMAN,tile2,h.1133,eRNA,0.20306,human
2,1.323999,1.457128,0.146955,0.155924,0.335623,0.417211,AAAAAAAAAGGCCACGCTCAAAACCCCAGACTAGTTTTCCTCACCA...,WILDTYPE,HUMAN_EVO_TSS__h.1436__tile1:157192113-1571922...,HUMAN_EVO_TSS__h.1436__tile1,...,4134.1.0.0,0.121888,0.163498,h.1436,HUMAN,tile1,h.1436,eRNA,0.0390438,human
3,0.899162,0.755892,0.571115,0.790854,0.76329,0.837563,TTAAACTCGTTTAAACCAATTTTGAATAAGCTACTTCTTAGGCTAG...,WILDTYPE,HUMAN_EVO_TSS__h.1436__tile2:157191999-1571921...,HUMAN_EVO_TSS__h.1436__tile2,...,4133.1.0.0,-0.046162,-0.12154,h.1436,HUMAN,tile2,h.1436,eRNA,0.0390438,human
4,1.137333,1.305637,0.305141,0.268285,0.564281,0.591663,AAAAAAAAGAAGAAGTGGGGCTTTCCAGAGGTGATTAAGTCATGAG...,WILDTYPE,HUMAN_EVO_TSS__h.1951__tile1:208526987-2085271...,HUMAN_EVO_TSS__h.1951__tile1,...,4272.1.0.0,0.055888,0.115822,h.1951,HUMAN,tile1,h.1951,lncRNA,0.0288889,human


In [22]:
dna_counts_human_all = human_vals[["element", "tss_id", "tss_tile_num"]].merge(dna_counts, on="element").drop_duplicates()
dna_counts_mouse_all = mouse_vals[["element", "tss_id", "tss_tile_num"]].merge(dna_counts, on="element").drop_duplicates()
dna_counts_human_all.head()

Unnamed: 0,element,tss_id,tss_tile_num,samp:dna_1__barc:1,samp:dna_1__barc:10,samp:dna_1__barc:11,samp:dna_1__barc:12,samp:dna_1__barc:13,samp:dna_1__barc:2,samp:dna_1__barc:3,samp:dna_1__barc:4,samp:dna_1__barc:5,samp:dna_1__barc:6,samp:dna_1__barc:7,samp:dna_1__barc:8,samp:dna_1__barc:9
0,AAAAAAAAAAAAAAAGAAAAGAAAAGAAAAAAAAGAAAGGATTGAG...,h.1133,tile1,64.0,12.0,52.0,0.0,16.0,94.0,44.0,128.0,178.0,44.0,0.0,20.0,340.0
1,CTACTCCGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGGGTGG...,h.1133,tile2,94.0,38.0,102.0,0.0,188.0,372.0,0.0,0.0,0.0,94.0,136.0,18.0,14.0
2,AAAAAAAAAGGCCACGCTCAAAACCCCAGACTAGTTTTCCTCACCA...,h.1436,tile1,596.0,242.0,456.0,180.0,0.0,828.0,226.0,42.0,302.0,446.0,750.0,540.0,650.0
3,TTAAACTCGTTTAAACCAATTTTGAATAAGCTACTTCTTAGGCTAG...,h.1436,tile2,148.0,58.0,26.0,42.0,42.0,180.0,0.0,40.0,150.0,8.0,0.0,206.0,280.0
4,AAAAAAAAGAAGAAGTGGGGCTTTCCAGAGGTGATTAAGTCATGAG...,h.1951,tile1,1540.0,3692.0,824.0,1232.0,60.0,538.0,32.0,1460.0,158.0,64.0,706.0,198.0,756.0


In [23]:
print(len(dna_counts_human_all))
print(len(dna_counts_mouse_all))

5964
5897


In [24]:
rna_counts_human_all = human_vals[["element", "tss_id", "tss_tile_num"]].merge(rna_counts, on="element").drop_duplicates()
rna_counts_mouse_all = mouse_vals[["element", "tss_id", "tss_tile_num"]].merge(rna_counts, on="element").drop_duplicates()
rna_counts_human_all.head()

Unnamed: 0,element,tss_id,tss_tile_num,samp:HUES64_rep1__barc:1,samp:HUES64_rep1__barc:10,samp:HUES64_rep1__barc:11,samp:HUES64_rep1__barc:12,samp:HUES64_rep1__barc:13,samp:HUES64_rep1__barc:2,samp:HUES64_rep1__barc:3,...,samp:mESC_rep3__barc:12,samp:mESC_rep3__barc:13,samp:mESC_rep3__barc:2,samp:mESC_rep3__barc:3,samp:mESC_rep3__barc:4,samp:mESC_rep3__barc:5,samp:mESC_rep3__barc:6,samp:mESC_rep3__barc:7,samp:mESC_rep3__barc:8,samp:mESC_rep3__barc:9
0,AAAAAAAAAAAAAAAGAAAAGAAAAGAAAAAAAAGAAAGGATTGAG...,h.1133,tile1,9.0,1.0,2.0,0.0,3.0,29.0,151.0,...,0.0,4.0,133.0,1.0,31.0,334.0,0.0,0.0,0.0,422.0
1,CTACTCCGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGGGTGG...,h.1133,tile2,220.0,0.0,26.0,0.0,76.0,638.0,0.0,...,0.0,81.0,130.0,0.0,0.0,0.0,62.0,26.0,0.0,0.0
2,AAAAAAAAAGGCCACGCTCAAAACCCCAGACTAGTTTTCCTCACCA...,h.1436,tile1,1045.0,176.0,733.0,97.0,0.0,2295.0,619.0,...,18.0,0.0,327.0,119.0,2.0,326.0,228.0,611.0,553.0,541.0
3,TTAAACTCGTTTAAACCAATTTTGAATAAGCTACTTCTTAGGCTAG...,h.1436,tile2,27.0,0.0,69.0,223.0,121.0,76.0,0.0,...,35.0,3.0,38.0,0.0,0.0,7.0,0.0,0.0,78.0,68.0
4,AAAAAAAAGAAGAAGTGGGGCTTTCCAGAGGTGATTAAGTCATGAG...,h.1951,tile1,1995.0,2523.0,1476.0,759.0,4.0,751.0,5.0,...,383.0,0.0,398.0,1.0,1088.0,0.0,28.0,600.0,127.0,272.0


In [25]:
print(len(rna_counts_human_all))
print(len(rna_counts_mouse_all))

5964
5897


## 2. merge human/mouse counts into 1 dataframe

new: merge tile1 with tile1 unless maximum is tile2 in both species (in which case use tile2); only consider tiles where both are avail in both species

In [26]:
dna_counts_human_tile1 = dna_counts_human_all[dna_counts_human_all["tss_tile_num"] == "tile1"]
dna_counts_human_tile2 = dna_counts_human_all[dna_counts_human_all["tss_tile_num"] == "tile2"]
print(len(dna_counts_human_tile1))
print(len(dna_counts_human_tile2))

2943
3021


In [27]:
rna_counts_human_tile1 = rna_counts_human_all[rna_counts_human_all["tss_tile_num"] == "tile1"]
rna_counts_human_tile2 = rna_counts_human_all[rna_counts_human_all["tss_tile_num"] == "tile2"]
print(len(rna_counts_human_tile1))
print(len(rna_counts_human_tile2))

2943
3021


In [28]:
dna_counts_mouse_tile1 = dna_counts_mouse_all[dna_counts_mouse_all["tss_tile_num"] == "tile1"]
dna_counts_mouse_tile2 = dna_counts_mouse_all[dna_counts_mouse_all["tss_tile_num"] == "tile2"]
print(len(dna_counts_mouse_tile1))
print(len(dna_counts_mouse_tile2))

2917
2980


In [29]:
rna_counts_mouse_tile1 = rna_counts_mouse_all[rna_counts_mouse_all["tss_tile_num"] == "tile1"]
rna_counts_mouse_tile2 = rna_counts_mouse_all[rna_counts_mouse_all["tss_tile_num"] == "tile2"]
print(len(rna_counts_mouse_tile1))
print(len(rna_counts_mouse_tile2))

2917
2980


In [30]:
#both_tile_ids = tss_map[(tss_map["n_tiles_hg19"] >= 2) & (tss_map["n_tiles_mm9"] >= 2)]
both_tile_ids = tss_map[(~pd.isnull(tss_map["n_tiles_hg19"]) & ~(pd.isnull(tss_map["n_tiles_mm9"])))]
len(both_tile_ids)

3312

In [31]:
tile1_ids = both_tile_ids[(both_tile_ids["tile_match"] == "tile1:tile1") | 
                          (both_tile_ids["tile_match"] == "tile1:tile2")][["hg19_id", "mm9_id"]].drop_duplicates()
len(tile1_ids)

2693

In [32]:
tile2_ids = both_tile_ids[(both_tile_ids["tile_match"] == "tile2:tile2")][["hg19_id", "mm9_id"]].drop_duplicates()
len(tile2_ids)

598

In [33]:
tss_map_mpra_tile1 = tile1_ids.merge(tss_map, on=["hg19_id", "mm9_id"])
tss_map_mpra_tile1 = tss_map_mpra_tile1.merge(rna_counts_human_tile1, left_on="hg19_id", 
                                              right_on="tss_id").merge(rna_counts_mouse_tile1, left_on="mm9_id", 
                                                                       right_on="tss_id",
                                                                       suffixes=("___seq:human", "___seq:mouse"))
tss_map_mpra_tile1.drop_duplicates(inplace=True)
print(len(tss_map_mpra_tile1))
tss_map_mpra_tile1.head(5)

2352


Unnamed: 0,hg19_id,mm9_id,cage_id_hg19,cage_id_mm9,name_peak_hg19,name_peak_mm9,biotype_hg19,biotype_mm9,minimal_biotype_hg19,minimal_biotype_mm9,...,samp:mESC_rep3__barc:12___seq:mouse,samp:mESC_rep3__barc:13___seq:mouse,samp:mESC_rep3__barc:2___seq:mouse,samp:mESC_rep3__barc:3___seq:mouse,samp:mESC_rep3__barc:4___seq:mouse,samp:mESC_rep3__barc:5___seq:mouse,samp:mESC_rep3__barc:6___seq:mouse,samp:mESC_rep3__barc:7___seq:mouse,samp:mESC_rep3__barc:8___seq:mouse,samp:mESC_rep3__barc:9___seq:mouse
0,h.2,m.1,"chr1:2985420..2985438,-","chr4:154011250..154011257,+",ENSG00000177133.6,ENSMUSG00000085069.2,divergent,divergent,lncRNA,lncRNA,...,733.0,1.0,349.0,875.0,40.0,22.0,146.0,118.0,284.0,694.0
1,h.3,m.2,"chr1:8086546..8086571,+","chr4:150229039..150229050,-",ENSG00000238290.1,ENSMUSG00000078492.3,divergent,antisense,lncRNA,lncRNA,...,0.0,2105.0,197.0,4166.0,157.0,978.0,286.0,268.0,0.0,0.0
2,h.4,m.3,"chr1:26498321..26498327,-","chr4:133799669..133799683,+",ENSG00000236782.1,ENSMUSG00000086322.7,antisense,protein_coding,other,mRNA,...,428.0,115.0,86.0,77.0,534.0,104.0,240.0,983.0,251.0,0.0
3,h.5,m.4,"chr1:65533390..65533443,-","chr4:101029310..101029334,-",ENSG00000231485.1,ENSMUSG00000086782.2,intergenic,intergenic,lncRNA,other,...,1447.0,304.0,0.0,578.0,154.0,279.0,271.0,979.0,208.0,0.0
4,h.6,m.4,"chr1:65533457..65533465,-","chr4:101029310..101029334,-",ENSG00000231485.1,ENSMUSG00000086782.2,intergenic,intergenic,lncRNA,other,...,1447.0,304.0,0.0,578.0,154.0,279.0,271.0,979.0,208.0,0.0


In [34]:
tss_map_mpra_tile2 = tile2_ids.merge(tss_map, on=["hg19_id", "mm9_id"])
tss_map_mpra_tile2 = tss_map_mpra_tile2.merge(rna_counts_human_tile2, left_on="hg19_id", 
                                              right_on="tss_id").merge(rna_counts_mouse_tile2, left_on="mm9_id", 
                                                                       right_on="tss_id",
                                                                       suffixes=("___seq:human", "___seq:mouse"))
tss_map_mpra_tile2.drop_duplicates(inplace=True)
print(len(tss_map_mpra_tile2))
tss_map_mpra_tile2.head(5)

600


Unnamed: 0,hg19_id,mm9_id,cage_id_hg19,cage_id_mm9,name_peak_hg19,name_peak_mm9,biotype_hg19,biotype_mm9,minimal_biotype_hg19,minimal_biotype_mm9,...,samp:mESC_rep3__barc:12___seq:mouse,samp:mESC_rep3__barc:13___seq:mouse,samp:mESC_rep3__barc:2___seq:mouse,samp:mESC_rep3__barc:3___seq:mouse,samp:mESC_rep3__barc:4___seq:mouse,samp:mESC_rep3__barc:5___seq:mouse,samp:mESC_rep3__barc:6___seq:mouse,samp:mESC_rep3__barc:7___seq:mouse,samp:mESC_rep3__barc:8___seq:mouse,samp:mESC_rep3__barc:9___seq:mouse
0,h.15,m.11,"chr1:151763294..151763320,+","chr3:94216809..94216812,-",ENSG00000203288.3,ENSMUSG00000097515.2,divergent,divergent,lncRNA,lncRNA,...,1245.0,2.0,3.0,24.0,1765.0,0.0,276.0,530.0,0.0,177.0
1,h.16,m.11,"chr1:151763321..151763339,+","chr3:94216809..94216812,-",ENSG00000203288.3,ENSMUSG00000097515.2,divergent,divergent,lncRNA,lncRNA,...,1245.0,2.0,3.0,24.0,1765.0,0.0,276.0,530.0,0.0,177.0
2,h.19,m.14,"chr1:179851611..179851688,-","chr1:157883318..157883388,+",ENSG00000272906.1,ENSMUSG00000050565.16,divergent,protein_coding,lncRNA,mRNA,...,0.0,3247.0,0.0,7257.0,7310.0,3933.0,6078.0,955.0,706.0,2118.0
3,h.25,m.20,"chr1:208042503..208042527,-","chr1:196802536..196802585,+",ENSG00000203709.5,ENSMUSG00000096929.7,intergenic,intergenic,other,other,...,0.0,239.0,17.0,443.0,0.0,244.0,92.0,21.0,19.0,123.0
4,h.27,m.22,"chr1:209602671..209602695,+","chr1:195336318..195336327,-",ENSG00000230937.5,ENSMUSG00000097850.2,intergenic,intergenic,other,other,...,2644.0,2591.0,66.0,896.0,321.0,834.0,826.0,967.0,508.0,2800.0


In [35]:
tss_map_dna_tile1 = tile1_ids.merge(tss_map, on=["hg19_id", "mm9_id"])
tss_map_dna_tile1 = tss_map_dna_tile1.merge(dna_counts_human_tile1, left_on="hg19_id", 
                                              right_on="tss_id").merge(dna_counts_mouse_tile1, left_on="mm9_id", 
                                                                       right_on="tss_id",
                                                                       suffixes=("___seq:human", "___seq:mouse"))
tss_map_dna_tile1.drop_duplicates(inplace=True)
print(len(tss_map_dna_tile1))
tss_map_dna_tile1.head(5)

2352


Unnamed: 0,hg19_id,mm9_id,cage_id_hg19,cage_id_mm9,name_peak_hg19,name_peak_mm9,biotype_hg19,biotype_mm9,minimal_biotype_hg19,minimal_biotype_mm9,...,samp:dna_1__barc:12___seq:mouse,samp:dna_1__barc:13___seq:mouse,samp:dna_1__barc:2___seq:mouse,samp:dna_1__barc:3___seq:mouse,samp:dna_1__barc:4___seq:mouse,samp:dna_1__barc:5___seq:mouse,samp:dna_1__barc:6___seq:mouse,samp:dna_1__barc:7___seq:mouse,samp:dna_1__barc:8___seq:mouse,samp:dna_1__barc:9___seq:mouse
0,h.2,m.1,"chr1:2985420..2985438,-","chr4:154011250..154011257,+",ENSG00000177133.6,ENSMUSG00000085069.2,divergent,divergent,lncRNA,lncRNA,...,2336.0,126.0,1098.0,1518.0,226.0,176.0,150.0,372.0,550.0,2208.0
1,h.3,m.2,"chr1:8086546..8086571,+","chr4:150229039..150229050,-",ENSG00000238290.1,ENSMUSG00000078492.3,divergent,antisense,lncRNA,lncRNA,...,0.0,1212.0,200.0,2244.0,138.0,590.0,422.0,206.0,74.0,38.0
2,h.4,m.3,"chr1:26498321..26498327,-","chr4:133799669..133799683,+",ENSG00000236782.1,ENSMUSG00000086322.7,antisense,protein_coding,other,mRNA,...,442.0,296.0,288.0,330.0,1914.0,228.0,446.0,1158.0,358.0,190.0
3,h.5,m.4,"chr1:65533390..65533443,-","chr4:101029310..101029334,-",ENSG00000231485.1,ENSMUSG00000086782.2,intergenic,intergenic,lncRNA,other,...,1060.0,256.0,0.0,250.0,316.0,408.0,352.0,1152.0,310.0,54.0
4,h.6,m.4,"chr1:65533457..65533465,-","chr4:101029310..101029334,-",ENSG00000231485.1,ENSMUSG00000086782.2,intergenic,intergenic,lncRNA,other,...,1060.0,256.0,0.0,250.0,316.0,408.0,352.0,1152.0,310.0,54.0


In [36]:
tss_map_dna_tile2 = tile2_ids.merge(tss_map, on=["hg19_id", "mm9_id"])
tss_map_dna_tile2 = tss_map_dna_tile2.merge(dna_counts_human_tile2, left_on="hg19_id", 
                                              right_on="tss_id").merge(dna_counts_mouse_tile2, left_on="mm9_id", 
                                                                       right_on="tss_id",
                                                                       suffixes=("___seq:human", "___seq:mouse"))
tss_map_dna_tile2.drop_duplicates(inplace=True)
print(len(tss_map_dna_tile2))
tss_map_dna_tile2.head(5)

600


Unnamed: 0,hg19_id,mm9_id,cage_id_hg19,cage_id_mm9,name_peak_hg19,name_peak_mm9,biotype_hg19,biotype_mm9,minimal_biotype_hg19,minimal_biotype_mm9,...,samp:dna_1__barc:12___seq:mouse,samp:dna_1__barc:13___seq:mouse,samp:dna_1__barc:2___seq:mouse,samp:dna_1__barc:3___seq:mouse,samp:dna_1__barc:4___seq:mouse,samp:dna_1__barc:5___seq:mouse,samp:dna_1__barc:6___seq:mouse,samp:dna_1__barc:7___seq:mouse,samp:dna_1__barc:8___seq:mouse,samp:dna_1__barc:9___seq:mouse
0,h.15,m.11,"chr1:151763294..151763320,+","chr3:94216809..94216812,-",ENSG00000203288.3,ENSMUSG00000097515.2,divergent,divergent,lncRNA,lncRNA,...,1648.0,92.0,144.0,132.0,1462.0,92.0,716.0,764.0,20.0,714.0
1,h.16,m.11,"chr1:151763321..151763339,+","chr3:94216809..94216812,-",ENSG00000203288.3,ENSMUSG00000097515.2,divergent,divergent,lncRNA,lncRNA,...,1648.0,92.0,144.0,132.0,1462.0,92.0,716.0,764.0,20.0,714.0
2,h.19,m.14,"chr1:179851611..179851688,-","chr1:157883318..157883388,+",ENSG00000272906.1,ENSMUSG00000050565.16,divergent,protein_coding,lncRNA,mRNA,...,0.0,870.0,0.0,3476.0,3782.0,1682.0,2454.0,288.0,402.0,838.0
3,h.25,m.20,"chr1:208042503..208042527,-","chr1:196802536..196802585,+",ENSG00000203709.5,ENSMUSG00000096929.7,intergenic,intergenic,other,other,...,0.0,740.0,92.0,890.0,54.0,264.0,600.0,198.0,296.0,318.0
4,h.27,m.22,"chr1:209602671..209602695,+","chr1:195336318..195336327,-",ENSG00000230937.5,ENSMUSG00000097850.2,intergenic,intergenic,other,other,...,1800.0,924.0,158.0,454.0,266.0,700.0,608.0,694.0,178.0,1284.0


old: merge tile1 with tile1 and tile2 with tile2 always

In [37]:
# tss_map_mpra_tile1 = tss_map.merge(rna_counts_human_tile1, left_on="hg19_id", 
#                                    right_on="tss_id").merge(rna_counts_mouse_tile1, left_on="mm9_id", right_on="tss_id",
#                                                             suffixes=("___seq:human", "___seq:mouse"))
# tss_map_mpra_tile1.drop_duplicates(inplace=True)
# print(len(tss_map_mpra_tile1))
# tss_map_mpra_tile1.head(5)

In [38]:
# tss_map_mpra_tile2 = tss_map.merge(rna_counts_human_tile2, left_on="hg19_id", 
#                                    right_on="tss_id").merge(rna_counts_mouse_tile2, left_on="mm9_id", right_on="tss_id",
#                                                             suffixes=("___seq:human", "___seq:mouse"))
# tss_map_mpra_tile2.drop_duplicates(inplace=True)
# print(len(tss_map_mpra_tile2))
# tss_map_mpra_tile2.head(5)

In [39]:
# tss_map_dna_tile1 = tss_map.merge(dna_counts_human_tile1, left_on="hg19_id", 
#                                   right_on="tss_id").merge(dna_counts_mouse_tile1, left_on="mm9_id", right_on="tss_id",
#                                                            suffixes=("___seq:human", "___seq:mouse"))
# tss_map_dna_tile1.drop_duplicates(inplace=True)
# print(len(tss_map_dna_tile1))
# tss_map_dna_tile1.head(5)

In [40]:
# tss_map_dna_tile2 = tss_map.merge(dna_counts_human_tile2, left_on="hg19_id", 
#                                   right_on="tss_id").merge(dna_counts_mouse_tile2, left_on="mm9_id", right_on="tss_id",
#                                                            suffixes=("___seq:human", "___seq:mouse"))
# tss_map_dna_tile2.drop_duplicates(inplace=True)
# print(len(tss_map_dna_tile2))
# tss_map_dna_tile2.head(5)

## 3. assign each pair an ID

In [41]:
HUES64_rna_cols = [x for x in tss_map_mpra_tile1.columns if "samp:HUES64" in x]
mESC_rna_cols = [x for x in tss_map_mpra_tile1.columns if "samp:mESC" in x]
all_dna_cols = [x for x in tss_map_dna_tile1.columns if "samp:dna" in x]

human_cols = ["hg19_id", "biotype_hg19", "mm9_id", "biotype_mm9"]
human_cols.extend(HUES64_rna_cols)

mouse_cols = ["hg19_id", "biotype_hg19", "mm9_id", "biotype_mm9"]
mouse_cols.extend(mESC_rna_cols)

dna_cols = ["hg19_id", "biotype_hg19", "mm9_id", "biotype_mm9"]
dna_cols.extend(all_dna_cols)

tss_map_mpra_human_tile1 = tss_map_mpra_tile1[human_cols]
tss_map_mpra_mouse_tile1 = tss_map_mpra_tile1[mouse_cols]

tss_map_mpra_human_tile2 = tss_map_mpra_tile2[human_cols]
tss_map_mpra_mouse_tile2 = tss_map_mpra_tile2[mouse_cols]

tss_map_dna_tile1 = tss_map_dna_tile1[dna_cols]
tss_map_dna_tile2 = tss_map_dna_tile2[dna_cols]

tss_map_mpra_human_tile1.head()

Unnamed: 0,hg19_id,biotype_hg19,mm9_id,biotype_mm9,samp:HUES64_rep1__barc:1___seq:human,samp:HUES64_rep1__barc:10___seq:human,samp:HUES64_rep1__barc:11___seq:human,samp:HUES64_rep1__barc:12___seq:human,samp:HUES64_rep1__barc:13___seq:human,samp:HUES64_rep1__barc:2___seq:human,...,samp:HUES64_rep3__barc:12___seq:mouse,samp:HUES64_rep3__barc:13___seq:mouse,samp:HUES64_rep3__barc:2___seq:mouse,samp:HUES64_rep3__barc:3___seq:mouse,samp:HUES64_rep3__barc:4___seq:mouse,samp:HUES64_rep3__barc:5___seq:mouse,samp:HUES64_rep3__barc:6___seq:mouse,samp:HUES64_rep3__barc:7___seq:mouse,samp:HUES64_rep3__barc:8___seq:mouse,samp:HUES64_rep3__barc:9___seq:mouse
0,h.2,divergent,m.1,divergent,8374.0,3343.0,6548.0,436.0,2117.0,2659.0,...,4038.0,207.0,1929.0,2188.0,199.0,146.0,116.0,612.0,722.0,2938.0
1,h.3,divergent,m.2,antisense,1.0,0.0,18.0,0.0,149.0,1.0,...,0.0,1973.0,681.0,4594.0,250.0,1068.0,543.0,462.0,0.0,43.0
2,h.4,antisense,m.3,protein_coding,1.0,1029.0,141.0,1729.0,2549.0,995.0,...,263.0,406.0,302.0,354.0,2587.0,502.0,351.0,2488.0,840.0,0.0
3,h.5,intergenic,m.4,intergenic,3487.0,433.0,3121.0,351.0,11277.0,1028.0,...,4111.0,811.0,0.0,1075.0,1158.0,1241.0,929.0,4320.0,775.0,77.0
4,h.6,intergenic,m.4,intergenic,14595.0,25119.0,9055.0,6041.0,10936.0,13086.0,...,4111.0,811.0,0.0,1075.0,1158.0,1241.0,929.0,4320.0,775.0,77.0


In [42]:
tss_map_mpra_human_tile1["tile_num"] = "tile1"
tss_map_mpra_mouse_tile1["tile_num"] = "tile1"
tss_map_mpra_human_tile2["tile_num"] = "tile2"
tss_map_mpra_mouse_tile2["tile_num"] = "tile2"
tss_map_dna_tile1["tile_num"] = "tile1"
tss_map_dna_tile2["tile_num"] = "tile2"

In [43]:
# all tile 1s
tss_map_mpra_human_tile1["comp_id"] = tss_map_mpra_human_tile1["hg19_id"] + "__" + tss_map_mpra_human_tile1["biotype_hg19"] + "__" + tss_map_mpra_human_tile1["mm9_id"] + "__" + tss_map_mpra_human_tile1["biotype_mm9"] + "__" + tss_map_mpra_human_tile1["tile_num"] 
tss_map_mpra_mouse_tile1["comp_id"] = tss_map_mpra_mouse_tile1["hg19_id"] + "__" + tss_map_mpra_mouse_tile1["biotype_hg19"] + "__" + tss_map_mpra_mouse_tile1["mm9_id"] + "__" + tss_map_mpra_mouse_tile1["biotype_mm9"] + "__" + tss_map_mpra_mouse_tile1["tile_num"]
tss_map_dna_tile1["comp_id"] = tss_map_dna_tile1["hg19_id"] + "__" + tss_map_dna_tile1["biotype_hg19"] + "__" + tss_map_dna_tile1["mm9_id"] + "__" + tss_map_dna_tile1["biotype_mm9"] + "__" + tss_map_dna_tile1["tile_num"]

# all tile 2s
tss_map_mpra_human_tile2["comp_id"] = tss_map_mpra_human_tile2["hg19_id"] + "__" + tss_map_mpra_human_tile2["biotype_hg19"] + "__" + tss_map_mpra_human_tile2["mm9_id"] + "__" + tss_map_mpra_human_tile2["biotype_mm9"] + "__" + tss_map_mpra_human_tile2["tile_num"] 
tss_map_mpra_mouse_tile2["comp_id"] = tss_map_mpra_mouse_tile2["hg19_id"] + "__" + tss_map_mpra_mouse_tile2["biotype_hg19"] + "__" + tss_map_mpra_mouse_tile2["mm9_id"] + "__" + tss_map_mpra_mouse_tile2["biotype_mm9"] + "__" + tss_map_mpra_mouse_tile2["tile_num"]
tss_map_dna_tile2["comp_id"] = tss_map_dna_tile2["hg19_id"] + "__" + tss_map_dna_tile2["biotype_hg19"] + "__" + tss_map_dna_tile2["mm9_id"] + "__" + tss_map_dna_tile2["biotype_mm9"] + "__" + tss_map_dna_tile2["tile_num"]

# drop redundant tiles
tss_map_mpra_human_tile1.drop(["hg19_id", "biotype_hg19", "mm9_id", "biotype_mm9", "tile_num"], axis=1, inplace=True)
tss_map_mpra_mouse_tile1.drop(["hg19_id", "biotype_hg19", "mm9_id", "biotype_mm9", "tile_num"], axis=1, inplace=True)
tss_map_dna_tile1.drop(["hg19_id", "biotype_hg19", "mm9_id", "biotype_mm9", "tile_num"], axis=1, inplace=True)
tss_map_mpra_human_tile2.drop(["hg19_id", "biotype_hg19", "mm9_id", "biotype_mm9", "tile_num"], axis=1, inplace=True)
tss_map_mpra_mouse_tile2.drop(["hg19_id", "biotype_hg19", "mm9_id", "biotype_mm9", "tile_num"], axis=1, inplace=True)
tss_map_dna_tile2.drop(["hg19_id", "biotype_hg19", "mm9_id", "biotype_mm9", "tile_num"], axis=1, inplace=True)

human_cols = ["comp_id"]
human_cols.extend(HUES64_rna_cols)

mouse_cols = ["comp_id"]
mouse_cols.extend(mESC_rna_cols)

dna_cols = ["comp_id"]
dna_cols.extend(all_dna_cols)

tss_map_mpra_human_tile1 = tss_map_mpra_human_tile1[human_cols]
tss_map_mpra_human_tile2 = tss_map_mpra_human_tile2[human_cols]
tss_map_mpra_mouse_tile1 = tss_map_mpra_mouse_tile1[mouse_cols]
tss_map_mpra_mouse_tile2 = tss_map_mpra_mouse_tile2[mouse_cols]
tss_map_dna_tile1 = tss_map_dna_tile1[dna_cols]
tss_map_dna_tile2 = tss_map_dna_tile2[dna_cols]

tss_map_mpra_human_tile1.head()

Unnamed: 0,comp_id,samp:HUES64_rep1__barc:1___seq:human,samp:HUES64_rep1__barc:10___seq:human,samp:HUES64_rep1__barc:11___seq:human,samp:HUES64_rep1__barc:12___seq:human,samp:HUES64_rep1__barc:13___seq:human,samp:HUES64_rep1__barc:2___seq:human,samp:HUES64_rep1__barc:3___seq:human,samp:HUES64_rep1__barc:4___seq:human,samp:HUES64_rep1__barc:5___seq:human,...,samp:HUES64_rep3__barc:12___seq:mouse,samp:HUES64_rep3__barc:13___seq:mouse,samp:HUES64_rep3__barc:2___seq:mouse,samp:HUES64_rep3__barc:3___seq:mouse,samp:HUES64_rep3__barc:4___seq:mouse,samp:HUES64_rep3__barc:5___seq:mouse,samp:HUES64_rep3__barc:6___seq:mouse,samp:HUES64_rep3__barc:7___seq:mouse,samp:HUES64_rep3__barc:8___seq:mouse,samp:HUES64_rep3__barc:9___seq:mouse
0,h.2__divergent__m.1__divergent__tile1,8374.0,3343.0,6548.0,436.0,2117.0,2659.0,0.0,48.0,347.0,...,4038.0,207.0,1929.0,2188.0,199.0,146.0,116.0,612.0,722.0,2938.0
1,h.3__divergent__m.2__antisense__tile1,1.0,0.0,18.0,0.0,149.0,1.0,5.0,0.0,278.0,...,0.0,1973.0,681.0,4594.0,250.0,1068.0,543.0,462.0,0.0,43.0
2,h.4__antisense__m.3__protein_coding__tile1,1.0,1029.0,141.0,1729.0,2549.0,995.0,0.0,139.0,71.0,...,263.0,406.0,302.0,354.0,2587.0,502.0,351.0,2488.0,840.0,0.0
3,h.5__intergenic__m.4__intergenic__tile1,3487.0,433.0,3121.0,351.0,11277.0,1028.0,10653.0,2691.0,1426.0,...,4111.0,811.0,0.0,1075.0,1158.0,1241.0,929.0,4320.0,775.0,77.0
4,h.6__intergenic__m.4__intergenic__tile1,14595.0,25119.0,9055.0,6041.0,10936.0,13086.0,9383.0,3051.0,1248.0,...,4111.0,811.0,0.0,1075.0,1158.0,1241.0,929.0,4320.0,775.0,77.0


In [44]:
# append tile 1 and tile2
tss_map_mpra_human = tss_map_mpra_human_tile1.append(tss_map_mpra_human_tile2).drop_duplicates()
tss_map_mpra_mouse = tss_map_mpra_mouse_tile1.append(tss_map_mpra_mouse_tile2).drop_duplicates()
tss_map_dna = tss_map_dna_tile1.append(tss_map_dna_tile2).drop_duplicates()
print(len(tss_map_mpra_human))
print(len(tss_map_mpra_mouse))
print(len(tss_map_dna))
tss_map_mpra_human.sample(5)

2934
2934
2934


Unnamed: 0,comp_id,samp:HUES64_rep1__barc:1___seq:human,samp:HUES64_rep1__barc:10___seq:human,samp:HUES64_rep1__barc:11___seq:human,samp:HUES64_rep1__barc:12___seq:human,samp:HUES64_rep1__barc:13___seq:human,samp:HUES64_rep1__barc:2___seq:human,samp:HUES64_rep1__barc:3___seq:human,samp:HUES64_rep1__barc:4___seq:human,samp:HUES64_rep1__barc:5___seq:human,...,samp:HUES64_rep3__barc:12___seq:mouse,samp:HUES64_rep3__barc:13___seq:mouse,samp:HUES64_rep3__barc:2___seq:mouse,samp:HUES64_rep3__barc:3___seq:mouse,samp:HUES64_rep3__barc:4___seq:mouse,samp:HUES64_rep3__barc:5___seq:mouse,samp:HUES64_rep3__barc:6___seq:mouse,samp:HUES64_rep3__barc:7___seq:mouse,samp:HUES64_rep3__barc:8___seq:mouse,samp:HUES64_rep3__barc:9___seq:mouse
1976,h.3060__unassigned__m.2872__protein_coding__tile1,2302.0,687.0,1947.0,2.0,7157.0,538.0,604.0,3090.0,3849.0,...,6085.0,2322.0,2889.0,848.0,4739.0,3167.0,6065.0,461.0,0.0,461.0
1606,h.2564__enhancer__m.2376__no cage activity__tile1,1809.0,399.0,2023.0,1642.0,0.0,639.0,732.0,10.0,194.0,...,219.0,0.0,281.0,340.0,33.0,72.0,229.0,1.0,11.0,330.0
877,h.1345__enhancer__m.1214__enhancer__tile1,991.0,3305.0,299.0,1658.0,4014.0,937.0,1286.0,164.0,5726.0,...,1646.0,148.0,1683.0,1195.0,1752.0,2030.0,2013.0,1757.0,410.0,1349.0
2318,h.3562__no cage activity__m.3372__enhancer__tile1,124.0,1057.0,450.0,515.0,266.0,1.0,612.0,318.0,1757.0,...,124.0,144.0,0.0,597.0,0.0,810.0,178.0,505.0,710.0,524.0
2286,h.3520__no cage activity__m.3330__enhancer__tile1,1092.0,1293.0,742.0,18.0,569.0,1192.0,39.0,1221.0,4.0,...,319.0,2458.0,5139.0,4279.0,1664.0,1708.0,164.0,315.0,1075.0,1441.0


In [45]:
# merge human and mouse so both cols in 1 df
tss_map_mpra = tss_map_mpra_human.merge(tss_map_mpra_mouse, on="comp_id")
len(tss_map_mpra)

2934

In [46]:
# also add dataframe for native comparisons
native_cols = ["comp_id"]
native_human_cols = [x for x in tss_map_mpra.columns if "HUES64" in x and "human" in x]
native_mouse_cols = [x for x in tss_map_mpra.columns if "mESC" in x and "mouse" in x]
native_cols.extend(native_human_cols)
native_cols.extend(native_mouse_cols)
tss_map_mpra_native = tss_map_mpra[native_cols]
tss_map_mpra_native.head()

Unnamed: 0,comp_id,samp:HUES64_rep1__barc:1___seq:human,samp:HUES64_rep1__barc:10___seq:human,samp:HUES64_rep1__barc:11___seq:human,samp:HUES64_rep1__barc:12___seq:human,samp:HUES64_rep1__barc:13___seq:human,samp:HUES64_rep1__barc:2___seq:human,samp:HUES64_rep1__barc:3___seq:human,samp:HUES64_rep1__barc:4___seq:human,samp:HUES64_rep1__barc:5___seq:human,...,samp:mESC_rep3__barc:12___seq:mouse,samp:mESC_rep3__barc:13___seq:mouse,samp:mESC_rep3__barc:2___seq:mouse,samp:mESC_rep3__barc:3___seq:mouse,samp:mESC_rep3__barc:4___seq:mouse,samp:mESC_rep3__barc:5___seq:mouse,samp:mESC_rep3__barc:6___seq:mouse,samp:mESC_rep3__barc:7___seq:mouse,samp:mESC_rep3__barc:8___seq:mouse,samp:mESC_rep3__barc:9___seq:mouse
0,h.2__divergent__m.1__divergent__tile1,8374.0,3343.0,6548.0,436.0,2117.0,2659.0,0.0,48.0,347.0,...,733.0,1.0,349.0,875.0,40.0,22.0,146.0,118.0,284.0,694.0
1,h.3__divergent__m.2__antisense__tile1,1.0,0.0,18.0,0.0,149.0,1.0,5.0,0.0,278.0,...,0.0,2105.0,197.0,4166.0,157.0,978.0,286.0,268.0,0.0,0.0
2,h.4__antisense__m.3__protein_coding__tile1,1.0,1029.0,141.0,1729.0,2549.0,995.0,0.0,139.0,71.0,...,428.0,115.0,86.0,77.0,534.0,104.0,240.0,983.0,251.0,0.0
3,h.5__intergenic__m.4__intergenic__tile1,3487.0,433.0,3121.0,351.0,11277.0,1028.0,10653.0,2691.0,1426.0,...,1447.0,304.0,0.0,578.0,154.0,279.0,271.0,979.0,208.0,0.0
4,h.6__intergenic__m.4__intergenic__tile1,14595.0,25119.0,9055.0,6041.0,10936.0,13086.0,9383.0,3051.0,1248.0,...,1447.0,304.0,0.0,578.0,154.0,279.0,271.0,979.0,208.0,0.0


In [47]:
# remove duplicates
tss_map_dna.drop_duplicates(inplace=True)
print(len(tss_map_dna))
print(len(tss_map_dna["comp_id"].unique()))

tss_map_mpra_human.drop_duplicates(inplace=True)
print(len(tss_map_mpra_human))
print(len(tss_map_mpra_human["comp_id"].unique()))

tss_map_mpra_mouse.drop_duplicates(inplace=True)
print(len(tss_map_mpra_mouse))
print(len(tss_map_mpra_mouse["comp_id"].unique()))

tss_map_mpra_native.drop_duplicates(inplace=True)
print(len(tss_map_mpra_native))
print(len(tss_map_mpra_native["comp_id"].unique()))

2934
2934
2934
2934
2934
2934
2934
2934


## 4. pair positive controls together to serve as negative controls
for each down-sampled control element (there are 4), randomly choose 100 pairs to serve as human/mouse

In [48]:
ctrl_ids = rna_counts_ctrl.element.unique()
ctrl_ids[0:5]

array([ 'AGTTCCGCTTACATAACTTACGGTAAATGGCCCGCCTGGCTGACCGCCCAACGACCCCCGCCCATTGACGTCAATAATGACGTATGTTCCCATAGTAACGCCAATAGGGACTTTCCATTGACGTCAATGGGTGGAGTATTTACG__samp1',
       'AGTTCCGCTTACATAACTTACGGTAAATGGCCCGCCTGGCTGACCGCCCAACGACCCCCGCCCATTGACGTCAATAATGACGTATGTTCCCATAGTAACGCCAATAGGGACTTTCCATTGACGTCAATGGGTGGAGTATTTACG__samp2',
       'AGTTCCGCTTACATAACTTACGGTAAATGGCCCGCCTGGCTGACCGCCCAACGACCCCCGCCCATTGACGTCAATAATGACGTATGTTCCCATAGTAACGCCAATAGGGACTTTCCATTGACGTCAATGGGTGGAGTATTTACG__samp3',
       'AGTTCCGCTTACATAACTTACGGTAAATGGCCCGCCTGGCTGACCGCCCAACGACCCCCGCCCATTGACGTCAATAATGACGTATGTTCCCATAGTAACGCCAATAGGGACTTTCCATTGACGTCAATGGGTGGAGTATTTACG__samp4',
       'AGTTCCGCTTACATAACTTACGGTAAATGGCCCGCCTGGCTGACCGCCCAACGACCCCCGCCCATTGACGTCAATAATGACGTATGTTCCCATAGTAACGCCAATAGGGACTTTCCATTGACGTCAATGGGTGGAGTATTTACG__samp5'], dtype=object)

In [49]:
ctrl_seqs = set([x.split("__")[0] for x in ctrl_ids])
samp_ids = set([x.split("__")[1] for x in ctrl_ids])

In [50]:
all_samp_id_pairs = list(itertools.combinations(samp_ids, 2))
all_samp_id_pairs_str = ["%s__%s" % (x[0], x[1]) for x in all_samp_id_pairs]
all_samp_id_pairs_str[0:5]

['samp76__samp12',
 'samp76__samp63',
 'samp76__samp58',
 'samp76__samp52',
 'samp76__samp81']

In [51]:
sampled_samp_id_pairs = np.random.choice(all_samp_id_pairs_str, size=100)
sampled_samp_id_pairs[0:5]

array(['samp69__samp38', 'samp83__samp20', 'samp61__samp39',
       'samp100__samp54', 'samp5__samp13'],
      dtype='<U15')

In [52]:
neg_ctrls_dna = pd.DataFrame()
neg_ctrls_human = pd.DataFrame()
neg_ctrls_mouse = pd.DataFrame()
neg_ctrls_native = pd.DataFrame()

for i, seq in enumerate(ctrl_seqs):
    print("ctrl #: %s" % (i+1))
    
    for j, samp_id_pair in enumerate(sampled_samp_id_pairs):
        if j % 50 == 0:
            print("...samp pair #: %s" % (j+1))
            
        samp1 = samp_id_pair.split("__")[0] # arbitrarily call 'human' seq
        samp2 = samp_id_pair.split("__")[1] # arbitrarily call 'mouse' seq
        
        human_elem = "%s__%s" % (seq, samp1)
        mouse_elem = "%s__%s" % (seq, samp2)
        
        human_sub_dna = dna_counts_ctrl[dna_counts_ctrl["element"] == human_elem]
        mouse_sub_dna = dna_counts_ctrl[dna_counts_ctrl["element"] == mouse_elem]
        
        human_sub_rna = rna_counts_ctrl[rna_counts_ctrl["element"] == human_elem]
        mouse_sub_rna = rna_counts_ctrl[rna_counts_ctrl["element"] == mouse_elem]
        
        # re-name columns w/ species name
        human_dna_cols = ["element"]
        mouse_dna_cols = ["element"]
        human_rna_cols = ["element"]
        mouse_rna_cols = ["element"]
        
        human_dna_cols.extend(["%s___seq:human" % x for x in human_sub_dna.columns if x != "element"])
        mouse_dna_cols.extend(["%s___seq:mouse" % x for x in mouse_sub_dna.columns if x != "element"])
        
        human_rna_cols.extend(["%s___seq:human" % x for x in human_sub_rna.columns if x != "element"])
        mouse_rna_cols.extend(["%s___seq:mouse" % x for x in mouse_sub_rna.columns if x != "element"])
        
        human_sub_dna.columns = human_dna_cols
        mouse_sub_dna.columns = mouse_dna_cols
        human_sub_rna.columns = human_rna_cols
        mouse_sub_rna.columns = mouse_rna_cols
        
        # add comp_id to each df
        comp_id = "CONTROL:%s__SAMP_PAIR:%s" % ((i+1), (j+1))
        human_sub_dna["comp_id"] = comp_id
        mouse_sub_dna["comp_id"] = comp_id
        human_sub_rna["comp_id"] = comp_id
        mouse_sub_rna["comp_id"] = comp_id
        
        # merge each df into 1
        human_sub_dna.drop("element", axis=1, inplace=True)
        mouse_sub_dna.drop("element", axis=1, inplace=True)
        human_sub_rna.drop("element", axis=1, inplace=True)
        mouse_sub_rna.drop("element", axis=1, inplace=True)
        
        sub_dna = human_sub_dna.merge(mouse_sub_dna, on="comp_id")
        sub_rna = human_sub_rna.merge(mouse_sub_rna, on="comp_id")
        
        # subset rna appropriately into each negative control bucket
        sub_rna_human_cols = [x for x in sub_rna.columns if x == "comp_id" or "HUES64" in x]
        sub_rna_mouse_cols = [x for x in sub_rna.columns if x == "comp_id" or "mESC" in x]
        sub_rna_native_cols = [x for x in sub_rna.columns if x == "comp_id" or ("HUES64" in x and "human" in x) or ("mESC" in x and "mouse" in x)]
        
        sub_rna_human = sub_rna[sub_rna_human_cols]
        sub_rna_mouse = sub_rna[sub_rna_mouse_cols]
        sub_rna_native = sub_rna[sub_rna_native_cols]
        
        # append
        neg_ctrls_dna = neg_ctrls_dna.append(sub_dna)
        neg_ctrls_human = neg_ctrls_human.append(sub_rna_human)
        neg_ctrls_mouse = neg_ctrls_mouse.append(sub_rna_mouse)
        neg_ctrls_native = neg_ctrls_native.append(sub_rna_native)

ctrl #: 1
...samp pair #: 1
...samp pair #: 51
ctrl #: 2
...samp pair #: 1
...samp pair #: 51
ctrl #: 3
...samp pair #: 1
...samp pair #: 51
ctrl #: 4
...samp pair #: 1
...samp pair #: 51


In [53]:
all_dna = tss_map_dna.append(neg_ctrls_dna)
all_dna.set_index("comp_id", inplace=True)
len(all_dna)

3334

In [54]:
all_rna_human = tss_map_mpra_human.append(neg_ctrls_human)
all_rna_human.set_index("comp_id", inplace=True)
len(all_rna_human)

3334

In [55]:
all_rna_mouse = tss_map_mpra_mouse.append(neg_ctrls_mouse)
all_rna_mouse.set_index("comp_id", inplace=True)
len(all_rna_mouse)

3334

In [56]:
all_rna_native = tss_map_mpra_native.append(neg_ctrls_native)
all_rna_native.set_index("comp_id", inplace=True)
len(all_rna_native)

3334

In [57]:
# also make file w/ everything together to test interactions!
tmp_human = all_rna_human.reset_index()
tmp_mouse = all_rna_mouse.reset_index()
all_rna = tmp_human.merge(tmp_mouse, on="comp_id")
all_cols = all_rna.columns
all_rna.set_index("comp_id", inplace=True)
len(all_rna)

3334

## 5. make annotation files

In [58]:
dna_col_ann = {}
human_col_ann = {}
mouse_col_ann = {}
native_col_ann = {}
all_col_ann = {}

for cols, ann in zip([all_dna_cols, human_cols, mouse_cols, native_cols, all_cols], 
                     [dna_col_ann, human_col_ann, mouse_col_ann, native_col_ann, all_col_ann]):
    for col in cols:
        if col == "comp_id":
            continue
        cond = col.split(":")[1].split("_")[0]
        barc = col.split(":")[2].split("_")[0]
        seq = col.split(":")[-1]
        ann[col] = {"condition": cond, "barcode": barc, "seq": seq}

dna_col_ann = pd.DataFrame.from_dict(dna_col_ann, orient="index")
human_col_ann = pd.DataFrame.from_dict(human_col_ann, orient="index")
mouse_col_ann = pd.DataFrame.from_dict(mouse_col_ann, orient="index")
native_col_ann = pd.DataFrame.from_dict(native_col_ann, orient="index")
all_col_ann = pd.DataFrame.from_dict(all_col_ann, orient="index")
native_col_ann.sample(5)

Unnamed: 0,condition,barcode,seq
samp:mESC_rep3__barc:7___seq:mouse,mESC,7,mouse
samp:mESC_rep1__barc:5___seq:mouse,mESC,5,mouse
samp:HUES64_rep2__barc:11___seq:human,HUES64,11,human
samp:mESC_rep3__barc:13___seq:mouse,mESC,13,mouse
samp:mESC_rep1__barc:6___seq:mouse,mESC,6,mouse


In [59]:
# merge w/ older annotations: first reset index
human_col_ann.reset_index(inplace=True)
mouse_col_ann.reset_index(inplace=True)
native_col_ann.reset_index(inplace=True)
all_col_ann.reset_index(inplace=True)

human_col_ann["colname"] = human_col_ann["index"]
mouse_col_ann["colname"] = mouse_col_ann["index"]
native_col_ann["colname"] = native_col_ann["index"]
all_col_ann["colname"] = all_col_ann["index"]

In [60]:
# reset index on old annots and turn barcode into str
old_rna_col_ann.reset_index(inplace=True)
old_rna_col_ann["barcode"] = old_rna_col_ann["barcode"].astype(str)

In [61]:
# merge
human_col_ann.sample(5)

Unnamed: 0,index,condition,barcode,seq,colname
48,samp:HUES64_rep2__barc:8___seq:human,HUES64,8,human,samp:HUES64_rep2__barc:8___seq:human
45,samp:HUES64_rep2__barc:6___seq:mouse,HUES64,6,mouse,samp:HUES64_rep2__barc:6___seq:mouse
56,samp:HUES64_rep3__barc:12___seq:human,HUES64,12,human,samp:HUES64_rep3__barc:12___seq:human
16,samp:HUES64_rep1__barc:5___seq:human,HUES64,5,human,samp:HUES64_rep1__barc:5___seq:human
10,samp:HUES64_rep1__barc:2___seq:human,HUES64,2,human,samp:HUES64_rep1__barc:2___seq:human


In [62]:
all_col_ann.sample(5)

Unnamed: 0,index,condition,barcode,seq,colname
133,samp:mESC_rep3__barc:11___seq:mouse,mESC,11,mouse,samp:mESC_rep3__barc:11___seq:mouse
92,samp:mESC_rep1__barc:4___seq:human,mESC,4,human,samp:mESC_rep1__barc:4___seq:human
0,samp:HUES64_rep1__barc:10___seq:human,HUES64,10,human,samp:HUES64_rep1__barc:10___seq:human
150,samp:mESC_rep3__barc:7___seq:human,mESC,7,human,samp:mESC_rep3__barc:7___seq:human
51,samp:HUES64_rep2__barc:9___seq:mouse,HUES64,9,mouse,samp:HUES64_rep2__barc:9___seq:mouse


In [63]:
# reset index
human_col_ann.set_index("colname", inplace=True)
mouse_col_ann.set_index("colname", inplace=True)
native_col_ann.set_index("colname", inplace=True)
all_col_ann.set_index("colname", inplace=True)

In [64]:
del human_col_ann.index.name
del mouse_col_ann.index.name
del native_col_ann.index.name
del all_col_ann.index.name

In [65]:
# human_col_ann.drop("index", axis=1, inplace=True)
# mouse_col_ann.drop("index", axis=1, inplace=True)
# native_col_ann.drop("index", axis=1, inplace=True)
# all_col_ann.drop("index", axis=1, inplace=True)

In [66]:
all_col_ann.head()

Unnamed: 0,index,condition,barcode,seq
samp:HUES64_rep1__barc:10___seq:human,samp:HUES64_rep1__barc:10___seq:human,HUES64,10,human
samp:HUES64_rep1__barc:10___seq:mouse,samp:HUES64_rep1__barc:10___seq:mouse,HUES64,10,mouse
samp:HUES64_rep1__barc:11___seq:human,samp:HUES64_rep1__barc:11___seq:human,HUES64,11,human
samp:HUES64_rep1__barc:11___seq:mouse,samp:HUES64_rep1__barc:11___seq:mouse,HUES64,11,mouse
samp:HUES64_rep1__barc:12___seq:human,samp:HUES64_rep1__barc:12___seq:human,HUES64,12,human


In [67]:
all_col_ann.tail()

Unnamed: 0,index,condition,barcode,seq
samp:mESC_rep3__barc:7___seq:mouse,samp:mESC_rep3__barc:7___seq:mouse,mESC,7,mouse
samp:mESC_rep3__barc:8___seq:human,samp:mESC_rep3__barc:8___seq:human,mESC,8,human
samp:mESC_rep3__barc:8___seq:mouse,samp:mESC_rep3__barc:8___seq:mouse,mESC,8,mouse
samp:mESC_rep3__barc:9___seq:human,samp:mESC_rep3__barc:9___seq:human,mESC,9,human
samp:mESC_rep3__barc:9___seq:mouse,samp:mESC_rep3__barc:9___seq:mouse,mESC,9,mouse


## 6. make control ID files

In [68]:
ctrls = all_rna.reset_index()[["comp_id", "samp:HUES64_rep1__barc:10___seq:human"]]
ctrls["ctrl_status"] = ctrls.apply(ctrl_status, axis=1)
ctrls.drop("samp:HUES64_rep1__barc:10___seq:human", axis=1, inplace=True)
ctrls.ctrl_status.value_counts()

False    2934
True      400
Name: ctrl_status, dtype: int64

In [69]:
ctrls.head()

Unnamed: 0,comp_id,ctrl_status
0,h.2__divergent__m.1__divergent__tile1,False
1,h.3__divergent__m.2__antisense__tile1,False
2,h.4__antisense__m.3__protein_coding__tile1,False
3,h.5__intergenic__m.4__intergenic__tile1,False
4,h.6__intergenic__m.4__intergenic__tile1,False


## 7. write seq comparison files

In [70]:
dna_col_ann.to_csv("%s/dna_col_ann.all_comp.mpranalyze.txt" % mpranalyze_dir, sep="\t")
human_col_ann.to_csv("%s/HUES64_col_ann.seq_comp.mpranalyze.txt" % mpranalyze_dir, sep="\t")
mouse_col_ann.to_csv("%s/mESC_col_ann.seq_comp.mpranalyze.txt" % mpranalyze_dir, sep="\t")
native_col_ann.to_csv("%s/native_col_ann.seq_comp.mpranalyze.txt" % mpranalyze_dir, sep="\t")
all_col_ann.to_csv("%s/all_col_ann.seq_comp.mpranalyze.txt" % mpranalyze_dir, sep="\t")

ctrls.to_csv("%s/ctrl_status.all_comp.mpranalyze.txt" % mpranalyze_dir, sep="\t", index=False)

all_dna.to_csv("%s/dna_counts.all_comp.mpranalyze.txt" % mpranalyze_dir, sep="\t", index=True)
all_rna_human.to_csv("%s/HUES64_rna_counts.seq_comp.mpranalyze.txt" % mpranalyze_dir, sep="\t", index=True)
all_rna_mouse.to_csv("%s/mESC_rna_counts.seq_comp.mpranalyze.txt" % mpranalyze_dir, sep="\t", index=True)
all_rna_native.to_csv("%s/native_rna_counts.seq_comp.mpranalyze.txt" % mpranalyze_dir, sep="\t", index=True)
all_rna.to_csv("%s/all_rna_counts.seq_comp.mpranalyze.txt" % mpranalyze_dir, sep="\t", index=True)

# then make files for cell line comparisons (trans effects)

## 1. run trans effects separately for human seqs & mouse seqs, so subset counts dataframe

In [71]:
human_columns = [x for x in all_rna.columns if "seq:human" in x]
mouse_columns = [x for x in all_rna.columns if "seq:mouse" in x]

In [72]:
human_trans = all_rna[human_columns]
mouse_trans = all_rna[mouse_columns]

In [73]:
print(len(human_trans))

3334


In [74]:
print(len(mouse_trans))

3334


## 2. subset annotation dataframe

In [75]:
tmp = all_col_ann
tmp.head()

Unnamed: 0,index,condition,barcode,seq
samp:HUES64_rep1__barc:10___seq:human,samp:HUES64_rep1__barc:10___seq:human,HUES64,10,human
samp:HUES64_rep1__barc:10___seq:mouse,samp:HUES64_rep1__barc:10___seq:mouse,HUES64,10,mouse
samp:HUES64_rep1__barc:11___seq:human,samp:HUES64_rep1__barc:11___seq:human,HUES64,11,human
samp:HUES64_rep1__barc:11___seq:mouse,samp:HUES64_rep1__barc:11___seq:mouse,HUES64,11,mouse
samp:HUES64_rep1__barc:12___seq:human,samp:HUES64_rep1__barc:12___seq:human,HUES64,12,human


In [76]:
human_trans_col_ann = tmp[tmp["index"].isin(human_columns)].set_index("index")
del human_trans_col_ann.index.name
human_trans_col_ann.sample(5)

Unnamed: 0,condition,barcode,seq
samp:mESC_rep2__barc:7___seq:human,mESC,7,human
samp:HUES64_rep2__barc:5___seq:human,HUES64,5,human
samp:mESC_rep1__barc:11___seq:human,mESC,11,human
samp:HUES64_rep2__barc:8___seq:human,HUES64,8,human
samp:HUES64_rep2__barc:12___seq:human,HUES64,12,human


In [77]:
mouse_trans_col_ann = tmp[tmp["index"].isin(mouse_columns)].set_index("index")
del mouse_trans_col_ann.index.name
mouse_trans_col_ann.sample(5)

Unnamed: 0,condition,barcode,seq
samp:HUES64_rep1__barc:12___seq:mouse,HUES64,12,mouse
samp:HUES64_rep1__barc:1___seq:mouse,HUES64,1,mouse
samp:mESC_rep2__barc:3___seq:mouse,mESC,3,mouse
samp:HUES64_rep1__barc:7___seq:mouse,HUES64,7,mouse
samp:mESC_rep1__barc:7___seq:mouse,mESC,7,mouse


In [78]:
print(len(human_columns))
print(len(human_trans_col_ann))
print(len(mouse_columns))
print(len(mouse_trans_col_ann))

78
78
78
78


## 3. write cell comparison files

In [79]:
human_trans_col_ann.to_csv("%s/human_col_ann.cell_comp.mpranalyze.txt" % mpranalyze_dir, sep="\t")
mouse_trans_col_ann.to_csv("%s/mouse_col_ann.cell_comp.mpranalyze.txt" % mpranalyze_dir, sep="\t")

human_trans.to_csv("%s/human_rna_counts.cell_comp.mpranalyze.txt" % mpranalyze_dir, sep="\t", index=True)
mouse_trans.to_csv("%s/mouse_rna_counts.cell_comp.mpranalyze.txt" % mpranalyze_dir, sep="\t", index=True)