# 00__make_files

in this notebook, i make the files necessary for finding CAGE reads that intersect our regions of interest (orthologous TSSs between human and mouse). final files are BED files with a 50 bp buffer surrounding the TSS (in both human and mouse).

In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import math
import matplotlib.pyplot as plt
import numpy as np
import re
import seaborn as sns
import sys

from scipy.stats import spearmanr

# import utils
sys.path.append("../../../utils")
from plotting_utils import *

%matplotlib inline
%config InlineBackend.figure_format = 'svg'
mpl.rcParams['figure.autolayout'] = False

## variables

In [2]:
human_master_f = "../../../data/01__design/00__genome_list/hg19.master_list.txt.gz"
mouse_master_f = "../../../data/01__design/00__genome_list/mm9.master_list.txt.gz"

## 1. import data

In [3]:
human_master = pd.read_table(human_master_f, sep="\t")
human_master.head()

Unnamed: 0,chr_tss_hg19,start_tss_hg19,end_tss_hg19,name_tss_hg19,score_tss_hg19,strand_tss_hg19,cage_id_hg19,biotype_hg19,seq_orth,cage_orth,...,score_peak_mm9,cage_id_mm9,biotype_mm9,stem_exp_mm9,avg_exp_mm9,har,repeat_hg19,repeat_mm9,repeat_status,remap_status
0,chr1,564588,564589,"ENSG00000225972.1,ENSG00000225630.1",2398,+,"chr1:564571..564600,+",multi-mapped,False,False,...,no cage activity,no cage activity,no cage activity,no cage activity,no cage activity,False,,,no repeats,remap fail
1,chr1,564645,564646,"ENSG00000225972.1,ENSG00000225630.1",220,+,"chr1:564639..564649,+",multi-mapped,False,False,...,no cage activity,no cage activity,no cage activity,no cage activity,no cage activity,False,,,no repeats,remap fail
2,chr1,565269,565270,"ENSG00000225972.1,ENSG00000225630.1",535,+,"chr1:565266..565278,+",multi-mapped,False,False,...,no cage activity,no cage activity,no cage activity,no cage activity,no cage activity,False,,,no repeats,remap fail
3,chr1,565480,565481,"ENSG00000237973.1,ENSG00000225630.1",106,+,"chr1:565478..565483,+",multi-mapped,False,False,...,no cage activity,no cage activity,no cage activity,no cage activity,no cage activity,False,,,no repeats,remap fail
4,chr1,565523,565524,"ENSG00000237973.1,ENSG00000225630.1",3594,+,"chr1:565509..565541,+",multi-mapped,False,False,...,no cage activity,no cage activity,no cage activity,no cage activity,no cage activity,False,,,no repeats,remap fail


In [4]:
mouse_master = pd.read_table(mouse_master_f, sep="\t")
mouse_master.head()

Unnamed: 0,chr_tss_mm9,start_tss_mm9,end_tss_mm9,name_tss_mm9,score_tss_mm9,strand_tss_mm9,cage_id_mm9,biotype_mm9,seq_orth,cage_orth,...,score_peak_hg19,cage_id_hg19,biotype_hg19,stem_exp_hg19,avg_exp_hg19,har,repeat_mm9,repeat_hg19,repeat_status,remap_status
0,chr1,3309585,3309586,unassigned,153,-,"chr1:3309585..3309588,-",unassigned,False,False,...,no cage activity,no cage activity,no cage activity,no cage activity,no cage activity,False,,,no repeats,
1,chr1,3367867,3367868,ENSMUSG00000104017.1,569,-,"chr1:3367867..3367870,-",other_noncoding,False,False,...,no cage activity,no cage activity,no cage activity,no cage activity,no cage activity,False,,,no repeats,
2,chr1,3479231,3479232,unassigned,163,-,"chr1:3479230..3479234,-",unassigned,False,False,...,no cage activity,no cage activity,no cage activity,no cage activity,no cage activity,False,Lx2A1,,repeat,
3,chr1,3644977,3644978,unassigned,493,-,"chr1:3644976..3644980,-",unassigned,True,False,...,no cage activity,no cage activity,no cage activity,no cage activity,no cage activity,False,,,no repeats,
4,chr1,3657916,3657917,unassigned,904,-,"chr1:3657915..3657919,-",unassigned,False,False,...,no cage activity,no cage activity,no cage activity,no cage activity,no cage activity,False,AT_rich,,repeat,


## 2. filter to seq orths only

In [5]:
human_master_filt = human_master[human_master["seq_orth"]]
len(human_master_filt)

195110

In [6]:
mouse_master_filt = mouse_master[mouse_master["seq_orth"]]
len(mouse_master_filt)

163131

## 3. find TSS coords for human/mouse paired regions
do it for both the "human" file (started from human) and the "mouse" file (started from mouse)

In [7]:
human_bed_hg19 = human_master_filt[["chr_tss_hg19", "start_tss_hg19", "end_tss_hg19", "cage_id_hg19",
                                    "score_tss_hg19", "strand_tss_hg19"]].drop_duplicates()
print(len(human_bed_hg19))
human_bed_hg19.head()

193830


Unnamed: 0,chr_tss_hg19,start_tss_hg19,end_tss_hg19,cage_id_hg19,score_tss_hg19,strand_tss_hg19
35,chr1,869211,869212,"chr1:869192..869225,+",331,+
36,chr1,869324,869325,"chr1:869322..869329,+",284,+
48,chr1,894635,894636,"chr1:894620..894659,-",160511,-
49,chr1,895911,895912,"chr1:895904..895925,-",1196,-
50,chr1,896012,896013,"chr1:895930..896033,+",21903,+


In [8]:
human_bed_mm9 = human_master_filt[["chr_tss_mm9", "start_tss_mm9", "end_tss_mm9", "cage_id_hg19",
                                   "score_tss_hg19", "strand_tss_mm9"]].drop_duplicates()
print(len(human_bed_mm9))
human_bed_mm9.head()

192046


Unnamed: 0,chr_tss_mm9,start_tss_mm9,end_tss_mm9,cage_id_hg19,score_tss_hg19,strand_tss_mm9
35,chr4,155629557,155629558,"chr1:869192..869225,+",331,-
36,chr4,155629445,155629446,"chr1:869322..869329,+",284,-
48,chr4,155610134,155610135,"chr1:894620..894659,-",160511,+
49,chr4,155608996,155608997,"chr1:895904..895925,-",1196,+
50,chr4,155608903,155608904,"chr1:895930..896033,+",21903,-


In [9]:
human_bed_mm9[human_bed_mm9["cage_id_hg19"] == "chr1:203273760..203273784,-"]

Unnamed: 0,chr_tss_mm9,start_tss_mm9,end_tss_mm9,cage_id_hg19,score_tss_hg19,strand_tss_mm9
330682,chr1,135976545,135976546,"chr1:203273760..203273784,-",851,+


In [10]:
mouse_bed_mm9 = mouse_master_filt[["chr_tss_mm9", "start_tss_mm9", "end_tss_mm9", "cage_id_mm9",
                                    "score_tss_mm9", "strand_tss_mm9"]].drop_duplicates()
print(len(mouse_bed_mm9))
mouse_bed_mm9.head()

162047


Unnamed: 0,chr_tss_mm9,start_tss_mm9,end_tss_mm9,cage_id_mm9,score_tss_mm9,strand_tss_mm9
3,chr1,3644977,3644978,"chr1:3644976..3644980,-",493,-
5,chr1,3661803,3661804,"chr1:3661752..3661814,-",4638,-
6,chr1,3661874,3661875,"chr1:3661851..3661889,-",2432,-
7,chr1,3664691,3664692,"chr1:3664689..3664693,+",578,+
9,chr1,4350322,4350323,"chr1:4350318..4350327,-",413,-


In [11]:
mouse_bed_hg19 = mouse_master_filt[["chr_tss_hg19", "start_tss_hg19", "end_tss_hg19", "cage_id_mm9",
                                    "score_tss_mm9", "strand_tss_hg19"]].drop_duplicates()
print(len(mouse_bed_hg19))
mouse_bed_hg19.head()

159952


Unnamed: 0,chr_tss_hg19,start_tss_hg19,end_tss_hg19,cage_id_mm9,score_tss_mm9,strand_tss_hg19
3,chr8,56032920,56032921,"chr1:3644976..3644980,-",493,+
5,chr8,56014691,56014692,"chr1:3661752..3661814,-",4638,+
6,chr8,56014587,56014588,"chr1:3661851..3661889,-",2432,+
7,chr8,56011954,56011955,"chr1:3664689..3664693,+",578,-
9,chr8,55528655,55528656,"chr1:4350318..4350327,-",413,+


## 4. group hg19/mm9 files together for bed intersect

In [12]:
human_bed_hg19["cage_id"] = "HUMAN_CAGE_ID__" + human_bed_hg19["cage_id_hg19"]
mouse_bed_hg19["cage_id"] = "MOUSE_CAGE_ID__" + mouse_bed_hg19["cage_id_mm9"]
human_bed_hg19["score"] = "HUMAN_SCORE__" + human_bed_hg19["score_tss_hg19"].astype(str)
mouse_bed_hg19["score"] = "MOUSE_SCORE__" + mouse_bed_hg19["score_tss_mm9"].astype(str)
human_bed_hg19.head()

Unnamed: 0,chr_tss_hg19,start_tss_hg19,end_tss_hg19,cage_id_hg19,score_tss_hg19,strand_tss_hg19,cage_id,score
35,chr1,869211,869212,"chr1:869192..869225,+",331,+,"HUMAN_CAGE_ID__chr1:869192..869225,+",HUMAN_SCORE__331
36,chr1,869324,869325,"chr1:869322..869329,+",284,+,"HUMAN_CAGE_ID__chr1:869322..869329,+",HUMAN_SCORE__284
48,chr1,894635,894636,"chr1:894620..894659,-",160511,-,"HUMAN_CAGE_ID__chr1:894620..894659,-",HUMAN_SCORE__160511
49,chr1,895911,895912,"chr1:895904..895925,-",1196,-,"HUMAN_CAGE_ID__chr1:895904..895925,-",HUMAN_SCORE__1196
50,chr1,896012,896013,"chr1:895930..896033,+",21903,+,"HUMAN_CAGE_ID__chr1:895930..896033,+",HUMAN_SCORE__21903


In [13]:
human_bed_mm9["cage_id"] = "HUMAN_CAGE_ID__" + human_bed_mm9["cage_id_hg19"]
mouse_bed_mm9["cage_id"] = "MOUSE_CAGE_ID__" + mouse_bed_mm9["cage_id_mm9"]
human_bed_mm9["score"] = "HUMAN_SCORE__" + human_bed_hg19["score_tss_hg19"].astype(str)
mouse_bed_mm9["score"] = "MOUSE_SCORE__" + mouse_bed_hg19["score_tss_mm9"].astype(str)
human_bed_mm9.head()

Unnamed: 0,chr_tss_mm9,start_tss_mm9,end_tss_mm9,cage_id_hg19,score_tss_hg19,strand_tss_mm9,cage_id,score
35,chr4,155629557,155629558,"chr1:869192..869225,+",331,-,"HUMAN_CAGE_ID__chr1:869192..869225,+",HUMAN_SCORE__331
36,chr4,155629445,155629446,"chr1:869322..869329,+",284,-,"HUMAN_CAGE_ID__chr1:869322..869329,+",HUMAN_SCORE__284
48,chr4,155610134,155610135,"chr1:894620..894659,-",160511,+,"HUMAN_CAGE_ID__chr1:894620..894659,-",HUMAN_SCORE__160511
49,chr4,155608996,155608997,"chr1:895904..895925,-",1196,+,"HUMAN_CAGE_ID__chr1:895904..895925,-",HUMAN_SCORE__1196
50,chr4,155608903,155608904,"chr1:895930..896033,+",21903,-,"HUMAN_CAGE_ID__chr1:895930..896033,+",HUMAN_SCORE__21903


In [14]:
hg19_bed = human_bed_hg19[["chr_tss_hg19", "start_tss_hg19", "end_tss_hg19", "cage_id", "score", "strand_tss_hg19"]]
hg19_bed = hg19_bed.append(mouse_bed_hg19[["chr_tss_hg19", "start_tss_hg19", "end_tss_hg19", "cage_id", "score", "strand_tss_hg19"]])
hg19_bed.drop_duplicates(inplace=True)
print(len(hg19_bed))
hg19_bed.sample(5)

353782


Unnamed: 0,chr_tss_hg19,start_tss_hg19,end_tss_hg19,cage_id,score,strand_tss_hg19
136417,chr4,6474373,6474374,"HUMAN_CAGE_ID__chr4:6474368..6474379,-",HUMAN_SCORE__1153,-
148693,chr15,55652852,55652853,"MOUSE_CAGE_ID__chr9:72859919..72859940,+",MOUSE_SCORE__49,-
113720,chr20,9438222,9438223,"HUMAN_CAGE_ID__chr20:9438212..9438239,+",HUMAN_SCORE__913,+
70011,chr16,56459257,56459258,"HUMAN_CAGE_ID__chr16:56459252..56459263,-",HUMAN_SCORE__638,-
54670,chr16,15833971,15833972,"MOUSE_CAGE_ID__chr16:14218991..14219005,-",MOUSE_SCORE__256,-


In [15]:
mm9_bed = human_bed_mm9[["chr_tss_mm9", "start_tss_mm9", "end_tss_mm9", "cage_id", "score", "strand_tss_mm9"]]
mm9_bed = mm9_bed.append(mouse_bed_mm9[["chr_tss_mm9", "start_tss_mm9", "end_tss_mm9", "cage_id", "score", "strand_tss_mm9"]])
mm9_bed.drop_duplicates(inplace=True)
print(len(mm9_bed))
mm9_bed.sample(5)

354093


Unnamed: 0,chr_tss_mm9,start_tss_mm9,end_tss_mm9,cage_id,score,strand_tss_mm9
25016,chr11,87587973,87587974,"MOUSE_CAGE_ID__chr11:87587973..87587974,+",MOUSE_SCORE__542,+
189222,chr5,138338654,138338655,MOUSE_CAGE_ID__chr5:138338507-138338657,MOUSE_SCORE__2,-
148070,chr13,93564772,93564773,"HUMAN_CAGE_ID__chr5:79331164..79331177,+",HUMAN_SCORE__18819,-
279299,chr5,148768278,148768279,HUMAN_CAGE_ID__chr13:29393233-29393586,HUMAN_SCORE__3,+
52083,chr15,88971114,88971115,"MOUSE_CAGE_ID__chr15:88971094..88971126,-",MOUSE_SCORE__2162,-


## 5. add buffer of +/- 50 bp

In [16]:
hg19_bed["start_tss_hg19"] = hg19_bed["start_tss_hg19"].astype(int) - 49
hg19_bed["end_tss_hg19"] = hg19_bed["end_tss_hg19"].astype(int) + 50
hg19_bed["score"] = 0
hg19_bed.head()

Unnamed: 0,chr_tss_hg19,start_tss_hg19,end_tss_hg19,cage_id,score,strand_tss_hg19
35,chr1,869162,869262,"HUMAN_CAGE_ID__chr1:869192..869225,+",0,+
36,chr1,869275,869375,"HUMAN_CAGE_ID__chr1:869322..869329,+",0,+
48,chr1,894586,894686,"HUMAN_CAGE_ID__chr1:894620..894659,-",0,-
49,chr1,895862,895962,"HUMAN_CAGE_ID__chr1:895904..895925,-",0,-
50,chr1,895963,896063,"HUMAN_CAGE_ID__chr1:895930..896033,+",0,+


In [17]:
mm9_bed["start_tss_mm9"] = mm9_bed["start_tss_mm9"].astype(int) - 49
mm9_bed["end_tss_mm9"] = mm9_bed["end_tss_mm9"].astype(int) + 50
mm9_bed["score"] = 0
mm9_bed.head()

Unnamed: 0,chr_tss_mm9,start_tss_mm9,end_tss_mm9,cage_id,score,strand_tss_mm9
35,chr4,155629508,155629608,"HUMAN_CAGE_ID__chr1:869192..869225,+",0,-
36,chr4,155629396,155629496,"HUMAN_CAGE_ID__chr1:869322..869329,+",0,-
48,chr4,155610085,155610185,"HUMAN_CAGE_ID__chr1:894620..894659,-",0,+
49,chr4,155608947,155609047,"HUMAN_CAGE_ID__chr1:895904..895925,-",0,+
50,chr4,155608854,155608954,"HUMAN_CAGE_ID__chr1:895930..896033,+",0,-


## 6. write files

In [18]:
hg19_bed.to_csv("../../../data/01__design/00__genome_list/hg19_master.50buff.bed", header=False, index=False, sep="\t")
mm9_bed.to_csv("../../../data/01__design/00__genome_list/mm9_master.50buff.bed", header=False, index=False, sep="\t")