# 01__conservation

In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import math
import matplotlib.pyplot as plt
import numpy as np
import re
import seaborn as sns
import sys

from scipy.stats import spearmanr

# import utils
sys.path.append("../../../utils")
from plotting_utils import *

%matplotlib inline
%config InlineBackend.figure_format = 'svg'
mpl.rcParams['figure.autolayout'] = False

In [2]:
sns.set(**PAPER_PRESET)
fontsize = PAPER_FONTSIZE

In [3]:
np.random.seed(2019)

## functions

In [4]:
def cleaner_biotype(row, biotype_col):
    if row[biotype_col] in ["protein_coding", "div_pc"]:
        return "mRNA"
    elif row[biotype_col] == "intergenic":
        return "lncRNA"
    elif row[biotype_col] in ["antisense", "div_lnc"]:
        return "lncRNA"
    elif row[biotype_col] == "enhancer":
        return "eRNA"
    elif row[biotype_col] == "no cage activity":
        return "no CAGE activity"
    else:
        return "other"

In [5]:
def get_perc(row):
    if not pd.isnull(row["cage_id"]):
        return (row["cage_id"]/row["cage_id_x"])*100
    else:
        return (row["cage_id_y"]/row["cage_id_x"])*100

## variables

In [6]:
human_list_f = "../../../data/01__design/00__genome_list/hg19.PEAK_STATUS.txt.gz"
mouse_list_f = "../../../data/01__design/00__genome_list/mm9.PEAK_STATUS.txt.gz"

## 1. import data

In [7]:
human_list = pd.read_table(human_list_f)
human_list.head()

Unnamed: 0,cage_id,biotype,seq_ortholog,cage_ortholog,other_sp_biotype
0,"chr10:100013403..100013414,-",other,1,1,other
1,chr10:100019738-100019959,eRNA,2,0,no CAGE activity
2,chr10:100019738-100019959,eRNA,2,0,no CAGE activity
3,chr10:100020230-100020246,eRNA,0,0,no CAGE activity
4,chr10:100020230-100020246,eRNA,0,0,no CAGE activity


In [8]:
mouse_list = pd.read_table(mouse_list_f)
mouse_list.head()

Unnamed: 0,cage_id,biotype,seq_ortholog,cage_ortholog,other_sp_biotype
0,"chr10:100007025..100007067,+",other,1,0,no CAGE activity
1,"chr10:100051744..100051765,+",other,1,0,no CAGE activity
2,"chr10:100051839..100051903,-",other,1,1,other
3,"chr10:100053047..100053060,-",mRNA,0,0,no CAGE activity
4,"chr10:100053061..100053066,-",mRNA,1,0,no CAGE activity


## 2. clean biotype counts

In [9]:
human_list.biotype.value_counts()

other     140996
eRNA      130814
mRNA       53595
lncRNA      6048
Name: biotype, dtype: int64

In [10]:
mouse_list.biotype.value_counts()

other     102302
eRNA       88886
mRNA       52547
lncRNA      3468
Name: biotype, dtype: int64

In [11]:
human_list[human_list["seq_ortholog"] == 0].sample(5)

Unnamed: 0,cage_id,biotype,seq_ortholog,cage_ortholog,other_sp_biotype
270462,chr6:3386855-3386871,eRNA,0,0,no CAGE activity
125363,"chr19:55954631..55954676,-",mRNA,0,0,no CAGE activity
35713,"chr12:121716436..121716441,+",other,0,0,no CAGE activity
179520,chr2:113380153-113380493,eRNA,0,0,no CAGE activity
185286,chr2:168858571-168859104,eRNA,0,0,no CAGE activity


## 3. find % sequence conservation

In [12]:
human_tots = human_list.groupby("biotype")["cage_id"].agg("count").reset_index()

human_tsss = human_list[(human_list["biotype"] != "eRNA") & (human_list["seq_ortholog"] == 1)]
human_tss_seqs = human_tsss.groupby("biotype")["cage_id"].agg("count").reset_index()

# make sure we pick enhancers that have both TSSs map
human_enhs = human_list[(human_list["biotype"] == "eRNA") & (human_list["seq_ortholog"] >= 2)]
human_enh_seqs = human_enhs.groupby("biotype")["cage_id"].agg("count").reset_index()

# merge
human_seq_perc = human_tots.merge(human_tss_seqs, 
                                  on="biotype", 
                                  how="left").merge(human_enh_seqs, 
                                                    on="biotype", 
                                                    how="left")
human_seq_perc["perc"] = human_seq_perc.apply(get_perc, axis=1)
human_seq_perc.head()

Unnamed: 0,biotype,cage_id_x,cage_id_y,cage_id,perc
0,eRNA,130814,,54178.0,41.416056
1,lncRNA,6048,3552.0,,58.730159
2,mRNA,53595,43942.0,,81.988992
3,other,140996,92130.0,,65.342279


In [13]:
mouse_tots = mouse_list.groupby("biotype")["cage_id"].agg("count").reset_index()

mouse_tsss = mouse_list[(mouse_list["biotype"] != "eRNA") & (mouse_list["seq_ortholog"] == 1)]
mouse_tss_seqs = mouse_tsss.groupby("biotype")["cage_id"].agg("count").reset_index()

# make sure we pick enhancers that have both TSSs map
mouse_enhs = mouse_list[(mouse_list["biotype"] == "eRNA") & (mouse_list["seq_ortholog"] >= 2)]
mouse_enh_seqs = mouse_enhs.groupby("biotype")["cage_id"].agg("count").reset_index()

# merge
mouse_seq_perc = mouse_tots.merge(mouse_tss_seqs, 
                                  on="biotype", 
                                  how="left").merge(mouse_enh_seqs, 
                                                    on="biotype", 
                                                    how="left")
mouse_seq_perc["perc"] = mouse_seq_perc.apply(get_perc, axis=1)
mouse_seq_perc.head()

Unnamed: 0,biotype,cage_id_x,cage_id_y,cage_id,perc
0,eRNA,88886,,43004.0,48.381072
1,lncRNA,3468,2382.0,,68.685121
2,mRNA,52547,45695.0,,86.960245
3,other,102302,70936.0,,69.339798


## 4. find % CAGE conservation

In [14]:
human_tots = human_list.groupby("biotype")["cage_id"].agg("count").reset_index()

human_tsss = human_list[(human_list["biotype"] != "eRNA") & (human_list["cage_ortholog"] == 1)]
human_tss_seqs = human_tsss.groupby("biotype")["cage_id"].agg("count").reset_index()

# make sure we pick enhancers that have both TSSs map
human_enhs = human_list[(human_list["biotype"] == "eRNA") & (human_list["cage_ortholog"] >= 2)]
human_enh_seqs = human_enhs.groupby("biotype")["cage_id"].agg("count").reset_index()

# merge
human_cage_perc = human_tots.merge(human_tss_seqs, 
                                  on="biotype", 
                                  how="left").merge(human_enh_seqs, 
                                                    on="biotype", 
                                                    how="left")
human_cage_perc["perc"] = human_cage_perc.apply(get_perc, axis=1)
human_cage_perc.head()

Unnamed: 0,biotype,cage_id_x,cage_id_y,cage_id,perc
0,eRNA,130814,,9368.0,7.161313
1,lncRNA,6048,1905.0,,31.498016
2,mRNA,53595,40492.0,,75.551824
3,other,140996,61341.0,,43.50549


In [15]:
mouse_tots = mouse_list.groupby("biotype")["cage_id"].agg("count").reset_index()

mouse_tsss = mouse_list[(mouse_list["biotype"] != "eRNA") & (mouse_list["cage_ortholog"] == 1)]
mouse_tss_seqs = mouse_tsss.groupby("biotype")["cage_id"].agg("count").reset_index()

# make sure we pick enhancers that have both TSSs map
mouse_enhs = mouse_list[(mouse_list["biotype"] == "eRNA") & (mouse_list["cage_ortholog"] >= 2)]
mouse_enh_seqs = mouse_enhs.groupby("biotype")["cage_id"].agg("count").reset_index()

# merge
mouse_cage_perc = mouse_tots.merge(mouse_tss_seqs, 
                                  on="biotype", 
                                  how="left").merge(mouse_enh_seqs, 
                                                    on="biotype", 
                                                    how="left")
mouse_cage_perc["perc"] = mouse_cage_perc.apply(get_perc, axis=1)
mouse_cage_perc.head()

Unnamed: 0,biotype,cage_id_x,cage_id_y,cage_id,perc
0,eRNA,88886,,10428.0,11.731881
1,lncRNA,3468,1261.0,,36.361015
2,mRNA,52547,42670.0,,81.203494
3,other,102302,47951.0,,46.872006


## 5. make plots

In [None]:
order = ["eRNA", "lncRNA", "mRNA"]

In [None]:
fig, axarr = plt.subplots(figsize=(1.4, 2), ncols=1, nrows=2, sharex=True, sharey=True)

ax = axarr[0]
sns.barplot(data=human_seq_perc, x="biotype", y="perc",
            order=order, color=sns.color_palette("Set2")[1], ax=ax)
ax.set_xlabel("")
ax.set_ylabel("% sequence\northologs")
ax.set_ylim((0, 100))

ax = axarr[1]
sns.barplot(data=human_cage_perc, x="biotype", y="perc",
            order=order, color=sns.color_palette("Set2")[1], ax=ax)
ax.set_xlabel("")
ax.set_ylabel("% conserved")
ax.set_xticklabels(order, rotation=50, ha='right', va='top')
ax.set_ylim((0, 100))
fig.savefig("human_orth_percents.pdf", dpi="figure", bbox_inches="tight")

In [None]:
fig, axarr = plt.subplots(figsize=(1.4, 2), ncols=1, nrows=2, sharex=True, sharey=True)

ax = axarr[0]
sns.barplot(data=mouse_seq_perc, x="biotype", y="perc",
            order=order, color=sns.color_palette("Set2")[0], ax=ax)
ax.set_xlabel("")
ax.set_ylabel("% sequence\northologs")
ax.set_ylim((0, 100))

ax = axarr[1]
sns.barplot(data=mouse_cage_perc, x="biotype", y="perc",
            order=order, color=sns.color_palette("Set2")[0], ax=ax)
ax.set_xlabel("")
ax.set_ylabel("% conserved")
ax.set_xticklabels(order, rotation=50, ha='right', va='top')
ax.set_ylim((0, 100))
fig.savefig("mouse_orth_percents.pdf", dpi="figure", bbox_inches="tight")