# 1. Library 

In [1]:
## Parameters specific to where your folders are and your data
parameter_file = 'params/params.yaml'
import yaml
import sys

with open(parameter_file,'r') as f:
    doc = yaml.load(f)

#p = dic2obj(**doc)

data_folder = doc['data_folder']
tissues = doc['tissues'].split(',')
sys.path.append(doc['pipeline_path'])
ref_fa = doc['ref_fa']
anno_gff=doc['annotation']
mRNA_peak_file = doc["mRNA_peak_file"]

tss_annotation = doc['tss_annotation']


import os
import sys
import pandas as pd
import matplotlib
import seaborn as sns
import pickle
import numpy as np
import matplotlib as mpl
from matplotlib import pyplot as plt
from itertools import product
import glob
import re
from matplotlib_venn import venn2
from matplotlib import rcParams
import inspect
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
sys.setrecursionlimit(3000)
%load_ext autoreload
%autoreload 2
rcParams['figure.figsize'] = 8, 6
import tqdm

from os.path import basename
##mpl.use('Agg')
#mpl.style.use('ggplot')
#mpl.style.use('fivethirtyeight')
from Homer import *
import plot_tss_results
print('Number of tissues: ',len(tissues))

from scipy.stats import zscore
from scipy.stats import iqr

('Number of tissues: ', 13)


In [None]:
save_dir = "Results/Figures/Silenced_CHO"
if not os.path.exists(save_dir):
    os.mkdir(save_dir)

# 2. Load and Preprocessing

## Read in expression matrix

In [None]:
cho_rna_file = doc["cho_rna_file"]
cho_rna = pd.read_csv(cho_rna_file,sep="\t",index_col=0)
cho_rna = cho_rna.iloc[:,1:]
cho_rna = np.log10(cho_rna+1)
cho_rna

In [None]:
old_mrna = pd.read_csv("/data/isshamie/genome/picr_final/mRNA_final.peak",sep="\t")
old_mrna

In [None]:
cho_rna_gnames = cho_rna.groupby(old_mrna.set_index(["gene_id"],)["gene"].to_dict()).max()

In [None]:
cho_rna_gnames

In [3]:
tissues_expr = pd.read_csv("Results/merged/tissues.merge.peaksexpression.log10",sep="\t",index_col=0)
tissues_expr.head()

Unnamed: 0,Heart,Spleen,MiscOrgans,FemaleReproductive,CHO,BMDM1hKLA,Brain,Lung,Liver,BMDMwt,Muscle,Kidney,Pancreas
0,0.213256,0.0,0.191009,0.177054,1.379905,1.93044,0.192445,0.292457,0.0,0.0,0.192803,0.289548,0.0
1,0.0,0.569562,0.658306,0.704879,0.0,0.0,0.365353,0.875756,0.211156,0.0,0.160073,0.206016,0.0
2,0.0,0.272654,0.191009,0.650337,1.26945,0.0,0.285237,0.440272,0.0,0.823474,0.252575,0.377071,0.299942
3,0.213256,0.583265,0.592983,0.556972,0.347363,0.0,0.633209,0.256951,0.0,0.537819,0.300486,0.216445,0.380588
4,0.858835,1.743575,1.998797,1.218955,0.698374,0.0,1.707621,0.72908,0.945703,0.642465,1.198361,0.838893,1.141651


In [None]:
gene_tissue = pd.DataFrame(index=tss_meta["Gene"].unique(), columns=prom_tissue.columns, dtype=float)
for ind, val in tqdm.tqdm_notebook(tss_meta.groupby("Gene")):
    gene_tissue.loc[ind] = prom_tissue.loc[prom_tissue.loc[val.index].sum(axis=1).idxmax()]
    
    
gene_tissue

# 3. Filter to genes of interest: DNA repair genes

In [None]:
with open("../supplemental/dna_repair_genes/03_DNA_repair_genes.hamster.final.txt","r") as f:
    repair_genes = [line.rstrip() for line in f]
    
#repair_genes = list(map(lambda x: x.upper(),repair_genes))
repair_genes

In [None]:
tss_meta_dna_repair = tss_meta[tss_meta["Gene"].isin(repair_genes)]["Gene"].unique()

print("Number of repair genes in annotation", len(tss_meta[tss_meta["Gene"].isin(repair_genes)]["Gene"].unique()))
print("Number of overall repair genes", len(np.unique(np.array(repair_genes))))

not_found_in_anno = []
for i in repair_genes:
    if i not in tss_meta["Gene"].values:
        not_found_in_anno.append(i)

print("Number of repair genes not found in annotation", len(not_found_in_anno))


In [None]:
gene_tissue = gene_tissue[gene_tissue.index.isin(repair_genes)]

# 4. Extract silenced Genes

## Calculate Z-score over tissues

In [None]:
gene_tissue_z = gene_tissue.apply(zscore,axis=1,result_type='broadcast') #gene_tissue.apply(lambda x) zscore(gene_tissue, axis=1)
gene_tissue_z.sum(axis=1)

## A. See CHO Genes 2SD below mean 

In [None]:
silenced_cho_2sd = gene_tissue_z[gene_tissue_z["CHO"] < -2]
silenced_cho_2sd

## Get CHO less than 1 TPM

In [None]:
pseudocount = 1 # This is b/c 1 was added to TPM in log when taking the average
silenced_cho_1tpm = gene_tissue.loc[gene_tissue["CHO"] < np.log10(1+pseudocount)].index
print("Number of CHO genes less than 1TPM:", len(silenced_cho_1tpm))
silenced_cho_1tpm

## B and C. More than 75% /100% of the other tissues are on

In [None]:
n_samples = len(gene_tissue.columns)-1 # Remove CHO
cutoffs = dict()
gene_tissue_silenced = gene_tissue.drop(["CHO"],axis=1) < np.log10(1+pseudocount)

num_tissues_silenced = gene_tissue_silenced.sum(axis=1)
cutoffs[75] = num_tissues_silenced[num_tissues_silenced < (1-0.75)*n_samples].index
cutoffs[90] = num_tissues_silenced[num_tissues_silenced < (1-0.9)*n_samples].index

cutoffs[100] = num_tissues_silenced[num_tissues_silenced ==0].index




cho_75 = set(silenced_cho_1tpm).intersection(cutoffs[75])
cho_90 = set(silenced_cho_1tpm).intersection(cutoffs[90])
cho_100 = set(silenced_cho_1tpm).intersection(cutoffs[100])

print("75th percentile number of silenced genes:", len(cho_75))
print("90th percentile number of silenced genes:", len(cho_90))
print("100th percentile number of silenced genes:", len(cho_100))

## D. CHO IQR outlier 

In [None]:
gene_iqr = iqr(gene_tissue,axis=1)
lower_quart = np.percentile(gene_tissue,25,axis=1)
iqr_outlier_thresh = lower_quart - gene_iqr*1.5

silenced_cho_1tpm_iqr_outlier = gene_tissue[gene_tissue["CHO"] < iqr_outlier_thresh].index
print("Number of genes: ", len(silenced_cho_1tpm_iqr_outlier))
silenced_cho_1tpm_iqr_outlier

In [None]:
outlier_methods_df = pd.DataFrame(
    index=["Genes", "Number of Genes"],
    columns=[
        "IQR", "1TPM", "1TPM and 75 percent tissues on",
        "1TPM and 90 percent tissues on", "1TPM and 100 percent tissues on",
        "Zscore -2"
    ])

outlier_methods_df.loc["Number of Genes"] = [
    len(silenced_cho_1tpm_iqr_outlier),
    len(silenced_cho_1tpm),
    len(cho_75),
    len(cho_90),
    len(cho_100), 
    len(silenced_cho_2sd)
]
outlier_methods_df.loc["Genes"] = [
    set(silenced_cho_1tpm_iqr_outlier),
    set(silenced_cho_1tpm), cho_75, cho_90, cho_100, set(silenced_cho_2sd.index.values)
]
outlier_methods_df

# 5. Plot silenced genes values

In [None]:
for col in outlier_methods_df.columns.values:
    curr_order = gene_tissue.loc[outlier_methods_df.loc["Genes",col]].apply(lambda x: x.sum(), axis=1).sort_values().index
    curr_genes = gene_tissue.loc[curr_order]
    zip(range(len(curr_genes)), curr_genes.index)

    # Construct scatter values
    cho_scatter_x = []
    cho_scatter_y = []
    tissue_scatter_x = []
    tissue_scatter_y = []
    n_tissues = curr_genes.shape[1] - 1
    for i in zip(range(len(curr_genes)), curr_genes.index):
        cho_scatter_y.append(i[0])
        cho_scatter_x.append(gene_tissue.loc[i[1],"CHO"])
        tissue_scatter_y += [i[0]] * n_tissues
        tissue_scatter_x += list(gene_tissue.drop("CHO",axis=1).loc[i[1]].values)


    f, ax = plt.subplots()
    ax.scatter(cho_scatter_x,cho_scatter_y,color="g")
    ax.scatter(tissue_scatter_x,tissue_scatter_y)

    plt.ylabel("Gene")
    plt.xlabel("Log10(TPM+1)")
    ax.set_yticks(range(len(cho_scatter_y)))
    ax.set_yticklabels(curr_genes.index.values);
    plt.title(col);
    plt.legend(["CHO","Other"])
    name =os.path.join(save_dir,"repair_genes_method_" + col.replace(" ",""))
    plt.savefig(name+".png")
    #gene_tissue.loc[]
    curr_genes.to_csv(name + ".txt")
#     with open(name+".txt","w") as f:
        
#         f.write("\n".join(list(curr_genes.index.values)))

# 6. Overlap of genes across the different thresholding methods

## Plot the two