### requires python 3 and TSS conda environment (needed for parallelization)

### Motivation: 
TSS: Focused and dispersed. 
Within each TSS there are variations to where the start site occurs across different tissues. 
When calling our peaks, we choose the CHO peak if it occurs, otherwise pick the tissue with the maximum p-value.

One question that arises is when we call the TSS, how different are the read alignments when a) The TSS align b) The TSS are shifted between CHO and tissues and c) No CHO peak is seen, only tissues d) only CHO TSS

For each of these, how many TSS' are seen for each. How many tissues have a TSS when they all agree with CHO and when they are offset from CHO.

When they are offset versus aligned, what is the distribution of tags underneath (the density)


In the same vein, we want to make sure that the CHO GRO-Cap and CHO csRNA peaks are both showing similar buildup profiles, and that when plotting histograms, they don't just look good because the GRO-Cap is aligning on the GRO-cap peaks and csRNA aligning on csRNA peaks. Therefore, each should be done to the opposite calls.

In [None]:
## Parameters specific to where your folders are and your data
parameter_file = '../params/params.yaml'
import yaml
import sys

with open(parameter_file,'r') as f:
    doc = yaml.load(f)

#p = dic2obj(**doc)

data_folder = doc['data_folder']
tissues = doc['tissues'].split(',')
sys.path.append(doc['pipeline_path'])
ref_fa = doc['ref_fa']
anno_gff=doc['annotation']
mRNA_peak_file = doc["mRNA_peak_file"]

tss_annotation = doc['tss_annotation']


import os
from os.path import join
import sys
import pandas as pd
import matplotlib
import seaborn as sns
import pickle
import numpy as np
import matplotlib as mpl
from matplotlib import pyplot as plt
from itertools import product
import glob
import re
from matplotlib_venn import venn2
from matplotlib import rcParams
import inspect
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
sys.setrecursionlimit(3000)
%load_ext autoreload
%autoreload 2
rcParams['figure.figsize'] = 8, 6
from tqdm import *
from os.path import basename
##mpl.use('Agg')
#mpl.style.use('ggplot')
#mpl.style.use('fivethirtyeight')
from Homer import *
import helper
import create_output
print('Number of tissues: ',len(tissues))

from collections import Counter


  import sys


In [None]:
from numpanpar import parallel_df as pardf

## Parameters and directories

In [None]:
sample_peaks_dir = "../Results/tss_annotation_peaks/"
merged_dir = "../Results/merged"
save_dir = "Results/histograms/compare_tissues"
if not os.path.exists(save_dir):
    os.mkdir(save_dir)

### Load data

In [None]:
merged_df = pd.read_csv("../Results/merged/samples.merge",sep="\t", index_col=0)
merged_df.head()

In [None]:
meta_df = pd.read_csv("../Results/output/TSS1.exp.meta", sep="\t", index_col=0)
meta_df["ID"] = meta_df["ID"].fillna(-1).astype(int)
bed_df = read_bed_file("../Results/output/TSS1.exp.bed")
meta_df.head()


## Construct peak-by-location data

In [None]:
sample_files = glob.glob(os.path.join(sample_peaks_dir, "sample_*"))
columns = list(map(lambda x: x.split("sample_")[1].split(".tsv")[0],sample_files))
files_dict = {c:sample_files[ind] for ind, c in enumerate(columns)}
peak_by_location = pd.DataFrame(index=meta_df["ID"],columns=columns)
peak_by_location.head()

### To start, use all samples in columns (can average over tissues later)

A. Create a dictionary where each sample is a key and their dataframe of peaks is the value  
B. Loop through the meta_df IDs.   
    For each ID:  
    Determine which samples had the ID (use merged to determine where there is no null)  
    Loop through each sample that had the peak:  
        Get the peak from their sample_{sample} file, and get their location and fill it in. 
    

### Create sample dictionary

In [None]:
sample_dict = dict()
for ind, f in enumerate(sample_files):
    print(f)
    print(columns[ind])
    name = columns[ind]
    sample_dict[name] = pd.read_csv(f, sep="\t")

In [None]:
def par_peak_by_location(df, files_dict):
    # Dataframe will be tissue-by-peak instead of peak-by-sample to make the parallelization work
    for name in df.index.values:
        #name = columns[ind]
        f = files_dict[name]
        curr_sample = pd.read_csv(f, sep="\t", index_col=0)
        curr_merge = pd.read_csv(f.replace("sample","merged"), sep="\t", index_col=0)
        curr_merge = curr_merge.loc[:,curr_merge.columns.str.contains("f04_peaks/")]
        curr_merge.columns = list(map(lambda x: x.split("f04_peaks/")[1], curr_merge.columns.values))
        # Remove the comma, indicating multiple peaks, which doesnt happen often
        curr_merge = curr_merge.loc[~(curr_merge[name].str.contains(",")), name] #Take only the column of the sample

        for ind2, val in tqdm(curr_merge.iteritems()):
            df.loc[name,ind2] = np.floor((curr_sample.loc[val, "Start"] + curr_sample.loc[val, "End"])/2)
            
    return df

In [None]:
peak_by_location = pardf(peak_by_location.transpose(), par_peak_by_location, func_args=(files_dict,), num_processes=24).transpose()
peak_by_location = peak_by_location.sort_index()

In [None]:
peak_by_location.to_csv(os.path.join(save_dir,"peak_by_location.tsv"), sep="\t")

In [None]:
# for ind, f in enumerate(sample_files):
#     print(f)
#     print(columns[ind])
#     name = columns[ind]
#     curr_sample = pd.read_csv(f, sep="\t", index_col=0)
#     curr_merge = pd.read_csv(f.replace("sample","merged"), sep="\t", index_col=0)
#     curr_merge = curr_merge.loc[:,curr_merge.columns.str.contains("f04_peaks/")]
#     curr_merge.columns = list(map(lambda x: x.split("f04_peaks/")[1], curr_merge.columns.values))
#     curr_merge = curr_merge.loc[~(curr_merge[name].str.contains(",")), name] #Take only the column of the sample

#     for ind2, val in tqdm(curr_merge.iteritems()):
#         peak_by_location.loc[ind2, name] = (curr_sample.loc[val, "Start"] + curr_sample.loc[val, "End"])/2


## Merge the samples together into tissues

In [None]:
tissue_peaks_by_location = pd.DataFrame(index=peak_by_location.index, columns=tissues)
for t in tissues:
    tissue_peaks_by_location[t] = peak_by_location.loc[:,peak_by_location.columns.str.contains(t)].mean(axis=1, 
                                                                                                       skipna=True).apply(np.floor)
tissue_peaks_by_location.to_csv(os.path.join(save_dir,"tissue_peak_by_location.tsv"), sep="\t")

### Get peaks in CHO and not in CHO

In [None]:
tissue = "CHO"
peaks_in_cho = set(tissue_peaks_by_location.loc[~(tissue_peaks_by_location[tissue].isnull())].index)
print(len(peaks_in_cho))

peaks_not_in_cho = set(tissue_peaks_by_location[~tissue_peaks_by_location.isnull().all(axis=1)].drop(peaks_in_cho).index)
print(len(peaks_not_in_cho))
print(peaks_in_cho)
print(peaks_not_in_cho)

### Determine peaks that are in or not in agreement

In [None]:
def agree_peaks(tissue_peaks_by_location,tol=0, tissue='CHO'):    
    # Get the peaks in CHO first
    peaks_in_cho = set(tissue_peaks_by_location.loc[~(tissue_peaks_by_location[tissue].isnull())].index)
    # For all these CHO peaks, see which ones are i) unique, ii) seen in >n tissues and are the same. seen in >n tissues and are different
    no_cho_tissue_peaks_by_location = tissue_peaks_by_location.drop(tissue, axis=1)

    peaks_cho_agree = dict()
    peaks_cho_disagree = dict()

    for p in tqdm(peaks_in_cho):
        loc = tissue_peaks_by_location.loc[p, tissue]
        agree = (np.abs(no_cho_tissue_peaks_by_location.loc[p].dropna() - loc) <= tol ).sum()
        disagree = (np.abs(no_cho_tissue_peaks_by_location.loc[p].dropna() - loc) > tol ).sum()

        
        peaks_cho_agree[p] = agree
        peaks_cho_disagree[p] = disagree

    return peaks_cho_agree, peaks_cho_disagree

In [None]:
def retrieve_peaks(locations_df, meta_df, peaks_cho_agree, peaks_cho_disagree, n_ag, n_dis,n_tot):
    """Will extract peaks that have a certain qualification in terms of where the start signs align.
    Parameters:
    -----------
    locations_df: peak-by-sample df where each element is the peak location (assumes the same chromosome), or null if no peak detected there.
    """
    peaks_to_keep = set()
    for p in peaks_cho_agree:
        tot = peaks_cho_agree[p] + peaks_cho_disagree[p]
        if tot <= n_tot: 
            peaks_to_keep.add(p)
        elif peaks_cho_agree[p] >= n_ag and peaks_cho_disagree[p] >= n_dis:
            peaks_to_keep.add(p)
    return meta_df[meta_df["ID"].isin(peaks_to_keep)]

# With a tolerance of 0

In [None]:
peaks_cho_agree, peaks_cho_disagree = agree_peaks(tissue_peaks_by_location,tol=0, tissue='CHO')

f = plt.figure()
labels, values = zip(*Counter(peaks_cho_agree.values()).items())
plt.bar(labels, values)
plt.title("For each peak, the number of tissues that are on the same location tol=0")
#pd.Series(Counter(peaks_cho_agree)).plot.bar(color='y')


f = plt.figure()
labels, values = zip(*Counter(peaks_cho_disagree.values()).items())
plt.bar(labels, values)
plt.title("For each peak, the number of tissues that are on different location tol=0")


fraction_agree = []
for p in peaks_cho_agree:
    num_tot = peaks_cho_agree[p]+peaks_cho_disagree[p]
    if not num_tot == 0:
        fraction_agree.append(peaks_cho_agree[p]/(peaks_cho_agree[p]+peaks_cho_disagree[p]))

f = plt.figure()
plt.hist(fraction_agree)
plt.title("Fraction of tissues that have a peak that agree tol=0")

### A. TSS only in CHO

In [None]:
# Only CHO has peak:
#peaks_cho_agree, peaks_cho_disagree = agree_peaks(tissue_peaks_by_location,tol=0, tissue='CHO')
#

In [None]:
only_in_cho = retrieve_peaks(tissue_peaks_by_location, meta_df, peaks_cho_agree, peaks_cho_disagree, n_ag=20, n_dis=20,n_tot=0)
only_in_cho

### B. TSS offset  CHO and other tissues

In [None]:
cho_dis3 = retrieve_peaks(tissue_peaks_by_location, meta_df, peaks_cho_agree, peaks_cho_disagree, n_ag=0, n_dis=3,n_tot=-1)
cho_dis3

### C. TSS agree CHO and tissues

In [None]:
cho_ag3 = retrieve_peaks(tissue_peaks_by_location, meta_df, peaks_cho_agree, peaks_cho_disagree, n_ag=3, n_dis=0,n_tot=-1)
cho_ag3

### D. TSS offset from CHO, but take the tissue instead of CHO (using max or next max)

In [None]:
cho_dis3_maxDist_bed = bed_df.loc[cho_dis3.index]

for p in tqdm(cho_dis3["ID"].values):
    cho = tissue_peaks_by_location.loc[p, tissue]
    new_start = int(tissue_peaks_by_location.loc[p,(np.abs(cho - tissue_peaks_by_location.loc[p])).idxmax()])
    if new_start != 0:
        tss_index = meta_df[meta_df["ID"] == p].index[0]
        cho_dis3_maxDist_bed.loc[tss_index, "Start"] = new_start-76
        cho_dis3_maxDist_bed.loc[tss_index, "End"] = new_start+75



#### Save results as bed files

In [None]:
write_bed_file(cho_dis3_maxDist_bed, os.path.join(save_dir,"cho_disagree3_maxDistance_tol_0.tsv" ))
write_bed_file(bed_df.loc[cho_ag3.index], os.path.join(save_dir, "cho_agree3_tol_0.tsv"))
write_bed_file(bed_df.loc[cho_dis3.index], os.path.join(save_dir, "cho_disagree3_tol_0.tsv"))
write_bed_file(bed_df.loc[only_in_cho.index], os.path.join(save_dir, "cho_only_tol_0.tsv"))
write_bed_file(bed_df.loc[meta_df[meta_df["ID"].isin(peaks_not_in_cho)].index], os.path.join(save_dir, "not_in_cho.tsv"))

### E. TSS different across CHO samples

# With a tolerance of 10

In [None]:
peaks_cho_agree, peaks_cho_disagree = agree_peaks(tissue_peaks_by_location,tol=10, tissue='CHO')
f = plt.figure()
labels, values = zip(*Counter(peaks_cho_agree.values()).items())
plt.bar(labels, values)
plt.title("For each peak, the number of tissues that are on the same location tol=10")
#pd.Series(Counter(peaks_cho_agree)).plot.bar(color='y')


f = plt.figure()
labels, values = zip(*Counter(peaks_cho_disagree.values()).items())
plt.bar(labels, values)
plt.title("For each peak, the number of tissues that are on different location tol=10")


fraction_agree = []
for p in peaks_cho_agree:
    num_tot = peaks_cho_agree[p]+peaks_cho_disagree[p]
    if not num_tot == 0:
        fraction_agree.append(peaks_cho_agree[p]/(peaks_cho_agree[p]+peaks_cho_disagree[p]))

f = plt.figure()
plt.hist(fraction_agree)
plt.title("Fraction of tissues that have a peak that agree tol=10")

### A. TSS only in CHO

In [None]:
only_in_cho = retrieve_peaks(tissue_peaks_by_location, meta_df, peaks_cho_agree, peaks_cho_disagree, n_ag=20, n_dis=20,n_tot=0)
only_in_cho

### B. TSS offset  CHO and other tissues

In [None]:
cho_dis3 = retrieve_peaks(tissue_peaks_by_location, meta_df, peaks_cho_agree, peaks_cho_disagree, n_ag=0, n_dis=3,n_tot=-1)
cho_dis3

### C. TSS agree CHO and tissues

In [None]:
cho_ag3 = retrieve_peaks(tissue_peaks_by_location, meta_df, peaks_cho_agree, peaks_cho_disagree, n_ag=3, n_dis=0,n_tot=-1)
cho_ag3

### D. TSS different across CHO samples

# With a tolerance of 25

In [None]:
peaks_cho_agree, peaks_cho_disagree = agree_peaks(tissue_peaks_by_location,tol=25, tissue='CHO')
f = plt.figure()
labels, values = zip(*Counter(peaks_cho_agree.values()).items())
plt.bar(labels, values)
plt.title("For each peak, the number of tissues that are on the same location tol=25")
#pd.Series(Counter(peaks_cho_agree)).plot.bar(color='y')


f = plt.figure()
labels, values = zip(*Counter(peaks_cho_disagree.values()).items())
plt.bar(labels, values)
plt.title("For each peak, the number of tissues that are on different location tol=25")


fraction_agree = []
for p in peaks_cho_agree:
    num_tot = peaks_cho_agree[p]+peaks_cho_disagree[p]
    if not num_tot == 0:
        fraction_agree.append(peaks_cho_agree[p]/(peaks_cho_agree[p]+peaks_cho_disagree[p]))

f = plt.figure()
plt.hist(fraction_agree)
plt.title("Fraction of tissues that have a peak that agree tol=25")

### A. TSS only in CHO

In [None]:
only_in_cho = retrieve_peaks(tissue_peaks_by_location, meta_df, peaks_cho_agree, peaks_cho_disagree, n_ag=20, n_dis=20,n_tot=0)
only_in_cho

### B. TSS offset  CHO and other tissues

In [None]:
cho_dis3 = retrieve_peaks(tissue_peaks_by_location, meta_df, peaks_cho_agree, peaks_cho_disagree, n_ag=0, n_dis=3,n_tot=-1)
cho_dis3

### C. TSS agree CHO and tissues

In [None]:
cho_ag3 = retrieve_peaks(tissue_peaks_by_location, meta_df, peaks_cho_agree, peaks_cho_disagree, n_ag=3, n_dis=0,n_tot=-1)
cho_ag3

### D. TSS different across CHO samples

### E. Location is the maximum difference one

In [None]:
cho_dis3_maxDist_bed = bed_df.loc[cho_dis3.index]

for p in tqdm(cho_dis3["ID"].values):
    cho = tissue_peaks_by_location.loc[p, tissue]
    new_start = int(tissue_peaks_by_location.loc[p,(np.abs(cho - tissue_peaks_by_location.loc[p])).idxmax()])
    if new_start != 0:
        tss_index = meta_df[meta_df["ID"] == p].index[0]
        cho_dis3_maxDist_bed.loc[tss_index, "Start"] = new_start-76
        cho_dis3_maxDist_bed.loc[tss_index, "End"] = new_start+75



#### Save results as bed files

In [None]:
write_bed_file(cho_dis3_maxDist_bed, os.path.join(save_dir,"cho_disagree3_maxDistance_tol_25.tsv" ))
write_bed_file(bed_df.loc[cho_ag3.index], os.path.join(save_dir, "cho_agree3_tol_25.tsv"))
write_bed_file(bed_df.loc[cho_dis3.index], os.path.join(save_dir, "cho_disagree3_tol_25.tsv"))
write_bed_file(bed_df.loc[only_in_cho.index], os.path.join(save_dir, "cho_only_tol_25.tsv"))