In [1]:
import os
import sys
import pandas as pd
import matplotlib
import seaborn as sns
import pickle
import numpy as np
import matplotlib as mpl
from matplotlib import pyplot as plt
from itertools import product
import glob
import re
from matplotlib_venn import venn2
from matplotlib import rcParams
import inspect
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
sys.setrecursionlimit(3000)
sns.set_context('notebook')
%load_ext autoreload
%autoreload 2
rcParams['figure.figsize'] = 8, 6
#rcParams['font.family'] = 'sans-serif'
#rcParams['font.sans-serif'] = ['Tahoma']
import tqdm
### notebook specific configuration ###
from os.path import basename
mpl.style.use('fivethirtyeight')

from Modules.Homer import *

sys.path.append('/home/isshamie/software/NGS-Pipeline')

ref_fa = '/data/genome/hamster/picr/picr.fa'
annotation = '/data/genome/hamster/picr/updated_final_sort.gff3'

mrna_peak = '/data/isshamie/TSS_CHO/mRNA.peak'

tissues = ['BloondnegSpleen','BMDM1hKLA1','BMDMwt','Brain','Brain7neg1',
          'FemaleReproductive','Heart','Intestine','Kidney','Liver','Lung',
           'MiscOrgans','Muscle','Pancreas','Skin','Spleen','Total','CHO']

len(tissues)



This call to matplotlib.use() has no effect because the backend has already
been chosen; matplotlib.use() must be called *before* pylab, matplotlib.pyplot,
or matplotlib.backends is imported for the first time.



18

In [2]:
data_folder = '/data/isshamie/CH_tissue_TSS/'

In [3]:
meta_samples = pd.DataFrame(columns=['Experiment','Tissue','Short'])
for curr_tissue in tissues:
    curr_raw = glob.glob(data_folder + curr_tissue + '/*/')
    for j in curr_raw:
        curr_type = j.split('/')[-2]  
        if not curr_type == 'ATAC':
            count = 1
            for f in  glob.glob(j + '*f04_peaks/merge*bg*peak'):
                meta_samples = meta_samples.append(pd.DataFrame({'Tissue':curr_tissue, 'Experiment':curr_type,
                                                                'Short': curr_tissue + '_' + curr_type},
                                                                index=[f.split(data_folder)[-1]]))
meta_samples 

Unnamed: 0,Experiment,Short,Tissue
BloondnegSpleen/GRO/f04_peaks/merge_bg_2.peak,GRO,BloondnegSpleen_GRO,BloondnegSpleen
BloondnegSpleen/mSTART/f04_peaks/merge_bg_2.peak,mSTART,BloondnegSpleen_mSTART,BloondnegSpleen
BMDM1hKLA1/GRO/f04_peaks/merge_bg_2.peak,GRO,BMDM1hKLA1_GRO,BMDM1hKLA1
BMDMwt/GRO/f04_peaks/merge_bg_2.peak,GRO,BMDMwt_GRO,BMDMwt
Brain/GRO/f04_peaks/merge_bg_2.peak,GRO,Brain_GRO,Brain
Brain/mSTART/f04_peaks/merge_bg_2.peak,mSTART,Brain_mSTART,Brain
Brain7neg1/mSTART/f04_peaks/merge_bg_2.peak,mSTART,Brain7neg1_mSTART,Brain7neg1
FemaleReproductive/mSTART/f04_peaks/merge_bg_2.peak,mSTART,FemaleReproductive_mSTART,FemaleReproductive
Heart/mSTART/f04_peaks/merge_bg_2.peak,mSTART,Heart_mSTART,Heart
Intestine/mSTART/f04_peaks/merge_bg_2.peak,mSTART,Intestine_mSTART,Intestine


In [4]:
meta_samples.to_csv('Results/meta_samples.csv')

## Long meta

In [5]:
long_meta_samples = pd.DataFrame(columns=['Experiment','Short','Tissue'])
for tissue in tissues:
    curr_raw = glob.glob(data_folder + tissue + '/*/')
    for j in curr_raw:
        exp = j.split('/')[-2]  
        if not exp == 'ATAC':
            count = 1
            for f in  glob.glob(j + '*f04_peaks/trim*.peak'):
                print(f)
                short = tissue + '_' + exp
                long_meta_samples = long_meta_samples.append(pd.DataFrame({'Experiment':exp,'Tissue':tissue,'Short':short},
                                                                          index=[f.split(data_folder)[-1]]))

/data/isshamie/CH_tissue_TSS/BloondnegSpleen/GRO/f04_peaks/trim_CHBloondnegSpleen_5GRO_JHS1039_SD_GTCCGC_S54_L003_R1_001_and_trim_CHBloondnegSpleen_GRO_JHS1032_SD_TCCCGA_S47_L003_R1_001_bg_2.peak
/data/isshamie/CH_tissue_TSS/BMDM1hKLA1/GRO/f04_peaks/trim_CHBMDM1hKLA_5GRO_JHS1034_SD_AGTCAA_S49_L003_R1_001_and_trim_CHBMDM1hKLA_GRO_JHS1027_SD_CAAAAG_S42_L003_R1_001_bg_2.peak
/data/isshamie/CH_tissue_TSS/BMDMwt/GRO/f04_peaks/trim_CHBMDMwt_5GRO_JHS1033_SD_TAGCTT_S48_L003_R1_001_and_trim_CHBMDMwt_GRO_JHS1026_SD_ATTCCT_S41_L003_R1_001_bg_2.peak
/data/isshamie/CH_tissue_TSS/Brain/GRO/f04_peaks/trim_CHBrain_5GRO_JHS1036_SD_ATGTCA_S51_L003_R1_001_and_trim_CHBrain_GRO_JHS1029_SD_CACCGG_S44_L003_R1_001_bg_2.peak
/data/isshamie/CH_tissue_TSS/Brain/mSTART/f04_peaks/trim_CHBrain_mSTART_JHS1016_SD_GTAGAG_S28_L002_R1_001_and_trim_CHBrain_mSTARTinput_JHS1078_SD_ATTCCT_S100_L005_R1_001_bg_2.peak
/data/isshamie/CH_tissue_TSS/Brain/mSTART/f04_peaks/trim_CHBrain_mSTART_JHS1016_SD_GTAGAG_S28_L002_R1_001_and_

In [6]:
long_meta_samples.to_csv('Results/long_meta_samples.csv')

In [7]:
long_meta_samples.head()

Unnamed: 0,Experiment,Short,Tissue
BloondnegSpleen/GRO/f04_peaks/trim_CHBloondnegSpleen_5GRO_JHS1039_SD_GTCCGC_S54_L003_R1_001_and_trim_CHBloondnegSpleen_GRO_JHS1032_SD_TCCCGA_S47_L003_R1_001_bg_2.peak,GRO,BloondnegSpleen_GRO,BloondnegSpleen
BMDM1hKLA1/GRO/f04_peaks/trim_CHBMDM1hKLA_5GRO_JHS1034_SD_AGTCAA_S49_L003_R1_001_and_trim_CHBMDM1hKLA_GRO_JHS1027_SD_CAAAAG_S42_L003_R1_001_bg_2.peak,GRO,BMDM1hKLA1_GRO,BMDM1hKLA1
BMDMwt/GRO/f04_peaks/trim_CHBMDMwt_5GRO_JHS1033_SD_TAGCTT_S48_L003_R1_001_and_trim_CHBMDMwt_GRO_JHS1026_SD_ATTCCT_S41_L003_R1_001_bg_2.peak,GRO,BMDMwt_GRO,BMDMwt
Brain/GRO/f04_peaks/trim_CHBrain_5GRO_JHS1036_SD_ATGTCA_S51_L003_R1_001_and_trim_CHBrain_GRO_JHS1029_SD_CACCGG_S44_L003_R1_001_bg_2.peak,GRO,Brain_GRO,Brain
Brain/mSTART/f04_peaks/trim_CHBrain_mSTART_JHS1016_SD_GTAGAG_S28_L002_R1_001_and_trim_CHBrain_mSTARTinput_JHS1078_SD_ATTCCT_S100_L005_R1_001_bg_2.peak,mSTART,Brain_mSTART,Brain


## meta QC 
A. Sequencing reads
B. Number of reads mapped
C. Information entropy (score for duplication sequences)