----------------
## Params and loading packages

In [1]:
## Parameters specific to where your folders are and your data
parameter_file = 'params/params.yaml'
import yaml
import sys

with open(parameter_file,'r') as f:
    doc = yaml.load(f)

#p = dic2obj(**doc)

data_folder = doc['data_folder']
tissues = doc['tissues'].split(',')
sys.path.append(doc['pipeline_path'])
ref_fa = doc['ref_fa']
annotation=doc['annotation']



In [2]:

import os
import sys
import pandas as pd
import matplotlib
import seaborn as sns
import pickle
import numpy as np
import matplotlib as mpl
from matplotlib import pyplot as plt
from itertools import product
import glob
import re
from matplotlib_venn import venn2
from matplotlib import rcParams
import inspect
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
sys.setrecursionlimit(3000)
%load_ext autoreload
%autoreload 2
rcParams['figure.figsize'] = 8, 6
import tqdm
### notebook specific configuration ###
from os.path import basename
#mpl.style.use('ggplot')
mpl.style.use('fivethirtyeight')
from cycler import cycler
mpl.rcParams['axes.prop_cycle'] = cycler(color='bgrcmyk')

from Modules.Homer import *

print('Number of tissues: ',len(tissues))

('Number of tissues: ', 16)


### Till here
-------------------


# fastq files

In [3]:
raw_files = np.loadtxt(os.path.join(data_folder,'raw_files.csv'),delimiter='\t',
           dtype='str')

raw_files

array(['CHBloondnegSpleen_GRO_JHS1032_SD_TCCCGA_S47_L003_R1_001.fastq (3).gz',
       'CHBMDM1hKLA1_ATAC_JHS998_SD_TGGGTTTC_S9_L001_R1_001.fastq.gz',
       'CHBMDM1hKLA2_ATAC_JHS1000_SD_AGGTTGGG_S11_L001_R1_001.fastq.gz',
       'CHBMDM1hKLA_GRO_JHS1027_SD_CAAAAG_S42_L003_R1_001.fastq.gz',
       'CHBMDM1hKLA_GRO_JHS1034_SD_AGTCAA_S49_L003_R1_001.fastq.gz',
       'CHBMDMVehic1_ATAC_JHS997_SD_GTGTGGTG_S8_L001_R1_001.fastq.gz',
       'CHBMDMVehic2_ATAC_JHS999_SD_TGGTCACA_S10_L001_R1_001.fastq.gz',
       'CHBMDMwt_GRO_JHS1026_SD_ATTCCT_S41_L003_R1_001.fastq.gz',
       'CHBMDMwt_GRO_JHS1033_SD_TAGCTT_S48_L003_R1_001.fastq.gz',
       'CHBrain7neg1_mSTART_JHS1061_SD_CACGAT_S83_L004_R1_001.fastq.gz',
       'CHBrain_ATAC_JHS992_SD_ACCACTGT_S4_L001_R1_001.fastq.gz',
       'CHBrain_GRO_JHS1029_SD_CACCGG_S44_L003_R1_001.fastq.gz',
       'CHBrain_GRO_JHS1036_SD_ATGTCA_S51_L003_R1_001.fastq.gz',
       'CHBrain_mSTARTinput_JHS1078_SD_ATTCCT_S100_L005_R1_001.fastq.gz',
       'CHBrain_mSTAR

## Keys for meta_sample saving (along with tissues)

In [4]:
types = ['GROCap','GRO','START','STARTinput','ATAC','TotalRNA']
types_dict = {'GROCap':['5GRO'],'GRO':['_GRO_','-GRO-'],
              'START':['_mSTART_'],
              'STARTinput':['mSTARTinput','mSTART_input'],
              'ATAC':['ATAC'],'TotalRNA':['NL_']}


name_exceptions = {'Spleen': ['BloondnegSpleen', 'SpleednegBlood']}

f_out = 'raw_metaSamples.tsv'

# Loop through and fill in the file naming

In [5]:
raw_meta_samples = pd.DataFrame(columns=['Name','Experiment','Tissue','Full'])


for tissue in tissues:
    for s in types:
        '''Get keywords for tissue and experiment type'''
        ## Tissue name is in tissues list and may have an exceptions list 
        if tissue in name_exceptions:
            name_tissue = name_exceptions[tissue]
            name_tissue.append(tissue)
        else:
            name_tissue = [tissue]
        ## types have a list of keywords
        s_keyword = types_dict[s]
             
        ## Collect all files with tissue tissue and type s 
        curr_f = []
        for n in name_tissue:
            for key in s_keyword:
                curr_f.extend(list(raw_files[map(lambda x: (key in x) & (n in x ),raw_files)]))
        curr_f = set(curr_f)
        
        ## Loop through and add it to dataframe
        count = 1
        for f in curr_f:
            name = tissue + '_' + s + str(count)
            raw_meta_samples = raw_meta_samples.append(pd.DataFrame({'Experiment':s,'Tissue':tissue,'Name':name,'Full':f},
                                                                index=[os.path.basename(f)]))
            count += 1

raw_meta_samples = raw_meta_samples.sort_values(['Tissue','Experiment','Name'])
raw_meta_samples = raw_meta_samples[['Tissue','Experiment','Name','Full']]

print('Number of samples: ', raw_meta_samples.shape[0])
print('Saving to filename ' + f_out)
raw_meta_samples.to_csv(f_out,sep='\t')
raw_meta_samples


('Number of samples: ', 64)
Saving to filename raw_metaSamples.tsv


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


Unnamed: 0,Tissue,Experiment,Name,Full
CHBMDM1hKLA2_ATAC_JHS1000_SD_AGGTTGGG_S11_L001_R1_001.fastq.gz,BMDM1hKLA,ATAC,BMDM1hKLA_ATAC1,CHBMDM1hKLA2_ATAC_JHS1000_SD_AGGTTGGG_S11_L001...
CHBMDM1hKLA1_ATAC_JHS998_SD_TGGGTTTC_S9_L001_R1_001.fastq.gz,BMDM1hKLA,ATAC,BMDM1hKLA_ATAC2,CHBMDM1hKLA1_ATAC_JHS998_SD_TGGGTTTC_S9_L001_R...
CHBMDM1hKLA_GRO_JHS1027_SD_CAAAAG_S42_L003_R1_001.fastq.gz,BMDM1hKLA,GRO,BMDM1hKLA_GRO1,CHBMDM1hKLA_GRO_JHS1027_SD_CAAAAG_S42_L003_R1_...
CHBMDM1hKLA_GRO_JHS1034_SD_AGTCAA_S49_L003_R1_001.fastq.gz,BMDM1hKLA,GRO,BMDM1hKLA_GRO2,CHBMDM1hKLA_GRO_JHS1034_SD_AGTCAA_S49_L003_R1_...
CHBMDMwt_GRO_JHS1033_SD_TAGCTT_S48_L003_R1_001.fastq.gz,BMDMwt,GRO,BMDMwt_GRO1,CHBMDMwt_GRO_JHS1033_SD_TAGCTT_S48_L003_R1_001...
CHBMDMwt_GRO_JHS1026_SD_ATTCCT_S41_L003_R1_001.fastq.gz,BMDMwt,GRO,BMDMwt_GRO2,CHBMDMwt_GRO_JHS1026_SD_ATTCCT_S41_L003_R1_001...
CHBrain_ATAC_JHS992_SD_ACCACTGT_S4_L001_R1_001.fastq.gz,Brain,ATAC,Brain_ATAC1,CHBrain_ATAC_JHS992_SD_ACCACTGT_S4_L001_R1_001...
CHBrain_GRO_JHS1029_SD_CACCGG_S44_L003_R1_001.fastq.gz,Brain,GRO,Brain_GRO1,CHBrain_GRO_JHS1029_SD_CACCGG_S44_L003_R1_001....
CHBrain_GRO_JHS1036_SD_ATGTCA_S51_L003_R1_001.fastq.gz,Brain,GRO,Brain_GRO2,CHBrain_GRO_JHS1036_SD_ATGTCA_S51_L003_R1_001....
CHBrain_mSTART_JHS1057_SD_ATTCCT_S79_L004_R1_001.fastq.gz,Brain,START,Brain_START1,CHBrain_mSTART_JHS1057_SD_ATTCCT_S79_L004_R1_0...


# Peaks meta_sample

In [3]:
meta_samples = pd.DataFrame(columns=['Experiment','Tissue','Short','Full','Short_with_number'])
for curr_tissue in tissues:
    curr_raw = glob.glob(data_folder + curr_tissue + '/*/')
    for j in curr_raw:
        curr_type = j.split('/')[-2]  
        if not curr_type == 'ATAC':
            count = 1
            for f in  glob.glob(j + '*f04_peaks/merge*bg*peak'):
                short = curr_tissue + '_' + curr_type
                short_count = short + str(count)
                meta_samples = meta_samples.append(pd.DataFrame({'Tissue':curr_tissue, 'Experiment':curr_type,
                                                                'Short': short,'Short_with_number': short_count,
                                                                'Full': f},
                                                                index=[f.split(data_folder)[-1]]))
            count += 1
meta_samples 

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


Unnamed: 0,Experiment,Full,Short,Short_with_number,Tissue
BMDM1hKLA/GRO/f04_peaks/merge_bg_2.peak,GRO,/data/isshamie/TSS/Processed/BMDM1hKLA/GRO/f04...,BMDM1hKLA_GRO,BMDM1hKLA_GRO1,BMDM1hKLA
BMDMwt/GRO/f04_peaks/merge_bg_2.peak,GRO,/data/isshamie/TSS/Processed/BMDMwt/GRO/f04_pe...,BMDMwt_GRO,BMDMwt_GRO1,BMDMwt
Brain/GRO/f04_peaks/merge_bg_2.peak,GRO,/data/isshamie/TSS/Processed/Brain/GRO/f04_pea...,Brain_GRO,Brain_GRO1,Brain
Brain/mSTART/f04_peaks/merge_bg_2.peak,mSTART,/data/isshamie/TSS/Processed/Brain/mSTART/f04_...,Brain_mSTART,Brain_mSTART1,Brain
Heart/mSTART/f04_peaks/merge_bg_2.peak,mSTART,/data/isshamie/TSS/Processed/Heart/mSTART/f04_...,Heart_mSTART,Heart_mSTART1,Heart
Kidney/GRO/f04_peaks/merge_bg_2.peak,GRO,/data/isshamie/TSS/Processed/Kidney/GRO/f04_pe...,Kidney_GRO,Kidney_GRO1,Kidney
Kidney/mSTART/f04_peaks/merge_bg_2.peak,mSTART,/data/isshamie/TSS/Processed/Kidney/mSTART/f04...,Kidney_mSTART,Kidney_mSTART1,Kidney
Liver/GRO/f04_peaks/merge_bg_2.peak,GRO,/data/isshamie/TSS/Processed/Liver/GRO/f04_pea...,Liver_GRO,Liver_GRO1,Liver
Liver/mSTART/f04_peaks/merge_bg_2.peak,mSTART,/data/isshamie/TSS/Processed/Liver/mSTART/f04_...,Liver_mSTART,Liver_mSTART1,Liver
Lung/GRO/f04_peaks/merge_bg_2.peak,GRO,/data/isshamie/TSS/Processed/Lung/GRO/f04_peak...,Lung_GRO,Lung_GRO1,Lung


In [4]:
meta_samples.to_csv('Results/meta_samples.csv')

## Long meta

In [5]:
long_meta_samples = pd.DataFrame(columns=['Experiment','Short','Tissue','Full'])
for tissue in tissues:
    curr_raw = glob.glob(data_folder + tissue + '/*/')
    for j in curr_raw:
        exp = j.split('/')[-2]  
        if not exp == 'ATAC':
            count = 1
            for f in  glob.glob(j + '*f04_peaks/trim*.peak'):
                print(f)
                short = tissue + '_' + exp
                long_meta_samples = long_meta_samples.append(pd.DataFrame({'Experiment':exp,'Tissue':tissue,'Short':short,'Full':f},
                                                                          index=[f.split(data_folder)[-1]]))


/data/isshamie/TSS/Processed/BMDM1hKLA/GRO/f04_peaks/trim_CHBMDM1hKLA_5GRO_JHS1034_SD_AGTCAA_S49_L003_R1_001.fastq_and_trim_CHBMDM1hKLA_GRO_JHS1027_SD_CAAAAG_S42_L003_R1_001.fastq_bg_2.peak
/data/isshamie/TSS/Processed/BMDMwt/GRO/f04_peaks/trim_CHBMDMwt_5GRO_JHS1033_SD_TAGCTT_S48_L003_R1_001.fastq_and_trim_CHBMDMwt_GRO_JHS1026_SD_ATTCCT_S41_L003_R1_001.fastq_bg_2.peak
/data/isshamie/TSS/Processed/Brain/GRO/f04_peaks/trim_CHBrain_5GRO_JHS1036_SD_ATGTCA_S51_L003_R1_001.fastq_and_trim_CHBrain_GRO_JHS1029_SD_CACCGG_S44_L003_R1_001.fastq_bg_2.peak
/data/isshamie/TSS/Processed/Brain/mSTART/f04_peaks/trim_CHBrain_mSTART_JHS1057_SD_ATTCCT_S79_L004_R1_001.fastq_and_trim_CHBrain_mSTARTinput_JHS1078_SD_ATTCCT_S100_L005_R1_001.fastq_bg_2.peak
/data/isshamie/TSS/Processed/Brain/mSTART/f04_peaks/trim_CHBrain_mSTART_JHS1016_SD_GTAGAG_S28_L002_R1_001.fastq_and_trim_CHBrain_mSTARTinput_JHS1078_SD_ATTCCT_S100_L005_R1_001.fastq_bg_2.peak
/data/isshamie/TSS/Processed/Heart/mSTART/f04_peaks/trim_CHHeart_mS

In [6]:
long_meta_samples

Unnamed: 0,Experiment,Full,Short,Tissue
BMDM1hKLA/GRO/f04_peaks/trim_CHBMDM1hKLA_5GRO_JHS1034_SD_AGTCAA_S49_L003_R1_001.fastq_and_trim_CHBMDM1hKLA_GRO_JHS1027_SD_CAAAAG_S42_L003_R1_001.fastq_bg_2.peak,GRO,/data/isshamie/TSS/Processed/BMDM1hKLA/GRO/f04...,BMDM1hKLA_GRO,BMDM1hKLA
BMDMwt/GRO/f04_peaks/trim_CHBMDMwt_5GRO_JHS1033_SD_TAGCTT_S48_L003_R1_001.fastq_and_trim_CHBMDMwt_GRO_JHS1026_SD_ATTCCT_S41_L003_R1_001.fastq_bg_2.peak,GRO,/data/isshamie/TSS/Processed/BMDMwt/GRO/f04_pe...,BMDMwt_GRO,BMDMwt
Brain/GRO/f04_peaks/trim_CHBrain_5GRO_JHS1036_SD_ATGTCA_S51_L003_R1_001.fastq_and_trim_CHBrain_GRO_JHS1029_SD_CACCGG_S44_L003_R1_001.fastq_bg_2.peak,GRO,/data/isshamie/TSS/Processed/Brain/GRO/f04_pea...,Brain_GRO,Brain
Brain/mSTART/f04_peaks/trim_CHBrain_mSTART_JHS1057_SD_ATTCCT_S79_L004_R1_001.fastq_and_trim_CHBrain_mSTARTinput_JHS1078_SD_ATTCCT_S100_L005_R1_001.fastq_bg_2.peak,mSTART,/data/isshamie/TSS/Processed/Brain/mSTART/f04_...,Brain_mSTART,Brain
Brain/mSTART/f04_peaks/trim_CHBrain_mSTART_JHS1016_SD_GTAGAG_S28_L002_R1_001.fastq_and_trim_CHBrain_mSTARTinput_JHS1078_SD_ATTCCT_S100_L005_R1_001.fastq_bg_2.peak,mSTART,/data/isshamie/TSS/Processed/Brain/mSTART/f04_...,Brain_mSTART,Brain
Heart/mSTART/f04_peaks/trim_CHHeart_mSTART_JHS1056_SD_ATGAGC_S78_L004_R1_001.fastq_and_trim_CHHeart_mSTARTinput_JHS1077_SD_ATGAGC_S99_L005_R1_001.fastq_bg_2.peak,mSTART,/data/isshamie/TSS/Processed/Heart/mSTART/f04_...,Heart_mSTART,Heart
Heart/mSTART/f04_peaks/trim_CHHeart_mSTART_JHS1013_SD_AGTTCC_S25_L002_R1_001.fastq_and_trim_CHHeart_mSTARTinput_JHS1077_SD_ATGAGC_S99_L005_R1_001.fastq_bg_2.peak,mSTART,/data/isshamie/TSS/Processed/Heart/mSTART/f04_...,Heart_mSTART,Heart
Kidney/GRO/f04_peaks/trim_CHKidney_5GRO_JHS1035_SD_AGTTCC_S50_L003_R1_001.fastq_and_trim_CHKidney_GRO_JHS1028_SD_CAACTA_S43_L003_R1_001.fastq_bg_2.peak,GRO,/data/isshamie/TSS/Processed/Kidney/GRO/f04_pe...,Kidney_GRO,Kidney
Kidney/mSTART/f04_peaks/trim_CHKidney_mSTART_JHS1022_SD_GAGTGG_S34_L002_R1_001.fastq_and_trim_CHKidney_mSTARTinput_JHS1080_SD_CAACTA_S102_L005_R1_001.fastq_bg_2.peak,mSTART,/data/isshamie/TSS/Processed/Kidney/mSTART/f04...,Kidney_mSTART,Kidney
Kidney/mSTART/f04_peaks/trim_CHKidney_mSTART_JHS1059_SD_CAACTA_S81_L004_R1_001.fastq_and_trim_CHKidney_mSTARTinput_JHS1080_SD_CAACTA_S102_L005_R1_001.fastq_bg_2.peak,mSTART,/data/isshamie/TSS/Processed/Kidney/mSTART/f04...,Kidney_mSTART,Kidney


In [7]:
long_meta_samples.to_csv('Results/long_meta_samples.csv')

## meta QC 
A. Sequencing reads
B. Number of reads mapped
C. Information entropy (score for duplication sequences)