In [32]:
import os
import pandas as pd
import sys
sys.path.insert(0, '../..')
import numpy as np
import itertools

from JKBio import Helper as h

from bokeh.plotting import *
from IPython.display import IFrame
import igv

from sklearn.manifold import MDS, TSNE
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from sklearn.cluster import AgglomerativeClustering
from sklearn.mixture import GaussianMixture

output_notebook()
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Processing ChipSeq for Cobinding

In [7]:
version="v2"
project="cobinding"

## finding all the relevant files for chipseq processing

In [4]:
a = !gsutil ls gs://amlproject/Chip/fastqs

In [5]:
a

['gs://amlproject/Chip/fastqs/mp100-U937-INPUT-r1.fastq.gz',
 'gs://amlproject/Chip/fastqs/mp101-NOMO1-INPUT-r1.fastq.gz',
 'gs://amlproject/Chip/fastqs/mp102-UT7-INPUT-r1.fastq.gz',
 'gs://amlproject/Chip/fastqs/mp106-MV411-MYB-r1.fastq.gz',
 'gs://amlproject/Chip/fastqs/mp109-M6-CEBPA-r1.fastq.gz',
 'gs://amlproject/Chip/fastqs/mp112-MV411-CEBPA-r1.fastq.gz',
 'gs://amlproject/Chip/fastqs/mp115-MV411-PU1-r1.fastq.gz',
 'gs://amlproject/Chip/fastqs/mp116-MV411-MYB-r2.fastq.gz',
 'gs://amlproject/Chip/fastqs/mp117-MV411-POLII-r1.fastq.gz',
 'gs://amlproject/Chip/fastqs/mp118-MV411-SP1-r1.fastq.gz',
 'gs://amlproject/Chip/fastqs/mp120-HL60-INPUT-r1.fastq.gz',
 'gs://amlproject/Chip/fastqs/mp121-UCSDAML1-INPUT-r1.fastq.gz',
 'gs://amlproject/Chip/fastqs/mp122-M6-CEBPA-r2.fastq.gz',
 'gs://amlproject/Chip/fastqs/mp123-M6-RUNX1-r1.fastq.gz',
 'gs://amlproject/Chip/fastqs/mp124-M6-MYB-r1.fastq.gz',
 'gs://amlproject/Chip/fastqs/mp125-M6-ETV6-r1.fastq.gz',
 'gs://amlproject/Chip/fastqs/mp127

## looking for duplicates

In [None]:
import re
mps=[]
for i in a:
    res = re.search('mp(\d+)', i)
    if res is not None:
        mps.append(res.group(0))

In [None]:
mps.sort()

In [None]:
mps

In [None]:
duplicates=[ 'gs://amlproject/Chip/fastqs/Sample_mp582-MV411-MEIS1-r1.fastq.gz',
'gs://amlproject/Chip/fastqs/Sample_mp587-MV411-IKZF1-r1.fastq.gz',
,
'gs://amlproject/Chip/fastqs/mp88-MV411-H3K27ac-r2.fastq.gz']

In [None]:
for val in duplicates:
    os.system('gsutil rm '+val)

In [None]:
paired_end_tomove= ['gs://amlproject/Chip/fastqs/mp304-D9-PU1-r2_S1_R2.fastq.gz',
 'gs://amlproject/Chip/fastqs/mp305-SKNO1-PU1-r1_S2_R2.fastq.gz',
 'gs://amlproject/Chip/fastqs/mp320-MV411-ZEB2-r1_R2.fastq.gz',
 'gs://amlproject/Chip/fastqs/mp319-MV411-FLI1-r1_R2.fastq.gz',
 'gs://amlproject/Chip/fastqs/mp308-SKNO1-RUNX1-RUNX1T1-r1_S5_R2.fastq.gz',
 'gs://amlproject/Chip/fastqs/mp300-MV411-RUNX1-r3_R2.fastq.gz',
 'gs://amlproject/Chip/fastqs/mp321-MV411-GFI1-r4_R2.fastq.gz',
 'gs://amlproject/Chip/fastqs/mp324-MV411-MEF2D-r1_R2.fastq.gz',
 'gs://amlproject/Chip/fastqs/mp299-MV411-CEBPA-r3_S7_R2.fastq.gz'
 'gs://amlproject/Chip/fastqs/mp325-MV411-SP1-r2_R2.fastq.gz']

In [None]:
for val in paired_end_tomove:
    os.system('gsutil mv '+val+' gs://amlproject/Chip/fastqs/paired_end/')

## renamings

In [None]:
# renaming files
rename1 = {
'gs://amlproject/Chip/fastqs/20180709_1_MP5773_S1.fastq.gz': 'gs://amlproject/Chip/fastqs/mp631.fastq.gz',
 'gs://amlproject/Chip/fastqs/20180709_3_MP5773_S3.fastq.gz': 'gs://amlproject/Chip/fastqs/mp633.fastq.gz',
 'gs://amlproject/Chip/fastqs/20180709_5_MP5773_S5.fastq.gz': 'gs://amlproject/Chip/fastqs/mp635.fastq.gz',
 'gs://amlproject/Chip/fastqs/20180709_6_MP5773_S6.fastq.gz': 'gs://amlproject/Chip/fastqs/mp636.fastq.gz',
 'gs://amlproject/Chip/fastqs/20190718_MP-11_MP7213-2of2_S4_R1_001.fastq.gz': 'gs://amlproject/Chip/fastqs/mp748.fastq.gz',
 'gs://amlproject/Chip/fastqs/20190718_MP-13_MP7213-2of2_S5_R1_001.fastq.gz': 'gs://amlproject/Chip/fastqs/mp750.fastq.gz',
 'gs://amlproject/Chip/fastqs/20190718_MP-14_MP7213-2of2_S6_R1_001.fastq.gz': 'gs://amlproject/Chip/fastqs/mp751.fastq.gz',
 'gs://amlproject/Chip/fastqs/20190718_MP-2_MP7213-1of2_S2_R1_001.fastq.gz': 'gs://amlproject/Chip/fastqs/mp739.fastq.gz',
 'gs://amlproject/Chip/fastqs/20190718_MP-4_MP7213-1of2_S4_R1_001.fastq.gz': 'gs://amlproject/Chip/fastqs/mp741.fastq.gz',
 'gs://amlproject/Chip/fastqs/20190718_MP-5_MP7213-1of2_S5_R1_001.fastq.gz': 'gs://amlproject/Chip/fastqs/mp742.fastq.gz',
 'gs://amlproject/Chip/fastqs/20190718_MP-6_MP7213-1of2_S6_R1_001.fastq.gz': 'gs://amlproject/Chip/fastqs/mp743.fastq.gz',
 'gs://amlproject/Chip/fastqs/20190718_MP-8_MP7213-2of2_S1_R1_001.fastq.gz': 'gs://amlproject/Chip/fastqs/mp745.fastq.gz',
 'gs://amlproject/Chip/fastqs/20190826_10_MB7326-2of2_S5.fastq.gz': 'gs://amlproject/Chip/fastqs/mp761.fastq.gz',
 'gs://amlproject/Chip/fastqs/20190826_11_MB7326-2of2_S6.fastq.gz': 'gs://amlproject/Chip/fastqs/mp762.fastq.gz',
 'gs://amlproject/Chip/fastqs/20190826_9_MB7326-2of2_S4.fastq.gz': 'gs://amlproject/Chip/fastqs/mp760.fastq.gz',
 'gs://amlproject/Chip/fastqs/D0-INPUT.fastq.gz': 'gs://amlproject/Chip/fastqs/mp8.fastq.gz',
 'gs://amlproject/Chip/fastqs/D9-INPUT.fastq.gz': 'gs://amlproject/Chip/fastqs/mp14.fastq.gz',
 'gs://amlproject/Chip/fastqs/D9_CEBPA_m.fastq.gz': 'gs://amlproject/Chip/fastqs/mp27.fastq.gz',
 'gs://amlproject/Chip/fastqs/D9_INPUT.fastq.gz': 'gs://amlproject/Chip/fastqs/mp35.fastq.gz',
 'gs://amlproject/Chip/fastqs/MP29.fastq.gz': 'gs://amlproject/Chip/fastqs/mp4.fastq.gz',
 'gs://amlproject/Chip/fastqs/MP30.fastq.gz': 'gs://amlproject/Chip/fastqs/mp5.fastq.gz',
 'gs://amlproject/Chip/fastqs/MP31.fastq.gz': 'gs://amlproject/Chip/fastqs/mp6.fastq.gz',
 'gs://amlproject/Chip/fastqs/OCI_AML2-INPUT.fastq.gz': 'gs://amlproject/Chip/fastqs/mp24.fastq.gz',
 'gs://amlproject/Chip/fastqs/OCI_AML2_CEBPA_m.fastq.gz': 'gs://amlproject/Chip/fastqs/mp41.fastq.gz',
 'gs://amlproject/Chip/fastqs/OCI_AML2_INPUT.fastq.gz': 'gs://amlproject/Chip/fastqs/mp33.fastq.gz',
 'gs://amlproject/Chip/fastqs/Sample_10-D8-SP1.fastq.gz': 'gs://amlproject/Chip/fastqs/mp156.fastq.gz',
 'gs://amlproject/Chip/fastqs/Sample_12-MV411-SPI1.fastq.gz': 'gs://amlproject/Chip/fastqs/mp115.fastq.gz',
 'gs://amlproject/Chip/fastqs/Sample_15-MV411-SP1.fastq.gz': 'gs://amlproject/Chip/fastqs/mp118.fastq.gz',
 'gs://amlproject/Chip/fastqs/Sample_18-D8-MED1.fastq.gz': 'gs://amlproject/Chip/fastqs/mp160.fastq.gz',
 'gs://amlproject/Chip/fastqs/Sample_19-M6-CEBPA.fastq.gz': 'gs://amlproject/Chip/fastqs/mp122.fastq.gz',
 'gs://amlproject/Chip/fastqs/Sample_20-M6-RUNX1.fastq.gz': 'gs://amlproject/Chip/fastqs/mp123.fastq.gz',
 'gs://amlproject/Chip/fastqs/Sample_21-M6-MYB.fastq.gz': 'gs://amlproject/Chip/fastqs/mp124.fastq.gz',
 'gs://amlproject/Chip/fastqs/Sample_22-M6-ETV6.fastq.gz': 'gs://amlproject/Chip/fastqs/mp125.fastq.gz',
 'gs://amlproject/Chip/fastqs/Sample_24-M6-SPI1.fastq.gz': 'gs://amlproject/Chip/fastqs/mp171.fastq.gz',
 'gs://amlproject/Chip/fastqs/Sample_25-M6-SP1.fastq.gz': 'gs://amlproject/Chip/fastqs/mp127.fastq.gz',
 'gs://amlproject/Chip/fastqs/Sample_26-M6-MED1.fastq.gz': 'gs://amlproject/Chip/fastqs/mp128.fastq.gz',
 'gs://amlproject/Chip/fastqs/Sample_27-MV411-RUNX1.fastq.gz': 'gs://amlproject/Chip/fastqs/mp129.fastq.gz',
 'gs://amlproject/Chip/fastqs/Sample_27-OCIAML2-SPI1.fastq.gz': 'gs://amlproject/Chip/fastqs/mp168.fastq.gz',
 'gs://amlproject/Chip/fastqs/Sample_31.M6.INPUT.fastq.gz': 'gs://amlproject/Chip/fastqs/mp98.fastq.gz',
 'gs://amlproject/Chip/fastqs/Sample_41-M6-IGG.fastq.gz': 'gs://amlproject/Chip/fastqs/mp143.fastq.gz',
 'gs://amlproject/Chip/fastqs/Sample_6-M6-CEBPA.fastq.gz': 'gs://amlproject/Chip/fastqs/mp109.fastq.gz',
 'gs://amlproject/Chip/fastqs/Sample_7-D8-CEBPA.fastq.gz': 'gs://amlproject/Chip/fastqs/mp153.fastq.gz',
 'gs://amlproject/Chip/fastqs/Sample_8-D8-SPI1.fastq.gz': 'gs://amlproject/Chip/fastqs/mp154.fastq.gz',
 'gs://amlproject/Chip/fastqs/Sample_9-D8-MYB.fastq.gz': 'gs://amlproject/Chip/fastqs/mp155.fastq.gz'
}

In [None]:
for k, val in rename1.items():
    os.system('gsutil mv '+k+' '+val)

In [None]:
## IT SHOULD NOT CAUSE ANY PROBLEM TO CP R2 SINCE IT DOES NOT EXIST 
#(will throw an error and skip)
import re
import time
mps=[]
for i in a:
    res = re.search('mp(\d+)', i)
    if res is not None:
        print('gsutil mv '+i+' gs://amlproject/Chip/fastqs/'+res.group(0)+'.fastq.gz')
        time.sleep(2)
        os.system('gsutil mv '+i+' gs://amlproject/Chip/fastqs/'+res.group(0)+'.fastq.gz')

In [None]:
!gsutil ls gs://amlproject/Chip/fastqs/

In [None]:
# EXCEPT these two... TO RETRIEVE
gs://amlproject/Chip/fastqs/mp300-MV411-RUNX1-r3_R2.fastq.gz
gs://amlproject/Chip/fastqs/mp300-MV411-RUNX1-r3_R1.fastq.gz

In [None]:
stored = !gsutil ls -a gs://amlproject/Chip/fastqs/mp300-MV411-RUNX1-r3_R2.fastq.gz
stored=stored[0]
! gsutil cp $stored gs://amlproject/Chip/fastqs/mp300-MV411-RUNX1-r3_R2.fastq.gz

In [None]:
a

In [None]:
#continuing
a[85]

In [None]:
import re
import time
mps=[]
for i in a[86:]:
    res = re.search('mp(\d+)', i)
    if res is not None:
        print('gsutil mv '+i+' gs://amlproject/Chip/fastqs/'+res.group(0)+'.fastq.gz')
        time.sleep(2)
        os.system('gsutil mv '+i+' gs://amlproject/Chip/fastqs/'+res.group(0)+'.fastq.gz')

## adding metadata and creating the design file

In [None]:
from gsheets import Sheets
sheets = Sheets.from_files('~/.client_secret.json', '~/.storage.json')

In [None]:
url="https://docs.google.com/spreadsheets/d/14onj_lD6WrjSEsE5dWJid6lGdJ81O6qpcetP5_Z0FxQ"
gsheets = sheets.get(url).sheets[1].to_frame()

In [None]:
gsheets

In [None]:
gsheets = gsheets.set_index('Unique sample ID')

In [None]:
a = !gsutil -m ls gs://amlproject/Chip/fastqs/
a

In [None]:
data = pd.DataFrame({'fastq_1':a[:-1]}, index=[i.split('/')[-1].split('.')[0] for i in a[:-1]])

In [None]:
data[['quality','Protein','Replicate No','Cell line ID']] = gsheets[
    ['quality','Protein','Replicate No','Cell line ID']]

In [None]:
data['fastq_2']= np.NaN

In [None]:
data = data.rename(columns={'Protein': 'antibody','Cell line ID': 'group','Replicate No': 'replicate'})

In [None]:
data.loc[data[data.antibody.isin(['INPUT','IGG'])].index,'group'] = \
data[data.antibody.isin(['INPUT','IGG'])].antibody +'_'+\
data[data.antibody.isin(['INPUT','IGG'])].group

In [None]:
data.loc[data[~data.antibody.isin(['INPUT','IGG'])].index,'control'] = \
'INPUT_'+\
data[~data.antibody.isin(['INPUT','IGG'])].group

In [None]:
data

In [None]:
set(data.control)-set(data.group)

In [None]:
withoutinput = ["HEL",
                "SHI1",
                "EOL1",
                "F36P"]

In [None]:
# requesting them and downloading them...

In [None]:
rename = {'gs://amlproject/Chip/fastqs/1_MP5835-1of4_S1_R1_001.fastq.gz':"gs://amlproject/Chip/fastqs/mp643.fastq.gz",
 'gs://amlproject/Chip/fastqs/Sample_17-HL60-INPUT.fastq.gz':"gs://amlproject/Chip/fastqs/mp120.fastq.gz",
 'gs://amlproject/Chip/fastqs/Sample_33.U937.INPUT.fastq.gz':"gs://amlproject/Chip/fastqs/mp100.fastq.gz",
 'gs://amlproject/Chip/fastqs/Sample_34.NOMO1.INPUT.fastq.gz':"gs://amlproject/Chip/fastqs/mp101.fastq.gz",
 'gs://amlproject/Chip/fastqs/Sample_35.UT7.INPUT.fastq.gz':"gs://amlproject/Chip/fastqs/mp102.fastq.gz",
 'gs://amlproject/Chip/fastqs/Sample_mp309-KG1-INPUT-r1.fastq.gz':"gs://amlproject/Chip/fastqs/mp309.fastq.gz",
 'gs://amlproject/Chip/fastqs/Sample_mp310-Kasumi1-INPUT-r1.fastq.gz':"gs://amlproject/Chip/fastqs/mp310.fastq.gz",
 'gs://amlproject/Chip/fastqs/Sample_mp311-MOLM13-INPUT-r1.fastq.gz':"gs://amlproject/Chip/fastqs/mp311.fastq.gz",
 'gs://amlproject/Chip/fastqs/Sample_mp312-NB4-INPUT-r1.fastq.gz':"gs://amlproject/Chip/fastqs/mp312.fastq.gz",
 'gs://amlproject/Chip/fastqs/Sample_mp313-TF1-INPUT-r1.fastq.gz':"gs://amlproject/Chip/fastqs/mp313.fastq.gz",
 'gs://amlproject/Chip/fastqs/Sample_mp314-OCIAML3-INPUT-r1.fastq.gz':"gs://amlproject/Chip/fastqs/mp314.fastq.gz",
 'gs://amlproject/Chip/fastqs/Sample_mp315-MONOMAC3-INPUT-r1.fastq.gz':"gs://amlproject/Chip/fastqs/mp315.fastq.gz",
 'gs://amlproject/Chip/fastqs/Sample_mp326-DFAM71927V3-INPUT-r1_R1.fastq.gz':"gs://amlproject/Chip/fastqs/mp326.fastq.gz",
 'gs://amlproject/Chip/fastqs/mp609-MONOMAC1-INPUT-r1_S4.fastq.gz': 'gs://amlproject/Chip/fastqs/mp609.fastq.gz',
 'gs://amlproject/Chip/fastqs/mp610-HEL9217-INPUT-r1_S5.fastq.gz': 'gs://amlproject/Chip/fastqs/mp610.fastq.gz',
 'gs://amlproject/Chip/fastqs/mp611-P31FUJ-INPUT-r1_S6.fastq.gz': 'gs://amlproject/Chip/fastqs/mp611.fastq.gz',
 'gs://amlproject/Chip/fastqs/mp612-PLB985-INPUT-r1_S7.fastq.gz': 'gs://amlproject/Chip/fastqs/mp612.fastq.gz',
          
 'gs://amlproject/Chip/fastqs/1_MP5835-1of4_S1_R2_001.fastq.gz':"gs://amlproject/Chip/fastqs/paired_end/mp643-1_MP5835-1of4_S1_R2_001.fastq.gz",          
'gs://amlproject/Chip/fastqs/Sample_mp326-DFAM71927V3-INPUT-r1_R2.fastq.gz':"gs://amlproject/Chip/fastqs/paired_end/mp326-DFAM71927V3-INPUT-r1_R2.fastq.gz"}

In [None]:
for k,v in rename.items():
    ! gsutil mv $k $v

In [None]:
data = data.drop(data[data.group.isin(withoutinput)].index)

In [None]:
data.loc[data[~data.antibody.isin(['INPUT','IGG'])].index,'group'] = \
data[~data.antibody.isin(['INPUT','IGG'])].group + '_' + \
data[~data.antibody.isin(['INPUT','IGG'])].replicate.astype(str)

In [None]:
data[data.antibody.isin(['INPUT','IGG'])].antibody = np.nan

In [None]:
data['replicate']=1

In [None]:
data #things will be mergfed

In [None]:
data[data.columns[[4,3,0,5,2,6]]].to_csv('../nextflow/design_cobinding_'+version+'.csv', index=False)

In [None]:
! sudo ../nextflow run nf-core/chipseq --paired_end --seq_center 'DFCI' --email 'jkobject@gmail.com' \
--input ../AMLproject/nextflow/additional_degraded_v1_design.csv \
--genome GRCh38 --skip_preseq \ 
--max_cpus 16 -profile docker \
-w work \ #where you want your cached files to be stored
-resume exotic_bartik #when the job failed or was stopped. you can launch it back from a previous version

In [None]:
gsheets[gsheets['Cell line ID'].isin([
    "MOLM13",
"MONOMAC1",
"MONOMAC6",
"NB4",
"UT7",
"KG1",
"U937",
"P31FUJ",
"HL60",
"PLB985",
"OCIAML3",
"HEL9217",
"Kasumi1",
"NOMO1",
"TF1",
"SHI1",
"DFAM71927V3",
"EOL1",
"F36P"
])][gsheets.Protein=='INPUT'].index

## Additional samples

In [None]:
! gsutil mv gs://transfer-amlproject/200827_MP8178_fastq/* gs://transfer-amlproject/Cobinding_additional/

In [27]:
! mkdir ../../data/fastqs/

In [16]:
! gsutil -m cp gs://transfer-amlproject/Cobinding_additional/*.fastq.gz ../../data/fastqs/

Copying gs://transfer-amlproject/Cobinding_additional/20200827_FRA2_MP8178_S55_R1_001.fastq.gz...
Copying gs://transfer-amlproject/Cobinding_additional/20200827_HEX_MP8178_S54_R1_001.fastq.gz...
Copying gs://transfer-amlproject/Cobinding_additional/20200827_PLAGL2_MP8178_S53_R2_001.fastq.gz...
Copying gs://transfer-amlproject/Cobinding_additional/20200827_FRA2_MP8178_S55_R2_001.fastq.gz...
Copying gs://transfer-amlproject/Cobinding_additional/20200827_RARA_MP8178_S56_R1_001.fastq.gz...
Copying gs://transfer-amlproject/Cobinding_additional/20200827_PLAGL2_MP8178_S53_R1_001.fastq.gz...
Copying gs://transfer-amlproject/Cobinding_additional/20200827_HEX_MP8178_S54_R2_001.fastq.gz...
Copying gs://transfer-amlproject/Cobinding_additional/20200827_RARA_MP8178_S56_R2_001.fastq.gz...
Copying gs://transfer-amlproject/Cobinding_additional/20200827_ZFP281_MP8178_S52_R1_001.fastq.gz...
Copying gs://transfer-amlproject/Cobinding_additional/20200827_ZFP281_MP8178_S52_R2_001.fastq.gz...
- [10/10 files

In [20]:
f = ! ls ../../data/fastqs/*
f

['../../data/fastqs/20200827_FRA2_MP8178_S55_R1_001.fastq.gz',
 '../../data/fastqs/20200827_FRA2_MP8178_S55_R2_001.fastq.gz',
 '../../data/fastqs/20200827_HEX_MP8178_S54_R1_001.fastq.gz',
 '../../data/fastqs/20200827_HEX_MP8178_S54_R2_001.fastq.gz',
 '../../data/fastqs/20200827_PLAGL2_MP8178_S53_R1_001.fastq.gz',
 '../../data/fastqs/20200827_PLAGL2_MP8178_S53_R2_001.fastq.gz',
 '../../data/fastqs/20200827_RARA_MP8178_S56_R1_001.fastq.gz',
 '../../data/fastqs/20200827_RARA_MP8178_S56_R2_001.fastq.gz',
 '../../data/fastqs/20200827_ZFP281_MP8178_S52_R1_001.fastq.gz',
 '../../data/fastqs/20200827_ZFP281_MP8178_S52_R2_001.fastq.gz']

In [18]:
rename2 = {
"20200827_ZFP281_MP8178_S52": "mp876-MV411-ZFP281-r1",
"20200827_PLAGL2_MP8178_S53": "mp877-MV411-PLAGL2-r1",
"20200827_HEX_MP8178_S54": "mp878-MV411-HEX-r1",
"20200827_FRA2_MP8178_S55": "mp879-MV411-FOSL2-r1",
"20200827_RARA_MP8178_S56": "mp880-MV411-RARA-r1",
}

In [23]:
for val in f:
    ren = val
    for k, v in rename2.items():
        ren = ren.replace(k,v)
    ! mv $val $ren

In [39]:
from gsheets import Sheets
sheets = Sheets.from_files('~/.client_secret.json', '~/.storage.json')
url="https://docs.google.com/spreadsheets/d/1yFLjYB1McU530JnLgL0QIMAKIkVl3kl0_LCHje2gk8U"
gsheet = sheets.get(url).sheets[2].to_frame()

In [40]:
gsheet

Unnamed: 0,id,cell line,replicate,protein,quality,paired_end,matching input name,processed,name,previous name,...,ratio to droso,unique mapped reads(droso),scaling factor,Total QC,folderNarrow,folderCompensated,folderQC,folderBroad,folder Bigwig,folder diffPeaks
0,mp100,U937,1,INPUT,,n,,Y,mp100-U937-INPUT-r1,,...,,,,https://storage.cloud.google.com/amlproject/Ch...,https://console.cloud.google.com/storage/brows...,,https://console.cloud.google.com/storage/brows...,https://console.cloud.google.com/storage/brows...,https://console.cloud.google.com/storage/brows...,
1,mp101,NOMO1,1,INPUT,,n,,Y,mp101-NOMO1-INPUT-r1,,...,,,,,,,,,,
2,mp102,UT7,1,INPUT,,n,,Y,mp102-UT7-INPUT-r1,,...,,,,,,,,,,
3,mp106,MV411,1,MYB,x,n,INPUT_MV411,Y,mp106-MV411-MYB-r1,,...,,,,,,,,,,
4,mp109,M6,1,CEBPA,x,n,INPUT_M6,Y,mp109-M6-CEBPA-r1,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
304,mp876,MV411,1,ZFP281,,,INPUT_MV412,,mp876-MV411-ZFP281-r1,20200827_ZFP281_MP8178_S52,...,,,,,,,,,,
305,mp877,MV411,1,PLAGL2,,,INPUT_MV413,,mp877-MV411-PLAGL2-r1,20200827_PLAGL2_MP8178_S53,...,,,,,,,,,,
306,mp878,MV411,1,HEX,,,INPUT_MV414,,mp878-MV411-HEX-r1,20200827_HEX_MP8178_S54,...,,,,,,,,,,
307,mp879,MV411,1,FOSL2,,,INPUT_MV415,,mp879-MV411-FOSL2-r1,20200827_FRA2_MP8178_S55,...,,,,,,,,,,


In [41]:
a = ! ls ../../data/fastqs/*

In [42]:
a

['../../data/fastqs/mp876-MV411-ZFP281-r1_R1_001.fastq.gz',
 '../../data/fastqs/mp876-MV411-ZFP281-r1_R2_001.fastq.gz',
 '../../data/fastqs/mp877-MV411-PLAGL2-r1_R1_001.fastq.gz',
 '../../data/fastqs/mp877-MV411-PLAGL2-r1_R2_001.fastq.gz',
 '../../data/fastqs/mp878-MV411-HEX-r1_R1_001.fastq.gz',
 '../../data/fastqs/mp878-MV411-HEX-r1_R2_001.fastq.gz',
 '../../data/fastqs/mp879-MV411-FOSL2-r1_R1_001.fastq.gz',
 '../../data/fastqs/mp879-MV411-FOSL2-r1_R2_001.fastq.gz',
 '../../data/fastqs/mp880-MV411-RARA-r1_R1_001.fastq.gz',
 '../../data/fastqs/mp880-MV411-RARA-r1_R2_001.fastq.gz']

In [45]:
df = {
"fastq_1": [],
"fastq_2": [],
"antibody": [],
"group": [],
"replicate": [],
"control": []
}
for val in h.grouped(a,2):
    row = gsheet[gsheet.id==val[0].split('/')[-1].split('-')[0]]
    df['group'].append("MV4"+row['name'].values[0].split('-r')[0].split('-MV4')[1])
    df['replicate'].append(row['name'].values[0].split('-r')[1])
    df['fastq_1'].append("fastqs/"+val[0])
    df['fastq_2'].append("fastqs/"+val[1])
    df['antibody'].append(row['protein'].values[0])
    df['control'].append("INPUT")
df['group'].append('INPUT')
df['replicate'].append(1)
df['fastq_1'].append('ref/mp845-MV411-INPUT-r2_R1.fastq.gz')
df['fastq_2'].append('ref/mp845-MV411-INPUT-r2_R2.fastq.gz')
df['antibody'].append("")
df['control'].append("")
df = pd.DataFrame(df)

In [62]:
df

Unnamed: 0,fastq_1,fastq_2,antibody,group,replicate,control
0,fastqs/../../data/fastqs/mp876-MV411-ZFP281-r1...,fastqs/../../data/fastqs/mp876-MV411-ZFP281-r1...,ZFP281,MV411-ZFP281,1,INPUT
1,fastqs/../../data/fastqs/mp877-MV411-PLAGL2-r1...,fastqs/../../data/fastqs/mp877-MV411-PLAGL2-r1...,PLAGL2,MV411-PLAGL2,1,INPUT
2,fastqs/../../data/fastqs/mp878-MV411-HEX-r1_R1...,fastqs/../../data/fastqs/mp878-MV411-HEX-r1_R2...,HEX,MV411-HEX,1,INPUT
3,fastqs/../../data/fastqs/mp879-MV411-FOSL2-r1_...,fastqs/../../data/fastqs/mp879-MV411-FOSL2-r1_...,FOSL2,MV411-FOSL2,1,INPUT
4,fastqs/../../data/fastqs/mp880-MV411-RARA-r1_R...,fastqs/../../data/fastqs/mp880-MV411-RARA-r1_R...,RARA,MV411-RARA,1,INPUT
5,ref/mp845-MV411-INPUT-r2_R1.fastq.gz,ref/mp845-MV411-INPUT-r2_R2.fastq.gz,,INPUT,1,


In [47]:
df[df.columns[[3,4,0,1,2,5]]].to_csv('../nextflow/additional_cobinding_v1_design.csv',index=False)

In [49]:
! cd ../../data/ && sudo ../nextflow log ## to get access to the previous runs

TIMESTAMP          	DURATION      	RUN NAME              	STATUS	REVISION ID	SESSION ID                          	COMMAND                                                                                                                                                                                                                                                          
2020-02-10 22:39:18	8m 25s        	stupefied_crick       	ERR   	21be314954 	76ea5df0-153c-4e71-a59d-52c6112fda84	nextflow run nf-core/chipseq --paired_end --seq_center DFCI --email jkobject@gmail.com --narrow_peak --input design.csv --genome GRCh38 --skip_preseq --max_cpus 24 -profile docker -w work                                                                      
2020-02-10 22:40:37	1m 6s         	ridiculous_hilbert    	ERR   	21be314954 	75004903-035d-4504-ab80-cab74b5acac4	nextflow run nf-core/chipseq --paired_end --seq_center DFCI --email jkobject@gmail.com --narrow_peak --input design.csv --genome GRCh38 --sk

In [None]:
#process chips
! cd ../../data/ && sudo ../nextflow run nf-core/chipseq --paired_end --seq_center 'DFCI' --email 'jkobject@gmail.com' --input ../AMLproject/nextflow/additional_cobinding_v1_design.csv --genome GRCh38 --skip_preseq --max_cpus 16 -profile docker -w work -resume modest_mendel

In [50]:
!gsutil -m cp -r ../../data/results/* gs://amlproject/Chip/$project/ && sudo rm -r ..data/work

Copying file://../../data/results/Documentation/results_description.html [Content-Type=text/html]...
Copying file://../../data/results/bwa/mergedLibrary/MV411-ZFP281_R1.mLb.clN.sorted.bam.bai [Content-Type=application/octet-stream]...
Copying file://../../data/results/bwa/mergedLibrary/MV411-PLAGL2_R1.mLb.clN.sorted.bam [Content-Type=application/octet-stream]...
Copying file://../../data/results/bwa/mergedLibrary/MV411-FOSL2_R1.mLb.clN.sorted.bam.bai [Content-Type=application/octet-stream]...
Copying file://../../data/results/bwa/mergedLibrary/MV411-ZFP281_R1.mLb.clN.sorted.bam [Content-Type=application/octet-stream]...
Copying file://../../data/results/bwa/mergedLibrary/INPUT_R1.mLb.clN.sorted.bam.bai [Content-Type=application/octet-stream]...
==> NOTE: You are uploading one or more large file(s), which would run
significantly faster if you enable parallel composite uploads. This
feature can be enabled by editing the
"parallel_composite_upload_threshold" value in your .boto
configurat

Copying file://../../data/results/bwa/mergedLibrary/picard_metrics/MV411-HEX_R1.mLb.clN.CollectMultipleMetrics.quality_distribution_metrics [Content-Type=application/octet-stream]...
Copying file://../../data/results/bwa/mergedLibrary/picard_metrics/MV411-RARA_R1.mLb.clN.CollectMultipleMetrics.base_distribution_by_cycle_metrics [Content-Type=application/octet-stream]...
Copying file://../../data/results/bwa/mergedLibrary/picard_metrics/MV411-RARA_R1.mLb.clN.CollectMultipleMetrics.insert_size_metrics [Content-Type=application/octet-stream]...
Copying file://../../data/results/bwa/mergedLibrary/picard_metrics/MV411-HEX_R1.mLb.clN.CollectMultipleMetrics.base_distribution_by_cycle_metrics [Content-Type=application/octet-stream]...
Copying file://../../data/results/bwa/mergedLibrary/picard_metrics/INPUT_R1.mLb.clN.CollectMultipleMetrics.alignment_summary_metrics [Content-Type=application/octet-stream]...
Copying file://../../data/results/bwa/mergedLibrary/picard_metrics/INPUT_R1.mLb.clN.Col

Copying file://../../data/results/bwa/mergedLibrary/picard_metrics/pdf/MV411-HEX_R1.mLb.clN.CollectMultipleMetrics.quality_distribution.pdf [Content-Type=application/pdf]...
Copying file://../../data/results/bwa/mergedLibrary/picard_metrics/pdf/MV411-RARA_R1.mLb.clN.CollectMultipleMetrics.quality_distribution.pdf [Content-Type=application/pdf]...
Copying file://../../data/results/bwa/mergedLibrary/picard_metrics/pdf/MV411-ZFP281_R1.mLb.clN.CollectMultipleMetrics.quality_distribution.pdf [Content-Type=application/pdf]...
Copying file://../../data/results/bwa/mergedLibrary/picard_metrics/pdf/MV411-FOSL2_R1.mLb.clN.CollectMultipleMetrics.insert_size_histogram.pdf [Content-Type=application/pdf]...
Copying file://../../data/results/bwa/mergedLibrary/picard_metrics/pdf/MV411-RARA_R1.mLb.clN.CollectMultipleMetrics.insert_size_histogram.pdf [Content-Type=application/pdf]...
Copying file://../../data/results/bwa/mergedLibrary/picard_metrics/pdf/MV411-HEX_R1.mLb.clN.CollectMultipleMetrics.base_d

Copying file://../../data/results/bwa/mergedLibrary/bigwig/scale/MV411-FOSL2_R1.mLb.clN.scale_factor.txt [Content-Type=text/plain]...
Copying file://../../data/results/bwa/mergedLibrary/phantompeakqualtools/INPUT_R1.spp.pdf [Content-Type=application/pdf]...
Copying file://../../data/results/bwa/mergedLibrary/phantompeakqualtools/MV411-ZFP281_R1_spp_nsc_mqc.tsv [Content-Type=text/tab-separated-values]...
Copying file://../../data/results/bwa/mergedLibrary/phantompeakqualtools/MV411-HEX_R1.spp.out [Content-Type=application/octet-stream]...
Copying file://../../data/results/bwa/mergedLibrary/bigwig/scale/MV411-RARA_R1.mLb.clN.scale_factor.txt [Content-Type=text/plain]...
Copying file://../../data/results/bwa/mergedLibrary/phantompeakqualtools/MV411-PLAGL2_R1.spp.out [Content-Type=application/octet-stream]...
Copying file://../../data/results/bwa/mergedLibrary/bigwig/scale/MV411-HEX_R1.mLb.clN.scale_factor.txt [Content-Type=text/plain]...
Copying file://../../data/results/bwa/mergedLibrary

Copying file://../../data/results/fastqc/MV411-FOSL2_R1_T1_1_fastqc.html [Content-Type=text/html]...
Copying file://../../data/results/fastqc/INPUT_R1_T1_1_fastqc.html [Content-Type=text/html]...
Copying file://../../data/results/fastqc/MV411-FOSL2_R1_T1_2_fastqc.html [Content-Type=text/html]...
Copying file://../../data/results/fastqc/MV411-ZFP281_R1_T1_1_fastqc.html [Content-Type=text/html]...
Copying file://../../data/results/fastqc/zips/INPUT_R1_T1_2_fastqc.zip [Content-Type=application/zip]...
Copying file://../../data/results/fastqc/MV411-PLAGL2_R1_T1_1_fastqc.html [Content-Type=text/html]...
Copying file://../../data/results/fastqc/zips/MV411-ZFP281_R1_T1_1_fastqc.zip [Content-Type=application/zip]...
Copying file://../../data/results/fastqc/MV411-RARA_R1_T1_2_fastqc.html [Content-Type=text/html]...
Copying file://../../data/results/fastqc/MV411-HEX_R1_T1_2_fastqc.html [Content-Type=text/html]...
Copying file://../../data/results/fastqc/MV411-HEX_R1_T1_1_fastqc.html [Content-Typ

Copying file://../../data/results/multiqc/broadPeak/multiqc_data/mqc_samtools-idxstats-mapped-reads-plot_Counts.txt [Content-Type=text/plain]...
Copying file://../../data/results/multiqc/broadPeak/multiqc_data/mqc_picard_insert_size_Percentages.txt [Content-Type=text/plain]...
Copying file://../../data/results/multiqc/broadPeak/multiqc_data/mqc_picard_base_distribution_by_cycle__Cytosine.txt [Content-Type=text/plain]...
Copying file://../../data/results/multiqc/broadPeak/multiqc_data/mqc_mqc_mplplot_fuxbrtycvz_1.txt [Content-Type=text/plain]...
Copying file://../../data/results/multiqc/broadPeak/multiqc_data/mqc_fastqc_per_sequence_quality_scores_plot-2_1.txt [Content-Type=text/plain]...
Copying file://../../data/results/multiqc/broadPeak/multiqc_data/multiqc_samtools_idxstats.txt [Content-Type=text/plain]...
Copying file://../../data/results/multiqc/broadPeak/multiqc_data/mqc_picard_deduplication_1.txt [Content-Type=text/plain]...
Copying file://../../data/results/multiqc/broadPeak/mu

Copying file://../../data/results/multiqc/broadPeak/multiqc_plots/svg/mqc_samtools-idxstats-mapped-reads-plot-2_Counts.svg [Content-Type=image/svg+xml]...
Copying file://../../data/results/multiqc/broadPeak/multiqc_plots/svg/mqc_samtools-idxstats-mapped-reads-plot-3_Normalised_Counts.svg [Content-Type=image/svg+xml]...
Copying file://../../data/results/multiqc/broadPeak/multiqc_plots/svg/mqc_fastqc_per_sequence_gc_content_plot-2_Percentages.svg [Content-Type=image/svg+xml]...
Copying file://../../data/results/multiqc/broadPeak/multiqc_plots/svg/mqc_fastqc_sequence_duplication_levels_plot-2_1.svg [Content-Type=image/svg+xml]...
Copying file://../../data/results/multiqc/broadPeak/multiqc_plots/svg/mqc_samtools-idxstats-mapped-reads-plot-3_Counts.svg [Content-Type=image/svg+xml]...
Copying file://../../data/results/multiqc/broadPeak/multiqc_plots/svg/mqc_fastqc_per_sequence_quality_scores_plot_1.svg [Content-Type=image/svg+xml]...
Copying file://../../data/results/multiqc/broadPeak/multiq

Copying file://../../data/results/multiqc/broadPeak/multiqc_plots/pdf/mqc_fastqc_sequence_counts_plot_1_pc.pdf [Content-Type=application/pdf]...
Copying file://../../data/results/multiqc/broadPeak/multiqc_plots/pdf/mqc_fastqc_per_sequence_quality_scores_plot-2_1.pdf [Content-Type=application/pdf]...
Copying file://../../data/results/multiqc/broadPeak/multiqc_plots/pdf/mqc_picard_base_distribution_by_cycle__Undetermined.pdf [Content-Type=application/pdf]...
Copying file://../../data/results/multiqc/broadPeak/multiqc_plots/pdf/mqc_deeptools_fingerprint_plot_1.pdf [Content-Type=application/pdf]...
Copying file://../../data/results/multiqc/broadPeak/multiqc_plots/pdf/mqc_samtools-idxstats-mapped-reads-plot_Counts.pdf [Content-Type=application/pdf]...
Copying file://../../data/results/multiqc/broadPeak/multiqc_plots/pdf/mqc_fastqc_adapter_content_plot_1.pdf [Content-Type=application/pdf]...
Copying file://../../data/results/multiqc/broadPeak/multiqc_plots/pdf/mqc_samtools_alignment_plot-3_1

Copying file://../../data/results/multiqc/broadPeak/multiqc_plots/png/mqc_fastqc_sequence_counts_plot_1.png [Content-Type=image/png]...
Copying file://../../data/results/multiqc/broadPeak/multiqc_plots/png/mqc_samtools-idxstats-mapped-reads-plot-2_Normalised_Counts.png [Content-Type=image/png]...
Copying file://../../data/results/multiqc/broadPeak/multiqc_plots/png/mqc_samtools_alignment_plot-3_1_pc.png [Content-Type=image/png]...
Copying file://../../data/results/multiqc/broadPeak/multiqc_plots/png/mqc_fastqc_sequence_counts_plot-2_1.png [Content-Type=image/png]...
Copying file://../../data/results/multiqc/broadPeak/multiqc_plots/png/mqc_picard_aligned_reads_1.png [Content-Type=image/png]...
Copying file://../../data/results/multiqc/broadPeak/multiqc_plots/png/mqc_samtools-idxstats-mapped-reads-plot-3_Counts.png [Content-Type=image/png]...
Copying file://../../data/results/multiqc/broadPeak/multiqc_plots/png/mqc_samtools-idxstats-mapped-reads-plot-2_Counts.png [Content-Type=image/png]

\ [562/562 files][ 28.0 GiB/ 28.0 GiB] 100% Done  99.1 MiB/s ETA 00:00:00       
Operation completed over 562 objects/28.0 GiB.                                   
rm: cannot remove '..data/work': No such file or directory


In [58]:
project

'cobinding'

In [52]:
! gsutil -m cp ../../data/fastqs/* gs://amlproject/Chip/fastqs/paired_end/ && rm ../../data/fastqs/*

Copying file://../../data/fastqs/mp876-MV411-ZFP281-r1_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Copying file://../../data/fastqs/mp876-MV411-ZFP281-r1_R2_001.fastq.gz [Content-Type=application/octet-stream]...
Copying file://../../data/fastqs/mp877-MV411-PLAGL2-r1_R1_001.fastq.gz [Content-Type=application/octet-stream]...
Copying file://../../data/fastqs/mp877-MV411-PLAGL2-r1_R2_001.fastq.gz [Content-Type=application/octet-stream]...
==> NOTE: You are uploading one or more large file(s), which would run
significantly faster if you enable parallel composite uploads. This
feature can be enabled by editing the
"parallel_composite_upload_threshold" value in your .boto
configuration file. However, note that if you do this large files will
be uploaded as `composite objects
<https://cloud.google.com/storage/docs/composite-objects>`_,which
means that any user who downloads such objects will need to have a
compiled crcmod installed (see "gsutil help crcmod"). This is because
w

In [61]:
ls ../../data/results/fastqc

INPUT_R1_T1_1_fastqc.html         MV411-PLAGL2_R1_T1_2_fastqc.html
INPUT_R1_T1_2_fastqc.html         MV411-RARA_R1_T1_1_fastqc.html
MV411-FOSL2_R1_T1_1_fastqc.html   MV411-RARA_R1_T1_2_fastqc.html
MV411-FOSL2_R1_T1_2_fastqc.html   MV411-ZFP281_R1_T1_1_fastqc.html
MV411-HEX_R1_T1_1_fastqc.html     MV411-ZFP281_R1_T1_2_fastqc.html
MV411-HEX_R1_T1_2_fastqc.html     [0m[01;34mzips[0m/
MV411-PLAGL2_R1_T1_1_fastqc.html


In [None]:
# ADDING data TO THE SAMPLE TRACKER

In [63]:
! sudo rm -r ../../data/results

# HiCseq Preprocessing

### need to use juicer
https://github.com/aidenlab/juicer/wiki/Data-Extraction

https://github.com/aidenlab/straw/wiki/Python

https://github.com/aidenlab/juicer/wiki/Data

https://github.com/aidenlab/juicer

In [None]:
! pip install hic-straw

In [None]:
import straw

In [None]:
result = straw.straw('KR', 
                     '../../data/'+project+'/MV411_H3K27ac_all_rep_duplicate_removed_allValidPairs.hic',
                     'X', 'X', 'BP', 5000)

In [None]:
len(result[1])

In [None]:
! ls ../../data/$project