## Generate QC table

- Add additional crispr samples to the QC table.
- Fill out the details to [geo-submission-metadata-v4](https://docs.google.com/spreadsheets/d/1Y_OIFHcAEtofVmIso5LPKX3YpU04i1W6/edit#gid=1074977823)

In [1]:
import os
import pandas as pd
from pathlib import Path
from glob import glob
import json
import numpy as np
from tqdm import tqdm
from basepair.utils import md5sum, flatten_list
from joblib import Parallel, delayed

Using TensorFlow backend.


In [2]:
def read_config(config_path):
    config = json.load(open(config_path))
    # Merge all chip_nexus.fastqs into a list.
    config['fastqs'] = [p for k, paths in config.items()
                        if k.startswith('chip_nexus.fastqs')
                        for p in paths]
    return {k: v for k, v in config.items() if not k.startswith('chip_nexus.fastqs')}


def read_metadata(metadata_path, add_qc=True):
    from basepair.utils import flatten

    if not os.path.exists(metadata_path):
        return {}
    metadata = json.load(open(metadata_path))
    metadata['chip_nexus.qc_json'] = metadata['outputs']['chip_nexus.qc_json']
    metadata['chip_nexus.report'] = metadata['outputs']['chip_nexus.report']
    metadata = {k: v for k, v in metadata.items()
                if k in ['status', 'workflowRoot', 'id', 'chip_nexus.qc_json', 'chip_nexus.report']}
    if add_qc:
        metadata = {**metadata, **flatten(json.load(open(metadata['chip_nexus.qc_json'])), '/')}
    return metadata

In [3]:
# Read configs
samples = [
    'wt.Nanog',
    'wt.Sox2',
    'mutant.sox2crispr1.Nanog',
    'mutant.sox2crispr1.Sox2',
    'mutant.nanogcrispr1.Nanog',
    'mutant.nanogcrispr1.Sox2'
]

# Load configs
ddir = Path('/oak/stanford/groups/akundaje/avsec/basepair/data/raw/zeitlinger-chip-nexus-crispr/')
configs = {sample: {**read_config(f'{ddir}/{sample}.config.json'),
                    **read_metadata(f'{ddir}/{sample}.metadata.json')}
           for sample in samples}

df = pd.DataFrame(configs).T

In [5]:
# Sanity checks
all_fastqs = flatten_list(df.fastqs)
# No duplicates in fastqs
assert not pd.Series(all_fastqs).duplicated().any()
# All fastq files were processed
# assert {str(s) for s in ddir.glob('*.fastq.gz')} == set(all_fastqs)

# All adapters and barcodes were the same for all experiments
assert len(df['chip_nexus.adapter'].unique()) == 1
assert len(df['chip_nexus.barcodes'].unique()) == 1

# All files succeeded
assert np.all(df['status'] == 'Succeeded')

In [6]:
df['chip_nexus.report']

wt.Nanog                     /oak/stanford/groups/...
wt.Sox2                      /oak/stanford/groups/...
mutant.sox2crispr1.Nanog     /oak/stanford/groups/...
mutant.sox2crispr1.Sox2      /oak/stanford/groups/...
mutant.nanogcrispr1.Nanog    /oak/stanford/groups/...
mutant.nanogcrispr1.Sox2     /oak/stanford/groups/...
Name: chip_nexus.report, dtype: object

In [7]:
df.head()

Unnamed: 0,chip_nexus.adapter,chip_nexus.barcodes,chip_nexus.description,chip_nexus.enable_count_signal_track,chip_nexus.genome_tsv,chip_nexus.qc_json,chip_nexus.report,chip_nexus.title,fastqs,flagstat_qc/rep1/diff_chroms,flagstat_qc/rep1/diff_chroms_qc_failed,flagstat_qc/rep1/duplicates,flagstat_qc/rep1/duplicates_qc_failed,flagstat_qc/rep1/mapped,flagstat_qc/rep1/mapped_pct,flagstat_qc/rep1/mapped_qc_failed,flagstat_qc/rep1/paired,flagstat_qc/rep1/paired_properly,flagstat_qc/rep1/paired_properly_pct,flagstat_qc/rep1/paired_properly_qc_failed,flagstat_qc/rep1/paired_qc_failed,flagstat_qc/rep1/read1,flagstat_qc/rep1/read1_qc_failed,flagstat_qc/rep1/read2,flagstat_qc/rep1/read2_qc_failed,flagstat_qc/rep1/singletons,flagstat_qc/rep1/singletons_pct,flagstat_qc/rep1/singletons_qc_failed,flagstat_qc/rep1/total,flagstat_qc/rep1/total_qc_failed,flagstat_qc/rep1/with_itself,flagstat_qc/rep1/with_itself_qc_failed,flagstat_qc/rep2/diff_chroms,flagstat_qc/rep2/diff_chroms_qc_failed,flagstat_qc/rep2/duplicates,flagstat_qc/rep2/duplicates_qc_failed,flagstat_qc/rep2/mapped,flagstat_qc/rep2/mapped_pct,flagstat_qc/rep2/mapped_qc_failed,flagstat_qc/rep2/paired,flagstat_qc/rep2/paired_properly,flagstat_qc/rep2/paired_properly_pct,flagstat_qc/rep2/paired_properly_qc_failed,flagstat_qc/rep2/paired_qc_failed,flagstat_qc/rep2/read1,flagstat_qc/rep2/read1_qc_failed,flagstat_qc/rep2/read2,flagstat_qc/rep2/read2_qc_failed,flagstat_qc/rep2/singletons,flagstat_qc/rep2/singletons_pct,flagstat_qc/rep2/singletons_qc_failed,flagstat_qc/rep2/total,flagstat_qc/rep2/total_qc_failed,flagstat_qc/rep2/with_itself,flagstat_qc/rep2/with_itself_qc_failed,flagstat_qc/rep3/diff_chroms,flagstat_qc/rep3/diff_chroms_qc_failed,flagstat_qc/rep3/duplicates,flagstat_qc/rep3/duplicates_qc_failed,flagstat_qc/rep3/mapped,flagstat_qc/rep3/mapped_pct,flagstat_qc/rep3/mapped_qc_failed,flagstat_qc/rep3/paired,flagstat_qc/rep3/paired_properly,flagstat_qc/rep3/paired_properly_pct,flagstat_qc/rep3/paired_properly_qc_failed,flagstat_qc/rep3/paired_qc_failed,flagstat_qc/rep3/read1,flagstat_qc/rep3/read1_qc_failed,flagstat_qc/rep3/read2,flagstat_qc/rep3/read2_qc_failed,flagstat_qc/rep3/singletons,flagstat_qc/rep3/singletons_pct,flagstat_qc/rep3/singletons_qc_failed,flagstat_qc/rep3/total,flagstat_qc/rep3/total_qc_failed,flagstat_qc/rep3/with_itself,flagstat_qc/rep3/with_itself_qc_failed,frip_macs2_qc/pooled/FRiP,frip_macs2_qc/ppr1/FRiP,frip_macs2_qc/ppr2/FRiP,frip_macs2_qc/rep1-pr1/FRiP,frip_macs2_qc/rep1-pr2/FRiP,frip_macs2_qc/rep1/FRiP,frip_macs2_qc/rep2-pr1/FRiP,frip_macs2_qc/rep2-pr2/FRiP,frip_macs2_qc/rep2/FRiP,frip_macs2_qc/rep3-pr1/FRiP,frip_macs2_qc/rep3-pr2/FRiP,frip_macs2_qc/rep3/FRiP,general/date,general/description,general/genome,general/paired_end,general/peak_caller,general/pipeline_ver,general/title,id,idr_frip_qc/ppr/FRiP,idr_frip_qc/rep1-pr/FRiP,idr_frip_qc/rep1-rep2/FRiP,idr_frip_qc/rep1-rep3/FRiP,idr_frip_qc/rep2-pr/FRiP,idr_frip_qc/rep2-rep3/FRiP,idr_frip_qc/rep3-pr/FRiP,idr_reproducibility_qc/N1,idr_reproducibility_qc/N2,idr_reproducibility_qc/N3,idr_reproducibility_qc/N_consv,idr_reproducibility_qc/N_opt,idr_reproducibility_qc/Np,idr_reproducibility_qc/Nt,idr_reproducibility_qc/consv_set,idr_reproducibility_qc/opt_set,idr_reproducibility_qc/reproducibility,idr_reproducibility_qc/rescue_ratio,idr_reproducibility_qc/self_consistency_ratio,nodup_flagstat_qc/rep1/diff_chroms,nodup_flagstat_qc/rep1/diff_chroms_qc_failed,nodup_flagstat_qc/rep1/duplicates,nodup_flagstat_qc/rep1/duplicates_qc_failed,nodup_flagstat_qc/rep1/mapped,nodup_flagstat_qc/rep1/mapped_pct,nodup_flagstat_qc/rep1/mapped_qc_failed,nodup_flagstat_qc/rep1/paired,nodup_flagstat_qc/rep1/paired_properly,nodup_flagstat_qc/rep1/paired_properly_pct,nodup_flagstat_qc/rep1/paired_properly_qc_failed,nodup_flagstat_qc/rep1/paired_qc_failed,nodup_flagstat_qc/rep1/read1,nodup_flagstat_qc/rep1/read1_qc_failed,nodup_flagstat_qc/rep1/read2,nodup_flagstat_qc/rep1/read2_qc_failed,nodup_flagstat_qc/rep1/singletons,nodup_flagstat_qc/rep1/singletons_pct,nodup_flagstat_qc/rep1/singletons_qc_failed,nodup_flagstat_qc/rep1/total,nodup_flagstat_qc/rep1/total_qc_failed,nodup_flagstat_qc/rep1/with_itself,nodup_flagstat_qc/rep1/with_itself_qc_failed,nodup_flagstat_qc/rep2/diff_chroms,nodup_flagstat_qc/rep2/diff_chroms_qc_failed,nodup_flagstat_qc/rep2/duplicates,nodup_flagstat_qc/rep2/duplicates_qc_failed,nodup_flagstat_qc/rep2/mapped,nodup_flagstat_qc/rep2/mapped_pct,nodup_flagstat_qc/rep2/mapped_qc_failed,nodup_flagstat_qc/rep2/paired,nodup_flagstat_qc/rep2/paired_properly,nodup_flagstat_qc/rep2/paired_properly_pct,nodup_flagstat_qc/rep2/paired_properly_qc_failed,nodup_flagstat_qc/rep2/paired_qc_failed,nodup_flagstat_qc/rep2/read1,nodup_flagstat_qc/rep2/read1_qc_failed,nodup_flagstat_qc/rep2/read2,nodup_flagstat_qc/rep2/read2_qc_failed,nodup_flagstat_qc/rep2/singletons,nodup_flagstat_qc/rep2/singletons_pct,nodup_flagstat_qc/rep2/singletons_qc_failed,nodup_flagstat_qc/rep2/total,nodup_flagstat_qc/rep2/total_qc_failed,nodup_flagstat_qc/rep2/with_itself,nodup_flagstat_qc/rep2/with_itself_qc_failed,nodup_flagstat_qc/rep3/diff_chroms,nodup_flagstat_qc/rep3/diff_chroms_qc_failed,nodup_flagstat_qc/rep3/duplicates,nodup_flagstat_qc/rep3/duplicates_qc_failed,nodup_flagstat_qc/rep3/mapped,nodup_flagstat_qc/rep3/mapped_pct,nodup_flagstat_qc/rep3/mapped_qc_failed,nodup_flagstat_qc/rep3/paired,nodup_flagstat_qc/rep3/paired_properly,nodup_flagstat_qc/rep3/paired_properly_pct,nodup_flagstat_qc/rep3/paired_properly_qc_failed,nodup_flagstat_qc/rep3/paired_qc_failed,nodup_flagstat_qc/rep3/read1,nodup_flagstat_qc/rep3/read1_qc_failed,nodup_flagstat_qc/rep3/read2,nodup_flagstat_qc/rep3/read2_qc_failed,nodup_flagstat_qc/rep3/singletons,nodup_flagstat_qc/rep3/singletons_pct,nodup_flagstat_qc/rep3/singletons_qc_failed,nodup_flagstat_qc/rep3/total,nodup_flagstat_qc/rep3/total_qc_failed,nodup_flagstat_qc/rep3/with_itself,nodup_flagstat_qc/rep3/with_itself_qc_failed,overlap_frip_qc/ppr/FRiP,overlap_frip_qc/rep1-pr/FRiP,overlap_frip_qc/rep1-rep2/FRiP,overlap_frip_qc/rep1-rep3/FRiP,overlap_frip_qc/rep2-pr/FRiP,overlap_frip_qc/rep2-rep3/FRiP,overlap_frip_qc/rep3-pr/FRiP,overlap_reproducibility_qc/N1,overlap_reproducibility_qc/N2,overlap_reproducibility_qc/N3,overlap_reproducibility_qc/N_consv,overlap_reproducibility_qc/N_opt,overlap_reproducibility_qc/Np,overlap_reproducibility_qc/Nt,overlap_reproducibility_qc/consv_set,overlap_reproducibility_qc/opt_set,overlap_reproducibility_qc/reproducibility,overlap_reproducibility_qc/rescue_ratio,overlap_reproducibility_qc/self_consistency_ratio,pbc_qc/rep1/NRF,pbc_qc/rep1/PBC1,pbc_qc/rep1/PBC2,pbc_qc/rep1/distinct_read_pairs,pbc_qc/rep1/one_read_pair,pbc_qc/rep1/total_read_pairs,pbc_qc/rep1/two_read_pair,pbc_qc/rep2/NRF,pbc_qc/rep2/PBC1,pbc_qc/rep2/PBC2,pbc_qc/rep2/distinct_read_pairs,pbc_qc/rep2/one_read_pair,pbc_qc/rep2/total_read_pairs,pbc_qc/rep2/two_read_pair,pbc_qc/rep3/NRF,pbc_qc/rep3/PBC1,pbc_qc/rep3/PBC2,pbc_qc/rep3/distinct_read_pairs,pbc_qc/rep3/one_read_pair,pbc_qc/rep3/total_read_pairs,pbc_qc/rep3/two_read_pair,status,workflowRoot
wt.Nanog,AGATCGGAAGAGCACACGTCT...,"CAGT,TCAG,GTCA,AGTC",chipnexus data from j...,True,/oak/stanford/groups/...,/oak/stanford/groups/...,/oak/stanford/groups/...,chipnexus_wt.Nanog,[/oak/stanford/groups...,0,0,0,0,235330333,100,0,0,0,0,0,0,0,0,0,0,0,0,0,235330333,0,0,0,0,0,0,0,183517795,100,0,0,0,0,0,0,0,0,0,0,0,0,0,183517795,0,0,0,0.0,0.0,0.0,0.0,237732711.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,237732711.0,0.0,0.0,0.0,0.5808,0.5685,0.5684,0.4585,0.4584,0.4741,0.5938,0.5937,0.6067,0.5692,0.5695,0.5807,2020-02-20 12:12:56,chipnexus data from j...,mm10_sherlock.tsv,False,macs2,v1.1.6,chipnexus_wt.Nanog,1ed91889-8fad-48fa-a4...,0.4199,0.4079,0.5034,0.5077,0.3756,0.517,0.2765,66649,16716,10644.0,105037,105037,38127,105037,rep2-rep3,rep2-rep3,fail,2.755,6.262,0,0,0,0,65394814,100,0,0,0,0,0,0,0,0,0,0,0,0,0,65394814,0,0,0,0,0,0,0,84103211,100,0,0,0,0,0,0,0,0,0,0,0,0,0,84103211,0,0,0,0.0,0.0,0.0,0.0,148281085.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,148281085.0,0.0,0.0,0.0,0.568,0.4521,0.5488,0.5515,0.5886,0.5596,0.5661,122777,150581,182791.0,178369,196806,196806,178369,rep2-rep3,ppr,pass,1.103,1.489,0.2779,,,65393569,,235326446,,0.4583,,,84100961,,183513792,,0.6237,,,148277224.0,,237727258.0,,Succeeded,/oak/stanford/groups/...
wt.Sox2,AGATCGGAAGAGCACACGTCT...,"CAGT,TCAG,GTCA,AGTC",chipnexus data from j...,True,/oak/stanford/groups/...,/oak/stanford/groups/...,/oak/stanford/groups/...,chipnexus_wt.Sox2,[/oak/stanford/groups...,0,0,0,0,172068682,100,0,0,0,0,0,0,0,0,0,0,0,0,0,172068682,0,0,0,0,0,0,0,198240035,100,0,0,0,0,0,0,0,0,0,0,0,0,0,198240035,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,0.2244,0.2056,0.2059,0.1811,0.1818,0.1997,0.1959,0.1962,0.2137,,,,2020-02-19 10:01:02,chipnexus data from j...,mm10_sherlock.tsv,False,macs2,v1.1.6,chipnexus_wt.Sox2,dc47c819-09a0-4e3e-ab...,0.1599,0.1283,0.1585,,0.1433,,,46858,46439,,67574,69414,69414,67574,rep1-rep2,ppr,pass,1.027,1.009,0,0,0,0,101217199,100,0,0,0,0,0,0,0,0,0,0,0,0,0,101217199,0,0,0,0,0,0,0,96355001,100,0,0,0,0,0,0,0,0,0,0,0,0,0,96355001,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,0.1996,0.1704,0.1984,,0.1872,,,106405,104195,,133387,136157,136157,133387,rep1-rep2,ppr,pass,1.021,1.021,0.5882,,,101205995,,172051409,,0.486,,,96346535,,198224829,,,,,,,,,Succeeded,/oak/stanford/groups/...
mutant.sox2crispr1.Nanog,AGATCGGAAGAGCACACGTCT...,"CAGT,TCAG,GTCA,AGTC",chipnexus data from j...,True,/oak/stanford/groups/...,/oak/stanford/groups/...,/oak/stanford/groups/...,chipnexus_mutant.sox2...,[/oak/stanford/groups...,0,0,0,0,251068216,100,0,0,0,0,0,0,0,0,0,0,0,0,0,251068216,0,0,0,0,0,0,0,206108730,100,0,0,0,0,0,0,0,0,0,0,0,0,0,206108730,0,0,0,0.0,0.0,0.0,0.0,311704016.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,311704016.0,0.0,0.0,0.0,0.5419,0.5293,0.5294,0.5199,0.5197,0.5352,0.4794,0.4795,0.4948,0.5252,0.5261,0.5366,2020-02-20 19:26:24,chipnexus data from j...,mm10_sox2crispr1_muta...,False,macs2,v1.1.6,chipnexus_mutant.sox2...,a33b5888-8025-459b-ba...,0.3791,0.3095,0.487,0.4833,0.4018,0.482,0.2816,16798,68921,11643.0,124143,124143,37831,124143,rep1-rep2,rep1-rep2,fail,3.282,5.92,0,0,0,0,138471343,100,0,0,0,0,0,0,0,0,0,0,0,0,0,138471343,0,0,0,0,0,0,0,146789710,100,0,0,0,0,0,0,0,0,0,0,0,0,0,146789710,0,0,0,0.0,0.0,0.0,0.0,167587350.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,167587350.0,0.0,0.0,0.0,0.5298,0.5172,0.5223,0.52,0.4755,0.5205,0.5229,166923,172785,174903.0,194709,214856,214856,194709,rep1-rep2,ppr,pass,1.103,1.048,0.5515,,,138467256,,251061827,,0.7122,,,146785236,,206103029,,0.5376,,,167580780.0,,311693270.0,,Succeeded,/oak/stanford/groups/...
mutant.sox2crispr1.Sox2,AGATCGGAAGAGCACACGTCT...,"CAGT,TCAG,GTCA,AGTC",chipnexus data from j...,True,/oak/stanford/groups/...,/oak/stanford/groups/...,/oak/stanford/groups/...,chipnexus_mutant.sox2...,[/oak/stanford/groups...,0,0,0,0,213046217,100,0,0,0,0,0,0,0,0,0,0,0,0,0,213046217,0,0,0,0,0,0,0,194315905,100,0,0,0,0,0,0,0,0,0,0,0,0,0,194315905,0,0,0,0.0,0.0,0.0,0.0,230595069.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,230595069.0,0.0,0.0,0.0,0.2341,0.2131,0.2138,0.1786,0.1792,0.2006,0.1754,0.1752,0.1975,0.1786,0.1789,0.1983,2020-02-20 09:03:35,chipnexus data from j...,mm10_sox2crispr1_muta...,False,macs2,v1.1.6,chipnexus_mutant.sox2...,5cdac87a-fbcb-49a6-93...,0.1709,0.1276,0.1572,0.1541,0.1233,0.1523,0.1331,47379,46026,45156.0,73393,93352,93352,73393,rep1-rep2,ppr,pass,1.272,1.049,0,0,0,0,122073355,100,0,0,0,0,0,0,0,0,0,0,0,0,0,122073355,0,0,0,0,0,0,0,122871190,100,0,0,0,0,0,0,0,0,0,0,0,0,0,122871190,0,0,0,0.0,0.0,0.0,0.0,119947637.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,119947637.0,0.0,0.0,0.0,0.2066,0.1692,0.1956,0.1922,0.164,0.1917,0.1729,105288,104034,98364.0,139482,167498,167498,139482,rep1-rep2,ppr,pass,1.201,1.07,0.573,,,122060072,,213025150,,0.6323,,,122855458,,194292846,,0.5202,,,119928463.0,,230560508.0,,Succeeded,/oak/stanford/groups/...
mutant.nanogcrispr1.Nanog,AGATCGGAAGAGCACACGTCT...,"CAGT,TCAG,GTCA,AGTC",chipnexus data from j...,True,/oak/stanford/groups/...,/oak/stanford/groups/...,/oak/stanford/groups/...,chipnexus_mutant,[/oak/stanford/groups...,0,0,0,0,285734809,100,0,0,0,0,0,0,0,0,0,0,0,0,0,285734809,0,0,0,0,0,0,0,230000645,100,0,0,0,0,0,0,0,0,0,0,0,0,0,230000645,0,0,0,0.0,0.0,0.0,0.0,345095073.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,345095073.0,0.0,0.0,0.0,0.5814,0.5695,0.5699,0.5552,0.5558,0.5739,0.5669,0.5669,0.5839,0.5271,0.5273,0.5404,2020-01-29 02:32:52,chipnexus data from j...,mm10_mutant_annotatio...,False,macs2,v1.1.6,chipnexus_mutant,83036154-43b1-41b4-a0...,0.4897,0.5023,0.5161,0.5267,0.3115,0.5193,0.3314,96389,13055,24649.0,135483,135483,89445,135483,rep1-rep3,rep1-rep3,borderline,1.515,7.383,0,0,0,0,137717502,100,0,0,0,0,0,0,0,0,0,0,0,0,0,137717502,0,0,0,0,0,0,0,106200119,100,0,0,0,0,0,0,0,0,0,0,0,0,0,106200119,0,0,0,0.0,0.0,0.0,0.0,151433576.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,151433576.0,0.0,0.0,0.0,0.5693,0.5532,0.5587,0.5634,0.5628,0.5601,0.524,175215,167226,191744.0,211753,228004,228004,211753,rep1-rep3,ppr,pass,1.077,1.147,0.482,,,137715090,,285730536,,0.4617,,,106198054,,229996803,,0.4388,,,151425175.0,,345077892.0,,Succeeded,/oak/stanford/groups/...


In [50]:
df.to_excel('/users/avsec//gdrive/projects/chipnexus/paper/tables/crispr-all-qc.xlsx')

## Processed files to export

In [8]:
# Extract processed files from the output directory
def processed_bigwig(root, strand):
    bigwigs = list((Path(root) / 'call-count_signal_track_pooled/execution').glob(f'*.{strand}.bigwig'))
    assert len(bigwigs) == 1
    return bigwigs[0]

def processed_narrowPeak(root):
    return Path(root) / 'call-reproducibility_idr/execution/optimal_peak.narrowPeak.gz'


def processed_all_files(root):
    return {'bigwig.positive': processed_bigwig(root, 'positive'),
            'bigwig.negative': processed_bigwig(root, 'negative'),
            'narrowPeak': processed_narrowPeak(root)
            }

In [9]:
processed_files = {sample: processed_all_files(config['workflowRoot'])
                   for sample,config in configs.items()}
df_processed_files = pd.DataFrame(processed_files).T
df_processed_files

Unnamed: 0,bigwig.negative,bigwig.positive,narrowPeak
wt.Nanog,/oak/stanford/groups/...,/oak/stanford/groups/...,/oak/stanford/groups/...
wt.Sox2,/oak/stanford/groups/...,/oak/stanford/groups/...,/oak/stanford/groups/...
mutant.sox2crispr1.Nanog,/oak/stanford/groups/...,/oak/stanford/groups/...,/oak/stanford/groups/...
mutant.sox2crispr1.Sox2,/oak/stanford/groups/...,/oak/stanford/groups/...,/oak/stanford/groups/...
mutant.nanogcrispr1.Nanog,/oak/stanford/groups/...,/oak/stanford/groups/...,/oak/stanford/groups/...
mutant.nanogcrispr1.Sox2,/oak/stanford/groups/...,/oak/stanford/groups/...,/oak/stanford/groups/...


In [14]:
# Molten version
dfm = df_processed_files.stack().reset_index(level=1)
dfm.columns = ['file_type', 'path']
df_fastq = pd.DataFrame([{'sample': sample, 'file_type': 'fastq', 'path': path}
                         for sample, paths in df.fastqs.items() for path in paths]).set_index('sample')
dfm = pd.concat([dfm, df_fastq], axis=0)
assert dfm.path.map(os.path.exists).all()  # All files exist

# Compute MD5 hashes
dfm['md5'] = Parallel(-1, verbose=10)(delayed(md5sum)(path) for path in dfm.path)

dfm

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 48 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of  35 | elapsed:  2.2min remaining: 17.3min
[Parallel(n_jobs=-1)]: Done   8 out of  35 | elapsed:  2.6min remaining:  8.8min
[Parallel(n_jobs=-1)]: Done  12 out of  35 | elapsed:  3.0min remaining:  5.8min
[Parallel(n_jobs=-1)]: Done  16 out of  35 | elapsed:  3.5min remaining:  4.1min
[Parallel(n_jobs=-1)]: Done  20 out of  35 | elapsed: 23.0min remaining: 17.2min
[Parallel(n_jobs=-1)]: Done  24 out of  35 | elapsed: 25.8min remaining: 11.8min
[Parallel(n_jobs=-1)]: Done  28 out of  35 | elapsed: 32.9min remaining:  8.2min
[Parallel(n_jobs=-1)]: Done  32 out of  35 | elapsed: 39.9min remaining:  3.7min
[Parallel(n_jobs=-1)]: Done  35 out of  35 | elapsed: 41.3min finished


Unnamed: 0,file_type,path,md5
wt.Nanog,bigwig.negative,/oak/stanford/groups/...,8469fa39110bf17f56952...
wt.Nanog,bigwig.positive,/oak/stanford/groups/...,655bfde8d14216c738202...
wt.Nanog,narrowPeak,/oak/stanford/groups/...,5976e226bc57531ce6c4b...
...,...,...,...
mutant.nanogcrispr1.Sox2,fastq,/oak/stanford/groups/...,fcab2929517ff7aa8ff26...
mutant.nanogcrispr1.Sox2,fastq,/oak/stanford/groups/...,fb25f045c9c4aeed88ff5...
mutant.nanogcrispr1.Sox2,fastq,/oak/stanford/groups/...,2ec0aa7956893d2bf2c5b...


In [49]:
dfm.to_excel('/users/avsec//gdrive/projects/chipnexus/paper/tables/file-hashes.xlsx')

In [16]:
## Make sure the md5-hashes are the same as from md5sum for some files
for i in [0, -1]:
    outputs = !md5sum {dfm.iloc[i].path}
    md5sum2 = outputs[0].split(' ')[0]
    print(md5sum2, dfm.iloc[i].md5)
    assert md5sum2 == dfm.iloc[i].md5

8469fa39110bf17f56952987a70ef311 8469fa39110bf17f56952987a70ef311
2ec0aa7956893d2bf2c5b3c2615d1955 2ec0aa7956893d2bf2c5b3c2615d1955


### Add qc to zip files

In [17]:
# from zipfile import ZipFile

# # writing files to a zipfile 
# with ZipFile(f'{ddir}/gdata/qc-htmls.zip','w') as zf: 
#     for i,row in reps.iterrows():
#         zf.write(f"{row.QC_dir}/qc.html", arcname="qc." + row['id'] + ".html") 

In [18]:
# Copy qc
for sample, row in df.iterrows():
    !mkdir -p /srv/www/kundaje/avsec/chipnexus/paper/data/chip-nexus/crispr/{sample}/
    !cp {row['chip_nexus.report']} /srv/www/kundaje/avsec/chipnexus/paper/data/chip-nexus/crispr/{sample}/qc.html
    !chmod o+rx /srv/www/kundaje/avsec/chipnexus/paper/data/chip-nexus/crispr/{sample}/qc.html

In [25]:
dfo = []
for sample, row in df.iterrows():
    for j, fastq in enumerate(row.fastqs):
        dfo.append({'sample': sample,
                    'path': os.path.basename(fastq),
                    '#Rep-IDRpeaks (N1, N2, ..)': row[f'idr_reproducibility_qc/N{j+1}'],
                    '#IDR-optimal peaks (Np)': row[f'idr_reproducibility_qc/Np'],
                    'Unique deduped reads': row[f'pbc_qc/rep{j+1}/distinct_read_pairs'],
                    'QC report': f'http://mitra.stanford.edu/kundaje/avsec/chipnexus/paper/data/chip-nexus/crispr/{sample}/qc.html',
                    'md5': dfm[dfm.path == fastq].iloc[0].md5,
                    'idr_reproducibility_qc/rescue_ratio': row['idr_reproducibility_qc/rescue_ratio'],
                    'idr_frip_qc/ppr/FRiP': row['idr_frip_qc/ppr/FRiP'],
                    })
dfo = pd.DataFrame(dfo)

In [28]:
print(dfo.to_csv(index=False))

#IDR-optimal peaks (Np),"#Rep-IDRpeaks (N1, N2, ..)",QC report,Unique deduped reads,idr_frip_qc/ppr/FRiP,idr_reproducibility_qc/rescue_ratio,md5,path,sample
95658,44149,http://mitra.stanford.edu/kundaje/avsec/chipnexus/paper/data/chip-nexus/crispr/mutant.nanogcrispr1.Sox2/qc.html,85074201,0.193377713437,1.1960688698,fcab2929517ff7aa8ff26952e4060179,mesc_nanogcrispr1_sox2_B07_nexus_1.fastq.gz,mutant.nanogcrispr1.Sox2
95658,59005,http://mitra.stanford.edu/kundaje/avsec/chipnexus/paper/data/chip-nexus/crispr/mutant.nanogcrispr1.Sox2/qc.html,100162662,0.193377713437,1.1960688698,fb25f045c9c4aeed88ff529c6b4c38c8,mesc_nanogcrispr1_sox2_B09_nexus_1.fastq.gz,mutant.nanogcrispr1.Sox2
95658,52844,http://mitra.stanford.edu/kundaje/avsec/chipnexus/paper/data/chip-nexus/crispr/mutant.nanogcrispr1.Sox2/qc.html,118872840,0.193377713437,1.1960688698,2ec0aa7956893d2bf2c5b3c2615d1955,mesc_nanogcrispr1_sox2_F10_nexus_1.fastq.gz,mutant.nanogcrispr1.Sox2



In [29]:
# Copy files to webserver
filenames = {'bigwig.negative': 'counts.neg.bw',
             'bigwig.positive': 'counts.pos.bw',
             'narrowPeak': 'optimal_peak.narrowPeak.gz'}

for sample, row in dfm[dfm.file_type != 'fastq'].iterrows():
    !mkdir -p /srv/www/kundaje/avsec/chipnexus/paper/data/chip-nexus/crispr/{sample}/
    fn = filenames[row['file_type']]
    file_path = f'/srv/www/kundaje/avsec/chipnexus/paper/data/chip-nexus/crispr/{sample}/{fn}'
    print(file_path)
    !cp {row['path']} {file_path}
    !chmod o+rx {file_path}

/srv/www/kundaje/avsec/chipnexus/paper/data/chip-nexus/crispr/wt.Nanog/counts.neg.bw
/srv/www/kundaje/avsec/chipnexus/paper/data/chip-nexus/crispr/wt.Nanog/counts.pos.bw
/srv/www/kundaje/avsec/chipnexus/paper/data/chip-nexus/crispr/wt.Nanog/optimal_peak.narrowPeak.gz
/srv/www/kundaje/avsec/chipnexus/paper/data/chip-nexus/crispr/wt.Sox2/counts.neg.bw
/srv/www/kundaje/avsec/chipnexus/paper/data/chip-nexus/crispr/wt.Sox2/counts.pos.bw
/srv/www/kundaje/avsec/chipnexus/paper/data/chip-nexus/crispr/wt.Sox2/optimal_peak.narrowPeak.gz
/srv/www/kundaje/avsec/chipnexus/paper/data/chip-nexus/crispr/mutant.sox2crispr1.Nanog/counts.neg.bw
/srv/www/kundaje/avsec/chipnexus/paper/data/chip-nexus/crispr/mutant.sox2crispr1.Nanog/counts.pos.bw
/srv/www/kundaje/avsec/chipnexus/paper/data/chip-nexus/crispr/mutant.sox2crispr1.Nanog/optimal_peak.narrowPeak.gz
/srv/www/kundaje/avsec/chipnexus/paper/data/chip-nexus/crispr/mutant.sox2crispr1.Sox2/counts.neg.bw
/srv/www/kundaje/avsec/chipnexus/paper/data/chip-ne

### Generate Supplementary table 1

In [30]:
from basepair.exp.paper.config import data_sheet

In [37]:
reps = data_sheet()
reps = reps[(~reps['QC report'].isnull()) | (reps['Comments'] == 'Crispr samples')]
reps = reps[(reps['TF Name'] != 'control')]
reps['Rep Number'] = reps['Rep Number'].astype(int)
reps['ID'] = [f"{row['Data Type']}-{row['TF Name']}-{row['Rep Number']}" for i,row in reps.iterrows()]
reps = reps[reps['TF Name'].isin(['oct4', 'sox2', 'nanog', 'klf4'])]
reps = reps[~((reps['TF Name'] == 'klf4')&(reps['Data Type'] == 'chipseq'))]

In [36]:
new_reps

Unnamed: 0,Mnemonic,FTP Name,thenexus Name,Sample ID,Comments,Reference,Data Type,TF Name,Rep Number,Control Reps,QC report,Unique deduped reads,Held-out test,"#Rep-IDRpeaks (N1, N2, ..)",#IDR-optimal peaks (Np),Md5-hash-FASTQ,Md5-hash-IDRoptimal,Md5-bigwigs,idr_frip_qc/ppr/FRiP,idr_reproducibility_qc/rescue_ratio,Oak path bw neg,ID
49,N34,mesc_wt_nanog_nexus_1...,wt.Nanog,,Crispr samples,mm10,chipnexus,nanog,1,,http://mitra.stanford...,65M,False,66649.0,38127.0,ac80395cba9da08516f9d...,,,0.4199,2.755,,chipnexus-nanog-1
50,N35,mesc_wt_nanog_nexus_1...,wt.Nanog,,Crispr samples,mm10,chipnexus,nanog,2,,http://mitra.stanford...,84M,False,16716.0,38127.0,126dac02c288489bff786...,,,0.4199,2.755,,chipnexus-nanog-2
51,N36,mesc_wt_nanog_nexus_1...,wt.Nanog,,Crispr samples,mm10,chipnexus,nanog,3,,http://mitra.stanford...,148M,False,10644.0,38127.0,20cfef8acca36b46f641d...,,,0.4199,2.755,,chipnexus-nanog-3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63,N48,mesc_nanogcrispr1_sox...,mutant.nanogcrispr1.Sox2,,Crispr samples,mm10_nanogcrispr1,chipnexus,sox2,1,,http://mitra.stanford...,85M,False,44149.0,95658.0,fcab2929517ff7aa8ff26...,,,0.1934,1.196,,chipnexus-sox2-1
64,N49,mesc_nanogcrispr1_sox...,mutant.nanogcrispr1.Sox2,,Crispr samples,mm10_nanogcrispr1,chipnexus,sox2,2,,http://mitra.stanford...,100M,False,59005.0,95658.0,fb25f045c9c4aeed88ff5...,,,0.1934,1.196,,chipnexus-sox2-2
65,N50,mesc_nanogcrispr1_sox...,mutant.nanogcrispr1.Sox2,,Crispr samples,mm10_nanogcrispr1,chipnexus,sox2,3,,http://mitra.stanford...,119M,False,52844.0,95658.0,2ec0aa7956893d2bf2c5b...,,,0.1934,1.196,,chipnexus-sox2-3


In [43]:
# Add df_data
df_data = pd.read_excel('/users/avsec/gdrive/projects/chipnexus/paper/tables/data-sheet.qc.xlsx')
new_reps = reps[~reps.Mnemonic.isin(df_data.Mnemonic)]
new_reps['idr_reproducibility_qc/rescue_ratio'] = df['idr_reproducibility_qc/rescue_ratio'].loc[new_reps['thenexus Name']].values
new_reps['idr_frip_qc/ppr/FRiP'] = df['idr_frip_qc/ppr/FRiP'].loc[new_reps['thenexus Name']].values
new_reps['ID'] = new_reps['thenexus Name'] + '-' + reps['Rep Number'].astype(str)
df_data = pd.concat([df_data, new_reps[df_data.columns]], axis=0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [33]:
df_data.to_excel('/users/avsec//gdrive/projects/chipnexus/paper/tables/data-sheet.qc.v2.xlsx')