## Get the QC metrics from the pipeline

In [141]:
# Imports
from basepair.imports import *
from basepair.exp.paper.config import data_sheet

In [142]:
reps = data_sheet()
reps = reps[~reps['QC report'].isnull()]
reps = reps[reps['TF Name'] != 'control']
reps['Rep Number'] = reps['Rep Number'].astype(int)
reps['id'] = [f"{row['Data Type']}-{row['TF Name']}-{row['Rep Number']}" for i,row in reps.iterrows()]
reps = reps[reps['TF Name'].isin(['oct4', 'sox2', 'nanog', 'klf4'])]
reps = reps[~((reps['TF Name'] == 'klf4')&(reps['Data Type'] == 'chipseq'))]

In [143]:
# append QC columns to reps
reps['QC_dir'] = reps['QC report'].str.replace('http://mitra.stanford.edu', '/srv/www').str.replace("/qc.html", "")

lrw = []
for i, rw in reps.iterrows():
    qc = read_json(f"{rw.QC_dir}/qc.json")
    rw['idr_reproducibility_qc/rescue_ratio'] = qc['idr_reproducibility_qc']['rescue_ratio']
    rw['idr_frip_qc/ppr/FRiP'] = qc['idr_frip_qc']['ppr']['FRiP']
    lrw.append(rw)

reps = pd.DataFrame(lrw)

In [144]:
qc_columns = ['Mnemonic', 'Data Type', 'TF Name', 'Rep Number', 'Unique deduped reads', '#Rep-IDRpeaks (N1, N2, ..)', '#IDR-optimal peaks (Np)', 'idr_reproducibility_qc/rescue_ratio', 'idr_frip_qc/ppr/FRiP']
print(reps[qc_columns].to_string())

   Mnemonic  Data Type TF Name  Rep Number Unique deduped reads  #Rep-IDRpeaks (N1, N2, ..)  #IDR-optimal peaks (Np)  idr_reproducibility_qc/rescue_ratio  idr_frip_qc/ppr/FRiP
0       C01    chipseq    oct4           1                  27M                   10770.0                    19351.0                    1.2593                           0.0326
1       C02    chipseq    oct4           2                  18M                   14402.0                    19351.0                    1.2593                           0.0326
2       C03    chipseq    sox2           1                  19M                     255.0                     9497.0                    1.3098                           0.0085
3       C04    chipseq    sox2           2                  49M                    8847.0                     9497.0                    1.3098                           0.0085
4       C05    chipseq    sox2           3                  32M                    4202.0                     9497.0    

In [138]:
ddir = get_data_dir()

In [99]:
reps[['id'] + qc_columns].to_excel(f'{ddir}/gdata/data-sheet.qc.xlsx')

## Zip all the qc reports

In [139]:
for i,row in reps.iterrows():
    print(row['id'])

chipseq-oct4-1
chipseq-oct4-2
chipseq-sox2-1
chipseq-sox2-2
chipseq-sox2-3
chipseq-nanog-1
chipseq-nanog-2
chipnexus-oct4-1
chipnexus-oct4-2
chipnexus-oct4-3
chipnexus-oct4-4
chipnexus-oct4-5
chipnexus-oct4-6
chipnexus-sox2-1
chipnexus-sox2-2
chipnexus-sox2-3
chipnexus-sox2-4
chipnexus-nanog-1
chipnexus-nanog-2
chipnexus-nanog-3
chipnexus-nanog-4
chipnexus-nanog-5
chipnexus-klf4-1
chipnexus-klf4-2
chipnexus-klf4-3
chipnexus-klf4-4
chipnexus-klf4-5


In [140]:
from zipfile import ZipFile

# writing files to a zipfile 
with ZipFile(f'{ddir}/gdata/qc-htmls.zip','w') as zf: 
    for i,row in reps.iterrows():
        zf.write(f"{row.QC_dir}/qc.html", arcname="qc." + row['id'] + ".html") 

Link: https://drive.google.com/file/d/16tnnLxlIGiPdnPg7tM4thdT-0y0haOxa/view