In [1]:
import numpy as np
import pandas as pd
import collections

# ONT

In [2]:
ont_df = pd.read_csv('hprc_metadata_sample_files_ONT.tsv', sep='\t')

In [3]:
# Remove notes if all nan
if ont_df['notes'].isnull().all() == True:
    ont_df = ont_df.drop('notes', axis=1)

In [4]:
# TODO: Integrate with HPRC_metadata/merge_metadata.py combine_readstats.keep_columns
ont_meta_columns = ['filename',
        'sample_ID',
        'path',
        'filetype',
        'library_ID',
        'library_strategy',
        'library_source',
        'library_selection',
        'library_layout',
        'platform',
        'instrument_model',
        'design_description',
        'data_type',
        'shear_method',
        'size_selection',
        'seq_kit',
        'basecaller',
        'basecaller_version',
        'basecaller_model',
        'generator_facility',
        'generator_contact']

ont_readstat_columns = ['sample_ID', 
                        'filename',
                        'read_N50',
                        'Gb',
                        'coverage',
                        '100kb+',
                        '200kb+',
                        '300kb+',
                        '400kb+',
                        '500kb+',
                        '1Mb+',
                        'whales']

ont_df_nan_columns = ont_df.columns[ont_df.isna().any()].tolist()

In [5]:
ont_sample_missing_1_submitter_df = ont_df[ont_df.isnull().any(axis=1)]

# Number of ONT samples that have at least one missing submitter file missing
print(len(set(ont_df[ont_df['sample_ID'].isin(ont_sample_missing_1_submitter_df['sample_ID'].tolist())]['sample_ID'].tolist())))

# Export ONT sample files that have at least one missing submitter file missing
ont_df[ont_df['sample_ID'].isin(ont_sample_missing_1_submitter_df['sample_ID'].tolist())].reset_index(drop=True).to_csv('hprc_metadata_sample_files_missing_1_submitter_ONT.tsv', sep='\t')

for nan_column in ont_df_nan_columns:
    ont_df[nan_column] = ont_df[nan_column].ffill().tolist()


14


In [6]:
ont_sample_meta_df = ont_df[ont_meta_columns].drop_duplicates(subset=['sample_ID'])
ont_sample_readstats_df = ont_df[ont_readstat_columns]

In [7]:
# Check no missing sample_ID for the sample meta and readstat subset
assert ont_df[~ont_df['sample_ID'].isin(ont_sample_meta_df['sample_ID'].tolist())].shape[0] == ont_df[~ont_df['sample_ID'].isin(ont_sample_readstats_df['sample_ID'].tolist())].shape[0] == 0

In [8]:
ont_sample_readstats_df = ont_sample_readstats_df.groupby('sample_ID').agg({'read_N50': 'mean',
                                                                          'coverage':'sum',
                                                                          '100kb+': 'sum',
                                                                          '200kb+': 'sum',
                                                                          '300kb+': 'sum',
                                                                          '400kb+': 'sum',
                                                                          '500kb+': 'sum',
                                                                          '1Mb+': 'sum',
                                                                          'whales': 'sum'}).reset_index()

In [9]:
ont_sample_df = pd.merge(ont_sample_meta_df, ont_sample_readstats_df, on='sample_ID')

In [10]:
assert len(set(ont_sample_df['sample_ID'].tolist())) == ont_sample_df.shape[0]

In [11]:
ont_sample_df.to_csv('hprc_metadata_sample_aggregate_ONT.tsv', sep='\t')

# HiFi

In [12]:
hifi_df = pd.read_csv('hprc_metadata_sample_files_HiFi.tsv', sep='\t')
hifi_df.head()

Unnamed: 0,filename,sample_ID,path,accession,study,biosample_accession,total_reads,total_bp,total_Gbp,min,...,design_description,data_type,shear_method,size_selection,ccs_algorithm,polymerase_version,seq_plate_chemistry_version,generator_facility,generator_contact,notes
0,m54329U_200124_193652.ccs.bam,HG01891,s3://human-pangenomics/working/HPRC/HG01891/ra...,SRR13684280,SRP305758,SAMN17861236,1596347,27122049640,27.12,47,...,HiFi sequencing of 18kb fractionated gDNA,unaligned reads,g-TUBE,SageELF,ccs 4.0.0 (commit SL-release-8.0.0),P2.0,C2.0,University of Washington,kmiyamot@uw.edu,
1,m54329U_200127_180554.ccs.bam,HG01891,s3://human-pangenomics/working/HPRC/HG01891/ra...,SRR13684280,SRP305758,SAMN17861236,1755465,29883779569,29.88,46,...,HiFi sequencing of 18kb fractionated gDNA,unaligned reads,g-TUBE,SageELF,ccs 4.0.0 (commit SL-release-8.0.0),P2.0,C2.0,University of Washington,kmiyamot@uw.edu,
2,m54329U_200129_001928.ccs.bam,HG01891,s3://human-pangenomics/working/HPRC/HG01891/ra...,SRR13684280,SRP305758,SAMN17861236,1779732,30200500419,30.2,45,...,HiFi sequencing of 18kb fractionated gDNA,unaligned reads,g-TUBE,SageELF,ccs 4.0.0 (commit SL-release-8.0.0),P2.0,C2.0,University of Washington,kmiyamot@uw.edu,
3,m54329U_200130_064539.ccs.bam,HG01891,s3://human-pangenomics/working/HPRC/HG01891/ra...,SRR13684280,SRP305758,SAMN17861236,1636421,27919782706,27.92,46,...,HiFi sequencing of 18kb fractionated gDNA,unaligned reads,g-TUBE,SageELF,ccs 4.0.0 (commit SL-release-8.0.0),P2.0,C2.0,University of Washington,kmiyamot@uw.edu,
4,m54329U_200201_051510.ccs.bam,HG01123,s3://human-pangenomics/working/HPRC/HG01123/ra...,SRR13684290,SRP305758,SAMN17861232,2072143,34554070679,34.55,46,...,HiFi sequencing of 17kb fractionated gDNA,unaligned reads,g-TUBE,SageELF,ccs 4.0.0 (commit SL-release-8.0.0),P2.0,C2.0,University of Washington,kmiyamot@uw.edu,


In [13]:
# Remove notes if all nan
if hifi_df['notes'].isnull().all() == True:
    hifi_df = hifi_df.drop('notes', axis=1)

In [14]:
hifi_1_submitter_metadata = ['filename', 'sample_ID', 'library_ID', 'library_strategy', 'library_source', 'library_selection', 'library_layout', 'platform', 'instrument_model', 
                    'design_description', 'data_type', 'shear_method', 'size_selection', 'ccs_algorithm', 'polymerase_version', 'seq_plate_chemistry_version', 
                    'generator_facility', 'generator_contact', 'notes']

hifi_5_readstats = ['sample_ID','filename', 'total_reads', 'total_bp', 'total_Gbp', 'min', 'max', 'mean', 'quartile_25', 'quartile_50', 'quartile_75', 'N25', 'N50', 'N75']


In [15]:
hifi_sample_meta_df = hifi_df[hifi_1_submitter_metadata].drop_duplicates(subset=['sample_ID']).reset_index(drop=True)

In [16]:
hifi_sample_missing_1_submitter_list = []
for sample in hifi_sample_meta_df['sample_ID'].tolist():
    if sum(hifi_sample_meta_df[hifi_sample_meta_df['sample_ID'].isin([sample])].isna().any()) == 0:
        pass
    else:
        hifi_sample_missing_1_submitter_df = hifi_sample_meta_df[hifi_sample_meta_df['sample_ID'].isin([sample])]
        if hifi_sample_missing_1_submitter_df.columns[hifi_sample_missing_1_submitter_df.isna().any()].tolist() == ['notes']:
            pass
        else:
            # print(sample, hifi_sample_missing_1_submitter_df.columns[hifi_sample_missing_1_submitter_df.isna().any()].tolist())
            hifi_sample_missing_1_submitter_list.append(sample)
print(len(hifi_sample_missing_1_submitter_list))

20


In [17]:
hifi_sample_meta_df[hifi_sample_meta_df['sample_ID'].isin(hifi_sample_missing_1_submitter_list)].reset_index().to_csv('hprc_metadata_sample_files_missing_1_submitter_HiFi.tsv', sep='\t')

In [18]:
hifi_sample_readstats_df = hifi_df[hifi_5_readstats]
hifi_sample_readstats_df = hifi_sample_readstats_df.groupby('sample_ID').agg({'total_reads': 'sum',
                                                   'total_bp': 'sum',
                                                   'total_Gbp': 'sum',
                                                   'min': 'mean',
                                                   'max': 'mean',
                                                   'quartile_25': 'mean',
                                                   'quartile_50': 'mean',
                                                   'quartile_75': 'mean',
                                                   'N25': 'sum',
                                                   'N50': 'sum',
                                                   'N75': 'sum'}).reset_index()

In [19]:
hifi_sample_df = pd.merge(hifi_sample_meta_df, hifi_sample_readstats_df, on='sample_ID')

In [20]:
# Check that sample's with missing 1_submitter metadata are present
assert hifi_sample_df[hifi_sample_df['sample_ID'].isin(hifi_sample_missing_1_submitter_list)].shape[0] == len(hifi_sample_missing_1_submitter_list)

In [21]:
hifi_sample_df.to_csv('hprc_metadata_sample_aggregate_HiFi.tsv', sep='\t')

# Deep Consensus

In [40]:
dc_df = pd.read_csv('hprc_metadata_sample_files_DEEPCONSENSUS.tsv', sep='\t')
dc_df.head()

Unnamed: 0,filename,sample_ID,path,accession,study,biosample_accession,total_reads,total_bp,total_Gbp,min,...,design_description,data_type,shear_method,size_selection,DeepConsensus_version,polymerase_version,seq_plate_chemistry_version,generator_facility,generator_contact,notes
0,HG00099.m54329U_220825_174247.dc.q20.fastq.gz,HG00099,s3://human-pangenomics/working/HPRC/HG00099/ra...,SRR26545347,SRP305758,SAMN33758778,2687625,56102007798,56.1,107,...,HiFi sequencing of 20kb fractionated gDNA reba...,unaligned reads,Megaruptor 3,PippinHT,1.2,P2,C2,University of Washington,kmiyamot@uw.edu,
1,HG00099.m54329U_220827_143814.dc.q20.fastq.gz,HG00099,s3://human-pangenomics/working/HPRC/HG00099/ra...,SRR26545347,SRP305758,SAMN33758778,2643186,52918724515,52.92,345,...,HiFi sequencing of 20kb fractionated gDNA reba...,unaligned reads,Megaruptor 3,PippinHT,1.2,P2,C2,University of Washington,kmiyamot@uw.edu,
2,HG00099.m54329U_220829_095708.dc.q20.fastq.gz,HG00099,s3://human-pangenomics/working/HPRC/HG00099/ra...,SRR26545347,SRP305758,SAMN33758778,2866917,57539149577,57.54,138,...,HiFi sequencing of 20kb fractionated gDNA reba...,unaligned reads,Megaruptor 3,PippinHT,1.2,P2,C2,University of Washington,kmiyamot@uw.edu,
3,HG00140.m64043_220728_173215.dc.q20.fastq.gz,HG00140,s3://human-pangenomics/working/HPRC/HG00140/ra...,SRR26545346,SRP305758,SAMN33621941,572695,11345594738,11.35,117,...,HiFi sequencing of 20kb fractionated gDNA reba...,unaligned reads,Megaruptor 3,PippinHT,1.2,P2,C2,University of Washington,kmiyamot@uw.edu,
4,HG00140.m64136_220715_182717.dc.q20.fastq.gz,HG00140,s3://human-pangenomics/working/HPRC/HG00140/ra...,SRR26545346,SRP305758,SAMN33621941,1674999,36918329655,36.92,400,...,HiFi sequencing of 20kb fractionated gDNA reba...,unaligned reads,Megaruptor 3,PippinHT,1.2,P2,C2,University of Washington,kmiyamot@uw.edu,


In [23]:
# Remove notes if all nan
if dc_df['notes'].isnull().all() == True:
    dc_df = dc_df.drop('notes', axis=1)

In [28]:
required_deepconsensus_submitter_columns = ['filename', 'sample_ID', 'library_ID', 'library_strategy', 'library_source', 
                                            'library_selection', 'library_layout', 'platform', 'instrument_model', 'design_description', 
                                            'data_type', 'shear_method', 'size_selection', 
                                            'DeepConsensus_version', 'polymerase_version', 
                                            'seq_plate_chemistry_version', 'generator_facility', 
                                            'generator_contact']

required_deepconsensus_readstats_columns = ['sample_ID', 'filename', 'total_reads', 'total_bp', 'total_Gbp', 'min', 'max', 'mean', 'quartile_25', 'quartile_50', 'quartile_75', 'N25', 'N50', 'N75']

In [25]:
dc_sample_meta_df = dc_df[required_deepconsensus_submitter_columns].drop_duplicates(subset=['sample_ID']).reset_index(drop=True)

In [26]:
dc_sample_missing_1_submitter_list = []
for sample in dc_sample_meta_df['sample_ID'].tolist():
    if sum(dc_sample_meta_df[dc_sample_meta_df['sample_ID'].isin([sample])].isna().any()) == 0:
        pass
    else:
        dc_sample_missing_1_submitter_df = dc_sample_meta_df[dc_sample_meta_df['sample_ID'].isin([sample])]
        if dc_sample_missing_1_submitter_df.columns[dc_sample_missing_1_submitter_df.isna().any()].tolist() == ['notes']:
            pass
        else:
            # print(sample, hifi_sample_missing_1_submitter_df.columns[hifi_sample_missing_1_submitter_df.isna().any()].tolist())
            dc_sample_missing_1_submitter_list.append(sample)
print(len(dc_sample_missing_1_submitter_list))

0


In [33]:
# Check all dc sample 5_readstats data is present
assert sum(dc_df[required_deepconsensus_readstats_columns].isna().any()) == 0

In [36]:
dc_sample_readstats_df = dc_df[required_deepconsensus_readstats_columns]

dc_sample_readstats_df = dc_sample_readstats_df.groupby('sample_ID').agg({'total_reads': 'sum',
                                               'total_bp': 'sum',
                                               'total_Gbp': 'sum',
                                               'min': 'mean',
                                               'max': 'mean',
                                               'quartile_25': 'mean',
                                               'quartile_50': 'mean',
                                               'quartile_75': 'mean',
                                               'N25': 'sum',
                                               'N50': 'sum',
                                               'N75': 'sum'}).reset_index()

In [37]:
dc_sample_df = pd.merge(dc_sample_meta_df, dc_sample_readstats_df, on='sample_ID')

In [44]:
dc_sample_df.to_csv('hprc_metadata_sample_aggregate_DEEPCONSENSUS.tsv', sep='\t')

Unnamed: 0,filename,sample_ID,library_ID,library_strategy,library_source,library_selection,library_layout,platform,instrument_model,design_description,...,total_bp,total_Gbp,min,max,quartile_25,quartile_50,quartile_75,N25,N50,N75
0,HG00099.m54329U_220825_174247.dc.q20.fastq.gz,HG00099,PG00099_1.HFSS_dc,WGS,GENOMIC,size fractionation,single,PACBIO_SMRT,PacBio Sequel II,HiFi sequencing of 20kb fractionated gDNA reba...,...,166559881890,166.56,196.666667,62248.666667,17449.333333,19636.333333,22651.666667,53851,61228,70913
1,HG00140.m64043_220728_173215.dc.q20.fastq.gz,HG00140,HG00140_lib1_dc,WGS,GENOMIC,size fractionation,single,PACBIO_SMRT,PacBio Sequel II,HiFi sequencing of 20kb fractionated gDNA reba...,...,126611334877,126.62,272.5,67774.5,18393.25,20448.5,23623.25,75494,85052,99985
2,HG00280.m54329U_220901_221341.dc.q20.fastq.gz,HG00280,PG00280.HFSS_dc,WGS,GENOMIC,size fractionation,single,PACBIO_SMRT,PacBio Sequel II,HiFi sequencing of 20kb fractionated gDNA reba...,...,151564344052,151.56,95.0,64738.666667,17189.666667,19452.666667,22546.0,53137,60800,70704
3,HG00323.m64043_220728_173215.dc.q20.fastq.gz,HG00323,HG00323_lib1_dc,WGS,GENOMIC,size fractionation,single,PACBIO_SMRT,PacBio Sequel II,HiFi sequencing of 20kb fractionated gDNA reba...,...,120027730827,120.03,368.75,60432.75,16941.5,19105.5,21983.25,69668,79575,91338
4,HG00408.m64136_211111_194404.dc.q20.fastq.gz,HG00408,HG00408_lib1_dc,WGS,GENOMIC,size fractionation,single,PACBIO_SMRT,PacBio Sequel II,HiFi sequencing of 20kb fractionated gDNA reba...,...,134450022475,134.45,396.666667,63712.666667,16502.333333,17981.0,21025.666667,50387,55732,67294
5,HG00558.m54329U_220107_233847.dc.q20.fastq.gz,HG00558,HG00558.HFSS_dc,WGS,GENOMIC,size fractionation,single,PACBIO_SMRT,PacBio Sequel II,HiFi sequencing of 20kb fractionated gDNA reba...,...,125476773513,125.47,278.25,71413.5,16664.25,20128.25,24542.0,71703,87285,106314
6,HG00597.m64043_211210_180342.dc.q20.fastq.gz,HG00597,HG00597_lib1_dc,WGS,GENOMIC,size fractionation,single,PACBIO_SMRT,PacBio Sequel II,HiFi sequencing of 20kb fractionated gDNA reba...,...,136188293000,136.19,507.5,56623.0,18041.5,19308.5,21024.75,72879,78351,85695
7,HG00639.m54329U_211222_104516.dc.q20.fastq.gz,HG00639,HG00639.HFSS_dc,WGS,GENOMIC,size fractionation,single,PACBIO_SMRT,PacBio Sequel II,HiFi sequencing of 20kb fractionated gDNA reba...,...,129823813816,129.83,171.75,67000.5,15945.5,19211.0,23407.5,68667,83391,101566
8,HG01074.m54329U_211110_112322.dc.q20.fastq.gz,HG01074,HG01074_SRE.HFSS_dc,WGS,GENOMIC,size fractionation,single,PACBIO_SMRT,PacBio Sequel II,HiFi sequencing of 20kb fractionated gDNA reba...,...,143703615375,143.7,450.75,65032.5,16286.25,18770.75,22208.25,67828,79291,94409
9,HG01081.m54329U_211223_214216.dc.q20.fastq.gz,HG01081,HG01081.HFSS_dc,WGS,GENOMIC,size fractionation,single,PACBIO_SMRT,PacBio Sequel II,HiFi sequencing of 20kb fractionated gDNA reba...,...,131091771689,131.09,279.0,73057.666667,15295.0,18874.0,23285.333333,50366,62247,76304
