In [2]:
import numpy as np
import pandas as pd
import collections

# Wrangling Functions

In [3]:
def check_missing_1_submitter(sample_df, required_1_submitter_columns):
    
    require_1_submitter_dict = {sample: {key: None for key in required_1_submitter_columns} for sample in sample_df['sample_ID'].tolist()}
    
    for sample in list(set(sample_df['sample_ID'].tolist())):
        sample_array = sample_df[sample_df['sample_ID'].isin([sample])][required_1_submitter_columns].values

        assert len(required_1_submitter_columns) == sample_array.shape[1]

        for column_index in range(sample_array.shape[1]):

            if np.all(sample_array[:, column_index] == sample_array[0, column_index]):
                require_1_submitter_dict[sample][required_1_submitter_columns[column_index]] = sample_array[0, column_index]
            else:
                require_1_submitter_dict[sample][required_1_submitter_columns[column_index]] = sample_array[:, column_index]
    
    samples_missing_1_submitter = _samples_missing_1_submitter(require_1_submitter_dict)
    
    submitter_df = pd.DataFrame.from_dict(require_1_submitter_dict, orient='index')
    
    return submitter_df, list(set(samples_missing_1_submitter))


def _samples_missing_1_submitter(require_1_submitter_dict):
    
    samples_missing_1_submitter = []
    for sample_id, meta_dict in require_1_submitter_dict.items():
        for key, value in meta_dict.items():
            if isinstance(value, np.ndarray):
                if any(item == 'nan' or pd.isna(item) for item in value):
                    if key == 'notes':
                        pass
                    else:
                        samples_missing_1_submitter.append(sample_id)
    return samples_missing_1_submitter


def sample_aggregate_5_readstats(sample_df, required_readstat_columns, readstat_dict):

    sample_5_readstats_df = sample_df[required_readstat_columns]
    sample_5_readstats_df = sample_5_readstats_df.groupby('sample_ID').agg(readstat_dict).reset_index()
    
    return sample_5_readstats_df

# ONT

In [5]:
ont_df = pd.read_csv('sample-files/hprc_metadata_sample_files_ONT.tsv', sep='\t')

In [93]:
# Remove notes if all nan
if ont_df['notes'].isnull().all() == True:
    ont_df = ont_df.drop('notes', axis=1)

In [94]:
# TODO: Integrate with HPRC_metadata/merge_metadata.py combine_readstats.keep_columns
required_ONT_1_submitter_columns = ['filename',
                                    'sample_ID',
                                    'path',
                                    'filetype',
                                    'library_ID',
                                    'library_strategy',
                                    'library_source',
                                    'library_selection',
                                    'library_layout',
                                    'platform',
                                    'instrument_model',
                                    'design_description',
                                    'data_type',
                                    'shear_method',
                                    'size_selection',
                                    'seq_kit',
                                    'basecaller',
                                    'basecaller_version',
                                    'basecaller_model',
                                    'generator_facility',
                                    'generator_contact']

required_ONT_1_submitter_columns_df = pd.DataFrame(required_ONT_1_submitter_columns, columns=['1_submitter_columns'])
required_ONT_1_submitter_columns_df.to_csv('aggregate-sample-inputs/hprc_1_submitter_columns_ONT.tsv',sep='\t')

In [97]:
required_ONT_5_readstats_columns = ['sample_ID', 
                                    'filename',
                                    'read_N50',
                                    'Gb',
                                    'coverage',
                                    '100kb+',
                                    '200kb+',
                                    '300kb+',
                                    '400kb+',
                                    '500kb+',
                                    '1Mb+',
                                    'whales']
required_ONT_5_readstats_columns  = pd.DataFrame(required_ONT_5_readstats_columns , columns=['5_readstats_columns'])
required_ONT_5_readstats_columns.to_csv('aggregate-sample-inputs/hprc_5_readstats_columns_ONT.tsv',sep='\t')

In [105]:
required_ONT_5_readstat_dict = {'read_N50': 'mean',
                                'coverage':'sum',
                                '100kb+': 'sum',
                                '200kb+': 'sum',
                                '300kb+': 'sum',
                                '400kb+': 'sum',
                                '500kb+': 'sum',
                                '1Mb+': 'sum',
                                'whales': 'sum'}

pd.DataFrame.from_dict(required_ONT_5_readstat_dict,orient='index').to_csv('aggregate-sample-inputs/hprc_5_readstat_sample_aggregate_rules_ONT.tsv',sep='\t')
# pd.read_csv('hprc_5_readstat_sample_aggregate_rules.tsv',sep='\t', index_col=[0]).to_dict()['0']

In [47]:
ont_1_submitter_df, samples_missing_1_submmiter_ont = check_missing_1_submitter(ont_df, required_ONT_1_submitter_columns)
if len(samples_missing_1_submmiter_ont) > 0:
    ont_1_submitter_df[ont_1_submitter_df['sample_ID'].isin(samples_missing_1_submmiter_ont)].to_csv('aggregate-sample-inputs/hprc_metadata_sample_files_missing_1_submitter_ONT.tsv', sep='\t')

In [48]:
# sample_df, required_readstat_columns
ont_5_readstats_df = sample_aggregate_5_readstats(ont_df, required_ONT_5_readstats_columns, required_ONT_5_readstat_dict)

In [49]:
ont_sample_df = pd.merge(ont_1_submitter_df, ont_5_readstats_df, on='sample_ID')

In [72]:
ont_sample_df.shape

(124, 30)

In [68]:
len(set(ont_sample_df['sample_ID'].tolist()))

124

In [75]:
ont_low_coverage_sample_df = ont_sample_df[ont_sample_df['100kb+'] <= 30.0]

In [78]:
ont_low_coverage_sample_df[ont_low_coverage_sample_df['sample_ID'].isin(list(set(samples_missing_1_submmiter_ont)))]

Unnamed: 0,filename,sample_ID,path,filetype,library_ID,library_strategy,library_source,library_selection,library_layout,platform,...,generator_contact,read_N50,coverage,100kb+,200kb+,300kb+,400kb+,500kb+,1Mb+,whales
3,[02_08_22_R941_HG03834_1_Guppy_6.5.7_450bps_mo...,HG03834,[s3://human-pangenomics/working/HPRC/HG03834/r...,"[nan, nan, nan]","[02_08_22_R941_HG03834_1, 02_08_22_R941_HG0383...","[nan, nan, nan]","[nan, nan, nan]","[nan, nan, nan]","[nan, nan, nan]","[nan, nan, nan]",...,"[nan, nan, nan]",107647.333333,48.8,26.33,8.97,3.23,1.35,0.64,0.03,99
6,[02_15_22_R941_HG03041_1_Guppy_6.5.7_450bps_mo...,HG03041,[s3://human-pangenomics/working/HPRC/HG03041/r...,"[nan, nan, nan]","[02_15_22_R941_HG03041_1, 02_15_22_R941_HG0304...","[nan, nan, nan]","[nan, nan, nan]","[nan, nan, nan]","[nan, nan, nan]","[nan, nan, nan]",...,"[nan, nan, nan]",92468.0,59.41,27.28,7.89,2.09,0.65,0.25,0.01,32
7,[02_15_22_R941_HG03050_1_Guppy_6.5.7_450bps_mo...,HG03050,[s3://human-pangenomics/working/HPRC/HG03050/r...,"[nan, nan, nan]","[02_15_22_R941_HG03050_1, 02_15_22_R941_HG0305...","[nan, nan, nan]","[nan, nan, nan]","[nan, nan, nan]","[nan, nan, nan]","[nan, nan, nan]",...,"[nan, nan, nan]",92270.666667,60.55,27.88,7.14,1.54,0.4,0.13,0.0,15
8,[02_15_22_R941_HG03239_1_Guppy_6.5.7_450bps_mo...,HG03239,[s3://human-pangenomics/working/HPRC/HG03239/r...,"[nan, nan, nan]","[02_15_22_R941_HG03239_1, 02_15_22_R941_HG0323...","[nan, nan, nan]","[nan, nan, nan]","[nan, nan, nan]","[nan, nan, nan]","[nan, nan, nan]",...,"[nan, nan, nan]",82147.0,64.02,25.48,5.39,1.03,0.29,0.12,0.0,16
19,[04_12_22_R941_HG03195_1_Guppy_6.5.7_450bps_mo...,HG03195,[s3://human-pangenomics/working/HPRC/HG03195/r...,"[nan, nan, nan, nan]","[04_12_22_R941_HG03195_1, 04_12_22_R941_HG0319...","[nan, nan, nan, nan]","[nan, nan, nan, nan]","[nan, nan, nan, nan]","[nan, nan, nan, nan]","[nan, nan, nan, nan]",...,"[nan, nan, nan, nan]",79603.5,47.55,18.08,3.57,0.78,0.27,0.14,0.02,46
22,[05_17_22_R941_HG03130_1_Guppy_6.5.7_450bps_mo...,HG03130,[s3://human-pangenomics/working/HPRC/HG03130/r...,"[nan, nan, nan]","[05_17_22_R941_HG03130_1, 05_17_22_R941_HG0313...","[nan, nan, nan]","[nan, nan, nan]","[nan, nan, nan]","[nan, nan, nan]","[nan, nan, nan]",...,"[nan, nan, nan]",77859.0,58.86,21.29,3.12,0.3,0.05,0.02,0.0,2
23,[05_17_22_R941_HG03139_1_Guppy_6.5.7_450bps_mo...,HG03139,[s3://human-pangenomics/working/HPRC/HG03139/r...,"[nan, nan, nan]","[05_17_22_R941_HG03139_1, 05_17_22_R941_HG0313...","[nan, nan, nan]","[nan, nan, nan]","[nan, nan, nan]","[nan, nan, nan]","[nan, nan, nan]",...,"[nan, nan, nan]",87795.333333,57.65,24.8,5.31,1.03,0.29,0.12,0.0,25
24,[05_17_22_R941_HG03209_1_Guppy_6.5.7_450bps_mo...,HG03209,[s3://human-pangenomics/working/HPRC/HG03209/r...,"[nan, nan, nan]","[05_17_22_R941_HG03209_1, 05_17_22_R941_HG0320...","[nan, nan, nan]","[nan, nan, nan]","[nan, nan, nan]","[nan, nan, nan]","[nan, nan, nan]",...,"[nan, nan, nan]",84171.333333,70.94,29.79,6.57,1.05,0.14,0.02,0.0,1
28,[05_24_22_R941_HG03458_1_Guppy_6.5.7_450bps_mo...,HG03458,[s3://human-pangenomics/working/HPRC/HG03458/r...,"[nan, nan, nan]","[05_24_22_R941_HG03458_1, 05_24_22_R941_HG0345...","[nan, nan, nan]","[nan, nan, nan]","[nan, nan, nan]","[nan, nan, nan]","[nan, nan, nan]",...,"[nan, nan, nan]",82253.0,68.27,27.82,7.42,1.58,0.3,0.05,0.0,0
89,[08_10_22_R941_HG03792_1_Guppy_6.5.7_450bps_mo...,HG03792,[s3://human-pangenomics/working/HPRC/HG03792/r...,"[nan, nan, nan]","[08_10_22_R941_HG03792_1, 08_10_22_R941_HG0379...","[nan, nan, nan]","[nan, nan, nan]","[nan, nan, nan]","[nan, nan, nan]","[nan, nan, nan]",...,"[nan, nan, nan]",57058.0,73.05,17.48,2.69,0.38,0.06,0.02,0.0,1


### Remove

In [5]:
ont_sample_missing_1_submitter_df = ont_df[ont_df.isnull().any(axis=1)]

# Number of ONT samples that have at least one missing submitter file missing
print(len(set(ont_df[ont_df['sample_ID'].isin(ont_sample_missing_1_submitter_df['sample_ID'].tolist())]['sample_ID'].tolist())))

# Export ONT sample files that have at least one missing submitter file missing
ont_df[ont_df['sample_ID'].isin(ont_sample_missing_1_submitter_df['sample_ID'].tolist())].reset_index(drop=True).to_csv('hprc_metadata_sample_files_missing_1_submitter_ONT.tsv', sep='\t')

for nan_column in ont_df_nan_columns:
    ont_df[nan_column] = ont_df[nan_column].ffill().tolist()


14


In [6]:
ont_sample_meta_df = ont_df[ont_meta_columns].drop_duplicates(subset=['sample_ID'])
ont_sample_readstats_df = ont_df[ont_readstat_columns]

In [7]:
# Check no missing sample_ID for the sample meta and readstat subset
assert ont_df[~ont_df['sample_ID'].isin(ont_sample_meta_df['sample_ID'].tolist())].shape[0] == ont_df[~ont_df['sample_ID'].isin(ont_sample_readstats_df['sample_ID'].tolist())].shape[0] == 0

In [8]:
ont_sample_readstats_df = ont_sample_readstats_df.groupby('sample_ID').agg({'read_N50': 'mean',
                                                                          'coverage':'sum',
                                                                          '100kb+': 'sum',
                                                                          '200kb+': 'sum',
                                                                          '300kb+': 'sum',
                                                                          '400kb+': 'sum',
                                                                          '500kb+': 'sum',
                                                                          '1Mb+': 'sum',
                                                                          'whales': 'sum'}).reset_index()

In [9]:
ont_sample_df = pd.merge(ont_sample_meta_df, ont_sample_readstats_df, on='sample_ID')

In [10]:
assert len(set(ont_sample_df['sample_ID'].tolist())) == ont_sample_df.shape[0]

In [11]:
ont_sample_df.to_csv('hprc_metadata_sample_aggregate_ONT.tsv', sep='\t')

# HiFi

In [3]:
hifi_df = pd.read_csv('sample-files/hprc_metadata_sample_files_HiFi.tsv', sep='\t')
hifi_df.head()

Unnamed: 0,filename,sample_ID,path,accession,study,biosample_accession,total_reads,total_bp,total_Gbp,min,...,design_description,data_type,shear_method,size_selection,ccs_algorithm,polymerase_version,seq_plate_chemistry_version,generator_facility,generator_contact,notes
0,m54329U_200124_193652.ccs.bam,HG01891,s3://human-pangenomics/working/HPRC/HG01891/ra...,SRR13684280,SRP305758,SAMN17861236,1596347,27122049640,27.12,47,...,HiFi sequencing of 18kb fractionated gDNA,unaligned reads,g-TUBE,SageELF,ccs 4.0.0 (commit SL-release-8.0.0),P2.0,C2.0,University of Washington,kmiyamot@uw.edu,
1,m54329U_200127_180554.ccs.bam,HG01891,s3://human-pangenomics/working/HPRC/HG01891/ra...,SRR13684280,SRP305758,SAMN17861236,1755465,29883779569,29.88,46,...,HiFi sequencing of 18kb fractionated gDNA,unaligned reads,g-TUBE,SageELF,ccs 4.0.0 (commit SL-release-8.0.0),P2.0,C2.0,University of Washington,kmiyamot@uw.edu,
2,m54329U_200129_001928.ccs.bam,HG01891,s3://human-pangenomics/working/HPRC/HG01891/ra...,SRR13684280,SRP305758,SAMN17861236,1779732,30200500419,30.2,45,...,HiFi sequencing of 18kb fractionated gDNA,unaligned reads,g-TUBE,SageELF,ccs 4.0.0 (commit SL-release-8.0.0),P2.0,C2.0,University of Washington,kmiyamot@uw.edu,
3,m54329U_200130_064539.ccs.bam,HG01891,s3://human-pangenomics/working/HPRC/HG01891/ra...,SRR13684280,SRP305758,SAMN17861236,1636421,27919782706,27.92,46,...,HiFi sequencing of 18kb fractionated gDNA,unaligned reads,g-TUBE,SageELF,ccs 4.0.0 (commit SL-release-8.0.0),P2.0,C2.0,University of Washington,kmiyamot@uw.edu,
4,m54329U_200201_051510.ccs.bam,HG01123,s3://human-pangenomics/working/HPRC/HG01123/ra...,SRR13684290,SRP305758,SAMN17861232,2072143,34554070679,34.55,46,...,HiFi sequencing of 17kb fractionated gDNA,unaligned reads,g-TUBE,SageELF,ccs 4.0.0 (commit SL-release-8.0.0),P2.0,C2.0,University of Washington,kmiyamot@uw.edu,


In [4]:
# Remove notes if all nan
if hifi_df['notes'].isnull().all() == True:
    hifi_df = hifi_df.drop('notes', axis=1)

In [14]:
required_hifi_1_submitter_columns = ['sample_ID', 'filename', 'library_ID', 'library_strategy', 'library_source', 'library_selection', 'library_layout', 'platform', 'instrument_model', 
                    'design_description', 'data_type', 'shear_method', 'size_selection', 'ccs_algorithm', 'polymerase_version', 'seq_plate_chemistry_version', 
                    'generator_facility', 'generator_contact', 'notes']

required_hifi_5_readstats_columns = ['sample_ID','filename', 'total_reads', 'total_bp', 'total_Gbp', 'min', 'max', 'mean', 'quartile_25', 'quartile_50', 'quartile_75', 'N25', 'N50', 'N75']

required_hifi_5_readstat_dict = {'total_reads': 'sum',
                 'total_bp': 'sum',
                 'total_Gbp': 'sum',
                 'min': 'mean',
                 'max': 'mean',
                 'quartile_25': 'mean',
                 'quartile_50': 'mean',
                 'quartile_75': 'mean',
                 'N25': 'mean',
                 'N50': 'mean', # assembly term shortest contig of the total assembly length (different for ONT)
                 'N75': 'mean'}


In [6]:
hifi_1_submitter_df, samples_missing_1_submmiter_hifi = check_missing_1_submitter(hifi_df, required_hifi_1_submitter_columns)
hifi_1_submitter_df[hifi_1_submitter_df['sample_ID'].isin(samples_missing_1_submmiter_hifi)].to_csv('hprc_metadata_sample_files_missing_1_submitter_HiFi.tsv', sep='\t')

In [7]:
# sample_df, required_readstat_columns
hifi_5_readstats_df = sample_aggregate_5_readstats(hifi_df, required_hifi_5_readstats_columns, readstat_dict)

In [8]:
hifi_sample_df = pd.merge(hifi_1_submitter_df, hifi_5_readstats_df, on='sample_ID')

In [9]:
# Check that sample's with missing 1_submitter metadata are present
assert hifi_sample_df[hifi_sample_df['sample_ID'].isin(samples_missing_1_submmiter_hifi)].shape[0] == len(set(samples_missing_1_submmiter_hifi))

In [10]:
hifi_sample_df.to_csv('hprc_metadata_sample_aggregate_HiFi.tsv', sep='\t')

### Remove

In [42]:

require_hifi_1_submitter_dict = {sample: {key: None for key in required_hifi_1_submitter_columns} for sample in hifi_df['sample_ID'].tolist()} 
for sample in list(set(hifi_df['sample_ID'].tolist())):
    # print(sample, required_hifi_1_submitter_columns)
    
    sample_array = hifi_df[hifi_df['sample_ID'].isin([sample])][required_hifi_1_submitter_columns].values
    
    # print(sample)
    # print(sample_array)
    # print(len(required_hifi_1_submitter_columns))
    
    assert len(required_hifi_1_submitter_columns) == sample_array.shape[1]
    
    for column_index in range(sample_array.shape[1]):
        # print(sample_array[0, column_index], sample_array[:, column_index])
        if np.all(sample_array[:, column_index] == sample_array[0, column_index]):
            # print(sample_array[0, column_index], 
            #       required_hifi_1_submitter_columns[column_index], 
            #       sample_array[:, column_index].tolist()[0])
            # print('\n')
            require_hifi_1_submitter_dict[sample][required_hifi_1_submitter_columns[column_index]] = sample_array[0, column_index]
        else:
            # print(sample_array[0, column_index])
            # print(required_hifi_1_submitter_columns,column_index)
            # print(required_hifi_1_submitter_columns[column_index])
            require_hifi_1_submitter_dict[sample][required_hifi_1_submitter_columns[column_index]] = sample_array[:, column_index]
            # print(required_hifi_1_submitter_columns[column_index],
                  # sample_array[:, column_index])
            # print('\n')
            # pass

In [19]:
samples_missing_submmiter_1_hifi = []
for sample_id, meta_dict in require_hifi_1_submitter_dict.items():
    # print(sample_id, meta_dict)
    for key, value in meta_dict.items():
        if isinstance(value, np.ndarray):
            if any(item == 'nan' or pd.isna(item) for item in value):
                if key == 'notes':
                    pass
                else:
                    samples_missing_submmiter_1_hifi.append(sample_id)
hifi_1_submitter_df = pd.DataFrame.from_dict(require_hifi_1_submitter_dict, orient='index')

In [None]:
hifi_1_submitter_df[hifi_1_submitter_df['sample_ID'].isin(samples_missing_submmiter_1_hifi)].to_csv('hprc_metadata_sample_files_missing_1_submitter_HiFi.tsv', sep='\t')

In [22]:
# # Retired
# hifi_sample_meta_df = hifi_df[required_hifi_1_submitter_columns].drop_duplicates(subset=['sample_ID']).reset_index(drop=True)
# hifi_sample_missing_1_submitter_list = []
# for sample in hifi_sample_meta_df['sample_ID'].tolist():
#     if sum(hifi_sample_meta_df[hifi_sample_meta_df['sample_ID'].isin([sample])].isna().any()) == 0:
#         pass
#     else:
#         hifi_sample_missing_1_submitter_df = hifi_sample_meta_df[hifi_sample_meta_df['sample_ID'].isin([sample])]
#         if hifi_sample_missing_1_submitter_df.columns[hifi_sample_missing_1_submitter_df.isna().any()].tolist() == ['notes']:
#             pass
#         else:
#             # print(sample, hifi_sample_missing_1_submitter_df.columns[hifi_sample_missing_1_submitter_df.isna().any()].tolist())
#             hifi_sample_missing_1_submitter_list.append(sample)
# print(len(hifi_sample_missing_1_submitter_list))

# hifi_sample_meta_df[hifi_sample_meta_df['sample_ID'].isin(hifi_sample_missing_1_submitter_list)].reset_index().to_csv('hprc_metadata_sample_files_missing_1_submitter_HiFi.tsv', sep='\t')

In [23]:
hifi_sample_readstats_df = hifi_df[required_hifi_5_readstats_columns]
hifi_sample_readstats_df = hifi_sample_readstats_df.groupby('sample_ID').agg({'total_reads': 'sum',
                                                   'total_bp': 'sum',
                                                   'total_Gbp': 'sum',
                                                   'min': 'mean',
                                                   'max': 'mean',
                                                   'quartile_25': 'mean',
                                                   'quartile_50': 'mean',
                                                   'quartile_75': 'mean',
                                                   'N25': 'mean',
                                                   'N50': 'mean', # assembly term shortest contig of the total assembly length (different for ONT)
                                                   'N75': 'mean'}).reset_index()

In [24]:
hifi_sample_df = pd.merge(hifi_1_submitter_df, hifi_sample_readstats_df, on='sample_ID')

In [25]:
# Check that sample's with missing 1_submitter metadata are present
assert hifi_sample_df[hifi_sample_df['sample_ID'].isin(samples_missing_submmiter_1_hifi)].shape[0] == len(set(samples_missing_submmiter_1_hifi))

In [26]:
hifi_sample_df.to_csv('hprc_metadata_sample_aggregate_HiFi.tsv', sep='\t')

In [28]:
hifi_sample_df.shape

(122, 30)

# Deep Consensus

In [17]:
dc_df = pd.read_csv('hprc_metadata_sample_files_DEEPCONSENSUS.tsv', sep='\t')
dc_df.head()

Unnamed: 0,filename,sample_ID,path,accession,study,biosample_accession,total_reads,total_bp,total_Gbp,min,...,design_description,data_type,shear_method,size_selection,DeepConsensus_version,polymerase_version,seq_plate_chemistry_version,generator_facility,generator_contact,notes
0,HG00099.m54329U_220825_174247.dc.q20.fastq.gz,HG00099,s3://human-pangenomics/working/HPRC/HG00099/ra...,SRR26545347,SRP305758,SAMN33758778,2687625,56102007798,56.1,107,...,HiFi sequencing of 20kb fractionated gDNA reba...,unaligned reads,Megaruptor 3,PippinHT,1.2,P2,C2,University of Washington,kmiyamot@uw.edu,
1,HG00099.m54329U_220827_143814.dc.q20.fastq.gz,HG00099,s3://human-pangenomics/working/HPRC/HG00099/ra...,SRR26545347,SRP305758,SAMN33758778,2643186,52918724515,52.92,345,...,HiFi sequencing of 20kb fractionated gDNA reba...,unaligned reads,Megaruptor 3,PippinHT,1.2,P2,C2,University of Washington,kmiyamot@uw.edu,
2,HG00099.m54329U_220829_095708.dc.q20.fastq.gz,HG00099,s3://human-pangenomics/working/HPRC/HG00099/ra...,SRR26545347,SRP305758,SAMN33758778,2866917,57539149577,57.54,138,...,HiFi sequencing of 20kb fractionated gDNA reba...,unaligned reads,Megaruptor 3,PippinHT,1.2,P2,C2,University of Washington,kmiyamot@uw.edu,
3,HG00140.m64043_220728_173215.dc.q20.fastq.gz,HG00140,s3://human-pangenomics/working/HPRC/HG00140/ra...,SRR26545346,SRP305758,SAMN33621941,572695,11345594738,11.35,117,...,HiFi sequencing of 20kb fractionated gDNA reba...,unaligned reads,Megaruptor 3,PippinHT,1.2,P2,C2,University of Washington,kmiyamot@uw.edu,
4,HG00140.m64136_220715_182717.dc.q20.fastq.gz,HG00140,s3://human-pangenomics/working/HPRC/HG00140/ra...,SRR26545346,SRP305758,SAMN33621941,1674999,36918329655,36.92,400,...,HiFi sequencing of 20kb fractionated gDNA reba...,unaligned reads,Megaruptor 3,PippinHT,1.2,P2,C2,University of Washington,kmiyamot@uw.edu,


In [12]:
# Remove notes if all nan
if dc_df['notes'].isnull().all() == True:
    dc_df = dc_df.drop('notes', axis=1)

In [19]:
required_deepconsensus_1_submitter_columns = ['filename', 'sample_ID', 'library_ID', 'library_strategy', 'library_source', 
                                            'library_selection', 'library_layout', 'platform', 'instrument_model', 'design_description', 
                                            'data_type', 'shear_method', 'size_selection', 
                                            'DeepConsensus_version', 'polymerase_version', 
                                            'seq_plate_chemistry_version', 'generator_facility', 
                                            'generator_contact']

required_deepconsensus_5_readstats_columns = ['sample_ID', 'filename', 'total_reads', 'total_bp', 'total_Gbp', 'min', 'max', 'mean', 'quartile_25', 'quartile_50', 'quartile_75', 'N25', 'N50', 'N75']

required_deepconsensus_5_readstat_dict = {'total_reads': 'sum',
                                               'total_bp': 'sum',
                                               'total_Gbp': 'sum',
                                               'min': 'mean',
                                               'max': 'mean',
                                               'quartile_25': 'mean',
                                               'quartile_50': 'mean',
                                               'quartile_75': 'mean',
                                               'N25': 'mean',
                                               'N50': 'mean',
                                               'N75': 'mean'}

In [24]:
dc_1_submitter_df, samples_missing_1_submmiter_hifi = check_missing_1_submitter(dc_df, 
                                                                                required_deepconsensus_1_submitter_columns)
if len(samples_missing_1_submmiter_hifi) > 0:
    dc_1_submitter_df[hifi_1_submitter_df['sample_ID'].isin(samples_missing_1_submmiter_hifi)].to_csv('hprc_metadata_sample_files_missing_1_submitter_DeepConsensus.tsv', sep='\t')

In [26]:
# sample_df, required_readstat_columns
dc_5_readstats_df = sample_aggregate_5_readstats(dc_df, 
                                                 required_deepconsensus_5_readstats_columns, 
                                                 required_deepconsensus_5_readstat_dict)

In [27]:
dc_sample_df = pd.merge(dc_1_submitter_df, dc_5_readstats_df, on='sample_ID')

In [30]:
dc_sample_df.to_csv('hprc_metadata_sample_aggregate_DEEPCONSENSUS.tsv', sep='\t')

# Sample Aggregate

In [66]:
merge_samples = set(dc_sample_df['sample_ID'].tolist()).union(hifi_sample_df['sample_ID'].tolist(), ont_sample_df['sample_ID'].tolist())

In [83]:
ont_sample_df.head()

Unnamed: 0,filename,sample_ID,path,filetype,library_ID,library_strategy,library_source,library_selection,library_layout,platform,...,generator_contact,read_N50,coverage,100kb+,200kb+,300kb+,400kb+,500kb+,1Mb+,whales
0,02_08_22_R941_HG00558_1_Guppy_6.5.7_450bps_mod...,HG00558,s3://human-pangenomics/working/HPRC/HG00558/ra...,bam,02_08_22_R941_HG00558_1,WGS,GENOMIC,RANDOM,single,OXFORD_NANOPORE,...,iviolich@ucsc.edu,98073.0,54.43,26.69,7.3,1.6,0.45,0.18,0.0,35
1,02_08_22_R941_HG01252_1_Guppy_6.5.7_450bps_mod...,HG01252,s3://human-pangenomics/working/HPRC/HG01252/ra...,bam,02_08_22_R941_HG01252_1,WGS,GENOMIC,RANDOM,single,OXFORD_NANOPORE,...,iviolich@ucsc.edu,79697.666667,47.53,17.45,3.02,0.66,0.18,0.07,0.0,7
2,02_08_22_R941_HG02258_1_Guppy_6.5.7_450bps_mod...,HG02258,s3://human-pangenomics/working/HPRC/HG02258/ra...,bam,02_08_22_R941_HG02258_1,WGS,GENOMIC,RANDOM,single,OXFORD_NANOPORE,...,iviolich@ucsc.edu,98838.5,49.73,25.01,6.62,1.26,0.26,0.08,0.0,5
3,02_08_22_R941_HG03834_1_Guppy_6.5.7_450bps_mod...,HG03834,s3://human-pangenomics/working/HPRC/HG03834/ra...,bam,02_08_22_R941_HG03834_1,WGS,GENOMIC,RANDOM,single,OXFORD_NANOPORE,...,iviolich@ucsc.edu,107647.333333,48.8,26.33,8.97,3.23,1.35,0.64,0.03,99
4,02_15_22_R941_HG00658_1_Guppy_6.5.7_450bps_mod...,HG00658,s3://human-pangenomics/working/HPRC/HG00658/ra...,bam,02_15_22_R941_HG00658_1,WGS,GENOMIC,RANDOM,single,OXFORD_NANOPORE,...,iviolich@ucsc.edu,90313.333333,68.25,30.8,8.37,1.85,0.49,0.18,0.0,16


In [None]:
ont_sample_df.groupby(['Age', 'Gender']).agg(lambda x: x.tolist()).reset_index()


In [82]:
ont_sample_df[['sample_ID','filename','path']].groupby(['sample_ID']).agg(lambda x: x.tolist()).reset_index()

Unnamed: 0,sample_ID,filename,path
0,GM18522,[05_17_22_R941_GM18522_1_Guppy_6.5.7_450bps_mo...,[s3://human-pangenomics/working/HPRC/NA18522/r...
1,GM18570,[08_10_22_R941_GM18570_1_Guppy_6.5.7_450bps_mo...,[s3://human-pangenomics/working/HPRC/NA18570/r...
2,GM18612,[06_28_22_R941_GM18612_1_Guppy_6.5.7_450bps_mo...,[s3://human-pangenomics/working/HPRC/NA18612/r...
3,GM18747,[06_28_22_R941_GM18747_1_Guppy_6.5.7_450bps_mo...,[s3://human-pangenomics/working/HPRC/NA18747/r...
4,GM18971,[06_28_22_R941_GM18971_1_Guppy_6.5.7_450bps_mo...,[s3://human-pangenomics/working/HPRC/NA18971/r...
...,...,...,...
119,HG04184,[08_25_21_R941_HG04184_1_Guppy_6.5.7_450bps_mo...,[s3://human-pangenomics/working/HPRC/HG04184/r...
120,HG04187,[08_10_21_R941_HG04187_1_Guppy_6.5.7_450bps_mo...,[s3://human-pangenomics/working/HPRC/HG04187/r...
121,HG04199,[08_10_21_R941_HG04199_1_Guppy_6.5.7_450bps_mo...,[s3://human-pangenomics/working/HPRC/HG04199/r...
122,HG04204,[08_10_21_R941_HG04204_1_Guppy_6.5.7_450bps_mo...,[s3://human-pangenomics/working/HPRC/HG04204/r...
