In [1]:
import numpy as np
import pandas as pd
import collections

# ONT

In [2]:
ont_df = pd.read_csv('hprc_metadata_sample_files_ONT.tsv', sep='\t')

In [3]:
# Remove notes if all nan
if ont_df['notes'].isnull().all() == True:
    ont_df = ont_df.drop('notes', axis=1)

In [4]:
# TODO: Integrate with HPRC_metadata/merge_metadata.py combine_readstats.keep_columns
ont_meta_columns = ['filename',
        'sample_ID',
        'path',
        'filetype',
        'library_ID',
        'library_strategy',
        'library_source',
        'library_selection',
        'library_layout',
        'platform',
        'instrument_model',
        'design_description',
        'data_type',
        'shear_method',
        'size_selection',
        'seq_kit',
        'basecaller',
        'basecaller_version',
        'basecaller_model',
        'generator_facility',
        'generator_contact']

ont_readstat_columns = ['sample_ID', 
                        'filename',
                        'read_N50',
                        'Gb',
                        'coverage',
                        '100kb+',
                        '200kb+',
                        '300kb+',
                        '400kb+',
                        '500kb+',
                        '1Mb+',
                        'whales']

ont_df_nan_columns = ont_df.columns[ont_df.isna().any()].tolist()

In [5]:
ont_sample_missing_1_submitter_df = ont_df[ont_df.isnull().any(axis=1)]

# Number of ONT samples that have at least one missing submitter file missing
print(len(set(ont_df[ont_df['sample_ID'].isin(ont_sample_missing_1_submitter_df['sample_ID'].tolist())]['sample_ID'].tolist())))

# Export ONT sample files that have at least one missing submitter file missing
ont_df[ont_df['sample_ID'].isin(ont_sample_missing_1_submitter_df['sample_ID'].tolist())].reset_index(drop=True).to_csv('hprc_metadata_sample_files_missing_1_submitter_ONT.tsv', sep='\t')

for nan_column in ont_df_nan_columns:
    ont_df[nan_column] = ont_df[nan_column].ffill().tolist()


14


In [6]:
ont_sample_meta_df = ont_df[ont_meta_columns].drop_duplicates(subset=['sample_ID'])
ont_sample_readstats_df = ont_df[ont_readstat_columns]

In [7]:
# Check no missing sample_ID for the sample meta and readstat subset
assert ont_df[~ont_df['sample_ID'].isin(ont_sample_meta_df['sample_ID'].tolist())].shape[0] == ont_df[~ont_df['sample_ID'].isin(ont_sample_readstats_df['sample_ID'].tolist())].shape[0] == 0

In [8]:
ont_sample_readstats_df = ont_sample_readstats_df.groupby('sample_ID').agg({'read_N50': 'mean',
                                                                          'coverage':'sum',
                                                                          '100kb+': 'sum',
                                                                          '200kb+': 'sum',
                                                                          '300kb+': 'sum',
                                                                          '400kb+': 'sum',
                                                                          '500kb+': 'sum',
                                                                          '1Mb+': 'sum',
                                                                          'whales': 'sum'}).reset_index()

In [9]:
ont_sample_df = pd.merge(ont_sample_meta_df, ont_sample_readstats_df, on='sample_ID')

In [10]:
assert len(set(ont_sample_df['sample_ID'].tolist())) == ont_sample_df.shape[0]

In [11]:
ont_sample_df.to_csv('hprc_metadata_sample_aggregate_ONT.tsv', sep='\t')

# HiFi

In [84]:
hifi_df = pd.read_csv('hprc_metadata_sample_files_HiFi.tsv', sep='\t')
hifi_df.head()

Unnamed: 0,filename,sample_ID,path,accession,study,biosample_accession,total_reads,total_bp,total_Gbp,min,...,design_description,data_type,shear_method,size_selection,ccs_algorithm,polymerase_version,seq_plate_chemistry_version,generator_facility,generator_contact,notes
0,m54329U_200124_193652.ccs.bam,HG01891,s3://human-pangenomics/working/HPRC/HG01891/ra...,SRR13684280,SRP305758,SAMN17861236,1596347,27122049640,27.12,47,...,HiFi sequencing of 18kb fractionated gDNA,unaligned reads,g-TUBE,SageELF,ccs 4.0.0 (commit SL-release-8.0.0),P2.0,C2.0,University of Washington,kmiyamot@uw.edu,
1,m54329U_200127_180554.ccs.bam,HG01891,s3://human-pangenomics/working/HPRC/HG01891/ra...,SRR13684280,SRP305758,SAMN17861236,1755465,29883779569,29.88,46,...,HiFi sequencing of 18kb fractionated gDNA,unaligned reads,g-TUBE,SageELF,ccs 4.0.0 (commit SL-release-8.0.0),P2.0,C2.0,University of Washington,kmiyamot@uw.edu,
2,m54329U_200129_001928.ccs.bam,HG01891,s3://human-pangenomics/working/HPRC/HG01891/ra...,SRR13684280,SRP305758,SAMN17861236,1779732,30200500419,30.2,45,...,HiFi sequencing of 18kb fractionated gDNA,unaligned reads,g-TUBE,SageELF,ccs 4.0.0 (commit SL-release-8.0.0),P2.0,C2.0,University of Washington,kmiyamot@uw.edu,
3,m54329U_200130_064539.ccs.bam,HG01891,s3://human-pangenomics/working/HPRC/HG01891/ra...,SRR13684280,SRP305758,SAMN17861236,1636421,27919782706,27.92,46,...,HiFi sequencing of 18kb fractionated gDNA,unaligned reads,g-TUBE,SageELF,ccs 4.0.0 (commit SL-release-8.0.0),P2.0,C2.0,University of Washington,kmiyamot@uw.edu,
4,m54329U_200201_051510.ccs.bam,HG01123,s3://human-pangenomics/working/HPRC/HG01123/ra...,SRR13684290,SRP305758,SAMN17861232,2072143,34554070679,34.55,46,...,HiFi sequencing of 17kb fractionated gDNA,unaligned reads,g-TUBE,SageELF,ccs 4.0.0 (commit SL-release-8.0.0),P2.0,C2.0,University of Washington,kmiyamot@uw.edu,


In [85]:
# Remove notes if all nan
if hifi_df['notes'].isnull().all() == True:
    hifi_df = hifi_df.drop('notes', axis=1)

In [86]:
required_hifi_1_submitter_columns = ['filename', 'sample_ID', 'library_ID', 'library_strategy', 'library_source', 'library_selection', 'library_layout', 'platform', 'instrument_model', 
                    'design_description', 'data_type', 'shear_method', 'size_selection', 'ccs_algorithm', 'polymerase_version', 'seq_plate_chemistry_version', 
                    'generator_facility', 'generator_contact', 'notes']

required_hifi_5_readstats_columns = ['sample_ID','filename', 'total_reads', 'total_bp', 'total_Gbp', 'min', 'max', 'mean', 'quartile_25', 'quartile_50', 'quartile_75', 'N25', 'N50', 'N75']


In [87]:
# hifi_sample_meta_df = hifi_df[required_hifi_1_submitter_columns].drop_duplicates(subset=['sample_ID']).reset_index(drop=True)

In [179]:
require_hifi_1_submitter_dict = {sample: {key: None for key in required_hifi_1_submitter_columns} for sample in hifi_df['sample_ID'].tolist()} 
for sample in list(set(hifi_df['sample_ID'].tolist())):
    # print(sample, required_hifi_1_submitter_columns)
    
    sample_array = hifi_df[hifi_df['sample_ID'].isin([sample])][required_hifi_1_submitter_columns].values
    
    # print(sample)
    # print(sample_array)
    # print(len(required_hifi_1_submitter_columns))
    
    assert len(required_hifi_1_submitter_columns) == sample_array.shape[1]
    
    for column_index in range(sample_array.shape[1]):
        # print(sample_array[0, column_index], sample_array[:, column_index])
        if np.all(sample_array[:, column_index] == sample_array[0, column_index]):
            # print(sample_array[0, column_index], 
            #       required_hifi_1_submitter_columns[column_index], 
            #       sample_array[:, column_index].tolist()[0])
            # print('\n')
            require_hifi_1_submitter_dict[sample][required_hifi_1_submitter_columns[column_index]] = sample_array[0, column_index]
        else:
            # print(sample_array[0, column_index])
            require_hifi_1_submitter_dict[sample][required_hifi_1_submitter_columns[column_index]] = sample_array[:, column_index]
            # print(required_hifi_1_submitter_columns[column_index],
                  # sample_array[:, column_index])
            # print('\n')
            # pass

In [180]:
pd.DataFrame.from_dict(require_hifi_1_submitter_dict, orient='index')

Unnamed: 0,filename,sample_ID,library_ID,library_strategy,library_source,library_selection,library_layout,platform,instrument_model,design_description,data_type,shear_method,size_selection,ccs_algorithm,polymerase_version,seq_plate_chemistry_version,generator_facility,generator_contact,notes
HG01891,"[m54329U_200124_193652.ccs.bam, m54329U_200127...",HG01891,HG01891.HiFiEx_f2,WGS,GENOMIC,size fractionation,single,PACBIO_SMRT,PacBio Sequel II,HiFi sequencing of 18kb fractionated gDNA,unaligned reads,g-TUBE,SageELF,ccs 4.0.0 (commit SL-release-8.0.0),P2.0,C2.0,University of Washington,kmiyamot@uw.edu,"[nan, nan, nan, nan]"
HG01123,"[m54329U_200201_051510.ccs.bam, m54329U_200203...",HG01123,HG01123_HiFiEx_f2,WGS,GENOMIC,size fractionation,single,PACBIO_SMRT,PacBio Sequel II,HiFi sequencing of 17kb fractionated gDNA,unaligned reads,g-TUBE,SageELF,ccs 4.0.0 (commit SL-release-8.0.0),P2.0,C2.0,University of Washington,kmiyamot@uw.edu,"[nan, nan, nan, nan]"
HG02559,"[m54329U_200211_192235.ccs.bam, m54329U_200217...",HG02559,"[HG02559.HiFiEx_b6, HG02559.HiFiEx_f2, HG02559...",WGS,GENOMIC,size fractionation,single,PACBIO_SMRT,PacBio Sequel II,HiFi sequencing of 20kb fractionated gDNA,unaligned reads,g-TUBE,SageELF,ccs 4.0.0 (commit SL-release-8.0.0),P2.0,C2.0,University of Washington,kmiyamot@uw.edu,"[nan, nan, nan, nan, nan]"
HG02486,"[m54329U_200215_021808.ccs.bam, m54329U_200313...",HG02486,"[HG02486.HiFiEx_f2, HG02486.HiFiEx2_f2, HG0248...",WGS,GENOMIC,size fractionation,single,PACBIO_SMRT,PacBio Sequel II,"[HiFi sequencing of 22kb fractionated gDNA, Hi...",unaligned reads,g-TUBE,SageELF,ccs 4.0.0 (commit SL-release-8.0.0),P2.0,C2.0,University of Washington,kmiyamot@uw.edu,"[nan, second library prep, second library prep..."
HG01361,"[m54329U_200306_185930.ccs.bam, m54329U_200308...",HG01361,"[HG01361.HiFiEx_f2, HG01361.HiFiEx_f2, HG01361...",WGS,GENOMIC,size fractionation,single,PACBIO_SMRT,PacBio Sequel II,"[HiFi sequencing of 20kb fractionated gDNA, Hi...",unaligned reads,g-TUBE,SageELF,ccs 4.0.0 (commit SL-release-8.0.0),P2.0,C2.0,University of Washington,kmiyamot@uw.edu,"[nan, nan, nan, nan, nan]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HG01934,"[m64136_210723_184622.hifi_reads.bam, m64136_2...",HG01934,HG01934_lib1,WGS,GENOMIC,size fractionation,single,PACBIO_SMRT,PacBio Sequel II,HiFi sequencing of 20kb fractionated gDNA,unaligned reads,Megaruptor 1,SageELF,10.1.0.115913,P2,C2,Washington University,tgraves@wustl.edu,bc1020 barcoded adapter
HG00408,[m64136_211111_194404-bc1002.5mc.hifi_reads.ba...,HG00408,HG00408_lib1,WGS,GENOMIC,size fractionation,single,PACBIO_SMRT,PacBio Sequel II,HiFi sequencing of 20kb fractionated gDNA,unaligned reads,Megaruptor 1,SageELF,6.0.0,P2.2,C2,Washington University,tgraves@wustl.edu,"[nan, nan, nan]"
HG02129,[m64136_220422_181133-bc1001.5mc.hifi_reads.ba...,HG02129,HG02129_lib1,WGS,GENOMIC,size fractionation,single,PACBIO_SMRT,PacBio Sequel II,HiFi sequencing of 20kb fractionated gDNA,unaligned reads,Megaruptor 1,SageELF,6.0.0,P2.2,C2,Washington University,tgraves@wustl.edu,"[nan, nan, nan]"
NA20805,[m64136_220726_203708-bc1020.5mc.hifi_reads.ba...,NA20805,NA20805_lib1,WGS,GENOMIC,size fractionation,single,PACBIO_SMRT,PacBio Sequel II,HiFi sequencing of 20kb fractionated gDNA,unaligned reads,Megaruptor 1,SageELF,6.0.0,P2.2,C2,Washington University,tgraves@wustl.edu,"[nan, nan, nan]"


In [16]:
hifi_sample_missing_1_submitter_list = []
for sample in hifi_sample_meta_df['sample_ID'].tolist():
    if sum(hifi_sample_meta_df[hifi_sample_meta_df['sample_ID'].isin([sample])].isna().any()) == 0:
        pass
    else:
        hifi_sample_missing_1_submitter_df = hifi_sample_meta_df[hifi_sample_meta_df['sample_ID'].isin([sample])]
        if hifi_sample_missing_1_submitter_df.columns[hifi_sample_missing_1_submitter_df.isna().any()].tolist() == ['notes']:
            pass
        else:
            # print(sample, hifi_sample_missing_1_submitter_df.columns[hifi_sample_missing_1_submitter_df.isna().any()].tolist())
            hifi_sample_missing_1_submitter_list.append(sample)
print(len(hifi_sample_missing_1_submitter_list))

20


In [17]:
hifi_sample_meta_df[hifi_sample_meta_df['sample_ID'].isin(hifi_sample_missing_1_submitter_list)].reset_index().to_csv('hprc_metadata_sample_files_missing_1_submitter_HiFi.tsv', sep='\t')

In [52]:
hifi_sample_readstats_df = hifi_df[required_hifi_5_readstats_columns]
hifi_sample_readstats_df = hifi_sample_readstats_df.groupby('sample_ID').agg({'total_reads': 'sum',
                                                   'total_bp': 'sum',
                                                   'total_Gbp': 'sum',
                                                   'min': 'mean',
                                                   'max': 'mean',
                                                   'quartile_25': 'mean',
                                                   'quartile_50': 'mean',
                                                   'quartile_75': 'mean',
                                                   'N25': 'mean',
                                                   'N50': 'mean', # assembly term shortest contig of the total assembly length (different for ONT)
                                                   'N75': 'mean'}).reset_index()

In [53]:
hifi_sample_df = pd.merge(hifi_sample_meta_df, hifi_sample_readstats_df, on='sample_ID')

In [54]:
# Check that sample's with missing 1_submitter metadata are present
assert hifi_sample_df[hifi_sample_df['sample_ID'].isin(hifi_sample_missing_1_submitter_list)].shape[0] == len(hifi_sample_missing_1_submitter_list)

In [55]:
hifi_sample_df.to_csv('hprc_metadata_sample_aggregate_HiFi.tsv', sep='\t')

# Deep Consensus

In [56]:
dc_df = pd.read_csv('hprc_metadata_sample_files_DEEPCONSENSUS.tsv', sep='\t')
dc_df.head()

Unnamed: 0,filename,sample_ID,path,accession,study,biosample_accession,total_reads,total_bp,total_Gbp,min,...,design_description,data_type,shear_method,size_selection,DeepConsensus_version,polymerase_version,seq_plate_chemistry_version,generator_facility,generator_contact,notes
0,HG00099.m54329U_220825_174247.dc.q20.fastq.gz,HG00099,s3://human-pangenomics/working/HPRC/HG00099/ra...,SRR26545347,SRP305758,SAMN33758778,2687625,56102007798,56.1,107,...,HiFi sequencing of 20kb fractionated gDNA reba...,unaligned reads,Megaruptor 3,PippinHT,1.2,P2,C2,University of Washington,kmiyamot@uw.edu,
1,HG00099.m54329U_220827_143814.dc.q20.fastq.gz,HG00099,s3://human-pangenomics/working/HPRC/HG00099/ra...,SRR26545347,SRP305758,SAMN33758778,2643186,52918724515,52.92,345,...,HiFi sequencing of 20kb fractionated gDNA reba...,unaligned reads,Megaruptor 3,PippinHT,1.2,P2,C2,University of Washington,kmiyamot@uw.edu,
2,HG00099.m54329U_220829_095708.dc.q20.fastq.gz,HG00099,s3://human-pangenomics/working/HPRC/HG00099/ra...,SRR26545347,SRP305758,SAMN33758778,2866917,57539149577,57.54,138,...,HiFi sequencing of 20kb fractionated gDNA reba...,unaligned reads,Megaruptor 3,PippinHT,1.2,P2,C2,University of Washington,kmiyamot@uw.edu,
3,HG00140.m64043_220728_173215.dc.q20.fastq.gz,HG00140,s3://human-pangenomics/working/HPRC/HG00140/ra...,SRR26545346,SRP305758,SAMN33621941,572695,11345594738,11.35,117,...,HiFi sequencing of 20kb fractionated gDNA reba...,unaligned reads,Megaruptor 3,PippinHT,1.2,P2,C2,University of Washington,kmiyamot@uw.edu,
4,HG00140.m64136_220715_182717.dc.q20.fastq.gz,HG00140,s3://human-pangenomics/working/HPRC/HG00140/ra...,SRR26545346,SRP305758,SAMN33621941,1674999,36918329655,36.92,400,...,HiFi sequencing of 20kb fractionated gDNA reba...,unaligned reads,Megaruptor 3,PippinHT,1.2,P2,C2,University of Washington,kmiyamot@uw.edu,


In [57]:
# Remove notes if all nan
if dc_df['notes'].isnull().all() == True:
    dc_df = dc_df.drop('notes', axis=1)

In [58]:
required_deepconsensus_submitter_columns = ['filename', 'sample_ID', 'library_ID', 'library_strategy', 'library_source', 
                                            'library_selection', 'library_layout', 'platform', 'instrument_model', 'design_description', 
                                            'data_type', 'shear_method', 'size_selection', 
                                            'DeepConsensus_version', 'polymerase_version', 
                                            'seq_plate_chemistry_version', 'generator_facility', 
                                            'generator_contact']

required_deepconsensus_readstats_columns = ['sample_ID', 'filename', 'total_reads', 'total_bp', 'total_Gbp', 'min', 'max', 'mean', 'quartile_25', 'quartile_50', 'quartile_75', 'N25', 'N50', 'N75']

In [59]:
dc_sample_meta_df = dc_df[required_deepconsensus_submitter_columns].drop_duplicates(subset=['sample_ID']).reset_index(drop=True)

In [69]:
# dc_sample_meta_df

In [60]:
dc_sample_missing_1_submitter_list = []
for sample in dc_sample_meta_df['sample_ID'].tolist():
    if sum(dc_sample_meta_df[dc_sample_meta_df['sample_ID'].isin([sample])].isna().any()) == 0:
        pass
    else:
        dc_sample_missing_1_submitter_df = dc_sample_meta_df[dc_sample_meta_df['sample_ID'].isin([sample])]
        if dc_sample_missing_1_submitter_df.columns[dc_sample_missing_1_submitter_df.isna().any()].tolist() == ['notes']:
            pass
        else:
            # print(sample, hifi_sample_missing_1_submitter_df.columns[hifi_sample_missing_1_submitter_df.isna().any()].tolist())
            dc_sample_missing_1_submitter_list.append(sample)
print(len(dc_sample_missing_1_submitter_list))

0


In [61]:
# Check all dc sample 5_readstats data is present
assert sum(dc_df[required_deepconsensus_readstats_columns].isna().any()) == 0

In [62]:
dc_sample_readstats_df = dc_df[required_deepconsensus_readstats_columns]

dc_sample_readstats_df = dc_sample_readstats_df.groupby('sample_ID').agg({'total_reads': 'sum',
                                               'total_bp': 'sum',
                                               'total_Gbp': 'sum',
                                               'min': 'mean',
                                               'max': 'mean',
                                               'quartile_25': 'mean',
                                               'quartile_50': 'mean',
                                               'quartile_75': 'mean',
                                               'N25': 'mean',
                                               'N50': 'mean',
                                               'N75': 'mean'}).reset_index()

In [63]:
dc_sample_df = pd.merge(dc_sample_meta_df, dc_sample_readstats_df, on='sample_ID')

In [64]:
dc_sample_df.to_csv('hprc_metadata_sample_aggregate_DEEPCONSENSUS.tsv', sep='\t')

In [65]:
dc_sample_df.head() # finished table with no non-unique
# filename, library_ID

Unnamed: 0,filename,sample_ID,library_ID,library_strategy,library_source,library_selection,library_layout,platform,instrument_model,design_description,...,total_bp,total_Gbp,min,max,quartile_25,quartile_50,quartile_75,N25,N50,N75
0,HG00099.m54329U_220825_174247.dc.q20.fastq.gz,HG00099,PG00099_1.HFSS_dc,WGS,GENOMIC,size fractionation,single,PACBIO_SMRT,PacBio Sequel II,HiFi sequencing of 20kb fractionated gDNA reba...,...,166559881890,166.56,196.666667,62248.666667,17449.333333,19636.333333,22651.666667,17950.333333,20409.333333,23637.666667
1,HG00140.m64043_220728_173215.dc.q20.fastq.gz,HG00140,HG00140_lib1_dc,WGS,GENOMIC,size fractionation,single,PACBIO_SMRT,PacBio Sequel II,HiFi sequencing of 20kb fractionated gDNA reba...,...,126611334877,126.62,272.5,67774.5,18393.25,20448.5,23623.25,18873.5,21263.0,24996.25
2,HG00280.m54329U_220901_221341.dc.q20.fastq.gz,HG00280,PG00280.HFSS_dc,WGS,GENOMIC,size fractionation,single,PACBIO_SMRT,PacBio Sequel II,HiFi sequencing of 20kb fractionated gDNA reba...,...,151564344052,151.56,95.0,64738.666667,17189.666667,19452.666667,22546.0,17712.333333,20266.666667,23568.0
3,HG00323.m64043_220728_173215.dc.q20.fastq.gz,HG00323,HG00323_lib1_dc,WGS,GENOMIC,size fractionation,single,PACBIO_SMRT,PacBio Sequel II,HiFi sequencing of 20kb fractionated gDNA reba...,...,120027730827,120.03,368.75,60432.75,16941.5,19105.5,21983.25,17417.0,19893.75,22834.5
4,HG00408.m64136_211111_194404.dc.q20.fastq.gz,HG00408,HG00408_lib1_dc,WGS,GENOMIC,size fractionation,single,PACBIO_SMRT,PacBio Sequel II,HiFi sequencing of 20kb fractionated gDNA reba...,...,134450022475,134.45,396.666667,63712.666667,16502.333333,17981.0,21025.666667,16795.666667,18577.333333,22431.333333


# Samples merged by modality

In [66]:
merge_samples = set(dc_sample_df['sample_ID'].tolist()).union(hifi_sample_df['sample_ID'].tolist(), ont_sample_df['sample_ID'].tolist())

In [83]:
ont_sample_df.head()

Unnamed: 0,filename,sample_ID,path,filetype,library_ID,library_strategy,library_source,library_selection,library_layout,platform,...,generator_contact,read_N50,coverage,100kb+,200kb+,300kb+,400kb+,500kb+,1Mb+,whales
0,02_08_22_R941_HG00558_1_Guppy_6.5.7_450bps_mod...,HG00558,s3://human-pangenomics/working/HPRC/HG00558/ra...,bam,02_08_22_R941_HG00558_1,WGS,GENOMIC,RANDOM,single,OXFORD_NANOPORE,...,iviolich@ucsc.edu,98073.0,54.43,26.69,7.3,1.6,0.45,0.18,0.0,35
1,02_08_22_R941_HG01252_1_Guppy_6.5.7_450bps_mod...,HG01252,s3://human-pangenomics/working/HPRC/HG01252/ra...,bam,02_08_22_R941_HG01252_1,WGS,GENOMIC,RANDOM,single,OXFORD_NANOPORE,...,iviolich@ucsc.edu,79697.666667,47.53,17.45,3.02,0.66,0.18,0.07,0.0,7
2,02_08_22_R941_HG02258_1_Guppy_6.5.7_450bps_mod...,HG02258,s3://human-pangenomics/working/HPRC/HG02258/ra...,bam,02_08_22_R941_HG02258_1,WGS,GENOMIC,RANDOM,single,OXFORD_NANOPORE,...,iviolich@ucsc.edu,98838.5,49.73,25.01,6.62,1.26,0.26,0.08,0.0,5
3,02_08_22_R941_HG03834_1_Guppy_6.5.7_450bps_mod...,HG03834,s3://human-pangenomics/working/HPRC/HG03834/ra...,bam,02_08_22_R941_HG03834_1,WGS,GENOMIC,RANDOM,single,OXFORD_NANOPORE,...,iviolich@ucsc.edu,107647.333333,48.8,26.33,8.97,3.23,1.35,0.64,0.03,99
4,02_15_22_R941_HG00658_1_Guppy_6.5.7_450bps_mod...,HG00658,s3://human-pangenomics/working/HPRC/HG00658/ra...,bam,02_15_22_R941_HG00658_1,WGS,GENOMIC,RANDOM,single,OXFORD_NANOPORE,...,iviolich@ucsc.edu,90313.333333,68.25,30.8,8.37,1.85,0.49,0.18,0.0,16


In [None]:
ont_sample_df.groupby(['Age', 'Gender']).agg(lambda x: x.tolist()).reset_index()


In [82]:
ont_sample_df[['sample_ID','filename','path']].groupby(['sample_ID']).agg(lambda x: x.tolist()).reset_index()

Unnamed: 0,sample_ID,filename,path
0,GM18522,[05_17_22_R941_GM18522_1_Guppy_6.5.7_450bps_mo...,[s3://human-pangenomics/working/HPRC/NA18522/r...
1,GM18570,[08_10_22_R941_GM18570_1_Guppy_6.5.7_450bps_mo...,[s3://human-pangenomics/working/HPRC/NA18570/r...
2,GM18612,[06_28_22_R941_GM18612_1_Guppy_6.5.7_450bps_mo...,[s3://human-pangenomics/working/HPRC/NA18612/r...
3,GM18747,[06_28_22_R941_GM18747_1_Guppy_6.5.7_450bps_mo...,[s3://human-pangenomics/working/HPRC/NA18747/r...
4,GM18971,[06_28_22_R941_GM18971_1_Guppy_6.5.7_450bps_mo...,[s3://human-pangenomics/working/HPRC/NA18971/r...
...,...,...,...
119,HG04184,[08_25_21_R941_HG04184_1_Guppy_6.5.7_450bps_mo...,[s3://human-pangenomics/working/HPRC/HG04184/r...
120,HG04187,[08_10_21_R941_HG04187_1_Guppy_6.5.7_450bps_mo...,[s3://human-pangenomics/working/HPRC/HG04187/r...
121,HG04199,[08_10_21_R941_HG04199_1_Guppy_6.5.7_450bps_mo...,[s3://human-pangenomics/working/HPRC/HG04199/r...
122,HG04204,[08_10_21_R941_HG04204_1_Guppy_6.5.7_450bps_mo...,[s3://human-pangenomics/working/HPRC/HG04204/r...
