In [111]:
import ast
import pandas as pd
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [112]:
batch11 = pd.read_csv('HPRC_Assembly_s3Locs_batch11-initial.tsv', sep='\t')

# Check that files are present in the main data tables
hifi_metadata = pd.read_csv('/private/groups/hprc/human-pangenomics/documentation-metadata/HPRC_metadata/data/hprc-data-explorer-tables/HPRC_PacBio_HiFi.tsv',sep='\t')

ont_metadata = pd.read_csv('/private/groups/hprc/human-pangenomics/documentation-metadata/HPRC_metadata/data/hprc-data-explorer-tables/HPRC_ONT.tsv',sep='\t')

illumina_pedigree_df = pd.read_csv('/private/groups/hprc/human-pangenomics/documentation-metadata/HPRC_metadata/submissions/Illumina_1KG/Illumina_pedigree.transfer.csv')
illumina_df = pd.read_csv('/private/groups/hprc/human-pangenomics/documentation-metadata/HPRC_metadata/submissions/Illumina_1KG/Illumina_Y1-Y4.transfer.csv')

ont_y4 = pd.read_csv('/private/groups/hprc/human-pangenomics/documentation-metadata/HPRC_metadata/submissions/UCSC_HPRC_nanopore_Year4/1_metadata/UCSC_HPRC_nanopore_Year4_final_table.csv')

In [113]:
# All missing:
# HiFi coverage is missing
# ONT coverage over 100kb 
assert batch11[batch11.hifi_cov.isna()].shape[0] == batch11.shape[0]
assert batch11[batch11.ont_cov_over100kb.isna()].shape[0] == batch11.shape[0]


In [114]:
# apply list character to  type  ist
batch11['hifi']  = batch11['hifi'].apply(ast.literal_eval)
batch11['nanopore'] = batch11['nanopore'].apply(ast.literal_eval)
batch11['hic'] = batch11['hic'].apply(ast.literal_eval)
batch11['hic_r1'] = batch11['hic_r1'].apply(ast.literal_eval)
batch11['hic_r2'] = batch11['hic_r2'].apply(ast.literal_eval)

In [115]:
for data in batch11[batch11['sample_id'].isin(['NA19909'])]['hifi'].tolist()[0]:
	print(data)

s3://human-pangenomics/working/HPRC/NA19909/raw_data/PacBio_HiFi/m84081_230523_180945_s1.hifi_reads.bc2005.bam
s3://human-pangenomics/working/HPRC/NA19909/raw_data/PacBio_HiFi/m84081_230525_172954_s3.hifi_reads.bc2005.bam
s3://human-pangenomics/working/HPRC/NA19909/raw_data/PacBio_HiFi/m84081_230609_194256_s2.hifi_reads.bc2001.bam
s3://human-pangenomics/working/HPRC/NA19909/raw_data/PacBio_HiFi/m84081_230728_194836_s2.hifi_reads.bc2029.bam
s3://human-pangenomics/working/HPRC/NA19909/raw_data/PacBio_HiFi/m84081_230728_201942_s3.hifi_reads.bc2029.bam
s3://human-pangenomics/working/HPRC/NA19909/raw_data/PacBio_HiFi/m84081_231207_200206_s1.hifi_reads.bc2008.bam


In [116]:
ont_y4['sample_ID'] = [sample_id.split('_')[-2] for sample_id in ont_y4['library_ID'].tolist()]

In [117]:
# Find samples that are missing or have mismatched HiFi and ONT files
missing_files = []  # List to store missing files information

for data in batch11[['sample_id', 'hifi', 'nanopore', 'hic']].values:
    sample_id, hifi, nanopore, hic = data[0], data[1], data[2], data[3]
    
    # Extracting HiFi and ONT sample information
    hifi_sample = hifi_metadata[hifi_metadata['sample_ID'].isin([sample_id])]
    ont_sample = ont_y4[ont_y4['sample_ID'].isin([sample_id])]

    print(f"Checking sample: {sample_id}")
    
    # Check HiFi and ONT files
    try:
        # Split and compare filenames for HiFi files
        hifi_expected = hifi_sample['filename'].tolist()
        hifi_found = [filename.split('/')[-1] for filename in hifi]
        
        # Split and compare filenames for ONT files
        ont_expected = ont_sample['filename'].tolist()
        ont_found = [filename.split('/')[-1] for filename in nanopore]
        
        # Check if HiFi filenames match
        if sorted(hifi_expected) != sorted(hifi_found):
            raise AssertionError(f"Mismatch in HiFi files for sample {sample_id}")

        # Check if ONT filenames match
        if sorted(ont_expected) != sorted(ont_found):
            raise AssertionError(f"Mismatch in ONT files for sample {sample_id}")

    except AssertionError as e:
        # If AssertionError occurs, log missing files and the error message
        print(f"AssertionError for sample: {sample_id} - {e}")
        missing_files.append({
            'sample_id': sample_id,
            'hifi_expected': hifi_expected,
            'hifi_found': hifi_found,
            'ont_expected': ont_expected,
            'ont_found': ont_found
        })
        continue  # Continue to the next sample

print('\n')
# After loop, print out missing or mismatched files
if missing_files:
    print("Samples with missing or mismatched files:")
    for missing in missing_files:
        print(f"Sample ID: {missing['sample_id']}")
        print(f"HiFi Expected: {missing['hifi_expected']}")
        print(f"HiFi Found: {missing['hifi_found']}")
        print(f"ONT Expected: {missing['ont_expected']}")
        print(f"ONT Found: {missing['ont_found']}\n")
else:
    print("No missing or mismatched files found.")


Checking sample: HG01167
Checking sample: NA20827
AssertionError for sample: NA20827 - Mismatch in HiFi files for sample NA20827
Checking sample: NA19131
Checking sample: HG00344
Checking sample: HG00350
Checking sample: HG00253
Checking sample: NA21102
Checking sample: NA20762
AssertionError for sample: NA20762 - Mismatch in HiFi files for sample NA20762
Checking sample: HG03521
Checking sample: HG00235
Checking sample: HG03369
Checking sample: NA20809
Checking sample: NA18565
Checking sample: NA20850
Checking sample: NA18879
Checking sample: HG04153
Checking sample: NA20282
Checking sample: NA20346
Checking sample: HG00329
Checking sample: HG00272
Checking sample: HG03784
Checking sample: NA20806
AssertionError for sample: NA20806 - Mismatch in HiFi files for sample NA20806
Checking sample: NA19682
Checking sample: NA19909
Checking sample: NA21144
Checking sample: HG00097
Checking sample: NA19776
Checking sample: NA19835


Samples with missing or mismatched files:
Sample ID: NA20827


In [102]:
# subset out human technopole to start assembly
batch11_ht = batch11[batch11['HiFi Prod Site'] == 'Human Technopole']
batch11 = batch11[~batch11['HiFi Prod Site'].isin(['Human Technopole'])]

In [103]:
batch11_ht

Unnamed: 0,sample_id,biosample_accession,cohort,Production Year,Sex,isMaleSample,paternal_id,maternal_id,Subpopulation,Superpopulation,HiFi Prod Site,hifi,hifi_cov,nanopore,ont_cov_over100kb,hic,hic_cov,hic_r1,hic_r2
1,NA20827,SAMN41021650,HPRC,YR4,male,True,,,TSI,EUR,Human Technopole,[],,[s3://human-pangenomics/working/HPRC/NA20827/r...,,[s3://human-pangenomics/submissions/1005B25C-E...,12.54,[s3://human-pangenomics/submissions/1005B25C-E...,[s3://human-pangenomics/submissions/1005B25C-E...
7,NA20762,SAMN41021652,HPRC,YR4,male,True,,,TSI,EUR,Human Technopole,[],,[s3://human-pangenomics/working/HPRC/NA20762/r...,,[s3://human-pangenomics/submissions/1005B25C-E...,16.1,[s3://human-pangenomics/submissions/1005B25C-E...,[s3://human-pangenomics/submissions/1005B25C-E...
21,NA20806,SAMN41021648,HPRC,YR4,male,True,,,TSI,EUR,Human Technopole,[],,[s3://human-pangenomics/working/HPRC/NA20806/r...,,[s3://human-pangenomics/submissions/1005B25C-E...,35.52,[s3://human-pangenomics/submissions/1005B25C-E...,[s3://human-pangenomics/submissions/1005B25C-E...


In [124]:
# batch11.loc[batch11['sample_id'] == 'NA19909', 'hifi'] = [['m84081_230523_180945_s1.hifi_reads.bc2005.bam',
#                                                            'm84081_230525_172954_s3.hifi_reads.bc2005.bam']]


In [None]:
# # Use a single list as the value for the selected row.
# batch11.loc[batch11['sample_id'] == 'NA19909', 'hifi'] = [['m84081_230523_180945_s1.hifi_reads.bc2005.bam',
#                                                            'm84081_230525_172954_s3.hifi_reads.bc2005.bam']]


In [104]:
# add hifi coverage 
hifi_coverage_df = pd.DataFrame(round(hifi_metadata[hifi_metadata['sample_ID'].isin(batch11.sample_id)].groupby('sample_ID')['total_Gbp'].sum() / 3.1).astype(int))
hifi_coverage_df['sample_id'] = hifi_coverage_df.index.tolist()
hifi_coverage_df.reset_index(drop=True, inplace=True)
hifi_coverage_df.rename(columns={'total_Gbp':'hifi_cov'},inplace=True)
hifi_coverage_dict= {data[1]:data[0] for data in hifi_coverage_df.values}
batch11['hifi_cov'] = [hifi_coverage_dict[sample] for sample in batch11['sample_id'].tolist()]
# hifi_coverage_df.set_index('sample_id').to_dict(orient='index')
# batch11.drop('hifi_cov', axis=1, inplace=True)
# batch11 = pd.merge(batch11, hifi_coverage_df, on='sample_id', how='inner')

In [105]:
# add ont coverage 
ont_y4_coverage_df = ont_y4[['sample_ID','100kb+']]
ont_y4_coverage_df.rename(columns={'sample_ID':'sample_id','100kb+':'ont_cov_over100kb'}, inplace=True)
ont_y4_coverage_df = pd.DataFrame(round(ont_y4_coverage_df.groupby('sample_id')['ont_cov_over100kb'].sum()).astype(int))
ont_y4_coverage_df['sample_id'] = ont_y4_coverage_df.index.tolist()
ont_y4_coverage_df.reset_index(drop=True, inplace=True)
ont_y4_coverage_dict = {data[1]:data[0] for data in ont_y4_coverage_df.values}
batch11['ont_cov_over100kb'] = [ont_y4_coverage_dict[sample] for sample in batch11['sample_id'].tolist()]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ont_y4_coverage_df.rename(columns={'sample_ID':'sample_id','100kb+':'ont_cov_over100kb'}, inplace=True)


In [74]:
# check if there are trios
# illumina_pedigree_df[illumina_pedigree_df['sample_id'].isin(batch11['sample_id'].tolist())]

In [106]:
illumina_dict = {sample[0]:sample[1] for sample in illumina_df[illumina_df['sample_id'].isin(batch11['sample_id'].tolist())][['sample_id','aws-submission-child']].values}

In [107]:
batch11['child_ilmn'] = [[illumina_dict[sample]] for sample in batch11['sample_id'].tolist()]

DNA extraction with Qiagen MagAttract HMW Kit

Sample sheared with Megaruptor1

Library Size selected with Sage ELF 1kb-18kb cassette


m84081_230523_180945_s1.hifi_reads.bc2005.bam

m84081_230525_172954_s3.hifi_reads.bc2005.bam

In [110]:
# # Use a single list as the value for the selected row.
# batch11.loc[batch11['sample_id'] == 'NA19909', 'hifi'] = [['m84081_230523_180945_s1.hifi_reads.bc2005.bam',
#                                                            'm84081_230525_172954_s3.hifi_reads.bc2005.bam']]


In [45]:
# batch11[batch11['sample_id'].isin(['NA19909'])]
# m84081_230523_180945_s1.hifi_reads.bc2005.bam
# m84081_230525_172954_s3.hifi_reads.bc2005.bam

In [35]:
batch11.to_csv('HPRC_Assembly_S3Locs_batch11.csv',index=False)

In [126]:
# manual update on NA19909
batch11 = pd.read_csv('HPRC_Assembly_S3Locs_batch11.csv')

In [132]:
batch11['hifi']  = batch11['hifi'].apply(ast.literal_eval)

In [136]:
for data in batch11[batch11['sample_id'].isin(['NA19909'])]['hifi'].tolist()[0]:
	print(data)

s3://human-pangenomics/working/HPRC/NA19909/raw_data/PacBio_HiFi/m84081_230523_180945_s1.hifi_reads.bc2005.bam
s3://human-pangenomics/working/HPRC/NA19909/raw_data/PacBio_HiFi/m84081_230525_172954_s3.hifi_reads.bc2005.bam


In [146]:
round(hifi_metadata[hifi_metadata['filename'].isin(['m84081_230523_180945_s1.hifi_reads.bc2005.bam',
								'm84081_230525_172954_s3.hifi_reads.bc2005.bam'])].groupby('sample_ID')['total_Gbp'].sum() / 3.1).astype(int)[0]

  round(hifi_metadata[hifi_metadata['filename'].isin(['m84081_230523_180945_s1.hifi_reads.bc2005.bam',


64

In [147]:
# set new coverage from metadata sheet
batch11.loc[batch11['sample_id'] == 'NA19909', 'hifi_cov'] = round(hifi_metadata[hifi_metadata['filename'].isin(['m84081_230523_180945_s1.hifi_reads.bc2005.bam',
								'm84081_230525_172954_s3.hifi_reads.bc2005.bam'])].groupby('sample_ID')['total_Gbp'].sum() / 3.1).astype(int)[0]

  batch11.loc[batch11['sample_id'] == 'NA19909', 'hifi_cov'] = round(hifi_metadata[hifi_metadata['filename'].isin(['m84081_230523_180945_s1.hifi_reads.bc2005.bam',
