In [1]:
import collections


import boto3
import glob
import json

import pandas as pd
from collections import defaultdict
import collections
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [5]:
assembly_table = pd.read_csv('/private/groups/hprc/human-pangenomics/hprc-data-index/hprc_intermediate_assembly/data_tables/assemblies_pre_release_v0.2.index.csv')
assembly_table = assembly_table[['sample_id','phasing']]

In [6]:
collections.Counter(assembly_table['phasing'])

Counter({'trio': 111, 'hi-c': 85})

In [11]:
illumina = pd.read_csv('/private/groups/hprc/human-pangenomics/hprc-synapse-1/HPRC_metadata/data/hprc-data-explorer-tables/Illumina_final_table.csv')
illumina.rename(columns={'sample_ID':'sample_id'},inplace=True)

In [12]:
pedigree = pd.read_csv('/private/groups/hprc/human-pangenomics/hprc-synapse-1/HPRC_metadata/data/IGSR/20130606_g1k.ped',sep='\t')
pedigree.rename(columns={'Individual ID':'sample_id'},inplace=True)

In [13]:
illumina_assembly = pd.merge(illumina,assembly_table,
							 on='sample_id',
							 how='inner')

In [68]:
# for data in illumina_assembly[illumina_assembly['phasing'].isin(['trio'])].sample_id.tolist():
# 	if data not in illumina.sample_id.tolist():
# 		print(data)

# for data in illumina_assembly[illumina_assembly['phasing'].isin(['hi-c'])].sample_id.tolist():
# 	if data not in illumina.sample_id.tolist():
# 		print(data)

In [51]:
# print(illumina_assembly.shape)
# print(assembly_table.shape)
# print(illumina.shape)

In [16]:
# samples withouth the Illumina path
assembly_table[~assembly_table['sample_id'].isin(illumina_assembly['sample_id'].tolist())]

Unnamed: 0,sample_id,phasing
50,HG02486,trio
51,HG02559,trio
79,HG03471,hi-c
95,HG01123,trio
166,HG02109,trio
173,NA21309,trio


In [41]:
# Initialize S3 client
s3 = boto3.client('s3', region_name='us-west-2')

# Define bucket and prefix
bucket_name = 'human-pangenomics'
prefix = 'working/HPRC/'

# Function to recursively list all files in the specified prefix
def list_s3_files(bucket, prefix):
    files = []
    paginator = s3.get_paginator('list_objects_v2')
    for page in paginator.paginate(Bucket=bucket, Prefix=prefix):
        if 'Contents' in page:
            for obj in page['Contents']:
                files.append(obj['Key'])
    return files

# List files
all_files = list_s3_files(bucket_name, prefix)

# Print all files with the desired s3 path format
s3_paths = [f"s3://{bucket_name}/{file_path}" for file_path in all_files]
illumina_working = [illumina for illumina in s3_paths if 'Illumina' in illumina]

illumina_df = pd.DataFrame(illumina_working, columns=['path'])

# Extract sample_id from file paths and create a new column
illumina_df['sample_id'] = illumina_df['path'].str.extract(r'/HPRC/([^/]+)/')
illumina_df = illumina_df[~illumina_df['path'].str.endswith(('.final.cram.crai', '.final.cram.md5'))]

child_paths = illumina_df[illumina_df['path'].str.contains('/child/')]
parent_paths = illumina_df[illumina_df['path'].str.contains('/parents/')]

In [20]:
illumina_df.head()

Unnamed: 0,path,sample_id
0,s3://human-pangenomics/working/HPRC/HG00097/ra...,HG00097
1,s3://human-pangenomics/working/HPRC/HG00099/ra...,HG00099
2,s3://human-pangenomics/working/HPRC/HG00106/ra...,HG00106
3,s3://human-pangenomics/working/HPRC/HG00117/ra...,HG00117
4,s3://human-pangenomics/working/HPRC/HG00126/ra...,HG00126


In [52]:
print(illumina_assembly.shape)
print(assembly_table.shape)
print(illumina.shape)

(190, 21)
(196, 2)
(281, 20)


In [110]:
trio_assembly = illumina_assembly[illumina_assembly['phasing'].isin(['trio'])]
hic_assembly = illumina_assembly[illumina_assembly['phasing'].isin(['hi-c'])]

In [22]:
collections.Counter(illumina_assembly['phasing'])

Counter({'trio': 106, 'hi-c': 84})

In [93]:
# trio_assembly

In [37]:
# subset aws working paths to trio assembly - parent child
illumina_df[illumina_df['sample_id'].isin(trio_assembly['sample_id'].tolist())]

Unnamed: 0,path,sample_id
24,s3://human-pangenomics/working/HPRC/HG00408/ra...,HG00408
27,s3://human-pangenomics/working/HPRC/HG00423/ra...,HG00423
30,s3://human-pangenomics/working/HPRC/HG00423/ra...,HG00423
31,s3://human-pangenomics/working/HPRC/HG00423/ra...,HG00423
32,s3://human-pangenomics/working/HPRC/HG00438/ra...,HG00438
...,...,...
333,s3://human-pangenomics/working/HPRC/HG04160/ra...,HG04160
334,s3://human-pangenomics/working/HPRC/HG04184/ra...,HG04184
336,s3://human-pangenomics/working/HPRC/HG04199/ra...,HG04199
337,s3://human-pangenomics/working/HPRC/HG04204/ra...,HG04204


In [54]:
illumina_df[illumina_df['sample_id'].isin(['HG00423'])].values

array([['s3://human-pangenomics/working/HPRC/HG00423/raw_data/Illumina/child/HG00423.final.cram',
        'HG00423'],
       ['s3://human-pangenomics/working/HPRC/HG00423/raw_data/Illumina/parents/HG00421/HG00421.final.cram',
        'HG00423'],
       ['s3://human-pangenomics/working/HPRC/HG00423/raw_data/Illumina/parents/HG00422/HG00422.final.cram',
        'HG00423']], dtype=object)

In [56]:
result = (
    illumina_df.groupby('sample_id')['path']
    .apply(list)
    .reset_index()
)

# Display the result
# print(result)


In [109]:
# result[result['sample_id'].isin(['HG00423'])].path.values

In [45]:
parent_paths['parent'] = [sample.split('/')[-1].split('.')[0] for sample in parent_paths.path.tolist()]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  parent_paths['parent'] = [sample.split('/')[-1].split('.')[0] for sample in parent_paths.path.tolist()]


In [48]:
# 265
child_paths.sample_id.nunique()


265

In [50]:
trio_assembly.shape

(106, 21)

In [67]:
for data in trio_assembly['sample_id'].tolist():
	if data not in trio_assembly[trio_assembly['sample_id'].isin(child_paths['sample_id'].tolist())].sample_id.tolist():
		print(data)

NA18906
HG02818
HG01109
HG02055
HG03486
NA20129
HG02080
HG03098
HG02723


In [70]:
trio_assembly_working = trio_assembly[trio_assembly['sample_id'].isin(child_paths['sample_id'].tolist())]

In [96]:
trio_assembly_working = pd.merge(trio_assembly_working, child_paths, on='sample_id', how='inner')

In [80]:
parent_paths[parent_paths['parent'].isin(trio_assembly_working['Paternal ID'].tolist())].shape
parent_paths[parent_paths['parent'].isin(trio_assembly_working['Maternal ID'].tolist())].shape

(30, 3)

In [101]:
path_dict = parent_paths.set_index('parent')['path'].to_dict()

# Map paths to Paternal and Maternal IDs
trio_assembly_working['paternal_path'] = trio_assembly_working['Paternal ID'].map(path_dict)
trio_assembly_working['maternal_path'] = trio_assembly_working['Maternal ID'].map(path_dict)


In [115]:
hic_assembly[hic_assembly['sample_id'].isin(illumina_df['sample_id'].tolist())].shape

(84, 21)

In [124]:
hic_assembly_working = pd.merge(illumina_df,hic_assembly, on=['sample_id'], how='inner')

In [127]:
trio_assembly_working.head()

Unnamed: 0,sample_id,total_bp,coverage,filetype,instrument_model,library_construction_protocol,library_layout,library_strategy,read_length,Family ID,Paternal ID,Maternal ID,Gender,Phenotype,Population,Relationship,Siblings,Second Order,Third Order,Other Comments,phasing,path,paternal_path,maternal_path
0,HG03239,106491346500,34.35,cram,Illumina NovaSeq 6000,TruSeq DNA PCR-free,PAIRED,WGS,150,PK41,HG03237,HG03238,2,0,PJL,child,0,0,0,0,trio,s3://human-pangenomics/working/HPRC/HG03239/ra...,,
1,HG01255,116578965900,37.61,cram,Illumina NovaSeq 6000,TruSeq DNA PCR-free,PAIRED,WGS,150,CLM15,HG01253,HG01254,1,0,CLM,child,0,0,0,0,trio,s3://human-pangenomics/working/HPRC/HG01255/ra...,,
2,HG04157,115172611500,37.15,cram,Illumina NovaSeq 6000,TruSeq DNA PCR-free,PAIRED,WGS,150,BD52,HG04155,HG04156,1,0,BEB,child,0,0,0,0,trio,s3://human-pangenomics/working/HPRC/HG04157/ra...,,
3,HG02074,142697641500,46.03,cram,Illumina NovaSeq 6000,TruSeq DNA PCR-free,PAIRED,WGS,150,VN066,HG02076,HG02075,1,0,KHV,child,0,0,0,0,trio,s3://human-pangenomics/working/HPRC/HG02074/ra...,,
4,HG02886,126521326500,40.81,cram,Illumina NovaSeq 6000,TruSeq DNA PCR-free,PAIRED,WGS,150,GB89,HG02884,HG02885,2,0,GWD,child,0,0,0,0,trio,s3://human-pangenomics/working/HPRC/HG02886/ra...,s3://human-pangenomics/working/HPRC/HG02886/ra...,s3://human-pangenomics/working/HPRC/HG02886/ra...


In [128]:
hic_assembly_working['paternal_path'] = hic_assembly_working['Paternal ID'].map(path_dict)
hic_assembly_working['maternal_path'] = hic_assembly_working['Maternal ID'].map(path_dict)

In [129]:
pd.concat([trio_assembly_working, hic_assembly_working]

Unnamed: 0,path,sample_id,total_bp,coverage,filetype,instrument_model,library_construction_protocol,library_layout,library_strategy,read_length,Family ID,Paternal ID,Maternal ID,Gender,Phenotype,Population,Relationship,Siblings,Second Order,Third Order,Other Comments,phasing,paternal_path,maternal_path
0,s3://human-pangenomics/working/HPRC/HG00097/ra...,HG00097,114209267700,36.84,cram,Illumina NovaSeq 6000,TruSeq DNA PCR-free,PAIRED,WGS,150,HG00097,0,0,2,0,GBR,unrel,0,0,0,0,hi-c,,
1,s3://human-pangenomics/working/HPRC/HG00099/ra...,HG00099,128644524300,41.50,cram,Illumina NovaSeq 6000,TruSeq DNA PCR-free,PAIRED,WGS,150,HG00099,0,0,2,0,GBR,unrel,0,0,0,0,hi-c,,
2,s3://human-pangenomics/working/HPRC/HG00126/ra...,HG00126,139636375200,45.04,cram,Illumina NovaSeq 6000,TruSeq DNA PCR-free,PAIRED,WGS,150,HG00126,0,0,1,0,GBR,unrel,0,0,0,0,hi-c,,
3,s3://human-pangenomics/working/HPRC/HG00128/ra...,HG00128,147460895100,47.57,cram,Illumina NovaSeq 6000,TruSeq DNA PCR-free,PAIRED,WGS,150,HG00128,0,0,2,0,GBR,unrel,0,0,0,0,hi-c,,
4,s3://human-pangenomics/working/HPRC/HG00133/ra...,HG00133,170378106600,54.96,cram,Illumina NovaSeq 6000,TruSeq DNA PCR-free,PAIRED,WGS,150,HG00133,0,0,2,0,GBR,unrel,0,0,0,0,hi-c,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79,s3://human-pangenomics/working/HPRC/NA21093/ra...,NA21093,143994153000,46.45,cram,Illumina NovaSeq 6000,TruSeq DNA PCR-free,PAIRED,WGS,150,NA21093,0,0,1,0,GIH,unrel,0,0,0,0,hi-c,,
80,s3://human-pangenomics/working/HPRC/NA21102/ra...,NA21102,136537550400,44.04,cram,Illumina NovaSeq 6000,TruSeq DNA PCR-free,PAIRED,WGS,150,NA21102,0,0,2,0,GIH,unrel,0,0,0,0,hi-c,,
81,s3://human-pangenomics/working/HPRC/NA21106/ra...,NA21106,179280390300,57.83,cram,Illumina NovaSeq 6000,TruSeq DNA PCR-free,PAIRED,WGS,150,NA21106,0,0,2,0,GIH,unrel,0,0,0,0,hi-c,,
82,s3://human-pangenomics/working/HPRC/NA21110/ra...,NA21110,160804054500,51.87,cram,Illumina NovaSeq 6000,TruSeq DNA PCR-free,PAIRED,WGS,150,NA21110,0,0,2,0,GIH,unrel,0,0,0,0,hi-c,,
