In [1]:
import collections
import pandas as pd
from Bio import Entrez
import numpy as np

In [5]:
# Set your email (required by NCBI)
Entrez.email = "apblair@ucsc.edu"

def get_sra_accessions(biosample_id):
    handle = Entrez.esearch(db="biosample", term=biosample_id)
    record = Entrez.read(handle)
    handle.close()
    biosample_id = record['IdList'][0]
    handle = Entrez.esummary(db="biosample", id=biosample_id)
    summary_record = Entrez.read(handle)
    handle.close()
    print(summary_record)
    biosample_id_dict = {data[0]:data[1] for data in [sample_data.split(': ') \
                                                      for sample_data in summary_record['DocumentSummarySet']['DocumentSummary'][0]['Identifiers'].split('; ')]}
    return biosample_id_dict

In [6]:
biosample_id = "NA20346"  # Replace with your BioSample ID
biosample_id_dict = get_sra_accessions(biosample_id)

{'DocumentSummarySet': DictElement({'DocumentSummary': [DictElement({'Title': 'Human sample from Homo sapiens', 'Accession': 'SAMN41021609', 'Date': '2024/04/19', 'PublicationDate': '2024/04/19', 'ModificationDate': '2024/04/19', 'Organization': 'UCSC Genomics Institute', 'Taxonomy': '9606', 'Organism': 'Homo sapiens', 'SourceSample': 'BioSample:SAMN41021609', 'SampleData': '<BioSample access="public" publication_date="2024-04-19T00:00:00.000" last_update="2024-04-19T21:04:04.382" submission_date="2024-04-19T21:04:04.382" id="41021609" accession="SAMN41021609">   <Ids>     <Id db="BioSample" is_primary="1">SAMN41021609</Id>     <Id db_label="Sample name">NA20346</Id>   </Ids>   <Description>     <Title>Human sample from Homo sapiens</Title>     <Organism taxonomy_id="9606" taxonomy_name="Homo sapiens">       <OrganismName>Homo sapiens</OrganismName>     </Organism>   </Description>   <Owner>     <Name abbreviation="UCSC GI" url="https://ucscgenomics.soe.ucsc.edu/">UCSC Genomics Institu

In [46]:
biosample_id_dict

{'BioSample': 'SAMN39925242'}

### Import

In [47]:
hprc_production = pd.read_csv('production/hprc-production-20240409.tsv', sep='\t', header=1)
hprc_production = hprc_production[hprc_production['Pellets Status: Production'].notna()]

In [48]:
bio_sample_table_df = pd.read_csv('production/biosample_table_HPRC-20240416.tsv', sep='\t')
biosample_accession_ids = bio_sample_table_df['Accession'].tolist()
biosample_df = bio_sample_table_df[['Accession', 'BioSample.name']]

In [49]:
biosample_dict = {biosample_id:get_sra_accessions(biosample_id) for biosample_id in biosample_accession_ids}

In [50]:
biosample_df = pd.DataFrame.from_dict(biosample_dict,orient='index')

In [51]:
biosample_df = biosample_df[['BioSample','Sample name']]
biosample_df.columns = ['Accession', 'BioSample.name']

### Wrangle

In [52]:
hprc_production_current_scope = hprc_production[hprc_production['Production Year'].isin(['YR1','YR2', 'YR3', 'YR4'])]
hprc_production_current_scope = hprc_production_current_scope[['ChildID','N', 
                                                               'Sex', 'Subpopulation', 
                                                               'Superpopulation', 
                                                               'Production Year', 'Pellets Status: Production']]
hprc_production_current_scope_samples = hprc_production[hprc_production['Production Year'].isin(['YR1','YR2', 'YR3', 'YR4'])]['ChildID'].tolist()

In [53]:
collections.Counter(hprc_production_current_scope['Production Year'].tolist())

Counter({'YR4': 101, 'YR3': 69, 'YR2': 52, 'YR1': 30})

In [54]:
condition_1 = hprc_production_current_scope['ChildID'] == 'NA19043 (replacement GM19454)'
condition_2 = hprc_production_current_scope['ChildID'] == 'NA19120 (replacement NA18934)'

hprc_production_current_scope.loc[condition_1, 'ChildID'] = 'NA19043'
hprc_production_current_scope.loc[condition_2, 'ChildID'] = 'NA19120'

In [55]:
# GM to NA conversion
biosample_identifier_coversion = ['NA' + identifier[2:] if identifier.startswith('GM') else identifier \
                                  for identifier in biosample_df['BioSample.name'].tolist()]
biosample_df['Sample'] = biosample_identifier_coversion
biosample_df = biosample_df.drop(columns='BioSample.name')

biosample_identifier_coversion_ChildID = ['NA' + identifier[2:] if identifier.startswith('GM') else identifier \
                                  for identifier in hprc_production_current_scope['ChildID'].tolist()]
hprc_production_current_scope['Sample'] = biosample_identifier_coversion_ChildID

In [56]:
merged_production = pd.merge(hprc_production_current_scope, biosample_df, on='Sample', how='left')

In [57]:
merged_production

Unnamed: 0,ChildID,N,Sex,Subpopulation,Superpopulation,Production Year,Pellets Status: Production,Sample,Accession
0,HG01891,BB05,female,ACB,AFR,YR1,Pellets Banked (11/25/19),HG01891,SAMN17861236
1,HG02486,BB55,male,ACB,AFR,YR1,Pellets Banked (11/25/19),HG02486,SAMN17861238
2,HG02559,BB68,female,ACB,AFR,YR1,Pellets Banked (11/25/19),HG02559,SAMN17861239
3,HG02257,BB21,female,ACB,AFR,YR1,Pellets Banked (11/25/19),HG02257,SAMN17861237
4,HG01358,CLM31,male,CLM,AMR,YR1,Pellets Banked (11/25/19),HG01358,SAMN17861234
...,...,...,...,...,...,...,...,...,...
247,HG03388,,,MSL,AFR,YR4,Pellets Banked (08/15/2023),HG03388,
248,NA19042,fiber-seq pilot,,LWK,AFR,YR4,Pellets Banked (08/22/2023),NA19042,
249,HG00701,,,CHS,EAS,YR4,Pellets Banked (09/12/2023),HG00701,
250,HG03072,,,MSL,AFR,YR4,Pellets Banked (09/12/2023),HG03072,


In [58]:
merged_production.columns = ['ChildID',
                             'familyID',
                             'Sex',
                             'Subpopulation',
                             'Superpopulation',
                             'Production Year',
                             'collection_date',
                             'Sample',
                             'Accession']

In [59]:
merged_production = merged_production.drop(columns='ChildID')

In [60]:
merged_production.loc[merged_production['familyID'] == 'fiber-seq pilot', 'familyID'] = np.nan

In [61]:
merged_production.head()

Unnamed: 0,familyID,Sex,Subpopulation,Superpopulation,Production Year,collection_date,Sample,Accession
0,BB05,female,ACB,AFR,YR1,Pellets Banked (11/25/19),HG01891,SAMN17861236
1,BB55,male,ACB,AFR,YR1,Pellets Banked (11/25/19),HG02486,SAMN17861238
2,BB68,female,ACB,AFR,YR1,Pellets Banked (11/25/19),HG02559,SAMN17861239
3,BB21,female,ACB,AFR,YR1,Pellets Banked (11/25/19),HG02257,SAMN17861237
4,CLM31,male,CLM,AMR,YR1,Pellets Banked (11/25/19),HG01358,SAMN17861234


In [62]:
biosample_df[biosample_df['Accession'].isin(merged_production['Accession'].tolist())]

Unnamed: 0,Accession,Sample
SAMN37797095,SAMN37797095,NA20858
SAMN37797094,SAMN37797094,NA20799
SAMN37797092,SAMN37797092,NA19391
SAMN37797093,SAMN37797093,NA19468
SAMN37797091,SAMN37797091,NA19338
...,...,...
SAMN17861241,SAMN17861241,HG03516
SAMN17861240,SAMN17861240,HG02572
SAMN17861239,SAMN17861239,HG02559
SAMN17861233,SAMN17861233,HG01258


In [63]:
biosample_df[~biosample_df['Accession'].isin(merged_production['Accession'].tolist())]

Unnamed: 0,Accession,Sample
SAMN33621959,SAMN33621959,HG06807
SAMN17861242,SAMN17861242,HG002


In [64]:
biosample_df.shape

(156, 2)

In [65]:
collections.Counter(merged_production[~merged_production['Accession'].notna()]['Production Year'].tolist())

Counter({'YR4': 98})

In [66]:
merged_production[merged_production['Accession'].notna()].shape

(154, 8)

In [67]:
merged_production_collection_dates = [data.split('(')[-1].split(')')[0] for data in \
                                                 merged_production['collection_date'].tolist()]
standard_dates = pd.to_datetime(merged_production_collection_dates, errors='coerce')
assert standard_dates.shape[0] == len(merged_production_collection_dates)

merged_production['collection_date'] = standard_dates

  standard_dates = pd.to_datetime(merged_production_collection_dates, errors='coerce')


In [68]:
merged_production.to_csv('production/hprc-production-biosample-20240409.tsv', sep='\t',index=False)

In [69]:
merged_production[~merged_production['Accession'].notna()]

Unnamed: 0,familyID,Sex,Subpopulation,Superpopulation,Production Year,collection_date,Sample,Accession
154,,female,ASW,AFR,YR4,2022-10-04,NA19909,
155,,female,ASW,AFR,YR4,2022-10-04,NA20282,
156,,male,ASW,AFR,YR4,2022-10-04,NA20346,
157,,female,MXL,AMR,YR4,2022-10-04,NA19776,
158,,male,MXL,AMR,YR4,2022-09-27,NA19682,
...,...,...,...,...,...,...,...,...
247,,,MSL,AFR,YR4,2023-08-15,HG03388,
248,,,LWK,AFR,YR4,2023-08-22,NA19042,
249,,,CHS,EAS,YR4,2023-09-12,HG00701,
250,,,MSL,AFR,YR4,2023-09-12,HG03072,


In [70]:
collections.Counter(merged_production['Production Year'].tolist())

Counter({'YR4': 101, 'YR3': 69, 'YR2': 52, 'YR1': 30})

In [71]:
hprc_production_current_scope_samples_create_biosample_ids_df = merged_production[~merged_production['Accession'].notna()]

In [72]:
hprc_production_current_scope_samples_create_biosample_ids_df['biomaterial_provider'] = 'Coriell Institute'
hprc_production_current_scope_samples_create_biosample_ids_df['organism'] = 'Homo sapiens'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hprc_production_current_scope_samples_create_biosample_ids_df['biomaterial_provider'] = 'Coriell Institute'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hprc_production_current_scope_samples_create_biosample_ids_df['organism'] = 'Homo sapiens'


In [73]:
hprc_production_current_scope_samples_create_biosample_ids_df.shape

(98, 10)

In [175]:
hprc_production_current_scope_samples_create_biosample_ids_df[hprc_production_current_scope_samples_create_biosample_ids_df['Sex'].notna()].shape

(54, 10)

In [98]:
import mechanicalsoup

# Create a MechanicalSoup browser object
manual_resolve = []
browser = mechanicalsoup.Browser()
sample_collection_dict = {sample : {"tissue": None,
                                    "geo_loc_name": None,
                                    "sex": None
                                   } for sample in hprc_production_current_scope_samples_create_biosample_ids_df['Sample'].tolist()}
for sample in hprc_production_current_scope_samples_create_biosample_ids_df['Sample'].tolist():
    
    url = 'https://catalog.coriell.org/0/Sections/Search/Sample_Detail.aspx?Ref='+sample+'&Product=DNA' # +'&product=CC'
    page = browser.get(url)
    
    form = page.soup.select('form')[0]
    # print(form)
    # print(page.find('span', id='lblCountry').text)
    try:
        country_span = form.select_one('#lblCountry').text
        cell_type_span = form.select_one('#lblCell_Type').text
        sex_span = form.select_one('#lblGender').text
        sample_collection_dict[sample]['tissue'] = cell_type_span
        sample_collection_dict[sample]['geo_loc_name'] = country_span
        sample_collection_dict[sample]['sex'] = sex_span
    except AttributeError:
        print(sample)


HG00867
HG02178
HG01028


In [99]:
sample_collection_dict['HG00867']['tissue'] = 'B-Lymphocyte'
sample_collection_dict['HG00867']['geo_loc_name'] = 'CHINA'
sample_collection_dict['HG00867']['sex'] = 'female'

sample_collection_dict['HG02178']['tissue'] = 'B-Lymphocyte'
sample_collection_dict['HG02178']['geo_loc_name'] = 'CHINA'
sample_collection_dict['HG02178']['sex'] = 'female'

sample_collection_dict['HG01028']['tissue'] = 'B-Lymphocyte'
sample_collection_dict['HG01028']['geo_loc_name'] = 'CHINA'
sample_collection_dict['HG01028']['sex'] = 'male'


In [100]:
hprc_production_current_scope_samples_create_biosample_ids_df.head()

Unnamed: 0,familyID,Sex,Subpopulation,Superpopulation,Production Year,collection_date,Sample,Accession,biomaterial_provider,organism
154,,female,ASW,AFR,YR4,2022-10-04,NA19909,,Coriell Institute,Homo sapiens
155,,female,ASW,AFR,YR4,2022-10-04,NA20282,,Coriell Institute,Homo sapiens
156,,male,ASW,AFR,YR4,2022-10-04,NA20346,,Coriell Institute,Homo sapiens
157,,female,MXL,AMR,YR4,2022-10-04,NA19776,,Coriell Institute,Homo sapiens
158,,male,MXL,AMR,YR4,2022-09-27,NA19682,,Coriell Institute,Homo sapiens


In [108]:
sample_collection_df = pd.DataFrame.from_dict(sample_collection_dict,orient='index')
sample_collection_df['Sample'] = sample_collection_df.index.tolist()

In [109]:
sample_collection_df['sex'] = sample_collection_df['sex'].str.lower()

In [111]:
hprc_production_current_scope_samples_create_biosample_ids_export_df = pd.merge(sample_collection_df,
                                                                                hprc_production_current_scope_samples_create_biosample_ids_df, 
                                                                                on='Sample')

In [116]:
hprc_production_current_scope_samples_create_biosample_ids_export_df.drop(columns=['Sex'], inplace=True)


In [119]:
hprc_production_current_scope_samples_create_biosample_ids_export_df

Unnamed: 0,tissue,geo_loc_name,sex,Sample,familyID,Subpopulation,Superpopulation,Production Year,collection_date,Accession,biomaterial_provider,organism
0,B-Lymphocyte,USA,female,NA19909,,ASW,AFR,YR4,2022-10-04,,Coriell Institute,Homo sapiens
1,B-Lymphocyte,USA,female,NA20282,,ASW,AFR,YR4,2022-10-04,,Coriell Institute,Homo sapiens
2,B-Lymphocyte,USA,male,NA20346,,ASW,AFR,YR4,2022-10-04,,Coriell Institute,Homo sapiens
3,B-Lymphocyte,USA,female,NA19776,,MXL,AMR,YR4,2022-10-04,,Coriell Institute,Homo sapiens
4,B-Lymphocyte,USA,male,NA19682,,MXL,AMR,YR4,2022-09-27,,Coriell Institute,Homo sapiens
...,...,...,...,...,...,...,...,...,...,...,...,...
93,B-Lymphocyte,SIERRA LEONE,male,HG03388,,MSL,AFR,YR4,2023-08-15,,Coriell Institute,Homo sapiens
94,B-Lymphocyte,KENYA,female,NA19042,,LWK,AFR,YR4,2023-08-22,,Coriell Institute,Homo sapiens
95,B-Lymphocyte,CHINA,male,HG00701,,CHS,EAS,YR4,2023-09-12,,Coriell Institute,Homo sapiens
96,B-Lymphocyte,SIERRA LEONE,male,HG03072,,MSL,AFR,YR4,2023-09-12,,Coriell Institute,Homo sapiens


In [122]:
hprc_production_current_scope_samples_create_biosample_ids_export_df[['Sample','tissue','geo_loc_name','sex', 'organism', 'biomaterial_provider', 'collection_date']]

Unnamed: 0,Sample,tissue,geo_loc_name,sex,organism,biomaterial_provider,collection_date
0,NA19909,B-Lymphocyte,USA,female,Homo sapiens,Coriell Institute,2022-10-04
1,NA20282,B-Lymphocyte,USA,female,Homo sapiens,Coriell Institute,2022-10-04
2,NA20346,B-Lymphocyte,USA,male,Homo sapiens,Coriell Institute,2022-10-04
3,NA19776,B-Lymphocyte,USA,female,Homo sapiens,Coriell Institute,2022-10-04
4,NA19682,B-Lymphocyte,USA,male,Homo sapiens,Coriell Institute,2022-09-27
...,...,...,...,...,...,...,...
93,HG03388,B-Lymphocyte,SIERRA LEONE,male,Homo sapiens,Coriell Institute,2023-08-15
94,NA19042,B-Lymphocyte,KENYA,female,Homo sapiens,Coriell Institute,2023-08-22
95,HG00701,B-Lymphocyte,CHINA,male,Homo sapiens,Coriell Institute,2023-09-12
96,HG03072,B-Lymphocyte,SIERRA LEONE,male,Homo sapiens,Coriell Institute,2023-09-12


In [123]:
hprc_production_current_scope_samples_create_biosample_ids_export_df.to_csv('create-biosample-identifiers/hprc-production-create-YR4-biosamples-20240409.tsv', sep='\t',index=False)

In [80]:
# biosample_check = {sample:get_sra_accessions(sample) for sample in hprc_production_current_scope_samples_create_biosample_ids_export_df['Sample'].tolist()}
# biosample_check_df = pd.DataFrame.from_dict(biosample_check,orient='index')[['BioSample','Sample name']]
# biosample_check_df['Sample'] = biosample_check_df.index
# biosample_check_df.columns = ['Accession', 'Sample name', 'Sample']

In [98]:
# final_merged_production = pd.merge(merged_production, biosample_check_df, on='Sample', how='left')

# final_merged_production['Accession'] = final_merged_production['Accession_x'].fillna(final_merged_production['Accession_x'])

# final_merged_production.drop(columns=['Accession_x', 'Accession_x'], inplace=True)
# final_merged_production.drop(columns=['Sample name','Accession_y'], inplace=True)

# final_merged_production.to_csv('production/hprc-production-biosample-20240409.tsv', sep='\t',index=False)

In [150]:
hprc_add_biosamples = hprc_production_current_scope_samples_create_biosample_ids_export_df[['Sample','Accession','familyID','Subpopulation', 'Superpopulation','Production Year']]

In [126]:
new_biosamples = pd.read_csv('create-biosample-identifiers/attributes.tsv',sep='\t')

In [131]:
new_biosamples_dict = {sample[1]:sample[0] for sample in new_biosamples[['accession','sample_name']].values}

In [135]:
# merged_production_add_biosamples = merged_production[~merged_production['Accession'].notna()]

In [151]:
add_biosamples = []
for sample in hprc_add_biosamples['Sample'].tolist():
    add_biosamples.append(new_biosamples_dict[sample])

In [152]:
hprc_add_biosamples['Accession'] = add_biosamples

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hprc_add_biosamples['Accession'] = add_biosamples


In [156]:
pd.concat([merged_production[merged_production['Accession'].notna()][['Sample','Accession',
                                                                      'familyID','Subpopulation',
                                                                      'Superpopulation','Production Year']], 
           hprc_add_biosamples]).to_csv('production/hprc-production-biosample-table-20240409.tsv', sep='\t', index=False)