# Create TXT files
## Create folder and copy definitions

In [12]:
import pandas as pd
from shutil import copytree
from shutil import rmtree
import pathlib
import json
import yaml
import hubmapbags
from pathlib import Path
import pandas as pd
from tqdm import tqdm
from warnings import warn as warning
from datetime import datetime

instance = 'prod'
token = ''

In [50]:
now = datetime.now()
output_directory = 'data'
if not Path(output_directory).exists():
    Path(output_directory).mkdir()

report_output_directory = 'uuid-protected-data-report'
if not Path(report_output_directory).exists():
    Path(report_output_directory).mkdir()

report_output_filename = report_output_directory + '/' + str(now.strftime('%Y%m%d')) + '.pkl'
print('File found on disk. Loading ' + report_output_filename + '.')
report = pd.read_pickle(report_output_filename.replace('tsv', 'pkl'))

File found on disk. Loading uuid-protected-data-report/20221219.pkl.


In [51]:
def get_dbgap_study_id( datum ):
        if ( datum['group_name'] == 'University of California San Diego TMC' ) or \
                ( datum['group_name'] == 'Broad Institute RTI' and datum['data_type'] == 'Slide-seq' ):
                return 'phs002249'
        elif datum['group_name'] == 'Stanford TMC':
                return 'phs002272'
        else:
                return None

In [52]:
def build_dataframe( report, instance=instance, token=token ):
    headers = ['donor_uuid','donor_hubmap_id',\
           'direct_sample_uuid','direct_sample_hubmap_id',\
           'organ_uuid','organ_hubmap_id','organ_type',\
           'direct_sample_type','dataset_metadata','donor_metadata']

    report['dbgap_study_id'] = None
    for index, datum in tqdm(report.iterrows()):
        report.loc[index,'dbgap_study_id'] = get_dbgap_study_id( datum )
        
    for key in headers:
        report[key] = None
        
    for index, datum in tqdm(report.iterrows()):
        pmetadata = hubmapbags.apis.get_provenance_info( datum['hubmap_id'], instance=instance, token=token)
        
        report.loc[index,'donor_uuid'] = pmetadata['donor_uuid']
        report.loc[index,'donor_hubmap_id'] = pmetadata['donor_hubmap_id']
        
        report.loc[index,'direct_sample_uuid'] = pmetadata['first_sample_uuid'][0]
        report.loc[index,'direct_sample_type'] = pmetadata['first_sample_type'][0]
        report.loc[index,'direct_sample_hubmap_id'] = pmetadata['first_sample_hubmap_id'][0]
        
        report.loc[index,'organ_uuid'] = pmetadata['organ_uuid']
        report.loc[index,'organ_hubmap_id'] = pmetadata['organ_hubmap_id']
        report.loc[index,'organ_type'] = pmetadata['organ_type']
        
        metadata = hubmapbags.apis.get_dataset_info( datum['hubmap_id'], instance=instance, token=token )
        report.loc[index,'donor_uuid'] = pmetadata.get('donor_uuid')[0]
        report.loc[index,'donor_hubmap_id'] = pmetadata.get('donor_hubmap_id')[0]
        
        try:
            report.loc[index,'dataset_metadata'] = [hubmapbags.apis.get_dataset_info( datum['hubmap_id'], instance=instance, token=token)]
        except Exception as e:
            print(datum['hubmap_id'])
            print(e)
            report.loc[index,'dataset_metadata'] = None
    
    return report

In [53]:
report = build_dataframe( report, instance=instance, token=token )
report.to_csv( 'dbgap.' + str(now.strftime('%Y%m%d')) + '.pkl', sep='\t', index=False)

417it [00:00, 6522.05it/s]
417it [04:01,  1.73it/s]


In [55]:
report.keys()

Index(['uuid', 'hubmap_id', 'status', 'is_protected', 'data_type', 'directory',
       'group_name', 'has_uuids', 'number_of_uuids', 'number_of_files',
       'dbgap_study_id', 'donor_uuid', 'donor_hubmap_id', 'direct_sample_uuid',
       'direct_sample_hubmap_id', 'organ_uuid', 'organ_hubmap_id',
       'organ_type', 'direct_sample_type', 'dataset_metadata',
       'donor_metadata'],
      dtype='object')

In [58]:
df = report[report['dbgap_study_id']=='phs002249']
df['group_name'

20    phs002249
21    phs002249
22    phs002249
23    phs002249
24    phs002249
        ...    
35    phs002249
36    phs002249
37    phs002249
38    phs002249
39    phs002249
Name: dbgap_study_id, Length: 179, dtype: object

In [45]:
#DISCLAIMER: @icaoberg this code is super alpha. Please be kind.
# remove submission folder if it exists
for tmc_name in tmc.unique():
    directory = 'submission-' + tmc_name.replace(' ','_').lower()
    p = pathlib.Path( directory )
    if p.exists() and p.is_dir():
        print( 'Removing existing folder ' + directory )
        rmtree(p)
    result = copytree( 'dbgap-submission-scripts/templates', directory )

## Subject Consent

In [None]:
donor = df[['donor_hubmap_id', 'donor_metadata']]
donor.insert(0,'TMC',tmc)
donor = donor.drop_duplicates(subset=['donor_hubmap_id'])
donor['sex'] = donor['donor_metadata'].str.find('Female',0)
donor.loc[donor['sex'] != -1, 'sex'] = 2
donor.loc[donor['sex'] == -1, 'sex'] = 1
donor.insert(1,'consent',[1]*len(donor))
donor = donor.drop('donor_metadata',axis=1)
donor = donor.rename(columns={'donor_hubmap_id':'SUBJECT_ID','consent':'CONSENT','sex':'SEX'})
donor.insert(3,'SUBJECT_SOURCE',['HuBMAP']*len(donor))
donor['SOURCE_SUBJECT_ID'] = donor['SUBJECT_ID']

donor=donor.reindex(columns=['TMC', 'SUBJECT_ID', 'CONSENT', 'SEX', 'SUBJECT_SOURCE', 'SOURCE_SUBJECT_ID'])
print(donor)

for tmc_name in tmc.unique():
    temp = donor
    temp = temp[temp.TMC.isin([tmc_name])]
    temp = temp.drop('TMC',axis=1)
    temp.to_csv('submission-' + tmc_name + '/2a_SubjectConsent_DS.txt', index=False, sep='\t')

## Sample Attributes

In [None]:
with open('search-api/src/search-schema/data/definitions/enums/organ_types.yaml') as file:
    organ_types = yaml.load(file, Loader=yaml.FullLoader)

sample_attributes = df[['dataset_hubmap_id','dataset_metadata']]
sample_attributes.insert(0,'TMC',tmc)

analyte_class = []
for index in range(len(sample_attributes)):
    metadata = sample_attributes.loc[index]['dataset_metadata']
    try:
        analyte_class.append( metadata[metadata.find('analyte_class',0)+16: \
                         metadata.find(',',metadata.find('analyte_class',0))])
    except:
        analyte_class.append('MISSING')
    
sample_attributes = sample_attributes.drop('dataset_metadata',axis=1)
sample_attributes['ANALYTE_TYPE'] = analyte_class
sample_attributes.insert(1,'BODY_SITE',df['organ_type'])
#sample_attributes.insert(1,'HISTOLOGICAL_TYPE',df['organ_type'])
sample_attributes.insert(3,'IS_TUMOR',['N']*len(sample_attributes))

for index in range(len(sample_attributes)):
    sample_attributes['BODY_SITE'].loc[index] = organ_types[sample_attributes['BODY_SITE'].loc[index]]['description']
    #sample_attributes['HISTOLOGICAL_TYPE'].loc[index] = organ_types[sample_attributes['HISTOLOGICAL_TYPE'].loc[index]]['description']

sample_attributes = sample_attributes.drop_duplicates(subset=['dataset_hubmap_id'])
sample_attributes = sample_attributes.rename(columns={'dataset_hubmap_id':'SAMPLE_ID'})

#sample_attributes=sample_attributes.reindex(columns=['TMC', 'SAMPLE_ID', 'BODY_SITE', 'ANALYTE_TYPE', 'IS_TUMOR', 'HISTOLOGICAL_TYPE'])
sample_attributes=sample_attributes.reindex(columns=['TMC', 'SAMPLE_ID', 'BODY_SITE', 'ANALYTE_TYPE', 'IS_TUMOR'])
#sample_attributes['HISTOLOGICAL_TYPE'].replace('Lymph Node 05', 'Lymph Node', inplace=True)
#sample_attributes['HISTOLOGICAL_TYPE'].replace('Lymph Node 01', 'Lymph Node', inplace=True)
sample_attributes['BODY_SITE'].replace('Lymph Node 05', 'Lymph Node', inplace=True)
sample_attributes['BODY_SITE'].replace('Lymph Node 01', 'Lymph Node', inplace=True)

print(sample_attributes)

for tmc_name in tmc.unique():
    temp = sample_attributes
    temp = temp[temp.TMC.isin([tmc_name])]
    temp = temp.drop('TMC',axis=1)
    temp.to_csv('submission-' + tmc_name + '/6a_SampleAttributes_DS.txt', index=False, sep='\t')

## Subject Sample Mapping (SSM)

In [None]:
sample_mapping = df[['donor_hubmap_id','dataset_hubmap_id']]
sample_mapping.insert(0,'TMC',tmc)
sample_mapping = sample_mapping.drop_duplicates(subset=['dataset_hubmap_id'])
sample_mapping = sample_mapping.rename(columns={'donor_hubmap_id':'SUBJECT_ID','dataset_hubmap_id':'SAMPLE_ID'})
print( sample_mapping )

for tmc_name in tmc.unique():
    temp = sample_mapping
    temp = temp[temp.TMC.isin([tmc_name])]
    temp = temp.drop('TMC',axis=1)
    temp.to_csv('submission-' + tmc_name + '/3a_SSM_DS.txt', index=False, sep='\t')

In [None]:
report.keys()

In [9]:
datum

NameError: name 'datum' is not defined