# Create TXT files
## Create folder and copy definitions

In [None]:
import pandas as pd
from shutil import copytree
from shutil import rmtree
import pathlib
import json
import yaml
import hubmapbags
from pathlib import Path
import pandas as pd
from tqdm import tqdm
from warnings import warn as warning
from datetime import datetime

instance = 'prod'
token = ''

In [None]:
now = datetime.now()
output_directory = 'data'
if not Path(output_directory).exists():
    Path(output_directory).mkdir()

report_output_directory = 'uuid-protected-data-report'
if not Path(report_output_directory).exists():
    Path(report_output_directory).mkdir()

report_output_filename = report_output_directory + '/' + str(now.strftime('%Y%m%d')) + '.pkl'
print('File found on disk. Loading ' + report_output_filename + '.')
report = pd.read_pickle(report_output_filename.replace('tsv', 'pkl'))

In [None]:
def get_dbgap_study_id( datum ):
        if ( datum['group_name'] == 'University of California San Diego TMC' ) or \
                ( datum['group_name'] == 'Broad Institute RTI' and datum['data_type'] == 'Slide-seq' ):
                return 'phs002249'
        elif datum['group_name'] == 'Stanford TMC':
                return 'phs002272'
        else:
                return None

In [None]:
report

In [None]:
report['dbgap_study_id'] = None
for index, datum in tqdm(report.iterrows()):
    report.loc[index,'dbgap_study_id'] = get_dbgap_study_id( datum )
report

In [None]:
def build_dataframe( report, instance=instance, token=token ):
    headers = ['donor_uuid','donor_hubmap_id',\
           'direct_sample_uuid','direct_sample_hubmap_id',\
           'organ_uuid','organ_hubmap_id','organ_type',\
           'direct_sample_type','dataset_metadata','donor_metadata']
        
    for index, datum in tqdm(report.iterrows()):
        pmetadata = hubmapbags.apis.get_provenance_info( datum['hubmap_id'], instance=instance, token=token)
        
        try:
            report.loc[index,'donor_uuid'] = pmetadata['donor_uuid'][0]
        except Exception as e:
            print(e)
            print(pmetadata['donor_uuid'])
            
        try:
            report.loc[index,'donor_hubmap_id'] = pmetadata['donor_hubmap_id'][0]
        except Exception as e:
            print(e)
            print(pmetadata['donor_hubmap_id'])
        
        report.loc[index,'direct_sample_uuid'] = pmetadata['first_sample_uuid'][0]
        report.loc[index,'direct_sample_type'] = pmetadata['first_sample_type'][0]
        report.loc[index,'direct_sample_hubmap_id'] = pmetadata['first_sample_hubmap_id'][0]
        
        try:
            report.loc[index,'organ_uuid'] = pmetadata['organ_uuid'][0]
        except Exception as e:
            print(e)
            print(pmetadata['organ_uuid'])
        
        try:
            report.loc[index,'organ_hubmap_id'] = pmetadata['organ_hubmap_id'][0]
        except Exception as e:
            print(e)
            print(pmetadata['organ_hubmap_id'])
            
        try:
            report.loc[index,'organ_type'] = pmetadata['organ_type'][0]
        except Exception as e:
            print(e)
            print(pmetadata['organ_type'])
        
        metadata = hubmapbags.apis.get_dataset_info( datum['hubmap_id'], instance=instance, token=token )
        
        try:
            report.loc[index,'donor_uuid'] = pmetadata.get('donor_uuid')[0]
        except Exception as e:
            print(e)
            print(pmetadata.get('donor_uuid'))
            
        try:
            report.loc[index,'donor_hubmap_id'] = pmetadata.get('donor_hubmap_id')[0]
        except Exception as e:
            print(e)
            print(pmetadata.get('donor_hubmap_id'))
    
    return report

In [None]:
report = build_dataframe( report, instance=instance, token=token )
report.to_csv( 'dbgap.' + str(now.strftime('%Y%m%d')) + '.tsv', sep='\t', index=False)

In [None]:
report = pd.concat([report[report['group_name'] == 'University of California San Diego TMC'],report[report['group_name'] == 'Broad Institute RTI']])

In [None]:
report

In [None]:
#DISCLAIMER: @icaoberg this code is super alpha. Please be kind.
# remove submission folder if it exists
directory = 'phs002249'
p = pathlib.Path( directory )
if p.exists() and p.is_dir():
    print( 'Removing existing folder ' + directory )
    rmtree(p)
result = copytree( 'dbgap-submission-scripts/templates', directory )
print(directory)

## Subject Consent

In [None]:
donor = report[['donor_hubmap_id', 'donor_uuid']]
donor = donor.drop_duplicates(subset=['donor_hubmap_id'])

donor['sex'] = None
for index, datum in tqdm(donor.iterrows()):
    metadata = hubmapbags.apis.get_entity_info( datum['donor_hubmap_id'], token=token, instance='prod' )
    if 'living_donor_data' in metadata['metadata'].keys():
        for info in metadata['metadata']['living_donor_data']:
            if info['grouping_concept_preferred_term'] == 'Sex':
                donor.loc[index,'sex'] = info['preferred_term']
    else:
        for info in metadata['metadata']['organ_donor_data']:
            if info['grouping_concept_preferred_term'] == 'Sex':
                donor.loc[index,'sex'] = info['preferred_term']
                
    if donor.loc[index,'sex'] == 'Male':
        donor.loc[index,'sex'] = 1;
    else:
        donor.loc[index,'sex'] = 2;
        
    donor.loc[index,'subject_source']='HuBMAP'
    
donor = donor.drop('donor_uuid',axis=1)
donor['SOURCE_SUBJECT_ID']=donor['donor_hubmap_id']
donor['consent']=1
donor = donor.rename(columns={'donor_hubmap_id':'SUBJECT_ID','consent':'CONSENT','sex':'SEX', 'subject_source':'SUBJECT_SOURCE'})
donor=donor.reindex(columns=['SUBJECT_ID', 'CONSENT', 'SEX', 'SUBJECT_SOURCE', 'SOURCE_SUBJECT_ID'])
donor.to_csv(directory + '/2a_SubjectConsent_DS.txt', index=False, sep='\t')

donor

## Sample Attributes

In [None]:
with open('search-api/src/search-schema/data/definitions/enums/organ_types.yaml') as file:
    organ_types = yaml.load(file, Loader=yaml.FullLoader)

sample_attributes = report[['hubmap_id']]
analyte_class = []

sample_attributes['BODY_SITE']=None
for index, datum in tqdm(sample_attributes.iterrows()):
    metadata = hubmapbags.apis.get_dataset_info(datum['hubmap_id'], token=token, instance=instance)
    
    if datum['hubmap_id'] == 'HBM347.RFGL.437':
        analyte_class.append('DNA')
    elif datum['hubmap_id'] == 'HBM773.WCXC.264':
        analyte_class.append('RNA')
    elif 'ingest_metadata' in metadata.keys():
        analyte_class.append(metadata['ingest_metadata']['metadata']['analyte_class'])
    else:
        print(datum['hubmap_id'])
    
    sample_attributes.loc[index,'BODY_SITE'] = report.loc[index, 'organ_type']

sample_attributes['ANALYTE_TYPE'] = analyte_class
sample_attributes['IS_TUMOR'] = 'N'
sample_attributes = sample_attributes.rename(columns={'hubmap_id':'SAMPLE_ID'})
sample_attributes=sample_attributes.reindex(columns=['SAMPLE_ID', 'BODY_SITE', 'ANALYTE_TYPE', 'IS_TUMOR'])
sample_attributes.to_csv(directory + '/6a_SampleAttributes_DS.txt', index=False, sep='\t')

In [None]:
sample_attributes

## Subject Sample Mapping (SSM)

In [None]:
sample_mapping = report[['donor_hubmap_id','hubmap_id']]
sample_mapping = sample_mapping.rename(columns={'donor_hubmap_id':'SUBJECT_ID','hubmap_id':'SAMPLE_ID'})
sample_mapping.to_csv(directory  + '/3a_SSM_DS.txt', index=False, sep='\t')

sample_mapping

# Missing metadata

In [None]:
hmid = 'HBM347.RFGL.437'
metadata = hubmapbags.apis.get_dataset_info( hmid, token=token, instance=instance )
metadata

In [None]:
hmid='HBM773.WCXC.264'
metadata = hubmapbags.apis.get_dataset_info( hmid, token=token, instance=instance )
metadata