# Inquiry into Kids First to FHIR model mapping

## Build a ship of pyrates

In [7]:
from fhir_pyrate import Pirate, Ahoy
import pandas as pd

pd.set_option('display.max_columns',None)
pd.set_option('display.expand_frame_repr',False)

PROD_URL = 'https://kf-api-fhir-service.kidsfirstdrc.org/'
DEV_URL = 'https://kf-api-fhir-service-dev.kidsfirstdrc.org/'
QA_URL = 'https://kf-api-fhir-service-qa.kidsfirstdrc.org'

ENDPOINT_URLS = {
    'PROD_URL' : 'https://kf-api-fhir-service.kidsfirstdrc.org/',
    'QA_URL' : 'https://kf-api-fhir-service-qa.kidsfirstdrc.org',
    'DEV_URL' : 'https://kf-api-fhir-service-dev.kidsfirstdrc.org/'
}

def get_pirate(url):
    auth = Ahoy(
        username="wnkhan32@gmail.com",
        auth_method=None,
        auth_url=url
    )

    pirate = Pirate(
        auth=auth, 
        base_url=url, 
        print_request_url=False, 
        num_processes=1
    )   

    return pirate

ship = {
    url_label : get_pirate(url) 
    for url_label, url in ENDPOINT_URLS.items()
}

## Check Pyrate Booty

In [8]:
import requests

def get_studies(pirate: Pirate):
    studies = []
    try:
        studies_df = pirate.steal_bundles_to_dataframe(
            resource_type='ResearchStudy',
            fhir_paths=[
                ('study_ids','meta.tag[0].code'),
            ]
        )
        if isinstance(studies_df,pd.DataFrame) and 'study_ids' in studies_df.columns:
            studies = studies_df['study_ids'].to_list()
    except (requests.exceptions.HTTPError) as e:
        print(e.response)

    return studies

endpoint_studies = {
    label : get_studies(pirate)
    for label, pirate in ship.items()
}

endpoint_studies

Query & Build DF (ResearchStudy): 100%|██████████| 1/1 [00:00<00:00, 381.61it/s]
Query & Build DF (ResearchStudy): 100%|██████████| 1/1 [00:00<00:00, 396.59it/s]

Failed to reach: https://kf-api-fhir-service-dev.kidsfirstdrc.org/ResearchStudy?





{'PROD_URL': ['SD_DYPMEHHF',
  'SD_65064P2Z',
  'SD_Y6VRG6MD',
  'SD_FYCR78W0',
  'SD_T8VSYRSG',
  'SD_FFVQ3T38',
  'SD_JWS3V24D',
  'SD_Z6MWD3H0',
  'SD_BHJXBDQK',
  'SD_PREASA7S',
  'SD_ZXJFFMEF',
  'SD_46SK55A3',
  'SD_PET7Q6F2',
  'SD_46RR9ZR6',
  'SD_P445ACHV',
  'SD_8Y99QZJJ',
  'SD_6FPYJQBR',
  'SD_YGVA0E1C',
  'SD_DZTB5HRR',
  'SD_VTTSHWV4',
  'SD_0TYVY1TW',
  'SD_ZFGDG5YS',
  'SD_DZ4GPQX6',
  'SD_RM8AFW0R',
  'SD_R0EPRSGS',
  'SD_W0V965XZ',
  'SD_YNSSAPHE',
  'SD_NMVV8A1Y',
  'SD_DK0KRWK8',
  'SD_B8X3C1MX',
  'SD_7NQ9151J',
  'SD_9PYZAHHE',
  'SD_1P41Z782',
  'SD_HGHFVPFD'],
 'QA_URL': ['SD_HGHFVPFD',
  'SD_1P41Z782',
  'SD_9PYZAHHE',
  'SD_7NQ9151J',
  'SD_B8X3C1MX',
  'SD_DK0KRWK8',
  'SD_NMVV8A1Y',
  'SD_YNSSAPHE',
  'SD_W0V965XZ',
  'SD_R0EPRSGS',
  'SD_RM8AFW0R',
  'SD_DZ4GPQX6',
  'SD_ZFGDG5YS',
  'SD_0TYVY1TW',
  'SD_VTTSHWV4',
  'SD_DZTB5HRR',
  'SD_YGVA0E1C',
  'SD_6FPYJQBR',
  'SD_8Y99QZJJ',
  'SD_P445ACHV',
  'SD_46RR9ZR6',
  'SD_PET7Q6F2',
  'SD_46SK55A3',
  'SD_ZX

## Check Stuides in Pyrate Booty

In [9]:
metadata = ship['PROD_URL'].steal_bundles_to_dataframe(
    resource_type='ResearchStudy',
    fhir_paths=[
        ('kf_id','meta.tag[0].code'),
        ('program','keyword[0].text'),
        ('name','title'),
        ('short_name','keyword[1].coding[0].code')
    ]
)

metadata.sort_values(by='kf_id',ignore_index=True,inplace=True)
metadata

Query & Build DF (ResearchStudy): 100%|██████████| 1/1 [00:00<00:00, 129.02it/s]


Unnamed: 0,kf_id,program,name,short_name
0,SD_0TYVY1TW,Kids First,Genomic Analysis of Esophageal Atresia and Tra...,KF-EATF
1,SD_1P41Z782,ICR,OpenDIPG: ICR London,ICR-DIPG
2,SD_46RR9ZR6,TARGET,TARGET: Acute Myeloid Leukemia (AML),TARGET-AML
3,SD_46SK55A3,Kids First,Kids First: Genomic Analysis of Congenital Dia...,KF-CDH
4,SD_65064P2Z,,INCLUDE: (Sherman) Genomic Analysis of Congeni...,INCLUDE
5,SD_6FPYJQBR,Kids First,Kids First: Genetic Basis of Disorders/Differe...,KF-DSD
6,SD_7NQ9151J,Kids First,Genome-wide Sequencing to Identify the Genes R...,KF-ED
7,SD_8Y99QZJJ,Pediatric Brain Tumor Atlas,Pediatric Brain Tumor Atlas: PNOC,PBTA-PNOC
8,SD_9PYZAHHE,Kids First,Genomic Studies of Orofacial Cleft Birth Defects,KF-OCEA
9,SD_B8X3C1MX,Kids First,Kids First: Craniofacial Microsomia: Genetic C...,KF-CM


## Noteworthy results
1. Creation time not identified
2. No column identified for age_at_enrollment

# KF FHIR Patient &rarr; C2M2 Subject
| C2M2 Field | FHIR Field |
| ---------- | ---------- |
| local_id   | Patient.identifier[].value |
| project_local_id | Patient.meta_tag[].code |
| sex | Patient.gender |
| ethnicity | Patient.extension_1_extension_0_valueString |
 


In [10]:
studies = pd.DataFrame({'studies':['SD_VTTSHWV4']})

patient_df = ship['PROD_URL'].trade_rows_for_dataframe(
    studies,
    resource_type="Patient",
    df_constraints={'_tag':'studies'},
    fhir_paths=[
        ("fhir_id","id"),
        ("kf_id","identifier[0].value"),
        ("study_id","meta.tag[0].code"),
        ("sex","gender"),
        ("ethnicity","extension[1].extension[0].valueString")
    ]
)

patient_df.head()


Query & Build DF (Patient): 100%|██████████| 1/1 [00:01<00:00,  1.40s/it]


Unnamed: 0,fhir_id,kf_id,study_id,sex,ethnicity,studies
0,184085,PT_7WSNR8P5,SD_VTTSHWV4,male,Not Hispanic or Latino,SD_VTTSHWV4
1,184086,PT_VDSFZMF6,SD_VTTSHWV4,female,Not Hispanic or Latino,SD_VTTSHWV4
2,184087,PT_R6553VPB,SD_VTTSHWV4,female,Not Hispanic or Latino,SD_VTTSHWV4
3,184088,PT_ER8QDHZH,SD_VTTSHWV4,female,Not Hispanic or Latino,SD_VTTSHWV4
4,184089,PT_WRGDD11W,SD_VTTSHWV4,female,Hispanic or Latino,SD_VTTSHWV4


# Biospecimens Ingest from FHIR
## Specimen &rarr; C2M2 Biosample
| FHIR Field | KF Field | C2M2 Field | 
| ---------- | ---------- | ---------- |
|  | kf_id | local_id | 
|  | study_id | project_local_id | 
|  | uberon_id_anatomical_site ***AND*** soure_text_anatomical_site | anatomy | 

In [11]:
patient_df = pd.DataFrame({'patient_id' :["Patient/184085"]}) 

biospec_df = ship['PROD_URL'].trade_rows_for_dataframe(
    patient_df,
    resource_type='Specimen',
    df_constraints={'patient':'patient_id'},
    #df_constraints={'_tag':'studies'},
    fhir_paths=[
        ('study_id','meta.tag[0].code'),
        ('kf_id','identifier[0].value'),
        ('source_text_anatomical_site','collection.bodySite.text'),
        ('uberon_id_anatomical_site','collection.bodySite.text'),
        ('patient_id','subject.reference'),
        ('age_at_event_days','collection._collectedDateTime.extension[0].extension[3].valueDuration.value'),
        ('dbgap_consent_code','meta.security[1].code')
    ]
)

biospec_df.head(5)


Query & Build DF (Specimen): 100%|██████████| 1/1 [00:00<00:00, 35.85it/s]


Unnamed: 0,study_id,kf_id,patient_id,age_at_event_days,dbgap_consent_code
0,SD_VTTSHWV4,BS_CB749AMH,Patient/184085,1520,phs001785.c2
1,SD_VTTSHWV4,BS_CB749AMH_Peripheral_Whole_Blood,Patient/184085,1520,phs001785.c2


Transform patient_id &rarr; participant_id due to kf model lost associations.

In [15]:
biospec_df['patient_id'] = biospec_df['patient_id'].apply(lambda the_col: the_col.split('/')[-1])
biospec_df = biospec_df.merge(patient_df[['kf_id','fhir_id']],how='inner',left_on='patient_id',right_on='fhir_id')
biospec_df.drop(['patient_id','fhir_id'],inplace=True,axis=1)
biospec_df.rename(columns={'kf_id_x':'kf_id','kf_id_y':'participant_id'},inplace=True)
biospec_df.head(5)

KeyboardInterrupt: 

# Genomic Files ingest from FHIR

## DocumentReference &rarr; C2M2 File

| FHIR Field | KF Field | C2M2 Field | 
| ---------- | ---------- | ---------- |
| identifier[0].value | kf_id | local_id | 
| content[1].attachment.url | external_id | filename | 
| content[0].format.display | file_format | file_format | 
| type.coding[0].code | data_type | data_type | 
| content[0].attachment.url | latest_did | ***Required for connecting file metadata*** | 

In [None]:
file_of_interest_df = pd.DataFrame({'gf_id': ["GF_SAQE6J8C"]})

gf_df = ship['PROD_URL'].trade_rows_for_dataframe(
    file_of_interest_df,
    resource_type='DocumentReference',
    df_constraints={"identifier":"gf_id"},
    fhir_paths=[
        ('study_id','meta.tag[0].code'),
        ('kf_id','identifier[0].value'),
        ('external_id','content[1].attachment.url'),
        ('file_format','content[0].format.display'),
        ('data_type','type.coding[0].code'),
        ('latest_did','content[0].attachment.url')
    ]
)

gf_df['latest_did'] = gf_df['latest_did'].apply(lambda the_col: the_col.split('/')[-1])
gf_df

Query & Build DF (DocumentReference): 100%|██████████| 1/1 [00:00<00:00, 36.92it/s]


Unnamed: 0,study_id,kf_id,external_id,file_format,data_type,latest_did,gf_id
0,SD_65064P2Z,GF_SAQE6J8C,s3://kf-study-us-east-1-prd-sd-z6mwd3h0/source...,cram,Aligned-Reads,4d1fc083-acc1-4a25-9806-e68129c22500,GF_SAQE6J8C
1,SD_Z6MWD3H0,GF_SAQE6J8C,s3://kf-study-us-east-1-prd-sd-z6mwd3h0/source...,cram,Aligned-Reads,4d1fc083-acc1-4a25-9806-e68129c22500,GF_SAQE6J8C
