# Inquiry into Kids First to FHIR model mapping

## Build a ship of pyrates

In [18]:
from fhir_pyrate import Pirate, Ahoy
import pandas as pd

pd.set_option('display.max_columns',None)
pd.set_option('display.expand_frame_repr',False)

PROD_URL = 'https://kf-api-fhir-service.kidsfirstdrc.org/'
DEV_URL = 'https://kf-api-fhir-service-dev.kidsfirstdrc.org/'
QA_URL = 'https://kf-api-fhir-service-qa.kidsfirstdrc.org'

ENDPOINT_URLS = {
    'PROD_URL' : 'https://kf-api-fhir-service.kidsfirstdrc.org/',
    'QA_URL' : 'https://kf-api-fhir-service-qa.kidsfirstdrc.org',
    'DEV_URL' : 'https://kf-api-fhir-service-dev.kidsfirstdrc.org/'
}

def get_pirate(url):
    auth = Ahoy(
        username="wnkhan32@gmail.com",
        auth_method=None,
        auth_url=url
    )

    pirate = Pirate(
        auth=auth, 
        base_url=url, 
        print_request_url=False, 
        num_processes=1
    )   

    return pirate

ship = {
    url_label : get_pirate(url) 
    for url_label, url in ENDPOINT_URLS.items()
}

## Check Pyrate Booty

In [19]:
import requests

def get_studies(pirate: Pirate):
    studies = []
    try:
        studies_df = pirate.steal_bundles_to_dataframe(
            resource_type='ResearchStudy',
            fhir_paths=[
                ('study_ids','meta.tag[0].code'),
            ]
        )
        if isinstance(studies_df,pd.DataFrame) and 'study_ids' in studies_df.columns:
            studies = studies_df['study_ids'].to_list()
    except (requests.exceptions.HTTPError) as e:
        print(e.response)

    return studies

endpoint_studies = {
    label : get_studies(pirate)
    for label, pirate in ship.items()
}

endpoint_studies

Query & Build DF (ResearchStudy): 100%|██████████| 1/1 [00:00<00:00, 398.66it/s]
Query & Build DF (ResearchStudy): 100%|██████████| 1/1 [00:00<00:00, 394.24it/s]

Failed to reach: https://kf-api-fhir-service-dev.kidsfirstdrc.org/ResearchStudy?





{'PROD_URL': ['SD_DYPMEHHF',
  'SD_65064P2Z',
  'SD_Y6VRG6MD',
  'SD_FYCR78W0',
  'SD_T8VSYRSG',
  'SD_FFVQ3T38',
  'SD_JWS3V24D',
  'SD_Z6MWD3H0',
  'SD_BHJXBDQK',
  'SD_PREASA7S',
  'SD_ZXJFFMEF',
  'SD_46SK55A3',
  'SD_PET7Q6F2',
  'SD_46RR9ZR6',
  'SD_P445ACHV',
  'SD_8Y99QZJJ',
  'SD_6FPYJQBR',
  'SD_YGVA0E1C',
  'SD_DZTB5HRR',
  'SD_VTTSHWV4',
  'SD_0TYVY1TW',
  'SD_ZFGDG5YS',
  'SD_DZ4GPQX6',
  'SD_RM8AFW0R',
  'SD_R0EPRSGS',
  'SD_W0V965XZ',
  'SD_YNSSAPHE',
  'SD_NMVV8A1Y',
  'SD_DK0KRWK8',
  'SD_B8X3C1MX',
  'SD_7NQ9151J',
  'SD_9PYZAHHE',
  'SD_1P41Z782',
  'SD_HGHFVPFD',
  'SD_Z0D9N23X',
  'SD_2CEKQ05V',
  'SD_54G4WG4R',
  'SD_JK4Z4T6V',
  'SD_W6FWTD8A',
  'SD_AQ9KVN5P'],
 'QA_URL': ['SD_AQ9KVN5P',
  'SD_W6FWTD8A',
  'SD_JK4Z4T6V',
  'SD_54G4WG4R',
  'SD_2CEKQ05V',
  'SD_Z0D9N23X',
  'SD_HGHFVPFD',
  'SD_1P41Z782',
  'SD_9PYZAHHE',
  'SD_7NQ9151J',
  'SD_B8X3C1MX',
  'SD_DK0KRWK8',
  'SD_NMVV8A1Y',
  'SD_YNSSAPHE',
  'SD_W0V965XZ',
  'SD_R0EPRSGS',
  'SD_RM8AFW0R',
  'SD_DZ

## Check Stuides in Pyrate Booty

In [20]:
metadata = ship['PROD_URL'].steal_bundles_to_dataframe(
    resource_type='ResearchStudy',
    fhir_paths=[
        ('kf_id','meta.tag[0].code'),
        ('program','keyword[0].text'),
        ('name','title'),
        ('short_name','keyword[1].coding[0].code')
    ]
)

metadata.sort_values(by='kf_id',ignore_index=True,inplace=True)
metadata.head(10)

Query & Build DF (ResearchStudy): 100%|██████████| 1/1 [00:00<00:00, 114.19it/s]


Unnamed: 0,kf_id,program,name,short_name
0,SD_0TYVY1TW,Kids First,Genomic Analysis of Esophageal Atresia and Tra...,KF-EATF
1,SD_1P41Z782,ICR,OpenDIPG: ICR London,ICR-DIPG
2,SD_2CEKQ05V,Kids First,Kids First: Genomic Diagnostics in Cornelia de...,KF-CDL
3,SD_46RR9ZR6,TARGET,TARGET: Acute Myeloid Leukemia (AML),TARGET-AML
4,SD_46SK55A3,Kids First,Kids First: Genomic Analysis of Congenital Dia...,KF-CDH
5,SD_54G4WG4R,Kids First,Kids First: Genomic Etiologies of CHARGE Syndr...,KF-CHARGE
6,SD_65064P2Z,,INCLUDE: (Sherman) Genomic Analysis of Congeni...,INCLUDE
7,SD_6FPYJQBR,Kids First,Kids First: Genetic Basis of Disorders/Differe...,KF-DSD
8,SD_7NQ9151J,Kids First,Genome-wide Sequencing to Identify the Genes R...,KF-ED
9,SD_8Y99QZJJ,Pediatric Brain Tumor Atlas,Pediatric Brain Tumor Atlas: PNOC,PBTA-PNOC


## Noteworthy results
1. Creation time not identified
2. No column identified for age_at_enrollment

# KF FHIR Patient &rarr; C2M2 Subject
| C2M2 Field | FHIR Field |
| ---------- | ---------- |
| local_id   | Patient.identifier[].value |
| project_local_id | Patient.meta_tag[].code |
| sex | Patient.gender |
| ethnicity | Patient.extension_1_extension_0_valueString |
 


In [21]:
studies = pd.DataFrame({'studies':['SD_VTTSHWV4']})

patient_df = ship['PROD_URL'].trade_rows_for_dataframe(
    studies,
    resource_type="Patient",
    df_constraints={'_tag':'studies'},
    fhir_paths=[
        ("fhir_id","id"),
        ("kf_id","identifier[0].value"),
        ("study_id","meta.tag[0].code"),
        ("sex","gender"),
        ("ethnicity","extension[1].extension[0].valueString")
    ]
)

patient_df.head()


Query & Build DF (Patient): 100%|██████████| 1/1 [00:01<00:00,  1.37s/it]


Unnamed: 0,fhir_id,kf_id,study_id,sex,ethnicity,studies
0,184085,PT_7WSNR8P5,SD_VTTSHWV4,male,Not Hispanic or Latino,SD_VTTSHWV4
1,184086,PT_VDSFZMF6,SD_VTTSHWV4,female,Not Hispanic or Latino,SD_VTTSHWV4
2,184087,PT_R6553VPB,SD_VTTSHWV4,female,Not Hispanic or Latino,SD_VTTSHWV4
3,184088,PT_ER8QDHZH,SD_VTTSHWV4,female,Not Hispanic or Latino,SD_VTTSHWV4
4,184089,PT_WRGDD11W,SD_VTTSHWV4,female,Hispanic or Latino,SD_VTTSHWV4


# Biospecimens Ingest from FHIR
## Specimen &rarr; C2M2 Biosample
| FHIR Field | KF Field | C2M2 Field | 
| ---------- | ---------- | ---------- |
|  | kf_id | local_id | 
|  | study_id | project_local_id | 
|  | uberon_id_anatomical_site ***AND*** soure_text_anatomical_site | anatomy | 

In [22]:
specific_patient_df = pd.DataFrame({'patient_id' :["Patient/184085"]}) 

biospec_df = ship['PROD_URL'].trade_rows_for_dataframe(
    specific_patient_df,
    resource_type='Specimen',
    df_constraints={'patient':'patient_id'},
    #df_constraints={'_tag':'studies'},
    fhir_paths=[
        ('study_id','meta.tag[0].code'),
        ('kf_id','identifier[0].value'),
        ('source_text_anatomical_site','collection.bodySite.text'),
        ('uberon_id_anatomical_site','collection.bodySite.text'),
        ('patient_id','subject.reference'),
        ('age_at_event_days','collection._collectedDateTime.extension[0].extension[3].valueDuration.value'),
        ('dbgap_consent_code','meta.security[1].code')
    ]
)

biospec_df.head(5)


Query & Build DF (Specimen): 100%|██████████| 1/1 [00:00<00:00, 38.12it/s]


Unnamed: 0,study_id,kf_id,patient_id,age_at_event_days,dbgap_consent_code
0,SD_VTTSHWV4,BS_CB749AMH,Patient/184085,1520,phs001785.c2
1,SD_VTTSHWV4,BS_CB749AMH_Peripheral_Whole_Blood,Patient/184085,1520,phs001785.c2


Transform patient_id &rarr; participant_id due to kf model lost associations.

In [23]:
biospec_df['patient_id'] = biospec_df['patient_id'].apply(lambda the_col: the_col.split('/')[-1])
biospec_df = biospec_df.merge(patient_df[['kf_id','fhir_id']],how='inner',left_on='patient_id',right_on='fhir_id')
biospec_df.drop(['patient_id','fhir_id'],inplace=True,axis=1)
biospec_df.rename(columns={'kf_id_x':'kf_id','kf_id_y':'participant_id'},inplace=True)
biospec_df.head(5)

Unnamed: 0,study_id,kf_id,age_at_event_days,dbgap_consent_code,participant_id
0,SD_VTTSHWV4,BS_CB749AMH,1520,phs001785.c2,PT_7WSNR8P5
1,SD_VTTSHWV4,BS_CB749AMH_Peripheral_Whole_Blood,1520,phs001785.c2,PT_7WSNR8P5


# Genomic Files ingest from FHIR

## DocumentReference &rarr; C2M2 File

| FHIR Field | KF Field | C2M2 Field | 
| ---------- | ---------- | ---------- |
| identifier[0].value | kf_id | local_id | 
| content[1].attachment.url | external_id | filename | 
| content[0].format.display | file_format | file_format | 
| type.coding[0].code | data_type | data_type | 
| content[0].attachment.url | latest_did | ***Required for connecting file metadata*** | 

In [24]:
file_of_interest_df = pd.DataFrame({'gf_id': ["GF_SAQE6J8C"]})

gf_df = ship['PROD_URL'].trade_rows_for_dataframe(
    file_of_interest_df,
    resource_type='DocumentReference',
    df_constraints={"identifier":"gf_id"},
    # fhir_paths=[
    #     ('study_id','meta.tag[0].code'),
    #     ('kf_id','identifier[0].value'),
    #     ('external_id','content[1].attachment.url'),
    #     ('file_format','content[0].format.display'),
    #     ('data_type','type.coding[0].code'),
    #     ('latest_did','content[0].attachment.url')
    #     ('specimen_reference','context.related[0].[reference]')
    # ]
)

# gf_df['latest_did'] = gf_df['latest_did'].apply(lambda the_col: the_col.split('/')[-1])
gf_df

Query & Build DF (DocumentReference): 100%|██████████| 1/1 [00:00<00:00, 40.06it/s]


Unnamed: 0,resourceType,id,meta_versionId,meta_lastUpdated,meta_source,meta_profile_0,meta_tag_0_system,meta_tag_0_code,identifier_0_use,identifier_0_system,identifier_0_value,status,docStatus,type_coding_0_system,type_coding_0_code,type_coding_0_display,type_text,category_0_coding_0_system,category_0_coding_0_code,category_0_coding_0_display,category_0_text,category_1_coding_0_system,category_1_coding_0_code,category_1_coding_0_display,category_1_text,subject_reference,securityLabel_0_coding_0_system,securityLabel_0_coding_0_code,securityLabel_0_coding_0_display,securityLabel_0_text,securityLabel_1_coding_0_code,securityLabel_1_text,securityLabel_2_coding_0_code,securityLabel_2_text,securityLabel_3_text,content_0_attachment_extension_0_url,content_0_attachment_extension_0_valueDecimal,content_0_attachment_extension_1_url,content_0_attachment_extension_1_valueCodeableConcept_coding_0_display,content_0_attachment_extension_1_valueCodeableConcept_text,content_0_attachment_extension_2_url,content_0_attachment_extension_2_valueCodeableConcept_coding_0_display,content_0_attachment_extension_2_valueCodeableConcept_text,content_0_attachment_url,content_0_attachment_title,content_0_format_display,content_1_attachment_url,context_related_0_reference,gf_id
0,DocumentReference,45166,2,2023-06-14T17:29:04.954+00:00,#eGRFNnjYbREZmBfA,https://ncpi-fhir.github.io/ncpi-fhir-ig/Struc...,https://kf-api-dataservice.kidsfirstdrc.org/st...,SD_65064P2Z,official,https://kf-api-dataservice.kidsfirstdrc.org/ge...,GF_SAQE6J8C,current,final,https://includedcc.org/fhir/code-systems/data_...,Aligned-Reads,Aligned Reads,Aligned Reads,https://includedcc.org/fhir/code-systems/exper...,WGS,Whole Genome Sequencing,WGS,https://includedcc.org/fhir/code-systems/data_...,Genomics,Genomics,WGS,Patient/5267,https://includedcc.org/fhir/code-systems/data_...,controlled,Controlled,True,c1,phs002330.c1,c999,phs002330.c999,SD_Z6MWD3H0,https://nih-ncpi.github.io/ncpi-fhir-ig/Struct...,20326249715,https://nih-ncpi.github.io/ncpi-fhir-ig/Struct...,md5,985f7d0968cc6a6394299a44abf23ff2,https://nih-ncpi.github.io/ncpi-fhir-ig/Struct...,sha256,d23f5b850896f92963a581fb40cc57597a3c6f86401936...,drs://data.kidsfirstdrc.org/4d1fc083-acc1-4a25...,SSH890577.cram,cram,s3://kf-study-us-east-1-prd-sd-z6mwd3h0/source...,Specimen/13082,GF_SAQE6J8C
1,DocumentReference,626202,1,2023-08-10T18:15:05.183+00:00,#YfAzTQuS4Tvz89Zm,https://ncpi-fhir.github.io/ncpi-fhir-ig/Struc...,https://kf-api-dataservice.kidsfirstdrc.org/st...,SD_Z6MWD3H0,official,https://kf-api-dataservice.kidsfirstdrc.org/ge...,GF_SAQE6J8C,current,final,https://includedcc.org/fhir/code-systems/data_...,Aligned-Reads,Aligned Reads,Aligned Reads,https://includedcc.org/fhir/code-systems/exper...,WGS,Whole Genome Sequencing,WGS,https://includedcc.org/fhir/code-systems/data_...,Genomics,Genomics,WGS,Patient/115270,https://includedcc.org/fhir/code-systems/data_...,controlled,Controlled,True,c1,phs002330.c1,c999,phs002330.c999,SD_Z6MWD3H0,https://nih-ncpi.github.io/ncpi-fhir-ig/Struct...,20326249715,https://nih-ncpi.github.io/ncpi-fhir-ig/Struct...,md5,985f7d0968cc6a6394299a44abf23ff2,https://nih-ncpi.github.io/ncpi-fhir-ig/Struct...,sha256,d23f5b850896f92963a581fb40cc57597a3c6f86401936...,drs://data.kidsfirstdrc.org/4d1fc083-acc1-4a25...,SSH890577.cram,cram,s3://kf-study-us-east-1-prd-sd-z6mwd3h0/source...,Specimen/355134,GF_SAQE6J8C


In [25]:
docref_populated_studies = []
docref_unpopulated_studies = []
docref_returned_empty_dict = []


for study_id in metadata['kf_id'].to_list():

    gf_df = ship['PROD_URL'].trade_rows_for_dataframe(
        pd.DataFrame({'study':[study_id]}),
        resource_type='DocumentReference',
        df_constraints={"_tag":"study"},
        fhir_paths=[
            ('study_id','meta.tag[0].code'),
            ('kf_id','identifier[0].value'),
            ('external_id','content[1].attachment.url'),
            ('file_format','content[0].format.display'),
            ('data_type','type.coding[0].code'),
            ('latest_did','content[0].attachment.url')
        ]
    )

    if isinstance(gf_df,pd.DataFrame) and not gf_df.empty:
        docref_populated_studies.append(study_id)
    elif isinstance(gf_df, dict):
        docref_returned_empty_dict.append(study_id) 
    else:
        docref_unpopulated_studies.append(study_id)

    

Query & Build DF (DocumentReference):   0%|          | 0/1 [00:00<?, ?it/s]

Query & Build DF (DocumentReference): 100%|██████████| 1/1 [00:00<00:00,  3.71it/s]
Query & Build DF (DocumentReference): 100%|██████████| 1/1 [00:00<00:00,  3.76it/s]
Query & Build DF (DocumentReference): 100%|██████████| 1/1 [00:00<00:00,  3.86it/s]
Query & Build DF (DocumentReference): 100%|██████████| 1/1 [00:00<00:00,  3.72it/s]
Query & Build DF (DocumentReference): 100%|██████████| 1/1 [00:00<00:00,  3.33it/s]
Query & Build DF (DocumentReference): 100%|██████████| 1/1 [00:00<00:00,  3.68it/s]
Query & Build DF (DocumentReference): 100%|██████████| 1/1 [00:37<00:00, 37.25s/it]
Query & Build DF (DocumentReference): 100%|██████████| 1/1 [00:00<00:00,  3.84it/s]
Query & Build DF (DocumentReference): 100%|██████████| 1/1 [00:00<00:00,  3.78it/s]
Query & Build DF (DocumentReference): 100%|██████████| 1/1 [00:00<00:00,  3.77it/s]
Query & Build DF (DocumentReference): 100%|██████████| 1/1 [00:00<00:00,  3.60it/s]
Query & Build DF (DocumentReference): 100%|██████████| 1/1 [00:00<00:00,  3.

In [26]:
spec_populated_studies = []
spec_unpopulated_studies = []
spec_returned_empty_dict = []


for study_id in metadata['kf_id'].to_list():

    gf_df = ship['PROD_URL'].trade_rows_for_dataframe(
        pd.DataFrame({'study':[study_id]}),
        resource_type='Specimen',
        df_constraints={"_tag":"study"},
        # fhir_paths=[
        #     ('study_id','meta.tag[0].code'),
        #     ('kf_id','identifier[0].value'),
        #     ('external_id','content[1].attachment.url'),
        #     ('file_format','content[0].format.display'),
        #     ('data_type','type.coding[0].code'),
        #     ('latest_did','content[0].attachment.url')
        # ]
    )

    if isinstance(gf_df,pd.DataFrame) and not gf_df.empty:
        spec_populated_studies.append(study_id)
    elif isinstance(gf_df, dict):
        spec_returned_empty_dict.append(study_id) 
    else:
        spec_unpopulated_studies.append(study_id)

Query & Build DF (Specimen): 100%|██████████| 1/1 [00:02<00:00,  2.74s/it]
Query & Build DF (Specimen): 100%|██████████| 1/1 [00:02<00:00,  2.61s/it]
Query & Build DF (Specimen): 100%|██████████| 1/1 [00:00<00:00,  3.80it/s]
Query & Build DF (Specimen): 100%|██████████| 1/1 [00:08<00:00,  8.93s/it]
Query & Build DF (Specimen): 100%|██████████| 1/1 [00:13<00:00, 13.71s/it]
Query & Build DF (Specimen): 100%|██████████| 1/1 [00:00<00:00,  3.79it/s]
Query & Build DF (Specimen): 100%|██████████| 1/1 [00:09<00:00,  9.55s/it]

Query & Build DF (Specimen): 100%|██████████| 1/1 [00:01<00:00,  1.65s/it]

Query & Build DF (Specimen): 100%|██████████| 1/1 [00:01<00:00,  1.84s/it]

Query & Build DF (Specimen): 100%|██████████| 1/1 [00:01<00:00,  1.94s/it]
Query & Build DF (Specimen): 100%|██████████| 1/1 [00:09<00:00,  9.52s/it]
Query & Build DF (Specimen): 100%|██████████| 1/1 [00:00<00:00,  3.71it/s]

Query & Build DF (Specimen): 100%|██████████| 1/1 [00:01<00:00,  1.92s/it]
Query & Build DF (Spe

In [27]:
docref_populated_studies = set(docref_populated_studies)
spec_populated_studies = set(spec_populated_studies)

overlapping_studies = docref_populated_studies & spec_populated_studies
overlapping_studies

{'SD_65064P2Z',
 'SD_BHJXBDQK',
 'SD_DYPMEHHF',
 'SD_FYCR78W0',
 'SD_JWS3V24D',
 'SD_T8VSYRSG',
 'SD_Y6VRG6MD',
 'SD_Z6MWD3H0'}

In [28]:
docref_populated_studies

{'SD_65064P2Z',
 'SD_BHJXBDQK',
 'SD_DYPMEHHF',
 'SD_FYCR78W0',
 'SD_JWS3V24D',
 'SD_T8VSYRSG',
 'SD_Y6VRG6MD',
 'SD_Z6MWD3H0'}

In [29]:
spec_populated_studies

{'SD_0TYVY1TW',
 'SD_1P41Z782',
 'SD_46RR9ZR6',
 'SD_46SK55A3',
 'SD_65064P2Z',
 'SD_6FPYJQBR',
 'SD_7NQ9151J',
 'SD_8Y99QZJJ',
 'SD_9PYZAHHE',
 'SD_B8X3C1MX',
 'SD_BHJXBDQK',
 'SD_DYPMEHHF',
 'SD_DZ4GPQX6',
 'SD_DZTB5HRR',
 'SD_FYCR78W0',
 'SD_HGHFVPFD',
 'SD_JWS3V24D',
 'SD_NMVV8A1Y',
 'SD_P445ACHV',
 'SD_PET7Q6F2',
 'SD_PREASA7S',
 'SD_R0EPRSGS',
 'SD_RM8AFW0R',
 'SD_T8VSYRSG',
 'SD_VTTSHWV4',
 'SD_W0V965XZ',
 'SD_Y6VRG6MD',
 'SD_YGVA0E1C',
 'SD_YNSSAPHE',
 'SD_Z6MWD3H0',
 'SD_ZXJFFMEF'}