# Synthea Data Mining

The Synthea Health tool was used to generate a total of 11,638 electronic health care records:

```bash
sh synthea_setup.sh
```

Health records in FIHR format: https://en.wikipedia.org/wiki/Fast_Healthcare_Interoperability_Resources are provided as JSON files in the output folder

# Setup

## Libraries

In [1]:
#!pip install fhir.resources
!pip install pandas
!pip install numpy



In [2]:
import pandas as pd 
import numpy as np
import datetime
import os

from fhir.resources.R4B.bundle import Bundle
from fhir.resources.R4B.patient import Patient
from fhir.resources.R4B.condition import Condition
from fhir.resources.R4B.observation import Observation
from fhir.resources.R4B.medicationrequest import MedicationRequest
from fhir.resources.R4B.procedure import Procedure
from fhir.resources.R4B.encounter import Encounter
from fhir.resources.R4B.claim import Claim
from fhir.resources.R4B.immunization import Immunization
from fhir.resources.R4B.humanname import HumanName

## Functions

### FIHR Import

In [3]:
def read_fihr_json(file_path):
    pt_resources = []
    pt_bundle = Bundle.parse_file(file_path)
    for entry in pt_bundle.entry:
        pt_resources.append(entry.resource)
    return(pt_resources)

### Patient Information

In [4]:
def get_patient_address_info(patient_obj):
    pt_country, pt_state, pt_city, pt_lat, pt_lon = '', '', '', 0,0
    for entry in patient_obj.address:
        pt_country += entry.country
        pt_state += entry.state
        pt_city += entry.city
        for ext in entry.extension:
            for ext2 in ext.extension:
                if ext2.url == 'latitude':
                    pt_lat += float(ext2.valueDecimal)
                if ext2.url == 'longitude':
                    pt_lon += float(ext2.valueDecimal)
    return pt_country, pt_state, pt_city, pt_lat, pt_lon

def get_patient_demographic(patient_obj):
    pt_birth_date = patient_obj.birthDate
    pt_gender = patient_obj.gender
    pt_gp = patient_obj.generalPractitioner
    pt_id = patient_obj.id
    pt_martial_status = patient_obj.maritalStatus.text
    if len(patient_obj.name[0].given) == 2:
        pt_first_name = patient_obj.name[0].given[0]
        pt_middle_name = patient_obj.name[0].given[1]
    else:
        pt_first_name = patient_obj.name[0].given[0]
        pt_middle_name = 'None'
    pt_family_name = patient_obj.name[0].family
    return pt_id, pt_first_name, pt_middle_name, pt_family_name, pt_gender, pt_birth_date, pt_martial_status, pt_gp

def get_patient_age(patient_obj):
    pt_bd = get_patient_demographic(patient_obj)[5]
    today = datetime.date.today()
    pt_age = int((today - pt_bd).days // 365.2425)
    return(pt_age)

def create_patient_entry(resource_obj):
    patient_ind = [ind for ind, entry in enumerate(resource_obj) if entry.resource_type == 'Patient'][0]
    patient_obj = resource_obj[patient_ind]
    pt_id, pt_first_name, pt_middle_name, pt_family_name, pt_gender, pt_birth_date, pt_marital_status, pt_gp = get_patient_demographic(patient_obj)
    pt_age = get_patient_age(patient_obj)
    pt_country, pt_state, pt_city, pt_lat, pt_lon = get_patient_address_info(patient_obj)
    pt_entry = {
        'uuid':pt_id,
        'first_name':pt_first_name,
        'middle_name':pt_middle_name,
        'family_name':pt_family_name,
        'gender':pt_gender,
        'birth_date': str(pt_birth_date),
        'marital_status': pt_marital_status,
        'general_practioner': pt_gp,
        'age_years': pt_age,
        'country': pt_country,
        'state': pt_state,
        'city': pt_city,
        'location_lat': pt_lat,
        'location_long': pt_lon
    }
    return pt_entry

### Condition

In [5]:
def get_condition_entry(condition_obj):
    pt_id = condition_obj.subject.reference.replace('urn:uuid:', '')
    cond_entry_id = condition_obj.encounter.reference.replace('urn:uuid:', '')
    cond_category = condition_obj.category[0].coding[0].display
    cond_code = condition_obj.code.coding[0].code
    cond_system = condition_obj.code.coding[0].system
    cond_text = condition_obj.code.coding[0].display
    cond_date = str(condition_obj.recordedDate.date())
    condition_entry = {
        'uuid':pt_id,
        'entry_id':cond_entry_id,
        'category':cond_category,
        'code':cond_code,
        'code_system':cond_system,
        'code_text':cond_text,
        'record_date':cond_date
    }
    return condition_entry

def create_condition_entries(resource_obj):
    condition_entries = {
    'uuid':[],
    'entry_id':[],
    'category':[],
    'code':[],
    'code_system':[],
    'code_text':[],
    'record_date':[]
    }
    condition_ind = [ind for ind, entry in enumerate(resource_obj) if entry.resource_type == 'Condition']
    for ind in condition_ind:
        entry = resource_obj[ind]
        entry_dict = get_condition_entry(entry)
        for key, item in entry_dict.items():
            condition_entries[key] += [item]
    return condition_entries

### Observation

In [6]:
def get_observation_entry(observation_obj):
    pt_id = []
    obs_entry_id = []
    obs_category = []
    obs_code = []
    obs_system = []
    obs_text = []
    obs_value = []
    obs_unit = []
    obs_date = []
    if observation_obj.component is not None:
        for obs_comp in observation_obj.component:
            pt_id.append(observation_obj.subject.reference.replace('urn:uuid:', ''))
            obs_entry_id.append(observation_obj.encounter.reference.replace('urn:uuid:', ''))
            obs_category.append(observation_obj.category[0].coding[0].display)
            obs_code.append(obs_comp.code.coding[0].code)
            obs_system.append(obs_comp.code.coding[0].system)
            obs_text.append(obs_comp.code.coding[0].display)
            if (obs_comp.valueQuantity is None) & (obs_comp.valueCodeableConcept is None):
                obs_value.append(None)
                obs_unit.append(None)
            elif (obs_comp.valueQuantity is None) & (obs_comp.valueCodeableConcept is not None):
                obs_value.append(obs_comp.valueCodeableConcept.text)
                obs_unit.append(obs_comp.valueCodeableConcept.coding[0].code)
            else:
                obs_value.append(obs_comp.valueQuantity.value)
                obs_unit.append(obs_comp.valueQuantity.unit)
            obs_date.append(str(observation_obj.effectiveDateTime.date()))
    else:
        pt_id.append(observation_obj.subject.reference.replace('urn:uuid:', ''))
        obs_entry_id.append(observation_obj.encounter.reference.replace('urn:uuid:', ''))
        obs_category.append(observation_obj.category[0].coding[0].display)
        obs_code.append(observation_obj.code.coding[0].code)
        obs_system.append(observation_obj.code.coding[0].system)
        obs_text.append(observation_obj.code.coding[0].display)
        if (observation_obj.valueQuantity is None) & (observation_obj.valueCodeableConcept is None):
                obs_value.append(None)
                obs_unit.append(None)
        elif (observation_obj.valueQuantity is None) & (observation_obj.valueCodeableConcept is not None):
            obs_value.append(observation_obj.valueCodeableConcept.text)
            obs_unit.append(observation_obj.valueCodeableConcept.coding[0].code)
        else:
            obs_value.append(observation_obj.valueQuantity.value)
            obs_unit.append(observation_obj.valueQuantity.unit)
        obs_date.append(str(observation_obj.effectiveDateTime.date()))
    observation_entry = {
        'uuid':pt_id,
        'entry_id':obs_entry_id,
        'category':obs_category,
        'code':obs_code,
        'code_system':obs_system,
        'code_text':obs_text,
        'code_value':obs_value,
        'code_unit':obs_unit,
        'record_date':obs_date
    }
    return observation_entry

def create_observation_entries(resource_obj):
    observation_entries = {
    'uuid':[],
    'entry_id':[],
    'category':[],
    'code':[],
    'code_system':[],
    'code_text':[],
    'code_value':[],
    'code_unit':[],
    'record_date':[]
    }
    observation_ind = [ind for ind, entry in enumerate(resource_obj) if entry.resource_type == 'Observation']
    for ind in observation_ind:
        entry = resource_obj[ind]
        entry_dict = get_observation_entry(entry)
        for key, item in entry_dict.items():
            observation_entries[key] += item
    return observation_entries

### Medication Request

In [7]:
def get_medication_entry(medication_obj):
    pt_id = []
    med_entry_id = []
    med_category = []
    med_code = []
    med_system = []
    med_text = []
    med_insurance = []
    med_intent = []
    med_dosage =  []
    med_reason = []
    med_requester = []
    med_date = []
    
    pt_id.append(medication_obj.subject.reference.replace('urn:uuid:', ''))
    med_entry_id.append(medication_obj.encounter.reference.replace('urn:uuid:', ''))
    med_category.append(medication_obj.category[0].coding[0].display)
    if medication_obj.medicationCodeableConcept is None:
        med_code.append(None)
        med_system.append(None)
        med_text.append(None)
    else:
        med_code.append(medication_obj.medicationCodeableConcept.coding[0].code)
        med_system.append(medication_obj.medicationCodeableConcept.coding[0].system)
        med_text.append(medication_obj.medicationCodeableConcept.coding[0].display)
    med_insurance.append(medication_obj.insurance)
    med_intent.append(medication_obj.intent)
    if medication_obj.dosageInstruction is not None:
        med_dosage.append(medication_obj.dosageInstruction[0].text)
    else:
        med_dosage.append(None)
    if medication_obj.reasonReference is not None:
         med_reason.append(medication_obj.reasonReference[0].display)
    else:
         med_reason.append(None)
    med_requester.append(medication_obj.requester.display)
    med_date.append(str(medication_obj.authoredOn.date()))
    medication_entry = {
        'uuid':pt_id,
        'entry_id':med_entry_id,
        'category':med_category,
        'code':med_code,
        'code_system':med_system,
        'code_text':med_text,
        'insurance':med_insurance,
        'intent':med_intent,
        'dosage':med_dosage,
        'reason':med_reason,
        'requester':med_requester,
        'request_date':med_date
    }
    return medication_entry

def create_medication_entries(resource_obj):
    medication_entries = {
    'uuid':[],
    'entry_id':[],
    'category':[],
    'code':[],
    'code_system':[],
    'code_text':[],
    'insurance':[],
    'intent':[],
    'dosage':[],
    'reason':[],
    'requester':[],
    'request_date':[]
    }
    medication_ind = [ind for ind, entry in enumerate(resource_obj) if entry.resource_type == 'MedicationRequest']
    for ind in medication_ind:
        entry = resource_obj[ind]
        entry_dict = get_medication_entry(entry)
        for key, item in entry_dict.items():
            medication_entries[key] += item
    return medication_entries

### Procedure

In [8]:
def get_procedure_entry(procedure_obj):
    pt_id = []
    proc_entry_id = []
    proc_code = []
    proc_system = []
    proc_text = []
    proc_location = []
    proc_start_date = []
    proc_end_date = []
    proc_duration_seconds = []
    
    pt_id.append(procedure_obj.subject.reference.replace('urn:uuid:', ''))
    proc_entry_id.append(procedure_obj.encounter.reference.replace('urn:uuid:', ''))
    proc_code.append(procedure_obj.code.coding[0].code)
    proc_system.append(procedure_obj.code.coding[0].system)
    proc_text.append(procedure_obj.code.coding[0].display)
    proc_location.append(procedure_obj.location.display)
    proc_start_date.append(str(procedure_obj.performedPeriod.start.date()))
    proc_end_date.append(str(procedure_obj.performedPeriod.end.date()))
    proc_duration_seconds.append(str((procedure_obj.performedPeriod.end - procedure_obj.performedPeriod.start).seconds))
    procedure_entry = {
        'uuid':pt_id,
        'entry_id':proc_entry_id,
        'code':proc_code,
        'code_system':proc_system,
        'code_text':proc_text,
        'location':proc_location,
        'start_date':proc_start_date,
        'end_date':proc_end_date,
        'duration_seconds':proc_duration_seconds
    }
    return procedure_entry

def create_procedure_entries(resource_obj):
    procedure_entries = {
    'uuid':[],
    'entry_id':[],
    'code':[],
    'code_system':[],
    'code_text':[],
    'location':[],
    'start_date':[],
    'end_date':[],
    'duration_seconds':[]
    }
    procedure_ind = [ind for ind, entry in enumerate(resource_obj) if entry.resource_type == 'Procedure']
    for ind in procedure_ind:
        entry = resource_obj[ind]
        entry_dict = get_procedure_entry(entry)
        for key, item in entry_dict.items():
            procedure_entries[key] += item
    return procedure_entries

### Claims

In [9]:
def get_claim_entry(claim_obj):
    pt_id = []
    claim_entry_id = []
    claim_code = []
    claim_system = []
    claim_item = []
    claim_payee = []
    claim_provider = []
    claim_insurance = []
    claim_value = []
    claim_unit = []
    claim_type = []
    claim_date = []
    
    pt_id.append(claim_obj.patient.reference.replace('urn:uuid:', ''))
    claim_entry_id.append(claim_obj.item[0].encounter[0].reference.replace('urn:uuid:', ''))
    claim_code.append(claim_obj.item[0].productOrService.coding[0].code)
    claim_system.append(claim_obj.item[0].productOrService.coding[0].system)
    claim_item.append(claim_obj.item[0].productOrService.coding[0].display)
    claim_payee.append(claim_obj.payee)
    claim_provider.append(claim_obj.provider.display)
    claim_insurance.append(claim_obj.insurance[0].coverage.display)
    claim_value.append(claim_obj.total.value)
    claim_unit.append(claim_obj.total.currency)
    claim_type.append(claim_obj.type.coding[0].code)
    claim_date.append(str(claim_obj.billablePeriod.start.date()))
    claim_entry = {
        'uuid':pt_id,
        'entry_id':claim_entry_id,
        'item_code':claim_code,
        'item_system':claim_system,
        'item_text':claim_item,
        'payee':claim_payee,
        'provider':claim_provider,
        'insurance':claim_insurance,
        'item_value':claim_value,
        'item_unit':claim_unit,
        'claim_type':claim_type,
        'claim_date':claim_date
    }
    return claim_entry

def create_claim_entries(resource_obj):
    claim_entries = {
    'uuid':[],
    'entry_id':[],
    'item_code':[],
    'item_system':[],
    'item_text':[],
    'payee':[],
    'provider':[],
    'insurance':[],
    'item_value':[],
    'item_unit':[],
    'claim_type':[],
    'claim_date':[]
    }
    claim_ind = [ind for ind, entry in enumerate(resource_obj) if entry.resource_type == 'Claim']
    for ind in claim_ind:
        entry = resource_obj[ind]
        entry_dict = get_claim_entry(entry)
        for key, item in entry_dict.items():
            claim_entries[key] += item
    return claim_entries

### Patient Bundle

In [10]:
class FihrRecord:
    def __init__(self, file_path):
        self.fihr_file_path = file_path
        self.fihr_object = read_fihr_json(self.fihr_file_path)
        self.patient = create_patient_entry(self.fihr_object)
        self.conditions = create_condition_entries(self.fihr_object)
        self.observations = create_observation_entries(self.fihr_object)
        self.medications = create_medication_entries(self.fihr_object)
        self.procedures = create_procedure_entries(self.fihr_object)
        self.claims = create_claim_entries(self.fihr_object)
    
    def patient_table(self):
        return pd.DataFrame(self.patient, index=[0])
    
    def conditions_table(self):
        return pd.DataFrame(self.conditions)

    def observations_table(self):
        return pd.DataFrame(self.observations)
    
    def medications_table(self):
        return pd.DataFrame(self.medications)
    
    def procedures_table(self):
        return pd.DataFrame(self.procedures)
    
    def claims_table(self):
        return pd.DataFrame(self.claims)

## Import Tables

In [11]:
fihr_directory = 'output/fhir/' 
fihr_files = os.listdir(fihr_directory)

patients_df = pd.DataFrame(
    columns = [
        'uuid', 'first_name', 'middle_name', 'family_name',
        'gender', 'birth_date', 'marital_status', 'general_practioner',
        'age_years', 'country', 'state', 'city', 'location_lat', 'location_long'
        ]
    )

conditions_df = pd.DataFrame(
    columns = [
        'uuid', 'entry_id', 'category', 'code', 'code_system', 'code_text', 'record_date'
        ]
    )

observations_df = pd.DataFrame(
    columns = [
        'uuid', 'entry_id', 'category', 'code', 'code_system', 'code_text', 'code_unit', 'record_date'
        ]
    )

medications_df = pd.DataFrame(
    columns = [
        'uuid', 'entry_id', 'category', 'code', 'code_system', 'code_text', 
        'insurance', 'intent', 'dosage', 'reason', 'requester', 'request_date'
        ]
    )

procedures_df = pd.DataFrame(
    columns = [
        'uuid', 'entry_id', 'code', 'code_system', 'code_text', 
        'location', 'start_date', 'end_date', 'duration_seconds'
        ]
    )

claims_df = pd.DataFrame(
    columns = [
        'uuid', 'entry_id', 'item_code', 'item_system', 'item_text', 
        'payee', 'provider', 'insurance', 'item_value', 'item_unit',
        'claim_type', 'claim_date'
        ]
    )

In [12]:
for file in fihr_files:
    if ('hospital' in file) | ('practitioner' in file):
        continue
    print('Processing', file)
    full_file = fihr_directory + file
    fihr_obj = FihrRecord(full_file)
    patients_df = pd.concat([patients_df, fihr_obj.patient_table()]).reset_index(drop=True)
    conditions_df = pd.concat([conditions_df, fihr_obj.conditions_table()]).reset_index(drop=True)
    observations_df = pd.concat([observations_df, fihr_obj.observations_table()]).reset_index(drop=True)
    medications_df = pd.concat([medications_df, fihr_obj.medications_table()]).reset_index(drop=True)
    procedures_df = pd.concat([procedures_df, fihr_obj.procedures_table()]).reset_index(drop=True)
    claims_df = pd.concat([claims_df, fihr_obj.claims_table()]).reset_index(drop=True)

Processing Abram53_Hettinger594_ad1c2021-e196-38c4-c2a5-c9b5edbd6161.json


Processing Adela471_Antonia30_Palomino953_900c6199-ef17-a5a0-715e-6e59dd3746b9.json
Processing Adolfo777_Kutch271_f5803203-b0cd-9e69-36f6-e2eb61fecdfc.json
Processing Adolfo777_Mills423_063eaafa-f743-9ce7-989d-7211f9332e9d.json
Processing Adolfo777_Okuneva707_f6d05aa0-2aa4-a262-73e5-19d0c95ea3aa.json
Processing Alaina222_Crissy767_D'Amore443_ae24ec52-8791-39a7-a153-e632a67e25f1.json
Processing Alan320_Schneider199_b8d53a21-538b-6e24-57e6-c956f3f56382.json
Processing Alec433_Rippin620_01a2fc31-415c-c755-9910-dac05827b876.json
Processing Alessandra932_Catherin804_Kertzmann286_bcdfd9aa-9a75-a4ed-e3ef-1adccb63a162.json
Processing Aleta47_Shawn523_Morissette863_a320c901-3832-f553-e910-27fc4b8efa1b.json
Processing Alfredo17_Sarabia507_4d834da9-fb50-534f-da5d-4ca9b888773b.json
Processing Alisia5_Lavera253_Douglas31_e758aaa0-13b2-dc63-b8d2-2df6e8d4c9e7.json
Processing Allen322_Lind531_eb8b3158-32d6-7836-eba4-4f8e3141c944.json
Processing Alonzo487_Renner328_1bd96630-b659-16d7-ad37-ad792fdeb25b.

In [20]:
patients_df.head(2)

Unnamed: 0,uuid,first_name,middle_name,family_name,gender,birth_date,marital_status,general_practioner,age_years,country,state,city,location_lat,location_long
0,ad1c2021-e196-38c4-c2a5-c9b5edbd6161,Abram53,Eli762,Hettinger594,male,1961-10-16,Married,,62,US,CA,Santa Ana,33.723615,-117.873961
1,900c6199-ef17-a5a0-715e-6e59dd3746b9,Adela471,Antonia30,Palomino953,female,1964-11-17,Widowed,,59,US,CA,Los Angeles,34.054434,-118.323101


In [22]:
patients_df.shape

(526, 14)

In [15]:
conditions_df.head(2)

Unnamed: 0,uuid,entry_id,category,code,code_system,code_text,record_date
0,ad1c2021-e196-38c4-c2a5-c9b5edbd6161,8b2c1e8c-f5ad-3946-927f-52caf5dd5270,Encounter Diagnosis,128613002,http://snomed.info/sct,Seizure disorder,1971-04-25
1,ad1c2021-e196-38c4-c2a5-c9b5edbd6161,8b2c1e8c-f5ad-3946-927f-52caf5dd5270,Encounter Diagnosis,703151001,http://snomed.info/sct,History of single seizure (situation),1971-04-25


In [16]:
observations_df.head(2)

Unnamed: 0,uuid,entry_id,category,code,code_system,code_text,code_unit,record_date,code_value
0,ad1c2021-e196-38c4-c2a5-c9b5edbd6161,c61e9875-cca9-bf41-5043-515380110a02,Laboratory,2345-7,http://loinc.org,Glucose [Mass/volume] in Serum or Plasma,mg/dL,2013-10-21,98.64
1,ad1c2021-e196-38c4-c2a5-c9b5edbd6161,c61e9875-cca9-bf41-5043-515380110a02,Laboratory,3094-0,http://loinc.org,Urea nitrogen [Mass/volume] in Serum or Plasma,mg/dL,2013-10-21,16.68


In [17]:
medications_df.head(2)

Unnamed: 0,uuid,entry_id,category,code,code_system,code_text,insurance,intent,dosage,reason,requester,request_date
0,ad1c2021-e196-38c4-c2a5-c9b5edbd6161,4e8d0356-d0c7-25d6-b863-1753bc9b2f04,Community,310798,http://www.nlm.nih.gov/research/umls/rxnorm,Hydrochlorothiazide 25 MG Oral Tablet,,order,,Essential hypertension (disorder),Dr. Michael441 Grady603,2012-12-10
1,ad1c2021-e196-38c4-c2a5-c9b5edbd6161,4e8d0356-d0c7-25d6-b863-1753bc9b2f04,Community,314076,http://www.nlm.nih.gov/research/umls/rxnorm,lisinopril 10 MG Oral Tablet,,order,,Essential hypertension (disorder),Dr. Michael441 Grady603,2012-12-10


In [18]:
procedures_df.head(2)

Unnamed: 0,uuid,entry_id,code,code_system,code_text,location,start_date,end_date,duration_seconds
0,ad1c2021-e196-38c4-c2a5-c9b5edbd6161,c61e9875-cca9-bf41-5043-515380110a02,430193006,http://snomed.info/sct,Medication Reconciliation (procedure),SOUTH COAST POST ACUTE,2013-10-21,2013-10-21,900
1,ad1c2021-e196-38c4-c2a5-c9b5edbd6161,c61e9875-cca9-bf41-5043-515380110a02,710824005,http://snomed.info/sct,Assessment of health and social care needs (pr...,SOUTH COAST POST ACUTE,2013-10-21,2013-10-21,1889


In [19]:
claims_df.head(2)

Unnamed: 0,uuid,entry_id,item_code,item_system,item_text,payee,provider,insurance,item_value,item_unit,claim_type,claim_date
0,ad1c2021-e196-38c4-c2a5-c9b5edbd6161,8b2c1e8c-f5ad-3946-927f-52caf5dd5270,50849002,http://snomed.info/sct,Emergency room admission (procedure),,SOUTH COAST POST ACUTE,Medicaid,10655.25,USD,institutional,1971-04-25
1,ad1c2021-e196-38c4-c2a5-c9b5edbd6161,1496fc93-3884-33d9-2ec7-4b946429954d,162673000,http://snomed.info/sct,General examination of patient (procedure),,SANO MEDICAL CLINIC INC,Dual Eligible,883.26,USD,professional,1979-12-10


# Export

In [24]:
patients_df.to_csv('csv/patient_data.csv', index=False)
conditions_df.to_csv('csv/conditions_data.csv', index=False)
observations_df.to_csv('csv/observations_data.csv', index=False)
medications_df.to_csv('csv/medications_data.csv', index=False)
procedures_df.to_csv('csv/procedures_data.csv', index=False)
claims_df.to_csv('csv/claims_data.csv', index=False)