# Synthea Data Mining

The Synthea Health tool was used to generate a total of 11,638 electronic health care records:

```bash
sh synthea_setup.sh
```

Health records in FIHR format: https://en.wikipedia.org/wiki/Fast_Healthcare_Interoperability_Resources are provided as JSON files in the output folder

# Setup

## Libraries

In [1]:
#!pip install fhir.resources
#!pip install pandas
#!pip install numpy



In [2]:
import pandas as pd 
import numpy as np
import datetime
import os

from fhir.resources.R4B.bundle import Bundle
from fhir.resources.R4B.patient import Patient
from fhir.resources.R4B.condition import Condition
from fhir.resources.R4B.observation import Observation
from fhir.resources.R4B.medicationrequest import MedicationRequest
from fhir.resources.R4B.procedure import Procedure
from fhir.resources.R4B.encounter import Encounter
from fhir.resources.R4B.claim import Claim
from fhir.resources.R4B.immunization import Immunization
from fhir.resources.R4B.humanname import HumanName

## Functions

### FIHR Import

In [3]:
def read_fihr_json(file_path):
    pt_resources = []
    pt_bundle = Bundle.parse_file(file_path)
    for entry in pt_bundle.entry:
        pt_resources.append(entry.resource)
    return(pt_resources)

### Patient Information

In [4]:
def get_patient_address_info(patient_obj):
    pt_country, pt_state, pt_city, pt_lat, pt_lon = '', '', '', 0,0
    for entry in patient_obj.address:
        pt_country += entry.country
        pt_state += entry.state
        pt_city += entry.city
        for ext in entry.extension:
            for ext2 in ext.extension:
                if ext2.url == 'latitude':
                    pt_lat += float(ext2.valueDecimal)
                if ext2.url == 'longitude':
                    pt_lon += float(ext2.valueDecimal)
    return pt_country, pt_state, pt_city, pt_lat, pt_lon

def get_patient_demographic(patient_obj):
    pt_birth_date = patient_obj.birthDate
    pt_gender = patient_obj.gender
    pt_gp = patient_obj.generalPractitioner
    pt_id = patient_obj.id
    pt_martial_status = patient_obj.maritalStatus.text
    if len(patient_obj.name[0].given) == 2:
        pt_first_name = patient_obj.name[0].given[0]
        pt_middle_name = patient_obj.name[0].given[1]
    else:
        pt_first_name = patient_obj.name[0].given[0]
        pt_middle_name = 'None'
    pt_family_name = patient_obj.name[0].family
    return pt_id, pt_first_name, pt_middle_name, pt_family_name, pt_gender, pt_birth_date, pt_martial_status, pt_gp

def get_patient_age(patient_obj):
    pt_bd = get_patient_demographic(patient_obj)[5]
    today = datetime.date.today()
    pt_age = int((today - pt_bd).days // 365.2425)
    return(pt_age)

def create_patient_entry(resource_obj):
    patient_ind = [ind for ind, entry in enumerate(resource_obj) if entry.resource_type == 'Patient'][0]
    patient_obj = resource_obj[patient_ind]
    pt_id, pt_first_name, pt_middle_name, pt_family_name, pt_gender, pt_birth_date, pt_marital_status, pt_gp = get_patient_demographic(patient_obj)
    pt_age = get_patient_age(patient_obj)
    pt_country, pt_state, pt_city, pt_lat, pt_lon = get_patient_address_info(patient_obj)
    pt_entry = {
        'uuid':pt_id,
        'first_name':pt_first_name,
        'middle_name':pt_middle_name,
        'family_name':pt_family_name,
        'gender':pt_gender,
        'birth_date': str(pt_birth_date),
        'marital_status': pt_marital_status,
        'general_practioner': pt_gp,
        'age_years': pt_age,
        'country': pt_country,
        'state': pt_state,
        'city': pt_city,
        'location_lat': pt_lat,
        'location_long': pt_lon
    }
    return pt_entry

### Condition

In [5]:
def get_condition_entry(condition_obj):
    pt_id = condition_obj.subject.reference.replace('urn:uuid:', '')
    cond_entry_id = condition_obj.encounter.reference.replace('urn:uuid:', '')
    cond_category = condition_obj.category[0].coding[0].display
    cond_code = condition_obj.code.coding[0].code
    cond_system = condition_obj.code.coding[0].system
    cond_text = condition_obj.code.coding[0].display
    cond_date = str(condition_obj.recordedDate.date())
    condition_entry = {
        'uuid':pt_id,
        'entry_id':cond_entry_id,
        'category':cond_category,
        'code':cond_code,
        'code_system':cond_system,
        'code_text':cond_text,
        'record_date':cond_date
    }
    return condition_entry

def create_condition_entries(resource_obj):
    condition_entries = {
    'uuid':[],
    'entry_id':[],
    'category':[],
    'code':[],
    'code_system':[],
    'code_text':[],
    'record_date':[]
    }
    condition_ind = [ind for ind, entry in enumerate(resource_obj) if entry.resource_type == 'Condition']
    for ind in condition_ind:
        entry = resource_obj[ind]
        entry_dict = get_condition_entry(entry)
        for key, item in entry_dict.items():
            condition_entries[key] += [item]
    return condition_entries

### Observation

In [6]:
def get_observation_entry(observation_obj):
    pt_id = []
    obs_entry_id = []
    obs_category = []
    obs_code = []
    obs_system = []
    obs_text = []
    obs_value = []
    obs_unit = []
    obs_date = []
    if observation_obj.component is not None:
        for obs_comp in observation_obj.component:
            pt_id.append(observation_obj.subject.reference.replace('urn:uuid:', ''))
            obs_entry_id.append(observation_obj.encounter.reference.replace('urn:uuid:', ''))
            obs_category.append(observation_obj.category[0].coding[0].display)
            obs_code.append(obs_comp.code.coding[0].code)
            obs_system.append(obs_comp.code.coding[0].system)
            obs_text.append(obs_comp.code.coding[0].display)
            if (obs_comp.valueQuantity is None) & (obs_comp.valueCodeableConcept is None):
                obs_value.append(None)
                obs_unit.append(None)
            elif (obs_comp.valueQuantity is None) & (obs_comp.valueCodeableConcept is not None):
                obs_value.append(obs_comp.valueCodeableConcept.text)
                obs_unit.append(obs_comp.valueCodeableConcept.coding[0].code)
            else:
                obs_value.append(obs_comp.valueQuantity.value)
                obs_unit.append(obs_comp.valueQuantity.unit)
            obs_date.append(str(observation_obj.effectiveDateTime.date()))
    else:
        pt_id.append(observation_obj.subject.reference.replace('urn:uuid:', ''))
        obs_entry_id.append(observation_obj.encounter.reference.replace('urn:uuid:', ''))
        obs_category.append(observation_obj.category[0].coding[0].display)
        obs_code.append(observation_obj.code.coding[0].code)
        obs_system.append(observation_obj.code.coding[0].system)
        obs_text.append(observation_obj.code.coding[0].display)
        if (observation_obj.valueQuantity is None) & (observation_obj.valueCodeableConcept is None):
                obs_value.append(None)
                obs_unit.append(None)
        elif (observation_obj.valueQuantity is None) & (observation_obj.valueCodeableConcept is not None):
            obs_value.append(observation_obj.valueCodeableConcept.text)
            obs_unit.append(observation_obj.valueCodeableConcept.coding[0].code)
        else:
            obs_value.append(observation_obj.valueQuantity.value)
            obs_unit.append(observation_obj.valueQuantity.unit)
        obs_date.append(str(observation_obj.effectiveDateTime.date()))
    observation_entry = {
        'uuid':pt_id,
        'entry_id':obs_entry_id,
        'category':obs_category,
        'code':obs_code,
        'code_system':obs_system,
        'code_text':obs_text,
        'code_value':obs_value,
        'code_unit':obs_unit,
        'record_date':obs_date
    }
    return observation_entry

def create_observation_entries(resource_obj):
    observation_entries = {
    'uuid':[],
    'entry_id':[],
    'category':[],
    'code':[],
    'code_system':[],
    'code_text':[],
    'code_value':[],
    'code_unit':[],
    'record_date':[]
    }
    observation_ind = [ind for ind, entry in enumerate(resource_obj) if entry.resource_type == 'Observation']
    for ind in observation_ind:
        entry = resource_obj[ind]
        entry_dict = get_observation_entry(entry)
        for key, item in entry_dict.items():
            observation_entries[key] += item
    return observation_entries

### Medication Request

In [7]:
def get_medication_entry(medication_obj):
    pt_id = []
    med_entry_id = []
    med_category = []
    med_code = []
    med_system = []
    med_text = []
    med_insurance = []
    med_intent = []
    med_dosage =  []
    med_reason = []
    med_requester = []
    med_date = []
    
    pt_id.append(medication_obj.subject.reference.replace('urn:uuid:', ''))
    med_entry_id.append(medication_obj.encounter.reference.replace('urn:uuid:', ''))
    med_category.append(medication_obj.category[0].coding[0].display)
    if medication_obj.medicationCodeableConcept is None:
        med_code.append(None)
        med_system.append(None)
        med_text.append(None)
    else:
        med_code.append(medication_obj.medicationCodeableConcept.coding[0].code)
        med_system.append(medication_obj.medicationCodeableConcept.coding[0].system)
        med_text.append(medication_obj.medicationCodeableConcept.coding[0].display)
    med_insurance.append(medication_obj.insurance)
    med_intent.append(medication_obj.intent)
    if medication_obj.dosageInstruction is not None:
        med_dosage.append(medication_obj.dosageInstruction[0].text)
    else:
        med_dosage.append(None)
    if medication_obj.reasonReference is not None:
         med_reason.append(medication_obj.reasonReference[0].display)
    else:
         med_reason.append(None)
    med_requester.append(medication_obj.requester.display)
    med_date.append(str(medication_obj.authoredOn.date()))
    medication_entry = {
        'uuid':pt_id,
        'entry_id':med_entry_id,
        'category':med_category,
        'code':med_code,
        'code_system':med_system,
        'code_text':med_text,
        'insurance':med_insurance,
        'intent':med_intent,
        'dosage':med_dosage,
        'reason':med_reason,
        'requester':med_requester,
        'request_date':med_date
    }
    return medication_entry

def create_medication_entries(resource_obj):
    medication_entries = {
    'uuid':[],
    'entry_id':[],
    'category':[],
    'code':[],
    'code_system':[],
    'code_text':[],
    'insurance':[],
    'intent':[],
    'dosage':[],
    'reason':[],
    'requester':[],
    'request_date':[]
    }
    medication_ind = [ind for ind, entry in enumerate(resource_obj) if entry.resource_type == 'MedicationRequest']
    for ind in medication_ind:
        entry = resource_obj[ind]
        entry_dict = get_medication_entry(entry)
        for key, item in entry_dict.items():
            medication_entries[key] += item
    return medication_entries

### Procedure

In [8]:
def get_procedure_entry(procedure_obj):
    pt_id = []
    proc_entry_id = []
    proc_code = []
    proc_system = []
    proc_text = []
    proc_location = []
    proc_start_date = []
    proc_end_date = []
    proc_duration_seconds = []
    
    pt_id.append(procedure_obj.subject.reference.replace('urn:uuid:', ''))
    proc_entry_id.append(procedure_obj.encounter.reference.replace('urn:uuid:', ''))
    proc_code.append(procedure_obj.code.coding[0].code)
    proc_system.append(procedure_obj.code.coding[0].system)
    proc_text.append(procedure_obj.code.coding[0].display)
    proc_location.append(procedure_obj.location.display)
    proc_start_date.append(str(procedure_obj.performedPeriod.start.date()))
    proc_end_date.append(str(procedure_obj.performedPeriod.end.date()))
    proc_duration_seconds.append(str((procedure_obj.performedPeriod.end - procedure_obj.performedPeriod.start).seconds))
    procedure_entry = {
        'uuid':pt_id,
        'entry_id':proc_entry_id,
        'code':proc_code,
        'code_system':proc_system,
        'code_text':proc_text,
        'location':proc_location,
        'start_date':proc_start_date,
        'end_date':proc_end_date,
        'duration_seconds':proc_duration_seconds
    }
    return procedure_entry

def create_procedure_entries(resource_obj):
    procedure_entries = {
    'uuid':[],
    'entry_id':[],
    'code':[],
    'code_system':[],
    'code_text':[],
    'location':[],
    'start_date':[],
    'end_date':[],
    'duration_seconds':[]
    }
    procedure_ind = [ind for ind, entry in enumerate(resource_obj) if entry.resource_type == 'Procedure']
    for ind in procedure_ind:
        entry = resource_obj[ind]
        entry_dict = get_procedure_entry(entry)
        for key, item in entry_dict.items():
            procedure_entries[key] += item
    return procedure_entries

### Claims

In [9]:
def get_claim_entry(claim_obj):
    pt_id = []
    claim_entry_id = []
    claim_code = []
    claim_system = []
    claim_item = []
    claim_payee = []
    claim_provider = []
    claim_insurance = []
    claim_value = []
    claim_unit = []
    claim_type = []
    claim_date = []
    
    pt_id.append(claim_obj.patient.reference.replace('urn:uuid:', ''))
    claim_entry_id.append(claim_obj.item[0].encounter[0].reference.replace('urn:uuid:', ''))
    claim_code.append(claim_obj.item[0].productOrService.coding[0].code)
    claim_system.append(claim_obj.item[0].productOrService.coding[0].system)
    claim_item.append(claim_obj.item[0].productOrService.coding[0].display)
    claim_payee.append(claim_obj.payee)
    claim_provider.append(claim_obj.provider.display)
    claim_insurance.append(claim_obj.insurance[0].coverage.display)
    claim_value.append(claim_obj.total.value)
    claim_unit.append(claim_obj.total.currency)
    claim_type.append(claim_obj.type.coding[0].code)
    claim_date.append(str(claim_obj.billablePeriod.start.date()))
    claim_entry = {
        'uuid':pt_id,
        'entry_id':claim_entry_id,
        'item_code':claim_code,
        'item_system':claim_system,
        'item_text':claim_item,
        'payee':claim_payee,
        'provider':claim_provider,
        'insurance':claim_insurance,
        'item_value':claim_value,
        'item_unit':claim_unit,
        'claim_type':claim_type,
        'claim_date':claim_date
    }
    return claim_entry

def create_claim_entries(resource_obj):
    claim_entries = {
    'uuid':[],
    'entry_id':[],
    'item_code':[],
    'item_system':[],
    'item_text':[],
    'payee':[],
    'provider':[],
    'insurance':[],
    'item_value':[],
    'item_unit':[],
    'claim_type':[],
    'claim_date':[]
    }
    claim_ind = [ind for ind, entry in enumerate(resource_obj) if entry.resource_type == 'Claim']
    for ind in claim_ind:
        entry = resource_obj[ind]
        entry_dict = get_claim_entry(entry)
        for key, item in entry_dict.items():
            claim_entries[key] += item
    return claim_entries

### Patient Bundle

In [10]:
class FihrRecord:
    def __init__(self, file_path):
        self.fihr_file_path = file_path
        self.fihr_object = read_fihr_json(self.fihr_file_path)
        self.patient = create_patient_entry(self.fihr_object)
        self.conditions = create_condition_entries(self.fihr_object)
        self.observations = create_observation_entries(self.fihr_object)
        self.medications = create_medication_entries(self.fihr_object)
        self.procedures = create_procedure_entries(self.fihr_object)
        self.claims = create_claim_entries(self.fihr_object)
    
    def patient_table(self):
        return pd.DataFrame(self.patient, index=[0])
    
    def conditions_table(self):
        return pd.DataFrame(self.conditions)

    def observations_table(self):
        return pd.DataFrame(self.observations)
    
    def medications_table(self):
        return pd.DataFrame(self.medications)
    
    def procedures_table(self):
        return pd.DataFrame(self.procedures)
    
    def claims_table(self):
        return pd.DataFrame(self.claims)

## Import Tables

In [11]:
fihr_directory = 'output/fhir/' 
fihr_files = os.listdir(fihr_directory)

patients_df = pd.DataFrame(
    columns = [
        'uuid', 'first_name', 'middle_name', 'family_name',
        'gender', 'birth_date', 'marital_status', 'general_practioner',
        'age_years', 'country', 'state', 'city', 'location_lat', 'location_long'
        ]
    )

conditions_df = pd.DataFrame(
    columns = [
        'uuid', 'entry_id', 'category', 'code', 'code_system', 'code_text', 'record_date'
        ]
    )

observations_df = pd.DataFrame(
    columns = [
        'uuid', 'entry_id', 'category', 'code', 'code_system', 'code_text', 'code_unit', 'record_date'
        ]
    )

medications_df = pd.DataFrame(
    columns = [
        'uuid', 'entry_id', 'category', 'code', 'code_system', 'code_text', 
        'insurance', 'intent', 'dosage', 'reason', 'requester', 'request_date'
        ]
    )

procedures_df = pd.DataFrame(
    columns = [
        'uuid', 'entry_id', 'code', 'code_system', 'code_text', 
        'location', 'start_date', 'end_date', 'duration_seconds'
        ]
    )

claims_df = pd.DataFrame(
    columns = [
        'uuid', 'entry_id', 'item_code', 'item_system', 'item_text', 
        'payee', 'provider', 'insurance', 'item_value', 'item_unit',
        'claim_type', 'claim_date'
        ]
    )

In [12]:
for file in fihr_files:
    if ('hospital' in file) | ('practitioner' in file):
        continue
    print('Processing', file)
    full_file = fihr_directory + file
    fihr_obj = FihrRecord(full_file)
    patients_df = pd.concat([patients_df, fihr_obj.patient_table()]).reset_index(drop=True)
    conditions_df = pd.concat([conditions_df, fihr_obj.conditions_table()]).reset_index(drop=True)
    observations_df = pd.concat([observations_df, fihr_obj.observations_table()]).reset_index(drop=True)
    medications_df = pd.concat([medications_df, fihr_obj.medications_table()]).reset_index(drop=True)
    procedures_df = pd.concat([procedures_df, fihr_obj.procedures_table()]).reset_index(drop=True)
    claims_df = pd.concat([claims_df, fihr_obj.claims_table()]).reset_index(drop=True)

Processing Abbey813_Schneider199_273aedd7-d2ea-b474-d017-f8ee7fc856c4.json
Processing Abby752_Raynor401_05594053-3b63-75d6-fd5f-c255abb4ab0f.json
Processing Abby752_Wyman904_8bebd181-8287-f22e-7e77-bca826c453e0.json
Processing Abdul218_Bayer639_a0543c14-d023-d8ce-6e25-626b77bfec28.json
Processing Abdul218_Bernhard322_a2e8d153-e0f3-e2c7-c0aa-80efa54db810.json
Processing Abdul218_Koepp521_12cee91c-4d2f-9463-99e6-4e12118df716.json
Processing Abdul218_Powlowski563_75837edd-2882-4f07-abfa-aac63044f786.json
Processing Abe604_Feest103_48a69545-3768-6b4d-d20d-3b6245f93671.json
Processing Abe604_Kerluke267_d28c4572-3974-a6b5-8231-11593bb43f17.json
Processing Abel832_Effertz744_485645d3-5e34-b1ff-da9c-22bb8b36eeba.json
Processing Abel832_Hammes673_bcff4fb1-525a-63b5-41a3-9eb3b752c412.json
Processing Abel832_Welch179_0ddfe9ca-11d2-066e-8219-c272d20ca6a8.json
Processing Abraham100_Hane680_0b8da3d4-1dcb-68d8-2942-a5df5f730fc4.json
Processing Abraham100_Murazik203_9faaf3d2-f618-5d8d-b3a5-f1bf68658b8

In [21]:
patients_df.head(2)

Unnamed: 0,uuid,first_name,middle_name,family_name,gender,birth_date,marital_status,general_practioner,age_years,country,state,city,location_lat,location_long
0,273aedd7-d2ea-b474-d017-f8ee7fc856c4,Abbey813,,Schneider199,female,2016-05-06,Never Married,,7,US,CA,Los Angeles,33.979613,-118.259022
1,05594053-3b63-75d6-fd5f-c255abb4ab0f,Abby752,,Raynor401,female,1925-05-28,Married,,98,US,CA,Santa Clara,37.341706,-122.031829


In [22]:
patients_df.shape

(10471, 14)

In [15]:
conditions_df.head(2)

Unnamed: 0,uuid,entry_id,category,code,code_system,code_text,record_date
0,273aedd7-d2ea-b474-d017-f8ee7fc856c4,c76c533e-34bf-ed92-ee51-c626a729abea,Encounter Diagnosis,314529007,http://snomed.info/sct,Medication review due (situation),2016-05-06
1,273aedd7-d2ea-b474-d017-f8ee7fc856c4,97b7b46a-b199-9a08-b3e2-125ea3ead616,Encounter Diagnosis,314529007,http://snomed.info/sct,Medication review due (situation),2016-08-12


In [16]:
observations_df.head(2)

Unnamed: 0,uuid,entry_id,category,code,code_system,code_text,code_unit,record_date,code_value
0,273aedd7-d2ea-b474-d017-f8ee7fc856c4,c76c533e-34bf-ed92-ee51-c626a729abea,Vital signs,8302-2,http://loinc.org,Body Height,cm,2016-05-06,48.9
1,273aedd7-d2ea-b474-d017-f8ee7fc856c4,c76c533e-34bf-ed92-ee51-c626a729abea,Vital signs,72514-3,http://loinc.org,Pain severity - 0-10 verbal numeric rating [Sc...,{score},2016-05-06,4.0


In [17]:
medications_df.head(2)

Unnamed: 0,uuid,entry_id,category,code,code_system,code_text,insurance,intent,dosage,reason,requester,request_date
0,273aedd7-d2ea-b474-d017-f8ee7fc856c4,001c0fa9-b1d3-cb66-25ec-42c3c4df058a,Community,308192,http://www.nlm.nih.gov/research/umls/rxnorm,Amoxicillin 500 MG Oral Tablet,,order,Take at regular intervals. Complete the prescr...,,Dr. Deane47 Howe413,2019-07-20
1,273aedd7-d2ea-b474-d017-f8ee7fc856c4,001c0fa9-b1d3-cb66-25ec-42c3c4df058a,Community,198405,http://www.nlm.nih.gov/research/umls/rxnorm,Ibuprofen 100 MG Oral Tablet,,order,Take as needed.,,Dr. Deane47 Howe413,2019-07-20


In [18]:
procedures_df.head(2)

Unnamed: 0,uuid,entry_id,code,code_system,code_text,location,start_date,end_date,duration_seconds
0,273aedd7-d2ea-b474-d017-f8ee7fc856c4,77e9b4f5-9984-24ce-e44b-71de5a1e6ace,430193006,http://snomed.info/sct,Medication Reconciliation (procedure),"ST. JOHN'S WELL CHILD AND FAMILY CENTER, INC.",2016-06-10,2016-06-10,900
1,273aedd7-d2ea-b474-d017-f8ee7fc856c4,97b7b46a-b199-9a08-b3e2-125ea3ead616,430193006,http://snomed.info/sct,Medication Reconciliation (procedure),"ST. JOHN'S WELL CHILD AND FAMILY CENTER, INC.",2016-08-12,2016-08-12,900


In [19]:
claims_df.head(2)

Unnamed: 0,uuid,entry_id,item_code,item_system,item_text,payee,provider,insurance,item_value,item_unit,claim_type,claim_date
0,273aedd7-d2ea-b474-d017-f8ee7fc856c4,c76c533e-34bf-ed92-ee51-c626a729abea,410620009,http://snomed.info/sct,Well child visit (procedure),,"ST. JOHN'S WELL CHILD AND FAMILY CENTER, INC.",Medicaid,402.69,USD,professional,2016-05-06
1,273aedd7-d2ea-b474-d017-f8ee7fc856c4,77e9b4f5-9984-24ce-e44b-71de5a1e6ace,410620009,http://snomed.info/sct,Well child visit (procedure),,"ST. JOHN'S WELL CHILD AND FAMILY CENTER, INC.",Medicaid,1006.5,USD,professional,2016-06-10


# Export

In [20]:
patients_df.to_csv('csv/patient_data.csv', index=False)
conditions_df.to_csv('csv/conditions_data.csv', index=False)
observations_df.to_csv('csv/observations_data.csv', index=False)
medications_df.to_csv('csv/medications_data.csv', index=False)
procedures_df.to_csv('csv/procedures_data.csv', index=False)
claims_df.to_csv('csv/claims_data.csv', index=False)