# Synthea Data Mining

The Synthea Health tool was used to generate a total of 11,638 electronic health care records:

```bash
sh synthea_setup.sh
```

Health records in FIHR format: https://en.wikipedia.org/wiki/Fast_Healthcare_Interoperability_Resources are provided as JSON files in the output folder

# Setup

## Libraries

In [3]:
#!pip install fhir.resources
!pip install pandas
!pip install numpy

Collecting pandas
  Downloading pandas-2.0.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2023.3.post1-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.1 (from pandas)
  Downloading tzdata-2023.3-py2.py3-none-any.whl (341 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m341.8/341.8 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting numpy>=1.20.3 (from pandas)
  Downloading numpy-1.24.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB)
Downloading pandas-2.0.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.4/12.4 MB[0m [31m29.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading numpy-1.24.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m

In [109]:
import pandas as pd 
import numpy as np
import datetime

from fhir.resources.R4B.bundle import Bundle
from fhir.resources.R4B.patient import Patient
from fhir.resources.R4B.condition import Condition
from fhir.resources.R4B.observation import Observation
from fhir.resources.R4B.medicationrequest import MedicationRequest
from fhir.resources.R4B.procedure import Procedure
from fhir.resources.R4B.encounter import Encounter
from fhir.resources.R4B.claim import Claim
from fhir.resources.R4B.immunization import Immunization
from fhir.resources.R4B.humanname import HumanName

## Functions

### FIHR Import

In [136]:
def read_fihr_json(file_path):
    pt_resources = []
    pt_bundle = Bundle.parse_file(file_path)
    for entry in pt_bundle.entry:
        pt_resources.append(entry.resource)
    return(pt_resources)

example = 'output/fhir/Aaron697_Eichmann909_8f8b9664-6af2-a8f6-694e-1eed399ea223.json'
ex_resources = read_fihr_json(example)
set([type(x) for x in ex_resources])

{fhir.resources.R4B.allergyintolerance.AllergyIntolerance,
 fhir.resources.R4B.careplan.CarePlan,
 fhir.resources.R4B.careteam.CareTeam,
 fhir.resources.R4B.claim.Claim,
 fhir.resources.R4B.condition.Condition,
 fhir.resources.R4B.diagnosticreport.DiagnosticReport,
 fhir.resources.R4B.documentreference.DocumentReference,
 fhir.resources.R4B.encounter.Encounter,
 fhir.resources.R4B.explanationofbenefit.ExplanationOfBenefit,
 fhir.resources.R4B.immunization.Immunization,
 fhir.resources.R4B.medicationrequest.MedicationRequest,
 fhir.resources.R4B.observation.Observation,
 fhir.resources.R4B.patient.Patient,
 fhir.resources.R4B.procedure.Procedure,
 fhir.resources.R4B.provenance.Provenance}

### Patient Information

In [159]:
def get_patient_address_info(patient_obj):
    pt_country, pt_state, pt_city, pt_lat, pt_lon = '', '', '', 0,0
    for entry in patient_obj.address:
        pt_country += entry.country
        pt_state += entry.state
        pt_city += entry.city
        for ext in entry.extension:
            for ext2 in ext.extension:
                if ext2.url == 'latitude':
                    pt_lat += float(ext2.valueDecimal)
                if ext2.url == 'longitude':
                    pt_lon += float(ext2.valueDecimal)
    return pt_country, pt_state, pt_city, pt_lat, pt_lon

def get_patient_demographic(patient_obj):
    pt_birth_date = ex_patient.birthDate
    pt_gender = ex_patient.gender
    pt_gp = ex_patient.generalPractitioner
    pt_id = ex_patient.id
    pt_martial_status = ex_patient.maritalStatus.text
    if len(ex_patient.name[0].given) == 2:
        pt_first_name = ex_patient.name[0].given[0]
        pt_middle_name = ex_patient.name[0].given[1]
    else:
        pt_first_name = ex_patient.name[0].given[0]
        pt_middle_name = 'None'
    pt_family_name = ex_patient.name[0].family
    return pt_id, pt_first_name, pt_middle_name, pt_family_name, pt_gender, pt_birth_date, pt_martial_status, pt_gp

def get_patient_age(patient_obj):
    pt_bd = get_patient_demographic(patient_obj)[5]
    today = datetime.date.today()
    pt_age = int((today - pt_bd).days // 365.2425)
    return(pt_age)

def create_patient_entry(resource_obj):
    patient_ind = [ind for ind, entry in enumerate(ex_resources) if entry.resource_type == 'Patient'][0]
    patient_obj = resource_obj[patient_ind]
    pt_id, pt_first_name, pt_middle_name, pt_family_name, pt_gender, pt_birth_date, pt_marital_status, pt_gp = get_patient_demographic(patient_obj)
    pt_age = get_patient_age(patient_obj)
    pt_country, pt_state, pt_city, pt_lat, pt_lon = get_patient_address_info(patient_obj)
    pt_entry = {
        'uuid':pt_id,
        'first_name':pt_first_name,
        'middle_name':pt_middle_name,
        'family_name':pt_family_name,
        'gender':pt_gender,
        'birth_date': str(pt_birth_date),
        'marital_status': pt_marital_status,
        'general_practioner': pt_gp,
        'age_years': pt_age,
        'country': pt_country,
        'state': pt_state,
        'city': pt_city,
        'location_lat': pt_lat,
        'location_long': pt_lon
    }
    return pt_entry

In [160]:
create_patient_entry(ex_resources)

{'uuid': '8f8b9664-6af2-a8f6-694e-1eed399ea223',
 'first_name': 'Aaron697',
 'middle_name': 'Don899',
 'family_name': 'Eichmann909',
 'gender': 'male',
 'birth_date': '1956-04-06',
 'marital_status': 'Never Married',
 'general_practioner': None,
 'age_years': 67,
 'country': 'US',
 'state': 'MA',
 'city': 'East Longmeadow',
 'location_lat': 42.05782803279011,
 'location_long': -72.45628312557714}