In [None]:
# ==============================================================
# 📘 01_data_preparation.ipynb
# Author: Josmy Mathew
# Project: AI-Digital-Twin-for-Clinical-Data
# Description:
#   Load, clean, and integrate Synthea EHR data into
#   structured longitudinal patient time-series tables.
# ==============================================================



**Step 1 — Import Libraries**

In [None]:
import pandas as pd
import numpy as np
import os
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns

# Display settings
pd.set_option('display.max_columns', None)


**Step 2 — Load Synthea CSV Data**

In [None]:
# --------------------------------------------------------------
# 2. Load Raw Synthea Data
# --------------------------------------------------------------

DATA_PATH = "/content/drive/MyDrive/synthea_sample_data_csv_latest"

patients = pd.read_csv(os.path.join(DATA_PATH, "patients.csv"))
encounters = pd.read_csv(os.path.join(DATA_PATH, "encounters.csv"))
observations = pd.read_csv(os.path.join(DATA_PATH, "observations.csv"))
conditions = pd.read_csv(os.path.join(DATA_PATH, "conditions.csv"))

print("✅ Data loaded:")
print(f"Patients: {patients.shape}")
print(f"Encounters: {encounters.shape}")
print(f"Observations: {observations.shape}")
print(f"Conditions: {conditions.shape}")


print(f"Patients: {patients.columns}")
print(f"Encounters: {encounters.columns}")
print(f"Observations: {observations.columns}")
print(f"Conditions: {conditions.columns}")

✅ Data loaded:
Patients: (109, 28)
Encounters: (6414, 15)
Observations: (100980, 9)
Conditions: (4294, 7)
Patients: Index(['Id', 'BIRTHDATE', 'DEATHDATE', 'SSN', 'DRIVERS', 'PASSPORT', 'PREFIX',
       'FIRST', 'MIDDLE', 'LAST', 'SUFFIX', 'MAIDEN', 'MARITAL', 'RACE',
       'ETHNICITY', 'GENDER', 'BIRTHPLACE', 'ADDRESS', 'CITY', 'STATE',
       'COUNTY', 'FIPS', 'ZIP', 'LAT', 'LON', 'HEALTHCARE_EXPENSES',
       'HEALTHCARE_COVERAGE', 'INCOME'],
      dtype='object')
Encounters: Index(['Id', 'START', 'STOP', 'PATIENT', 'ORGANIZATION', 'PROVIDER', 'PAYER',
       'ENCOUNTERCLASS', 'CODE', 'DESCRIPTION', 'BASE_ENCOUNTER_COST',
       'TOTAL_CLAIM_COST', 'PAYER_COVERAGE', 'REASONCODE',
       'REASONDESCRIPTION'],
      dtype='object')
Observations: Index(['DATE', 'PATIENT', 'ENCOUNTER', 'CATEGORY', 'CODE', 'DESCRIPTION',
       'VALUE', 'UNITS', 'TYPE'],
      dtype='object')
Conditions: Index(['START', 'STOP', 'PATIENT', 'ENCOUNTER', 'SYSTEM', 'CODE',
       'DESCRIPTION'],
      dtype=

In [None]:
# --------------------------------------------------------------
# 3️⃣ Parse Datetimes
# --------------------------------------------------------------
for df in [patients, encounters, observations, conditions]:
    for col in df.columns:
        if any(key in col.lower() for key in ['date', 'start', 'stop']):
            df[col] = pd.to_datetime(df[col], errors='coerce')

# --------------------------------------------------------------
# 4️⃣ Example: Merge Encounters + Observations
# --------------------------------------------------------------
# Each patient → time-series of observations (labs, vitals, etc.)
merged = pd.merge(
    observations,
    encounters[['Id', 'PATIENT', 'START']],
    left_on='ENCOUNTER',
    right_on='Id',
    suffixes=('_obs', '_enc')
)

merged = merged[['PATIENT', 'START', 'CODE', 'DESCRIPTION_obs', 'VALUE', 'UNIT']]
merged.rename(columns={'DESCRIPTION_obs': 'FEATURE', 'START': 'DATE'}, inplace=True)

# Sort chronologically per patient
merged = merged.sort_values(['PATIENT', 'DATE'])


KeyError: "['PATIENT', 'DESCRIPTION_obs', 'UNIT'] not in index"

In [None]:
print(merged.columns.tolist())


NameError: name 'merged' is not defined

In [None]:
print(merged.columns.tolist())


NameError: name 'merged' is not defined

In [None]:
# --------------------------------------------------------------
# 3️⃣ Parse Datetimes
# --------------------------------------------------------------
for df in [patients, encounters, observations, conditions]:
    for col in df.columns:
        if any(key in col.lower() for key in ['date', 'start', 'stop']):
            df[col] = pd.to_datetime(df[col], errors='coerce')

# --------------------------------------------------------------
# 4️⃣ Example: Merge Encounters + Observations
# --------------------------------------------------------------
# Each patient → time-series of observations (labs, vitals, etc.)
merged = pd.merge(
    observations,
    encounters[['Id', 'PATIENT', 'START']],
    left_on='ENCOUNTER',
    right_on='Id',
    suffixes=('_obs', '_enc')
)

merged = merged[['PATIENT', 'START', 'CODE', 'DESCRIPTION_obs', 'VALUE', 'UNIT']]
merged.rename(columns={'DESCRIPTION_obs': 'FEATURE', 'START': 'DATE'}, inplace=True)

# Sort chronologically per patient
merged = merged.sort_values(['PATIENT', 'DATE'])


In [None]:
# --------------------------------------------------------------
# 5️⃣ Example Aggregation: One Row per (Patient, Date)
# --------------------------------------------------------------
pivot = merged.pivot_table(
    index=['PATIENT', 'DATE'],
    columns='FEATURE',
    values='VALUE',
    aggfunc='mean'
).reset_index()

print("✅ Aggregated shape:", pivot.shape)
pivot.head()

# --------------------------------------------------------------
# 6️⃣ Merge with Patient Demographics
# --------------------------------------------------------------
data_final = pd.merge(
    pivot,
    patients,
    left_on='PATIENT',
    right_on='Id',
    how='left'
).drop(columns=['Id'])

# --------------------------------------------------------------
# 7️⃣ Save Processed Data
# --------------------------------------------------------------
os.makedirs("../data/processed/", exist_ok=True)
data_final.to_csv("../data/processed/patient_timeseries.csv", index=False)

print("💾 Processed data saved to: data/processed/patient_timeseries.csv")
print("✅ Done!")


# REFERENCES



1.   https://synthea.mitre.org/downloads
2.   List item

