In [1]:
import boto3

s3 = boto3.client('s3')
response = s3.list_buckets()
print([bucket['Name'] for bucket in response['Buckets']])

['ehr-genai-project-hadi']


In [2]:
import pandas as pd, json, os

os.makedirs('data/fhir_patients', exist_ok=True)

# Synthetic patient table
patients = pd.DataFrame({
    'patient_id': ['P001', 'P002', 'P003'],
    'age': [65, 45, 82],
    'gender': ['M', 'F', 'F']
})
patients.to_csv('data/patients.csv', index=False)

# Encounters table
encounters = pd.DataFrame({
    'encounter_id': ['E001', 'E002', 'E003'],
    'patient_id': ['P001','P002','P003'],
    'length_of_stay': [5, 2, 7],
    'diagnosis_code': ['I10','E11','J18']
})
encounters.to_csv('data/encounters.csv', index=False)

# Simulate claims
claims = pd.DataFrame({
    'claim_id': ['C001','C002','C003'],
    'patient_id': ['P001','P002','P003'],
    'claim_amount': [12000, 800, 20000]
})
claims.to_csv('data/claims.csv', index=False)

# FHIR JSONs for each patient
for idx, row in patients.iterrows():
    pt_json = {
        "resourceType": "Patient",
        "id": row.patient_id,
        "gender": row.gender.lower(),
        "birthDate": f"{2025-row.age}-01-01"
    }
    with open(f"data/fhir_patients/{row.patient_id}.json", 'w') as f:
        json.dump(pt_json, f, indent=2)

display(patients, encounters, claims)
!ls data/fhir_patients


  from pandas.core.computation.check import NUMEXPR_INSTALLED


Unnamed: 0,patient_id,age,gender
0,P001,65,M
1,P002,45,F
2,P003,82,F


Unnamed: 0,encounter_id,patient_id,length_of_stay,diagnosis_code
0,E001,P001,5,I10
1,E002,P002,2,E11
2,E003,P003,7,J18


Unnamed: 0,claim_id,patient_id,claim_amount
0,C001,P001,12000
1,C002,P002,800
2,C003,P003,20000


P001.json  P002.json  P003.json


In [3]:
import boto3, glob

s3 = boto3.client('s3')
bucket = 'ehr-genai-project-hadi'

for filepath in glob.glob('data/*'):
    if os.path.isfile(filepath):
        key = 'raw_data/' + os.path.basename(filepath)
        s3.upload_file(filepath, bucket, key)
for filepath in glob.glob('data/fhir_patients/*.json'):
    key = 'raw_data/fhir_patients/' + os.path.basename(filepath)
    s3.upload_file(filepath, bucket, key)

print("Uploaded raw_data files to S3!")

Uploaded raw_data files to S3!


In [4]:
resp = s3.list_objects_v2(Bucket=bucket, Prefix='raw_data/')
print([obj['Key'] for obj in resp.get('Contents', [])])

['raw_data/claims.csv', 'raw_data/encounters.csv', 'raw_data/fhir_patients/P001.json', 'raw_data/fhir_patients/P002.json', 'raw_data/fhir_patients/P003.json', 'raw_data/patients.csv']


In [5]:
import pandas as pd

patients = pd.read_csv("data/patients.csv")
encounters = pd.read_csv("data/encounters.csv")
claims = pd.read_csv("data/claims.csv")

# Merge everything
merged = patients.merge(encounters, on="patient_id", how="left") \
                 .merge(claims, on="patient_id", how="left")

merged.head()


Unnamed: 0,patient_id,age,gender,encounter_id,length_of_stay,diagnosis_code,claim_id,claim_amount
0,P001,65,M,E001,5,I10,C001,12000
1,P002,45,F,E002,2,E11,C002,800
2,P003,82,F,E003,7,J18,C003,20000


In [6]:
import json, os

fhir_folder = 'data/fhir_patients'
fhir_data = []

for fname in os.listdir(fhir_folder):
    with open(os.path.join(fhir_folder, fname)) as f:
        fhir = json.load(f)
        fhir_data.append({
            'patient_id': fhir['id'],
            'birth_year': int(fhir['birthDate'].split('-')[0]),
            'fhir_gender': fhir['gender']
        })

fhir_df = pd.DataFrame(fhir_data)

full_df = merged.merge(fhir_df, on="patient_id", how="left")

full_df["age_check"] = 2025 - full_df["birth_year"]
full_df["gender_match"] = (full_df["gender"].str.lower() == full_df["fhir_gender"]).astype(int)

full_df.head()

Unnamed: 0,patient_id,age,gender,encounter_id,length_of_stay,diagnosis_code,claim_id,claim_amount,birth_year,fhir_gender,age_check,gender_match
0,P001,65,M,E001,5,I10,C001,12000,1960,m,65,1
1,P002,45,F,E002,2,E11,C002,800,1980,f,45,1
2,P003,82,F,E003,7,J18,C003,20000,1943,f,82,1


In [8]:
cleaned_df = full_df.drop(columns=["fhir_gender", "birth_year"])
cleaned_df.to_csv("data/cleaned_data.csv", index=False)

# Upload to S3
import boto3
s3 = boto3.client('s3')

s3.upload_file("data/cleaned_data.csv", "ehr-genai-project-hadi", "cleaned_data/cleaned_data.csv")
print("Cleaned data saved and uploaded to S3.")


Cleaned data saved and uploaded to S3.


## model building

high_cost as prediction target

In [10]:
df = pd.read_csv("data/cleaned_data.csv")

df["high_cost"] = (df["claim_amount"] >= 10000).astype(int)

df = df.drop(columns=["patient_id", "diagnosis_code"])

df.to_csv("data/model_input.csv", index=False)

import boto3
s3 = boto3.client("s3")
s3.upload_file("data/model_input.csv", "ehr-genai-project-hadi", "model_data/model_input.csv")
print("Model data saved and uploaded to S3.")

Model data saved and uploaded to S3.


In [11]:
import sagemaker
from sagemaker.inputs import TrainingInput
from sagemaker.estimator import Estimator
import pandas as pd

session = sagemaker.Session()
bucket = 'ehr-genai-project-hadi'
role = sagemaker.get_execution_role()
region = session.boto_region_name

prefix = "model_data"
s3_path = f"s3://{bucket}/{prefix}/model_input.csv"

df = pd.read_csv("data/model_input.csv")
df.to_csv("data/model_input.csv", index=False)
session.upload_data("data/model_input.csv", bucket=bucket, key_prefix=prefix)

df = pd.read_csv("data/model_input.csv")
columns = df.columns.tolist()
train_cols = [col for col in columns if col != "high_cost"]
df_xgb = df[["high_cost"] + train_cols]
df_xgb.to_csv("data/train_xgb.csv", header=False, index=False)

xgb_uri = session.upload_data("data/train_xgb.csv", bucket=bucket, key_prefix="xgboost/input")

from sagemaker.image_uris import retrieve
image_uri = retrieve("xgboost", region, "1.5-1")

xgb_estimator = Estimator(
    image_uri=image_uri,
    role=role,
    instance_count=1,
    instance_type="ml.m5.large",
    volume_size=5,
    max_run=300,
    input_mode="File",
    output_path=f"s3://{bucket}/xgboost/output",
    sagemaker_session=session,
)

xgb_estimator.set_hyperparameters(objective="binary:logistic", num_round=50)

xgb_estimator.fit({"train": TrainingInput(xgb_uri, content_type="csv")})

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2025-06-14-16-23-22-450


2025-06-14 16:23:22 Starting - Starting the training job...
2025-06-14 16:23:45 Starting - Preparing the instances for training...
2025-06-14 16:24:08 Downloading - Downloading input data...
2025-06-14 16:24:53 Downloading - Downloading the training image......
  from pandas import MultiIndex, Int64Index[0m
[34m[2025-06-14 16:25:54.447 ip-10-0-138-230.ec2.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2025-06-14 16:25:54.470 ip-10-0-138-230.ec2.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.[0m
[34m[2025-06-14:16:25:54:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2025-06-14:16:25:54:INFO] Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34m[2025-06-14:16:25:54:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2025-06-14:16:25:54:INFO] Running XGBoost Sagemaker in algorithm mode[0m
[34m[2025-06-14:16:25:54:INFO] Determined 

## Add GenAI Explanation for Predictions using LLMs

Use a Large Language Model (LLM) to explain each prediction (e.g., why a patient was flagged high-risk) in plain English using structured features from your dataset.

### Prompt Example
Given this patient:

- Age: 82
- Length of stay: 7
- Claim amount: $20,000

Explain in 2–3 sentences why this patient is at high risk.

In [12]:
pip install openai

Collecting openai
  Downloading openai-1.86.0-py3-none-any.whl.metadata (25 kB)
Collecting distro<2,>=1.7.0 (from openai)
  Downloading distro-1.9.0-py3-none-any.whl.metadata (6.8 kB)
Collecting httpx<1,>=0.23.0 (from openai)
  Using cached httpx-0.28.1-py3-none-any.whl.metadata (7.1 kB)
Collecting jiter<1,>=0.4.0 (from openai)
  Downloading jiter-0.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.2 kB)
Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai)
  Using cached httpcore-1.0.9-py3-none-any.whl.metadata (21 kB)
Downloading openai-1.86.0-py3-none-any.whl (730 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m730.3/730.3 kB[0m [31m46.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading distro-1.9.0-py3-none-any.whl (20 kB)
Using cached httpx-0.28.1-py3-none-any.whl (73 kB)
Using cached httpcore-1.0.9-py3-none-any.whl (78 kB)
Downloading jiter-0.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (352 kB)
Installing collecte

In [18]:
from openai import OpenAI
import pandas as pd


In [25]:
import pandas as pd

df = pd.read_csv("data/model_input.csv")

high_risk = df[df["claim_amount"] >= 10000].copy()

high_risk = high_risk.head(2).copy()

high_risk["GenAI_Explanation"] = [
    "This 65-year-old patient had a moderately long hospital stay of 5 days and incurred a claim amount of $12,000, suggesting a complex or resource-intensive hospitalization. At this age, patients often have underlying chronic conditions that increase the likelihood of complications or slower recovery. These factors place the patient at elevated risk for both readmission and high post-discharge care needs.",
    "At 82 years old, this patient falls into a high-risk age group with increased vulnerability to adverse outcomes. A 7-day hospital stay combined with a $20,000 claim suggests significant medical intervention, possibly related to multiple comorbidities or an acute exacerbation of a chronic disease. The advanced age and high cost together indicate a strong risk for readmission and future complications."
]

high_risk.to_csv("data/high_risk_explained.csv", index=False)
print("Saved with explanations.")

Saved with explanations.


In [28]:
!ls

data  lost+found  Untitled.ipynb


In [34]:
!pwd

/home/ec2-user/SageMaker
