In [1]:
import pandas as pd

# STEP 1: Load dataset
df = pd.read_csv('data/optimized_synthetic_patient_claims.csv')

# View shape and sample
print("Dataset Shape:", df.shape)
print("Columns:", df.columns.tolist())
print(df.head())

Dataset Shape: (8902, 11)
Columns: ['MemberID', 'Age', 'Gender', 'ClaimID', 'ClaimDate', 'DiagnosisCode', 'ProcedureCode', 'MedicationCode', 'PrimaryDisease', 'AmountBilled', 'PreventiveGap']
   MemberID  Age Gender     ClaimID   ClaimDate DiagnosisCode ProcedureCode  \
0      1001   71      F  CLM1001001  2024-01-26         C50.9         92928   
1      1001   71      F  CLM1001002  2024-06-20   J44.1;N18.9         90837   
2      1001   71      F  CLM1001003  2024-06-15   C50.9;N18.9         11721   
3      1001   71      F  CLM1001004  2024-04-12         N18.9         19120   
4      1002   69      F  CLM1002001  2024-01-16         I10.0         90837   

  MedicationCode PrimaryDisease  AmountBilled                PreventiveGap  
0         NDC007         Cancer          1091     Cancer screening overdue  
1  NDC006;NDC005           COPD           721    Pulmonary check-up missed  
2         NDC004         Cancer          1673     Cancer screening overdue  
3  NDC001;NDC002         

In [30]:
df = df.dropna(subset=['DiagnosisCode', 'MemberID'])

In [31]:
df['DiagnosisCode'] = df['DiagnosisCode'].astype(str)
df['ProcedureCode'] = df['ProcedureCode'].astype(str)

df['DiagnosisCode'] = df['DiagnosisCode'].str.split(';')
df['ProcedureCode'] = df['ProcedureCode'].str.split(';')

df = df.explode('DiagnosisCode')
df = df.explode('ProcedureCode')

In [32]:
df['ICD_Prefix'] = df['DiagnosisCode'].str[:3]

In [35]:
# Replace 'your_column_name' with the actual column name
unique_values = df['DiagnosisCode'].unique()
print(unique_values)

['C50.9' 'J44.1' 'N18.9' 'I10.0' 'E11.9' 'H66.9' 'F32.9' 'I21.9' 'J45.9']


In [5]:
# STEP 4: Map ICD Prefix to Disease Name
icd_prefix_map = {
    'E10': 'Type 1 Diabetes',
    'E11': 'Type 2 Diabetes',
    'E13': 'Other Diabetes',
    'I10': 'Hypertension',
    'I25': 'Heart Disease',
    'I50': 'Heart Failure',
    'J44': 'COPD',
    'J45': 'Asthma',
    'J18': 'Pneumonia',
    'N18': 'Kidney Disease',
    'C50': 'Breast Cancer',
    'C34': 'Lung Cancer',
    'C18': 'Colon Cancer',
    'F32': 'Depression',
    'F20': 'Schizophrenia',
    'G30': 'Alzheimer\'s Disease',
    'M54': 'Back Pain',
    'K21': 'Acid Reflux',
    'K50': 'Crohn\'s Disease',
    'R51': 'Headache',
    'R53': 'Fatigue',
    'Z00': 'General Check-up',
    'Z51': 'Palliative Care'
}

df['DiseaseName'] = df['ICD_Prefix'].map(icd_prefix_map)

In [6]:
# STEP 5: Assign Disease Criticality (1 = Low Risk, 5 = Critical)
disease_criticality = {
    'Type 1 Diabetes': 4,
    'Type 2 Diabetes': 4,
    'Other Diabetes': 4,
    'Hypertension': 3,
    'Heart Disease': 5,
    'Heart Failure': 5,
    'COPD': 4,
    'Asthma': 3,
    'Pneumonia': 4,
    'Kidney Disease': 4,
    'Breast Cancer': 5,
    'Lung Cancer': 5,
    'Colon Cancer': 5,
    'Depression': 3,
    'Schizophrenia': 4,
    'Alzheimer\'s Disease': 4,
    'Back Pain': 2,
    'Acid Reflux': 2,
    'Crohn\'s Disease': 3,
    'Headache': 2,
    'Fatigue': 2,
    'General Check-up': 1,
    'Palliative Care': 5
}

df['DiseaseCriticality'] = df['DiseaseName'].map(disease_criticality)

In [7]:
# STEP 6: Final preview
print("\nSample Mapped Records:")
print(df[['MemberID', 'DiagnosisCode', 'DiseaseName', 'DiseaseCriticality']].dropna().head(10))


Sample Mapped Records:
    MemberID DiagnosisCode      DiseaseName  DiseaseCriticality
0       1001         J45.9           Asthma                 3.0
1       1002         C50.9    Breast Cancer                 5.0
3       1004         J45.9           Asthma                 3.0
4       1005         E11.9  Type 2 Diabetes                 4.0
5       1006         E11.9  Type 2 Diabetes                 4.0
6       1007         E11.9  Type 2 Diabetes                 4.0
10      1011         E11.9  Type 2 Diabetes                 4.0
11      1012         F32.9       Depression                 3.0
12      1013         J44.9             COPD                 4.0
13      1014   C50.9;H66.9    Breast Cancer                 5.0


In [8]:
# STEP 1: Drop rows where disease mapping failed
df_risk = df.dropna(subset=['DiseaseName', 'DiseaseCriticality'])

# STEP 2: Define a function to assign rule-based risk score
def assign_risk_score(row):
    if row['MaxCriticality'] >= 5 or row['ChronicCount'] >= 2:
        return 5
    elif row['AvgCriticality'] >= 4:
        return 4
    elif row['ChronicCount'] == 1 or row['AvgCriticality'] >= 3:
        return 3
    elif row['AvgCriticality'] >= 2:
        return 2
    else:
        return 1

# STEP 3: Group by MemberID and aggregate
risk_features = df_risk.groupby('MemberID').agg(
    NumClaims=('ClaimID', 'count'),
    UniqueDiseases=('DiseaseName', 'nunique'),
    AvgCriticality=('DiseaseCriticality', 'mean'),
    MaxCriticality=('DiseaseCriticality', 'max'),
    ChronicCount=('DiseaseCriticality', lambda x: (x >= 4).sum())
).reset_index()

# STEP 4: Assign rule-based risk score (1 to 5)
risk_features['RiskScore'] = risk_features.apply(assign_risk_score, axis=1)

# Preview
print("\nSample Member-Level Risk Features:")
print(risk_features.head())



Sample Member-Level Risk Features:
   MemberID  NumClaims  UniqueDiseases  AvgCriticality  MaxCriticality  \
0      1001          1               1             3.0             3.0   
1      1002          1               1             5.0             5.0   
2      1004          1               1             3.0             3.0   
3      1005          1               1             4.0             4.0   
4      1006          1               1             4.0             4.0   

   ChronicCount  RiskScore  
0             0          3  
1             1          5  
2             0          3  
3             1          4  
4             1          4  


In [9]:
# STEP 1: Preventive care map
preventive_care_map = {
    'Type 1 Diabetes': "Regular insulin monitoring and diet control",
    'Type 2 Diabetes': "Weight management, avoid sugar, regular A1C tests",
    'Other Diabetes': "Monitor glucose, exercise, low-carb diet",
    'Hypertension': "Reduce salt, regular BP check, avoid stress",
    'Heart Disease': "Cardiologist visit, ECG, physical activity",
    'Heart Failure': "Low sodium diet, daily weight check, medications",
    'COPD': "Avoid smoking, pulmonary rehab, flu vaccination",
    'Asthma': "Use inhalers regularly, avoid triggers, annual checkups",
    'Pneumonia': "Vaccination, hygiene, timely antibiotics",
    'Kidney Disease': "Limit protein/sodium, regular blood/urine tests",
    'Breast Cancer': "Mammograms, regular check-ups, self-exams",
    'Lung Cancer': "Quit smoking, screening for early detection",
    'Colon Cancer': "Colonoscopy, high-fiber diet, routine screening",
    'Depression': "Therapy, medication, regular mental health checks",
    'Schizophrenia': "Psychiatric care, medication adherence",
    'Back Pain': "Stretching, physiotherapy, avoid heavy lifting",
    'Acid Reflux': "Avoid spicy food, small meals, raise head while sleeping",
    'Crohn\'s Disease': "Anti-inflammatory drugs, avoid trigger foods",
    'Alzheimer\'s Disease': "Memory exercises, caregiver support",
    'Headache': "Hydration, sleep, limit screen time",
    'Fatigue': "Balanced diet, sleep, manage stress",
    'General Check-up': "Routine health screening and lifestyle advice",
    'Palliative Care': "Comfort-focused care and emotional support"
}


In [10]:
# STEP 2: Merge disease info with risk_features table
df_prevent = df_risk[['MemberID', 'DiseaseName']].dropna()
df_prevent['Recommendation'] = df_prevent['DiseaseName'].map(preventive_care_map)


In [11]:
# STEP 3: Aggregate recommendations per patient
recommendation_df = df_prevent.groupby('MemberID')['Recommendation'].unique().reset_index()
recommendation_df['PreventiveCareAdvice'] = recommendation_df['Recommendation'].apply(lambda x: "; ".join(x))
recommendation_df = recommendation_df.drop(columns=['Recommendation'])

In [17]:
# STEP 4: Merge with risk scores
final_patient_profile = pd.merge(risk_features, recommendation_df, on='MemberID', how='left')

# Preview result
print("\nSample Final Patient Profiles:")
print(final_patient_profile[['MemberID', 'RiskScore', 'PreventiveCareAdvice']].head())

final_patient_profile.to_csv("data/final_patient_profile.csv", index=False)


Sample Final Patient Profiles:
   MemberID  RiskScore                               PreventiveCareAdvice
0      1001          3  Use inhalers regularly, avoid triggers, annual...
1      1002          5          Mammograms, regular check-ups, self-exams
2      1004          3  Use inhalers regularly, avoid triggers, annual...
3      1005          4  Weight management, avoid sugar, regular A1C tests
4      1006          4  Weight management, avoid sugar, regular A1C tests


In [18]:
df.columns

Index(['MemberID', 'Age', 'Gender', 'ClaimID', 'ClaimDate', 'DiagnosisCode',
       'ProcedureCode', 'MedicationCode', 'PrimaryDisease', 'AmountBilled',
       'PreventiveGap', 'ICD_Prefix', 'DiseaseName', 'DiseaseCriticality'],
      dtype='object')

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import LabelEncoder
import joblib

# Step 1: Load original data and final profile
claims_df = pd.read_csv("data/synthetic_patient_claims.csv")
profile_df = pd.read_csv("data/final_patient_profile.csv")  # Save it after rule-based processing

# Step 2: Merge to bring RiskScore to each row
df = claims_df.merge(profile_df[['MemberID', 'RiskScore']], on='MemberID', how='inner')

# Step 3: Drop missing data
df.dropna(subset=['Age', 'Gender', 'DiagnosisCode', 'ProcedureCode', 'RiskScore'], inplace=True)

# Step 4: Encode categorical columns
le_gender = LabelEncoder()
df['Gender'] = le_gender.fit_transform(df['Gender'])

le_diag = LabelEncoder()
df['DiagnosisCode'] = le_diag.fit_transform(df['DiagnosisCode'])

le_proc = LabelEncoder()
df['ProcedureCode'] = le_proc.fit_transform(df['ProcedureCode'])

# Step 5: Define Features & Target
features = ['Age', 'Gender', 'DiagnosisCode', 'ProcedureCode', 'AmountBilled']
X = df[features]
y = df['RiskScore'].astype(int)

# Step 6: Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

# Step 7: Train Model
rf = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    min_samples_split=10,
    min_samples_leaf=5,
    random_state=42
)

cv_scores = cross_val_score(rf, X_train, y_train, cv=5)
print("Cross-validation scores:", cv_scores)
print("Mean CV Accuracy:", np.mean(cv_scores))

rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

# Step 8: Evaluation
print("Test Accuracy:", accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Step 9: Save Model and Encoders
joblib.dump(rf, 'output/patient_risk_model.pkl')
joblib.dump(le_gender, 'output/le_gender.pkl')
joblib.dump(le_diag, 'output/le_diag.pkl')
joblib.dump(le_proc, 'output/le_proc.pkl')


Cross-validation scores: [1.         1.         0.99595142 0.99190283 0.99595142]
Mean CV Accuracy: 0.9967611336032389
Test Accuracy: 0.9935483870967742
[[132   2   0]
 [  0 132   0]
 [  0   0  44]]
              precision    recall  f1-score   support

           3       1.00      0.99      0.99       134
           4       0.99      1.00      0.99       132
           5       1.00      1.00      1.00        44

    accuracy                           0.99       310
   macro avg       1.00      1.00      0.99       310
weighted avg       0.99      0.99      0.99       310



['output/le_proc.pkl']