# AIM-AHEAD Health Disparities ML Project
This notebook analyzes synthetic patient data to explore how race, sex, age, and vaccination status influence hospitalization and mortality outcomes among cardiovascular COVID-19 patients.

## Step 1: Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder
sns.set(style="whitegrid")

## Step 2: Load the Dataset

In [None]:
df = pd.read_csv('../data/synthetic_cvd_covid.csv')
df.head()

## Step 3: Explore Distributions by Race and Mortality

In [None]:
plt.figure(figsize=(10, 5))
sns.countplot(x='Race', hue='Mortality', data=df)
plt.title("Mortality Counts by Race")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## Step 4: Preprocess and Encode Data

In [None]:
df_encoded = df.copy()
for col in ['Sex', 'Race', 'VaccinationStatus']:
    df_encoded[col] = LabelEncoder().fit_transform(df_encoded[col])
features = ['Age', 'Sex', 'Race', 'VaccinationStatus']
X = df_encoded[features]
y = df_encoded['Mortality']

## Step 5: Apply SMOTE to Balance Mortality Classes

In [None]:
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X, y)
print("Before SMOTE:", y.value_counts().to_dict())
print("After SMOTE:", pd.Series(y_res).value_counts().to_dict())

## Step 6: Train Random Forest and Logistic Regression

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.3, random_state=42)

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
rf_preds = rf.predict(X_test)

logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train, y_train)
log_preds = logreg.predict(X_test)

## Step 7: Model Evaluation

In [None]:
print("Random Forest Report:")
print(classification_report(y_test, rf_preds))
print("Logistic Regression Report:")
print(classification_report(y_test, log_preds))

## Step 8: Feature Importance (Random Forest)

In [None]:

importances = rf.feature_importances_
feature_names = X.columns
plt.figure(figsize=(8, 4))
sns.barplot(x=importances, y=feature_names)
plt.title("Random Forest Feature Importance")
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.tight_layout()
plt.show()


## Step 9: ROC Curves

In [None]:

from sklearn.metrics import roc_curve, auc

rf_probs = rf.predict_proba(X_test)[:, 1]
log_probs = logreg.predict_proba(X_test)[:, 1]

rf_fpr, rf_tpr, _ = roc_curve(y_test, rf_probs)
log_fpr, log_tpr, _ = roc_curve(y_test, log_probs)

plt.figure(figsize=(8, 6))
plt.plot(rf_fpr, rf_tpr, label=f"Random Forest (AUC = {auc(rf_fpr, rf_tpr):.2f})")
plt.plot(log_fpr, log_tpr, label=f"Logistic Regression (AUC = {auc(log_fpr, log_tpr):.2f})")
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
plt.title("ROC Curve Comparison")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
