In [None]:
import seaborn as sns
import pandas as pd
from sklearn.metrics import roc_auc_score
import xgboost as xgb
from sklearn.model_selection import train_test_split

df = pd.read_csv('../data/synthetic_patients.csv')

# Handle categorical variables with one-hot encoding
categorical_cols = ['gender', 'ethnicity', 'smoking_status', 'diabetes_status', 'physical_activity', 'family_history']
for col in categorical_cols:
    df[col] = df[col].astype('category')

X = pd.get_dummies(df.drop(columns=['cardio_risk']), drop_first=True)
y = df['cardio_risk']

# Train/test split (use all data for model, or split as needed)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
model = xgb.XGBClassifier()
model.fit(X_train, y_train)

groups = df['ethnicity'].unique()

for group in groups:
    subset = df[df['ethnicity'] == group]
    if subset['cardio_risk'].nunique() < 2:
        print(f"{group}: Only one class present in y_true, skipping AUC calculation.")
        continue
    subset_X = pd.get_dummies(subset.drop(columns=['cardio_risk']), drop_first=True)
    # Align columns with training data
    subset_X = subset_X.reindex(columns=X_train.columns, fill_value=0)
    auc = roc_auc_score(subset['cardio_risk'], model.predict_proba(subset_X)[:,1])
    print(f"{group} AUC: {auc}")


ModuleNotFoundError: No module named 'numpy'