In [1]:
# Install once if needed: pip install scikit-learn
import duckdb, pandas as pd, os, json
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score

# 1) Load features from DuckDB
con = duckdb.connect("../data/features.duckdb")

# IMPORTANT: Avoid NULL -> NaN in target by using CASE ... THEN 1 ELSE 0
df = con.execute("""
SELECT
  CASE WHEN control_status = 'Poorly Controlled' THEN 1 ELSE 0 END AS y,
  age, bmi,
  (smoking_status != 'Never')::INT AS smoker_flag,
  er_visits, fev1,
  gender,
  CASE
    WHEN bmi < 18.5 THEN 'Underweight'
    WHEN bmi < 25   THEN 'Normal'
    WHEN bmi < 30   THEN 'Overweight'
    ELSE 'Obese'
  END AS bmi_category,
  COALESCE(comorbidity, 'None') AS comorbidity,
  environment
FROM asthma_features
""").fetch_df()
con.close()

# 2) Define X, y (no NaNs in y now)
y = df["y"].astype(int)
X = df.drop(columns=["y"])
print("Shapes → X:", X.shape, " y:", y.shape)
print("y value counts:", y.value_counts().to_dict())

# 3) Preprocess (impute NaNs) + train baseline logistic regression
num_cols = ["age","bmi","smoker_flag","er_visits","fev1"]
cat_cols = ["gender","bmi_category","comorbidity","environment"]

numeric_pipe = Pipeline([
    ("impute", SimpleImputer(strategy="median")),
])

categorical_pipe = Pipeline([
    ("impute", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore")),
])

pre = ColumnTransformer([
    ("num", numeric_pipe, num_cols),
    ("cat", categorical_pipe, cat_cols),
])

clf = Pipeline([
    ("pre", pre),
    ("lr", LogisticRegression(max_iter=300, solver="lbfgs"))
])

# If class counts are extremely imbalanced and stratify errors, remove stratify=y
X_tr, X_te, y_tr, y_te = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)

clf.fit(X_tr, y_tr)
proba = clf.predict_proba(X_te)[:, 1]
pred  = (proba >= 0.5).astype(int)

# 4) Metrics + export
print(classification_report(y_te, pred))
roc_auc = float(roc_auc_score(y_te, proba))
print(f"ROC-AUC: {roc_auc:.3f}")

os.makedirs("../reports/figures", exist_ok=True)
with open("../reports/figures/model_metrics.json", "w") as f:
    json.dump({"roc_auc": roc_auc}, f, indent=2)

print("✅ Saved ../reports/figures/model_metrics.json")

Shapes → X: (5000000, 9)  y: (5000000,)
y value counts: {0: 4956877, 1: 43123}
              precision    recall  f1-score   support

           0       0.99      1.00      1.00    991375
           1       0.32      0.03      0.06      8625

    accuracy                           0.99   1000000
   macro avg       0.66      0.52      0.53   1000000
weighted avg       0.99      0.99      0.99   1000000

ROC-AUC: 0.955
✅ Saved ../reports/figures/model_metrics.json


In [3]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, confusion_matrix
import seaborn as sns

# ROC Curve
fpr, tpr, _ = roc_curve(y_te, proba)
plt.figure()
plt.plot(fpr, tpr, label=f"ROC curve (AUC = {roc_auc:.2f})")
plt.plot([0,1],[0,1],"--",color="grey")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()
plt.savefig("../reports/figures/roc_curve.png")
plt.close()

# Confusion Matrix
cm = confusion_matrix(y_te, pred)
plt.figure()
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
            xticklabels=["Well/Partly Controlled","Poorly Controlled"],
            yticklabels=["Well/Partly Controlled","Poorly Controlled"])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.savefig("../reports/figures/confusion_matrix.png")
plt.close()

print("✅ Saved ROC curve and Confusion Matrix charts")

✅ Saved ROC curve and Confusion Matrix charts


In [4]:
# Feature importance for Logistic Regression (coef-based)
import pandas as pd
import matplotlib.pyplot as plt
import os

# 1) Pull pieces from the pipeline
pre = clf.named_steps["pre"]
lr  = clf.named_steps["lr"]

# 2) Recover output feature names from the preprocessor
num_cols_out = num_cols  # passthrough (after imputation), same names

ohe = pre.named_transformers_["cat"].named_steps["onehot"]
cat_cols_out = list(ohe.get_feature_names_out(cat_cols))

all_features = num_cols_out + cat_cols_out

# 3) Coefficients (positive => higher odds of "Poorly Controlled")
coefs = lr.coef_[0]
feat_imp = pd.DataFrame({"feature": all_features, "coef": coefs}) \
           .sort_values("coef", ascending=True)

# 4) Save CSV + PNG
os.makedirs("../reports/figures", exist_ok=True)
feat_imp.to_csv("../reports/figures/feature_importance.csv", index=False)

plt.figure(figsize=(9, 10))
plt.barh(feat_imp["feature"], feat_imp["coef"])
plt.title("Feature Importance (Logistic Regression Coefficients)")
plt.xlabel("Coefficient (log-odds)")
plt.tight_layout()
plt.savefig("../reports/figures/feature_importance.png")
plt.close()

print("✅ Saved feature_importance.csv and feature_importance.png")
feat_imp.tail(10)

✅ Saved feature_importance.csv and feature_importance.png


Unnamed: 0,feature,coef
11,comorbidity_Both,-0.899791
12,comorbidity_Diabetes,-0.835606
14,comorbidity_None,-0.83536
13,comorbidity_Hypertension,-0.814244
4,fev1,-0.01727
1,bmi,-0.015786
0,age,-0.00017
3,er_visits,1.446606
8,bmi_category_Obese,2.121888
2,smoker_flag,2.292666
