In [None]:
# 02-model-comparison.ipynb 
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import joblib
from collections import Counter

# -------- Load encoded diabetes dataset (path from notebook) --------
df = pd.read_csv("../data/diabetes_encoded.csv")   # encoded version
print("Loaded diabetes_encoded.csv, shape:", df.shape)

# -------- Features / Target --------
X = df.drop(columns=["Outcome"])
y = df["Outcome"]

# -------- Train / Test split (stratified) --------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)

print("Train shape:", X_train.shape, "Test shape:", X_test.shape)
print("Train target distribution:", Counter(y_train))
print("Test  target distribution:", Counter(y_test))

# -------- Scaling --------
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)

# Convert back to DataFrame for convenience & saving (keep column names)
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X.columns, index=X_train.index)
X_test_scaled  = pd.DataFrame(X_test_scaled,  columns=X.columns, index=X_test.index)

# Save scaled train/test into data/ for reuse
train_scaled = pd.concat([X_train_scaled, y_train.reset_index(drop=True)], axis=1)
test_scaled  = pd.concat([X_test_scaled,  y_test.reset_index(drop=True)],  axis=1)

train_scaled.to_csv("../data/diabetes_train_scaled.csv", index=False)
test_scaled.to_csv("../data/diabetes_test_scaled.csv", index=False)

# Save scaler for later use by API / app
joblib.dump(scaler, "../models/diabetes_scaler.pkl")

# Quick check (means should be ~0, std ~1 for train)
print("\nScaled train feature means (approx):\n", np.round(X_train_scaled.mean(),3).to_dict())
print("Scaled train feature stds (approx):\n", np.round(X_train_scaled.std(),3).to_dict())

print("\nSaved:")
print("- ../data/diabetes_train_scaled.csv")
print("- ../data/diabetes_test_scaled.csv")
print("- ../models/diabetes_scaler.pkl")


Loaded diabetes_encoded.csv, shape: (768, 9)
Train shape: (614, 8) Test shape: (154, 8)
Train target distribution: Counter({0: 400, 1: 214})
Test  target distribution: Counter({0: 100, 1: 54})

Scaled train feature means (approx):
 {'Pregnancies': -0.0, 'Glucose': -0.0, 'BloodPressure': 0.0, 'SkinThickness': -0.0, 'Insulin': -0.0, 'BMI': -0.0, 'DiabetesPedigreeFunction': -0.0, 'Age': -0.0}
Scaled train feature stds (approx):
 {'Pregnancies': 1.001, 'Glucose': 1.001, 'BloodPressure': 1.001, 'SkinThickness': 1.001, 'Insulin': 1.001, 'BMI': 1.001, 'DiabetesPedigreeFunction': 1.001, 'Age': 1.001}

Saved:
- ../data/diabetes_train_scaled.csv
- ../data/diabetes_test_scaled.csv
- ../models/diabetes_scaler.pkl


In [2]:
# 02-model-comparison.ipynb 
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score, f1_score

# -------- Train Logistic Regression --------
log_reg = LogisticRegression(max_iter=1000, random_state=42)
log_reg.fit(X_train_scaled, y_train)
y_pred_lr = log_reg.predict(X_test_scaled)
y_prob_lr = log_reg.predict_proba(X_test_scaled)[:,1]

print("Logistic Regression Results:")
print(classification_report(y_test, y_pred_lr))
print("ROC-AUC:", roc_auc_score(y_test, y_prob_lr))
print("F1:", f1_score(y_test, y_pred_lr))
print("-"*50)

# -------- Train Random Forest --------
rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X_train_scaled, y_train)
y_pred_rf = rf.predict(X_test_scaled)
y_prob_rf = rf.predict_proba(X_test_scaled)[:,1]

print("Random Forest Results:")
print(classification_report(y_test, y_pred_rf))
print("ROC-AUC:", roc_auc_score(y_test, y_prob_rf))
print("F1:", f1_score(y_test, y_pred_rf))
print("-"*50)

# --------  best model --------
score_lr = roc_auc_score(y_test, y_prob_lr)
score_rf = roc_auc_score(y_test, y_prob_rf)

if score_rf >= score_lr:
    best_model = rf
    best_name = "RandomForest"
else:
    best_model = log_reg
    best_name = "LogisticRegression"

print(f"✅ Best model for Diabetes: {best_name}")

# Save best model
joblib.dump(best_model, "../models/diabetes_model.pkl")
print("Saved model -> ../models/diabetes_model.pkl")


Logistic Regression Results:
              precision    recall  f1-score   support

           0       0.75      0.82      0.78       100
           1       0.60      0.50      0.55        54

    accuracy                           0.71       154
   macro avg       0.68      0.66      0.67       154
weighted avg       0.70      0.71      0.70       154

ROC-AUC: 0.812962962962963
F1: 0.5454545454545454
--------------------------------------------------
Random Forest Results:
              precision    recall  f1-score   support

           0       0.78      0.84      0.81       100
           1       0.65      0.56      0.60        54

    accuracy                           0.74       154
   macro avg       0.71      0.70      0.70       154
weighted avg       0.73      0.74      0.73       154

ROC-AUC: 0.8161111111111112
F1: 0.6
--------------------------------------------------
✅ Best model for Diabetes: RandomForest
Saved model -> ../models/diabetes_model.pkl


In [1]:
# 02-model-comparison.ipynb
# ----- Heart Dataset Train/Test Split + Scaling -----

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import joblib

# Load already encoded dataset (numeric only)
heart = pd.read_csv("../data/heart_encoded.csv")
print("Loaded heart_encoded.csv, shape:", heart.shape)

# Features and target
X_heart = heart.drop(columns=["target"])
y_heart = heart["target"]

# Train-test split
X_train_h, X_test_h, y_train_h, y_test_h = train_test_split(
    X_heart, y_heart, test_size=0.2, random_state=42, stratify=y_heart
)

print("Train shape:", X_train_h.shape, "Test shape:", X_test_h.shape)

# Scaling
scaler_h = StandardScaler()
X_train_h_scaled = scaler_h.fit_transform(X_train_h)
X_test_h_scaled = scaler_h.transform(X_test_h)

# Save scaled splits
train_h_scaled = pd.DataFrame(X_train_h_scaled, columns=X_train_h.columns)
test_h_scaled = pd.DataFrame(X_test_h_scaled, columns=X_test_h.columns)
train_h_scaled["target"] = y_train_h.values
test_h_scaled["target"] = y_test_h.values

train_h_scaled.to_csv("../data/heart_train_scaled.csv", index=False)
test_h_scaled.to_csv("../data/heart_test_scaled.csv", index=False)

# Save scaler
joblib.dump(scaler_h, "../models/heart_scaler.pkl")

print("Saved:")
print("- ../data/heart_train_scaled.csv")
print("- ../data/heart_test_scaled.csv")
print("- ../models/heart_scaler.pkl")


Loaded heart_encoded.csv, shape: (309, 13)
Train shape: (247, 12) Test shape: (62, 12)
Saved:
- ../data/heart_train_scaled.csv
- ../data/heart_test_scaled.csv
- ../models/heart_scaler.pkl


# Models on the Heart dataset

In [2]:
# ----- Logistic Regression on Heart Dataset -----
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score, f1_score
import joblib

# Load scaled train/test splits
train_h = pd.read_csv("../data/heart_train_scaled.csv")
test_h = pd.read_csv("../data/heart_test_scaled.csv")

X_train_h = train_h.drop(columns=["target"])
y_train_h = train_h["target"]
X_test_h = test_h.drop(columns=["target"])
y_test_h = test_h["target"]

# Train Logistic Regression
log_reg = LogisticRegression(max_iter=1000, random_state=42)
log_reg.fit(X_train_h, y_train_h)

# Predictions
y_pred = log_reg.predict(X_test_h)
y_proba = log_reg.predict_proba(X_test_h)[:, 1]

# Evaluation
print("Logistic Regression Results (Heart):")
print(classification_report(y_test_h, y_pred))
print("ROC-AUC:", roc_auc_score(y_test_h, y_proba))
print("F1:", f1_score(y_test_h, y_pred))

# Save model
joblib.dump(log_reg, "../models/heart_logreg.pkl")
print("✅ Saved model -> ../models/heart_logreg.pkl")


Logistic Regression Results (Heart):
              precision    recall  f1-score   support

           0       0.76      0.72      0.74        36
           1       0.64      0.69      0.67        26

    accuracy                           0.71        62
   macro avg       0.70      0.71      0.70        62
weighted avg       0.71      0.71      0.71        62

ROC-AUC: 0.7553418803418803
F1: 0.6666666666666666
✅ Saved model -> ../models/heart_logreg.pkl


In [3]:
# ----- Heart Dataset - Random Forest -----
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score, f1_score

# Load scaled splits
train_h = pd.read_csv("../data/heart_train_scaled.csv")
test_h = pd.read_csv("../data/heart_test_scaled.csv")

X_train_h = train_h.drop(columns=["target"])
y_train_h = train_h["target"]
X_test_h = test_h.drop(columns=["target"])
y_test_h = test_h["target"]

# Train Random Forest
rf_h = RandomForestClassifier(n_estimators=200, random_state=42)
rf_h.fit(X_train_h, y_train_h)

# Predictions
y_pred_rf_h = rf_h.predict(X_test_h)
y_prob_rf_h = rf_h.predict_proba(X_test_h)[:, 1]

# Evaluation
print("Random Forest Results (Heart):")
print(classification_report(y_test_h, y_pred_rf_h))
print("ROC-AUC:", roc_auc_score(y_test_h, y_prob_rf_h))
print("F1:", f1_score(y_test_h, y_pred_rf_h))

# Save model
joblib.dump(rf_h, "../models/heart_rf.pkl")
print("✅ Saved model -> ../models/heart_rf.pkl")


Random Forest Results (Heart):
              precision    recall  f1-score   support

           0       0.73      0.75      0.74        36
           1       0.64      0.62      0.63        26

    accuracy                           0.69        62
   macro avg       0.68      0.68      0.68        62
weighted avg       0.69      0.69      0.69        62

ROC-AUC: 0.7142094017094017
F1: 0.6274509803921569
✅ Saved model -> ../models/heart_rf.pkl


In [4]:
import joblib

# Load the already saved logistic regression model
logreg_model = joblib.load("../models/heart_logreg.pkl")

# Save it as the final heart_model.pkl
joblib.dump(logreg_model, "../models/heart_model.pkl")

print("✅ Final Heart Model saved as ../models/heart_model.pkl")


✅ Final Heart Model saved as ../models/heart_model.pkl


In [5]:
import os

# Paths
models_dir = "../models/"
temp_files = ["heart_logreg.pkl", "heart_rf.pkl"]

for f in temp_files:
    path = os.path.join(models_dir, f)
    if os.path.exists(path):
        os.remove(path)
        print(f"🗑️ Removed {f}")

print("✅ Cleaned up. Only final models remain:")
print("- diabetes_model.pkl")
print("- heart_model.pkl")


🗑️ Removed heart_logreg.pkl
🗑️ Removed heart_rf.pkl
✅ Cleaned up. Only final models remain:
- diabetes_model.pkl
- heart_model.pkl
