# 03_model_baselines

In [1]:
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

# ----------------------------------------------
# 1. Load processed data (NO engineered features)
# ----------------------------------------------
df = pd.read_csv("../data/processed/diabetes_processed.csv")

# Define features and target
X = df.drop("diabetes", axis=1)
y = df["diabetes"]

# ----------------------------------------------
# 2. Define baseline models
# ----------------------------------------------
models = {
    "Logistic Regression": LogisticRegression(max_iter=500),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(n_estimators=100),
}

# ----------------------------------------------
# 3. Cross-validation evaluation (ROC-AUC)
# ----------------------------------------------
results = {}
for name, model in models.items():
    scores = cross_val_score(model, X, y, cv=5, scoring="roc_auc")
    results[name] = scores.mean()

print("Baseline results:", results)


Baseline results: {'Logistic Regression': np.float64(0.9613339601414335), 'Decision Tree': np.float64(0.8558227483124397), 'Random Forest': np.float64(0.9639327675988427)}
