
# 🩺 Diabetes Prediction Project

This notebook demonstrates data preprocessing, exploratory data analysis (EDA), 
model training, evaluation, and results visualization for the Pima Indians Diabetes dataset.

---


In [2]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, RocCurveDisplay
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

import joblib

RANDOM_STATE = 42
NUM_COLS = ['Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age']


In [None]:

# Load dataset
csv_path = "data/raw/diabetes.csv"  # adjust if needed
df = pd.read_csv(csv_path)

# Replace zeros with NaN for specific clinical columns
zero_as_nan = ['Glucose','BloodPressure','SkinThickness','Insulin','BMI']
for col in zero_as_nan:
    df[col] = df[col].replace(0, np.nan)

df.head()


FileNotFoundError: [Errno 2] No such file or directory: 'data/raw/diabetes.csv'

In [None]:

# Summary statistics
df.describe()

# Histograms
df[NUM_COLS].hist(bins=30, figsize=(12, 10))
plt.suptitle("Feature Distributions")
plt.show()

# Correlation heatmap
plt.figure(figsize=(10, 8))
corr = df[NUM_COLS + ['Outcome']].corr()
sns.heatmap(corr, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Matrix")
plt.show()


In [None]:

X = df[NUM_COLS].copy()
y = df['Outcome'].astype(int)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
)


In [None]:

# Preprocessor
numeric_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
preprocessor = ColumnTransformer(
    transformers=[('num', numeric_pipeline, NUM_COLS)],
    remainder='drop'
)

# Models and grids
models = {
    'logreg': (LogisticRegression(max_iter=1000, solver='liblinear', class_weight='balanced', random_state=RANDOM_STATE),
               {'model__C':[0.01,0.1,1,10], 'model__penalty':['l1','l2']}),
    'svm': (SVC(probability=True, class_weight='balanced', random_state=RANDOM_STATE),
            {'model__C':[0.1,1,10], 'model__kernel':['rbf','linear'], 'model__gamma':['scale','auto']}),
    'knn': (KNeighborsClassifier(),
            {'model__n_neighbors':[3,5,7,9,11], 'model__weights':['uniform','distance']}),
    'dt': (DecisionTreeClassifier(class_weight='balanced', random_state=RANDOM_STATE),
           {'model__max_depth':[None,3,5,7,9], 'model__min_samples_split':[2,5,10]}),
    'rf': (RandomForestClassifier(class_weight='balanced', random_state=RANDOM_STATE),
           {'model__n_estimators':[200,500], 'model__max_depth':[None,5,10], 'model__max_features':['sqrt','log2']})
}


In [None]:

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

best_score = -np.inf
best_name = None
best_estimator = None
results_rows = []

for name, (estimator, grid) in models.items():
    pipe = Pipeline(steps=[('preprocess', preprocessor), ('model', estimator)])
    gs = GridSearchCV(
        estimator=pipe,
        param_grid=grid,
        scoring='roc_auc',
        cv=cv,
        n_jobs=-1,
        verbose=0
    )
    gs.fit(X_train, y_train)
    mean_score = gs.best_score_
    results_rows.append({'model': name, 'cv_roc_auc': mean_score, 'best_params': gs.best_params_})
    if mean_score > best_score:
        best_score = mean_score
        best_name = name
        best_estimator = gs.best_estimator_

pd.DataFrame(results_rows)


In [None]:

# Evaluate best model
y_prob = best_estimator.predict_proba(X_test)[:,1]
y_pred = (y_prob >= 0.5).astype(int)

metrics = {
    'accuracy': accuracy_score(y_test, y_pred),
    'precision': precision_score(y_test, y_pred, zero_division=0),
    'recall': recall_score(y_test, y_pred, zero_division=0),
    'f1': f1_score(y_test, y_pred, zero_division=0),
    'roc_auc': roc_auc_score(y_test, y_prob),
}
metrics


In [None]:

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(4,4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()

# ROC curve
RocCurveDisplay.from_predictions(y_test, y_prob)
plt.title("ROC Curve")
plt.show()

# Feature importance / coefficients
model = best_estimator.named_steps['model']
importances = None
if hasattr(model, 'feature_importances_'):
    importances = model.feature_importances_
elif hasattr(model, 'coef_'):
    importances = np.abs(model.coef_).ravel()

if importances is not None:
    order = np.argsort(importances)[::-1]
    plt.figure(figsize=(8,4))
    plt.bar(range(len(importances)), importances[order])
    plt.xticks(range(len(importances)), [NUM_COLS[i] for i in order], rotation=45)
    plt.title("Feature Importance / Coefficients")
    plt.show()


In [None]:

# Save the best pipeline for later use in the Streamlit app
Path("models").mkdir(exist_ok=True)
joblib.dump(best_estimator, "models/best_pipeline.joblib")
print(f"Saved best model: {best_name}")



---

## 📝 Reflection
- Logistic Regression often performs best on this dataset due to linear separability.  
- Recall (~70%) is important in medical screening, even if Precision is a bit lower.  
- The ROC AUC ~0.81 shows good discriminative power.  
- Future improvements: try ensemble methods, calibration, or feature engineering.

---
