# Diabetes Prediction Project
This notebook implements **Week 1 (EDA & Cleaning)** and **Week 2 (Modeling & Evaluation)** for the NIDDK diabetes dataset.

Dataset: `diabetes.csv`

Steps:
- Data exploration & visualization
- Data cleaning & imputation
- Class balance analysis
- Scatter plots & correlations
- Model building (KNN, Logistic Regression, Random Forest, XGBoost)
- Model comparison & evaluation metrics


In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load dataset
df = pd.read_csv("diabetes.csv")

# Quick overview
print(df.info())
print(df.describe())
print(df['Outcome'].value_counts())


In [None]:

# Columns where 0 indicates missing values
zero_missing_cols = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']

# Replace zeros with NaN
df[zero_missing_cols] = df[zero_missing_cols].replace(0, np.nan)

# Count missing values
df.isnull().sum()


In [None]:

# Histograms for variables with missing values
for col in zero_missing_cols:
    plt.figure(figsize=(6,3))
    sns.histplot(df[col].dropna(), kde=True)
    plt.title(f"Histogram of {col} (0s treated as missing)")
    plt.xlabel(col)
    plt.tight_layout()
plt.show()


In [None]:

# Frequency plot of data types
dtype_counts = df.dtypes.value_counts()
plt.figure(figsize=(5,3))
sns.barplot(x=dtype_counts.index.astype(str), y=dtype_counts.values)
plt.title("Count of variable data types")
plt.ylabel("Count")
plt.xlabel("Data type")
plt.show()


In [None]:

# Outcome class balance
sns.countplot(x='Outcome', data=df)
plt.title("Outcome (0 = non-diabetic, 1 = diabetic)")
plt.show()


In [None]:

# Pairwise scatter plots for selected variables
sns.pairplot(df.dropna(subset=['Glucose','BMI','Age','Insulin']), 
             vars=['Glucose','BMI','Age','Insulin'], 
             hue='Outcome', plot_kws={'alpha':0.5, 's':30})
plt.suptitle("Pairwise scatterplots (subset without missing)", y=1.02)
plt.show()


In [None]:

# Correlation heatmap
plt.figure(figsize=(8,6))
corr = df.corr()
sns.heatmap(corr, annot=True, fmt=".2f", cmap='coolwarm', square=True)
plt.title("Correlation heatmap")
plt.tight_layout()
plt.show()


In [None]:

# Median imputation
df_imputed = df.copy()
for col in zero_missing_cols:
    median_val = df_imputed[col].median()
    df_imputed[col].fillna(median_val, inplace=True)

# Optional: Missing flag for Insulin
df_imputed['Insulin_missing_flag'] = df['Insulin'].isnull().astype(int)


In [None]:

from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix, classification_report
import matplotlib.pyplot as plt

# Prepare X, y
X = df_imputed.drop(columns=['Outcome'])
y = df_imputed['Outcome']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# Helper for specificity
def specificity_score(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    return tn / (tn + fp)


In [None]:

# KNN baseline model
pipe_knn = Pipeline([('scaler', StandardScaler()), ('knn', KNeighborsClassifier())])
param_knn = {'knn__n_neighbors': [3,5,7,9,11,13,15], 'knn__weights': ['uniform','distance'], 'knn__p': [1,2]}
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
gs_knn = GridSearchCV(pipe_knn, param_knn, scoring='roc_auc', cv=cv, n_jobs=-1)
gs_knn.fit(X_train, y_train)

knn_best = gs_knn.best_estimator_
y_pred_knn = knn_best.predict(X_test)
y_proba_knn = knn_best.predict_proba(X_test)[:,1]

print("Best KNN params:", gs_knn.best_params_)
print("KNN Test AUC:", roc_auc_score(y_test, y_proba_knn))
print("KNN Classification Report:\n", classification_report(y_test, y_pred_knn))
print("KNN Specificity:", specificity_score(y_test, y_pred_knn))


In [None]:

# Logistic Regression
pipe_log = Pipeline([('scaler', StandardScaler()), ('log', LogisticRegression(max_iter=1000, solver='liblinear'))])
param_log = {'log__C': [0.01, 0.1, 1, 10, 100], 'log__penalty': ['l1','l2']}
gs_log = GridSearchCV(pipe_log, param_log, scoring='roc_auc', cv=cv, n_jobs=-1)
gs_log.fit(X_train, y_train)

best_log = gs_log.best_estimator_
y_pred_log = best_log.predict(X_test)
y_proba_log = best_log.predict_proba(X_test)[:,1]

print("Best Logistic Regression params:", gs_log.best_params_)
print("Logistic Regression Test AUC:", roc_auc_score(y_test, y_proba_log))
print("Logistic Regression Classification Report:\n", classification_report(y_test, y_pred_log))
print("Logistic Regression Specificity:", specificity_score(y_test, y_pred_log))


In [None]:

# Random Forest
pipe_rf = Pipeline([('rf', RandomForestClassifier(random_state=42))])
param_rf = {'rf__n_estimators':[100,200], 'rf__max_depth':[None,5,8,12], 'rf__min_samples_split':[2,5,10]}
gs_rf = GridSearchCV(pipe_rf, param_rf, scoring='roc_auc', cv=cv, n_jobs=-1)
gs_rf.fit(X_train, y_train)

best_rf = gs_rf.best_estimator_
y_pred_rf = best_rf.predict(X_test)
y_proba_rf = best_rf.predict_proba(X_test)[:,1]

print("Best RF params:", gs_rf.best_params_)
print("Random Forest Test AUC:", roc_auc_score(y_test, y_proba_rf))
print("Random Forest Classification Report:\n", classification_report(y_test, y_pred_rf))
print("Random Forest Specificity:", specificity_score(y_test, y_pred_rf))


In [None]:

# ROC Curve comparison
plt.figure(figsize=(6,5))
for name, proba in [('KNN', y_proba_knn), ('Logistic', y_proba_log), ('RF', y_proba_rf)]:
    fpr, tpr, _ = roc_curve(y_test, proba)
    plt.plot(fpr, tpr, label=f"{name} (AUC = {roc_auc_score(y_test, proba):.3f})")

plt.plot([0,1],[0,1],'k--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate (Recall)")
plt.title("ROC Curves")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
