In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    ConfusionMatrixDisplay,
)

# ----------------------------------
# Step 1: Load and inspect dataset
# ----------------------------------
df = pd.read_csv("diabetes.csv")

# Initial inspection
print("Dataset Shape:", df.shape)
print("\nData Types:\n", df.dtypes)
print("\nMissing Values:\n", df.isnull().sum())
print("\nStatistical Summary:\n", df.describe())

# ----------------------------------
# Step 2: Data Cleaning
# ----------------------------------

# Columns where 0 is biologically invalid
invalid_zero_cols = ["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"]

# Replace 0s with NaN for better imputation
for col in invalid_zero_cols:
    df[col] = df[col].replace(0, np.nan)

# Check for missing values
print("\nMissing values after replacement:\n", df.isnull().sum())

# Visualize missing values
plt.figure(figsize=(10, 6))
sns.heatmap(df.isnull(), cbar=False, cmap="viridis")
plt.title("Missing Values Heatmap")
plt.show()

# Impute using median
df.fillna(df.median(), inplace=True)

# Additional cleaning: removing outliers using z-score
from scipy.stats import zscore

z_scores = np.abs(zscore(df))
df_clean = df[(z_scores < 3).all(axis=1)]

print("\nOriginal dataset size:", df.shape[0])
print("Cleaned dataset size:", df_clean.shape[0])

# ----------------------------------
# Step 3: Visualizations
# ----------------------------------

plt.figure(figsize=(10, 4))
sns.histplot(df_clean["Age"], kde=True)
plt.title("Age Distribution")

plt.figure(figsize=(20, 6))
pd.crosstab(df_clean.Age, df_clean.Outcome).plot(kind="bar", figsize=(20, 6), color=["yellow", "blue"])
plt.title("Disease Frequency for Ages")
plt.xlabel("Age")
plt.ylabel("Frequency")
plt.show()

plt.figure(figsize=(10, 8))
sns.heatmap(df_clean.corr(), annot=True, cmap="coolwarm")
plt.title("Feature Correlation")
plt.show()

# ----------------------------------
# Step 4: Train-Test Split & Scaling
# ----------------------------------

X = df_clean.drop("Outcome", axis=1)
y = df_clean["Outcome"]

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.2, random_state=42)

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# ----------------------------------
# Step 5: Model Setup
# ----------------------------------

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC(probability=True),
    "KNN": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
}

accuracies = {}
best_model = None
best_accuracy = 0

# ----------------------------------
# Step 6: Training and Evaluation
# ----------------------------------

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    accuracies[name] = acc * 100

    print(f"\n{name} Accuracy: {acc * 100:.2f}%")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

    cm = confusion_matrix(y_test, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)
    disp.plot(cmap="Blues")
    plt.title(f"{name} - Confusion Matrix")
    plt.show()

    if acc > best_accuracy:
        best_accuracy = acc
        best_model = model

# ----------------------------------
# Step 7: Model Comparison
# ----------------------------------

plt.figure(figsize=(10, 6))
sns.barplot(x=list(accuracies.keys()), y=list(accuracies.values()), palette="viridis")
plt.xticks(rotation=45)
plt.ylabel("Accuracy (%)")
plt.title("Model Accuracy Comparison")
plt.show()

# ----------------------------------
# Step 8: Hyperparameter Tuning (for best model type)
# ----------------------------------

if isinstance(best_model, RandomForestClassifier):
    param_grid = {
        "n_estimators": [50, 100, 150],
        "max_depth": [3, 5, 10],
        "criterion": ["gini", "entropy"],
    }
elif isinstance(best_model, KNeighborsClassifier):
    param_grid = {
        "n_neighbors": [3, 5, 7],
        "weights": ["uniform", "distance"],
    }
elif isinstance(best_model, SVC):
    param_grid = {
        "C": [0.1, 1, 10],
        "kernel": ["linear", "rbf"],
    }
else:
    param_grid = {}

if param_grid:
    grid = GridSearchCV(best_model, param_grid, cv=3, n_jobs=-1)
    grid.fit(X_train, y_train)
    print("\nBest Parameters Found:\n", grid.best_params_)

# ----------------------------------
# Step 9: Summary
# ----------------------------------

print("\nFinal Accuracy Scores:")
for model_name, score in accuracies.items():
    print(f"{model_name}: {score:.2f}%")
