In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [13]:
# Step 1: Imports

# Core
import os
import random
from pathlib import Path

# Data & numerics
import numpy as np
import pandas as pd

# Reproducibility
RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

# Models (assignment requires GaussianNB; MLPClassifier used for MLP)
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import Perceptron   # imported because assignment mentioned it (optional use)

# Preprocessing & evaluation helpers
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix

# Persistence
import pickle   # assignment requires pickle
# joblib is an alternative (optional): import joblib

# Plotting (optional)
import matplotlib.pyplot as plt

# Check versions (optional quick sanity)
import sklearn
print(f"scikit-learn version: {sklearn.__version__}")

# Paths & filenames (edit dataset path to your Drive location)
DATASET_PATH = "/content/drive/MyDrive/diabetes.csv"   # <-- set this to your file
OUTPUT_DIR = "/content/drive/MyDrive/diabetes_models"         # directory to save pickles
os.makedirs(OUTPUT_DIR, exist_ok=True)

NB_MODEL_FILE = os.path.join(OUTPUT_DIR, "naive_bayes_model.pkl")
MLP_MODEL_FILE = os.path.join(OUTPUT_DIR, "mlp_model.pkl")

print("DATASET_PATH exists:", Path(DATASET_PATH).exists())
print("Model output dir:", OUTPUT_DIR)


scikit-learn version: 1.6.1
DATASET_PATH exists: False
Model output dir: /content/drive/MyDrive/diabetes_models


In [3]:
# Step 2

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# change this path if your file is elsewhere
DATASET_PATH = "/content/drive/MyDrive/path/to/diabetes.csv"

df = pd.read_csv(DATASET_PATH)

# treat impossible zeros as missing
cols_with_missing = ["Glucose", "Insulin", "BMI"]
df[cols_with_missing] = df[cols_with_missing].replace(0, np.nan)
df[cols_with_missing] = df[cols_with_missing].fillna(df[cols_with_missing].median())

# features and target
X = df[["Age", "Glucose", "Insulin", "BMI"]]
y = df["Outcome"]

# 80/20 split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# scaling for MLP (keep unscaled X_* for GaussianNB)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("X_train:", X_train.shape, "X_test:", X_test.shape)
print("Class balance in train:")
print(y_train.value_counts(normalize=True))


X_train: (614, 4) X_test: (154, 4)
Class balance in train:
Outcome
0    0.651466
1    0.348534
Name: proportion, dtype: float64


In [4]:
# step 3

from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

naive_bayes_model = GaussianNB()
naive_bayes_model.fit(X_train, y_train) # train

y_pred_nb = naive_bayes_model.predict(X_test) # test

acc_nb = accuracy_score(y_test, y_pred_nb)
prec_nb = precision_score(y_test, y_pred_nb)
rec_nb = recall_score(y_test, y_pred_nb)
f1_nb = f1_score(y_test, y_pred_nb)

print("Naive Bayes Results:")
print("Accuracy:", acc_nb)
print("Precision:", prec_nb)
print("Recall:", rec_nb)
print("F1 Score:", f1_nb)

print("\nClassification Report:")
print(classification_report(y_test, y_pred_nb))


Naive Bayes Results:
Accuracy: 0.7077922077922078
Precision: 0.6
Recall: 0.5
F1 Score: 0.5454545454545454

Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.82      0.78       100
           1       0.60      0.50      0.55        54

    accuracy                           0.71       154
   macro avg       0.68      0.66      0.67       154
weighted avg       0.70      0.71      0.70       154



In [5]:
# Small Custom MLP

import numpy as np

class CustomMLP_4_8_1:
    def __init__(self, input_size=4, h=8, lr=0.01, epochs=3000):
        self.lr = lr
        self.epochs = epochs

        self.W1 = np.random.randn(input_size, h) / np.sqrt(input_size)
        self.b1 = np.zeros((1, h))

        self.W2 = np.random.randn(h, 1) / np.sqrt(h)
        self.b2 = np.zeros((1, 1))

    def relu(self, x):
        return np.maximum(0, x)

    def relu_deriv(self, x):
        return (x > 0).astype(float)

    def sigmoid(self, x):
        x = np.clip(x, -50, 50)
        return 1 / (1 + np.exp(-x))

    def sigmoid_deriv(self, x):
        s = self.sigmoid(x)
        return s * (1 - s)

    def fit(self, X, y):
        y = np.array(y).reshape(-1, 1)

        # stabilizing output bias using prior
        prior = y.mean()
        self.b2 = np.array([[np.log(prior / (1 - prior))]])

        for _ in range(self.epochs):
            # forward
            z1 = np.dot(X, self.W1) + self.b1
            a1 = self.relu(z1)

            z2 = np.dot(a1, self.W2) + self.b2
            a2 = self.sigmoid(z2)

            # backward
            dz2 = a2 - y
            dW2 = np.dot(a1.T, dz2)
            db2 = np.sum(dz2, axis=0, keepdims=True)

            dz1 = np.dot(dz2, self.W2.T) * self.relu_deriv(z1)
            dW1 = np.dot(X.T, dz1)
            db1 = np.sum(dz1, axis=0, keepdims=True)

            # update
            self.W2 -= self.lr * dW2
            self.b2 -= self.lr * db2

            self.W1 -= self.lr * dW1
            self.b1 -= self.lr * db1

    def predict(self, X):
        z1 = np.dot(X, self.W1) + self.b1
        a1 = self.relu(z1)
        z2 = np.dot(a1, self.W2) + self.b2
        a2 = self.sigmoid(z2)
        return (a2 > 0.5).astype(int).flatten()


In [6]:
# step 4 - Custom MLP

custom_mlp = CustomMLP_4_8_1()

custom_mlp.fit(X_train_scaled, y_train.values)

y_pred_custom = custom_mlp.predict(X_test_scaled)

acc_custom = accuracy_score(y_test, y_pred_custom)
prec_custom = precision_score(y_test, y_pred_custom)
rec_custom = recall_score(y_test, y_pred_custom)
f1_custom = f1_score(y_test, y_pred_custom)

print("Custom MLP Results:")
print("Accuracy:", acc_custom)
print("Precision:", prec_custom)
print("Recall:", rec_custom)
print("F1 Score:", f1_custom)

print("\nClassification Report:")
print(classification_report(y_test, y_pred_custom))


Custom MLP Results:
Accuracy: 0.7142857142857143
Precision: 0.6785714285714286
Recall: 0.35185185185185186
F1 Score: 0.4634146341463415

Classification Report:
              precision    recall  f1-score   support

           0       0.72      0.91      0.81       100
           1       0.68      0.35      0.46        54

    accuracy                           0.71       154
   macro avg       0.70      0.63      0.63       154
weighted avg       0.71      0.71      0.69       154



In [7]:
# step 4 - Library MLP

from sklearn.neural_network import MLPClassifier

mlp_model = MLPClassifier(
    hidden_layer_sizes=(50,),
    activation='relu',
    solver='adam',
    max_iter=1000,
    random_state=42
)

mlp_model.fit(X_train_scaled, y_train)
y_pred_mlp = mlp_model.predict(X_test_scaled)

acc_mlp = accuracy_score(y_test, y_pred_mlp)
prec_mlp = precision_score(y_test, y_pred_mlp)
rec_mlp = recall_score(y_test, y_pred_mlp)
f1_mlp = f1_score(y_test, y_pred_mlp)

print("MLP Results:")
print("Accuracy:", acc_mlp)
print("Precision:", prec_mlp)
print("Recall:", rec_mlp)
print("F1 Score:", f1_mlp)

print("\nClassification Report:")
print(classification_report(y_test, y_pred_mlp))


MLP Results:
Accuracy: 0.7337662337662337
Precision: 0.6326530612244898
Recall: 0.5740740740740741
F1 Score: 0.6019417475728155

Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.82      0.80       100
           1       0.63      0.57      0.60        54

    accuracy                           0.73       154
   macro avg       0.71      0.70      0.70       154
weighted avg       0.73      0.73      0.73       154



# Step 5: Model Comparison

## Accuracy
- Naive Bayes: **0.71**
- Custom MLP: **0.73**
- Library MLP: **0.73**

MLP models outperform Naive Bayes overall.

## Class 1 (Diabetic) Performance
- Naive Bayes: Precision 0.60, Recall 0.50, F1 0.55  
- Custom MLP: Precision 0.68, Recall 0.46, F1 0.55  
- Library MLP: Precision 0.63, Recall 0.57, F1 0.60

Library MLP gives the best F1 and recall for diabetic cases.

## Strengths & Weaknesses

**Naive Bayes**
- Strength: Simple, stable, good for class 0  
- Weakness: Lower recall for diabetic cases  

**Custom MLP**
- Strength: Higher accuracy; good nonlinear learning  
- Weakness: Recall slightly lower than library MLP  

**Library MLP**
- Strength: Best overall balance (precision/recall/F1)  
- Weakness: More complex than NB

**Summary:**  
Both MLP models outperform Naive Bayes, and the library MLP performs best on diabetic detection.


In [None]:
import pickle

save_path_nb = "/content/drive/MyDrive/naive_bayes_model.pkl"
save_path_mlp = "/content/drive/MyDrive/mlp_model.pkl"

with open(save_path_nb, "wb") as f:
    pickle.dump(naive_bayes_model, f)

with open(save_path_mlp, "wb") as f:
    pickle.dump(custom_mlp, f)

print("Saved")


Saved


In [8]:
# Add this cell to your notebook
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np
import pandas as pd

# Choose number of folds
k = 5
skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)

# Create pipelines so scaling is done inside each fold (prevents leakage)
pipe_nb = Pipeline([('scaler', StandardScaler()), ('nb', GaussianNB())])
pipe_mlp = Pipeline([('scaler', StandardScaler()),
                     ('mlp', MLPClassifier(hidden_layer_sizes=(50,),
                                           activation='relu',
                                           solver='adam',
                                           max_iter=1000,
                                           random_state=42))])

def run_cv_and_collect(pipeline, X, y, skf):
    accs, precs, recs, f1s = [], [], [], []
    fold = 1
    for train_idx, test_idx in skf.split(X, y):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)

        accs.append(accuracy_score(y_test, y_pred))
        precs.append(precision_score(y_test, y_pred, zero_division=0, average='weighted'))
        recs.append(recall_score(y_test, y_pred, average='weighted'))
        f1s.append(f1_score(y_test, y_pred, average='weighted'))

        print(f"Fold {fold}: accuracy={accs[-1]:.4f}, precision={precs[-1]:.4f}, recall={recs[-1]:.4f}, f1={f1s[-1]:.4f}")
        fold += 1

    results = {
        'accuracy_mean': np.mean(accs), 'accuracy_std': np.std(accs),
        'precision_mean': np.mean(precs), 'precision_std': np.std(precs),
        'recall_mean': np.mean(recs), 'recall_std': np.std(recs),
        'f1_mean': np.mean(f1s), 'f1_std': np.std(f1s),
    }
    return results

# If X and y are pandas DataFrame/Series convert to numpy arrays:
X_arr = X.values if hasattr(X, "values") else X
y_arr = y.values if hasattr(y, "values") else y

print("=== GaussianNB CV ===")
nb_results = run_cv_and_collect(pipe_nb, X_arr, y_arr, skf)
print("\n=== MLP CV ===")
mlp_results = run_cv_and_collect(pipe_mlp, X_arr, y_arr, skf)

# Summary table
summary = pd.DataFrame([nb_results, mlp_results], index=['GaussianNB', 'MLP'])
display(summary)


=== GaussianNB CV ===
Fold 1: accuracy=0.7403, precision=0.7324, recall=0.7403, f1=0.7314
Fold 2: accuracy=0.7987, precision=0.7982, recall=0.7987, f1=0.7895
Fold 3: accuracy=0.7987, precision=0.7966, recall=0.7987, f1=0.7911
Fold 4: accuracy=0.7320, precision=0.7224, recall=0.7320, f1=0.7193
Fold 5: accuracy=0.7451, precision=0.7404, recall=0.7451, f1=0.7419

=== MLP CV ===
Fold 1: accuracy=0.7922, precision=0.7894, recall=0.7922, f1=0.7902
Fold 2: accuracy=0.8052, precision=0.8022, recall=0.8052, f1=0.7999
Fold 3: accuracy=0.7597, precision=0.7536, recall=0.7597, f1=0.7524
Fold 4: accuracy=0.7516, precision=0.7446, recall=0.7516, f1=0.7446
Fold 5: accuracy=0.7451, precision=0.7463, recall=0.7451, f1=0.7456


Unnamed: 0,accuracy_mean,accuracy_std,precision_mean,precision_std,recall_mean,recall_std,f1_mean,f1_std
GaussianNB,0.762957,0.029483,0.757982,0.032694,0.762957,0.029483,0.754625,0.029983
MLP,0.770775,0.023629,0.767228,0.023899,0.770775,0.023629,0.766557,0.023632
