<a href="https://colab.research.google.com/github/jkpoff/Data-Science-II-Group-Project/blob/main/notebooks/TalhaNotebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os, numpy as np, pandas as pd, matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, label_binarize
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score, recall_score, roc_auc_score

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, callbacks
%pip install nbstripout
os.chdir("/content")

REPO_URL="https://github.com/jkpoff/Data-Science-II-Group-Project.git"
REPO="Data-Science-II-Group-Project"

# If repo path exists update it, otherwise clone fresh
if os.path.exists(REPO):
    print(f"Repo '{REPO}' exists, pulling latest changes..")
    os.chdir(REPO)
    !git reset --hard HEAD   # discard local changes
    !git pull
else:
    print(f"Cloning repo '{REPO}'..")
    !git clone "$REPO_URL" "$REPO"
    os.chdir(REPO)

!nbstripout --install
!git branch -a

In [None]:
df = pd.read_csv("/content/Data-Science-II-Group-Project/dataset/diabetes_012_health_indicators_BRFSS2015.csv")
assert "Diabetes_012" in df.columns, "Diabetes_012 column not found"
X = df.drop(columns=["Diabetes_012"]).astype("float32").values
y = df['Diabetes_012'].astype(int).values

In [None]:
print(X.shape)
print(y.shape)

In [None]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.30, stratify=y, random_state=4337)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.50, stratify=y_temp, random_state=4337)

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

In [None]:
unique, counts = np.unique(y_train, return_counts=True)
num_classes = len(unique); total = y_train.shape[0]
class_weight = {int(c): float(total/(num_classes * n)) for c, n in zip(unique, counts)}
print("Class weights:", class_weight)

In [None]:
def build_model(in_dim, out_classes):
  return keras.Sequential([
      layers.Input(shape=(in_dim,)),
      layers.Dense(256, activation="selu", kernel_initializer="lecun_normal"),
      layers.AlphaDropout(0.1),
      layers.Dense(128, activation="selu", kernel_initializer="lecun_normal"),
      layers.AlphaDropout(0.1),
      layers.Dense(64, activation="selu", kernel_initializer="lecun_normal"),
      layers.AlphaDropout(0.1),
      layers.Dense(out_classes, activation="softmax")
  ])

In [None]:
print("Unique classes", len(unique))
model = build(X_train.shape[1], 3)
model.summary(optimizer=keras.optimizers.Adam(1e-3), loss=keras.losses.SparseCategoricalCrossentropy(), metrics=["accuracy"])

In [None]:
ckpt_path = "keras_mlp_talha_diabetes_best.ckpt"
cbs = [
    callbacks.ModelCheckpoint(ckpt_path, monitor="val_accuracy", save_best_only=True, save_weights_only=True),
    callbacks.ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=3, min_lr=1e-5, verbose=1),
    callbacks.EarlyStopping(monitor="val_loss", patience=6, restore_best_weights=True, verbose=1)
]

In [None]:
epochs = 50
batch_size = 1024