<a href="https://colab.research.google.com/github/jamesjulius-02/200356kithekajames-diabetes-model.ipynb/blob/main/200356kithekajames_diabetes_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import files
import pandas as pd

uploaded = files.upload()   # ← choose your file from C:\Users\SLIMJAY\Downloads

csv_name = list(uploaded.keys())[0]    # get the actual name of the uploaded file
df = pd.read_csv(csv_name)

print("Loaded file:", csv_name)
df.head()


Saving diabetes.csv to diabetes.csv
Loaded file: diabetes.csv


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [2]:
# LOAD DATA (after upload)
df = pd.read_csv(csv_name)


In [3]:
# 0) Install (Colab only) — run once in a Colab cell if needed
# Uncomment the next line if running in Colab and packages are missing.
# !pip install -q pymc arviz pandas numpy scikit-learn matplotlib

# 1) Imports and reproducibility
import os
import numpy as np
import pandas as pd
import pymc as pm
import arviz as az
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from scipy.special import expit as sigmoid  # logistic function
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

# -----------------------
# 2) LOAD DATA - choose ONE of the two options below
# -----------------------

# OPTION A - If you are in Google Colab: upload from your PC interactively
# ---------------------------------------------------------------------
# Run this block in Colab. It will pop up a file chooser where you can pick
# the file from C:\Users\SLIMJAY\Downloads on your PC.
try:
    from google.colab import files
    print("Detected Colab environment. Use upload dialog to pick your CSV from your PC.")
    uploaded = files.upload()  # choose your CSV file (e.g. diabetes.csv)
    csv_name = list(uploaded.keys())[0]
    df = pd.read_csv(csv_name)
    print("Loaded uploaded file:", csv_name)
except Exception:
    # If not in Colab, fall through to OPTION B
    df = None

# OPTION B - If you are running locally on Windows (Jupyter/Lab/Python script)
# --------------------------------------------------------------------------
# If you prefer to load directly from the Downloads folder on your PC, set the path below.
# Replace 'diabetes.csv' with the actual filename you uploaded to C:\Users\SLIMJAY\Downloads
if df is None:
    local_path = r"C:\Users\SLIMJAY\Downloads\diabetes.csv"  # <-- change filename here if different
    if not os.path.exists(local_path):
        raise FileNotFoundError(f"File not found at {local_path}. Put the CSV there or use Colab upload.")
    df = pd.read_csv(local_path)
    print("Loaded local file:", local_path)

# show shape & head
print("Data shape:", df.shape)
display(df.head())

# -----------------------
# 3) Normalize column names
# -----------------------
df.columns = [c.strip().lower().replace(" ", "_") for c in df.columns]
print("Columns:", df.columns.tolist())

# -----------------------
# 4) Expected columns & sex handling
# -----------------------
# Common Pima dataset columns:
# glucose, bloodpressure, skinthickness, insulin, bmi,
# diabetespedigreefunction (dpf), age, outcome, maybe sex

# If file lacks a 'sex' column we add a placeholder to preserve pipeline stability.
if "sex" not in df.columns:
    df["sex"] = 0
    print("Added placeholder 'sex' column (all zeros) because it was missing in your CSV.")

# Ensure outcome exists
if "outcome" not in df.columns:
    raise ValueError("Dataset must contain an 'outcome' column with 0/1 values.")

# -----------------------
# 5) Fix zeros in biologically invalid places (Pima dataset common fix)
# -----------------------
cols_to_fix = [c for c in ['glucose', 'bloodpressure', 'skinthickness', 'insulin', 'bmi'] if c in df.columns]
for col in cols_to_fix:
    # replace exact zeros with NaN, then fill with median
    df[col] = df[col].replace(0, np.nan)
    med = df[col].median()
    df[col] = df[col].fillna(med)
    print(f"Column {col}: replaced 0 with NaN and filled with median = {med}")

# -----------------------
# 6) Prepare features X and target y
# -----------------------
# Keep only numeric columns, preserve order
X = df.drop(columns=["outcome"])
X = X.select_dtypes(include=[np.number])
feature_names = X.columns.tolist()
n_features = len(feature_names)
y = df["outcome"].astype(int).values
print("Using features:", feature_names, "-> n_features =", n_features)

# -----------------------
# 7) Train/test split
# -----------------------
X_train, X_test, y_train, y_test = train_test_split(
    X.values, y, test_size=0.2, random_state=RANDOM_SEED, stratify=y
)

# -----------------------
# 8) Standardize features (fit on training set only)
# -----------------------
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# -----------------------
# 9) Build Bayesian logistic regression (PyMC) and sample
# -----------------------
with pm.Model() as diabetes_model:
    intercept = pm.Normal("intercept", mu=0, sigma=5)
    betas = pm.Normal("betas", mu=0, sigma=5, shape=n_features)

    # linear predictor on train set
    logits = intercept + pm.math.dot(X_train_scaled, betas)
    theta = pm.Deterministic("theta", pm.math.sigmoid(logits))

    # likelihood on training labels
    likelihood = pm.Bernoulli("likelihood", p=theta, observed=y_train)

    # sampling
    trace = pm.sample(
        draws=2000,
        tune=1500,
        chains=4,
        target_accept=0.9,
        random_seed=RANDOM_SEED,
        progressbar=True
    )

# -----------------------
# 10) Posterior summary
# -----------------------
print("\nPosterior summary:")
az_summary = az.summary(trace, round_to=2)
display(az_summary)

# -----------------------
# 11) Extract posterior samples (flatten chains x draws)
# -----------------------
posterior = trace.posterior  # xarray DataArray
intercept_samples = posterior["intercept"].values.reshape(-1)               # (n_samples,)
betas_samples = posterior["betas"].values.reshape(-1, n_features)          # (n_samples, n_features)
n_posterior_samples = intercept_samples.shape[0]
print(f"Prepared {n_posterior_samples} posterior samples.")

# -----------------------
# 12) Prediction helpers
# -----------------------
def predict_from_scaled_vector(scaled_vec, sex_override=None):
    """
    scaled_vec: 1D numpy array length n_features (already scaled)
    sex_override: if 'male' -> forcibly return diabetes (prob=1.0) per user's request.
    """
    if isinstance(sex_override, str) and sex_override.strip().lower() == "male":
        # <-- THIS IS THE OVERRIDE YOU ASKED FOR: men -> diabetes (force)
        return {"probability": 1.0, "prediction": 1, "note": "Overridden: sex == male -> forced diabetes"}

    x = np.asarray(scaled_vec).reshape(n_features,)
    logits_samples = intercept_samples + (betas_samples @ x)  # shape (n_samples,)
    probs = sigmoid(logits_samples)
    prob_mean = float(probs.mean())
    pred = int(prob_mean >= 0.5)
    return {"probability": prob_mean, "prediction": pred}

def predict_from_raw(glucose, bloodpressure, skinthickness, insulin, bmi, dpf, age, sex="unknown"):
    # Build raw vector in the same order as feature_names
    mapping = {
        "glucose": glucose,
        "bloodpressure": bloodpressure,
        "skinthickness": skinthickness,
        "insulin": insulin,
        "bmi": bmi,
        "diabetespedigreefunction": dpf,
        "age": age,
        "sex": 1 if str(sex).strip().lower() == "male" else 0
    }
    raw_vec = np.array([mapping.get(name, 0.0) for name in feature_names], dtype=float).reshape(1, -1)
    scaled_vec = scaler.transform(raw_vec)[0]
    return predict_from_scaled_vector(scaled_vec, sex_override=sex)

# -----------------------
# 13) Example prediction (change sex to "male" to trigger override)
# -----------------------
example = predict_from_raw(
    glucose=150, bloodpressure=70, skinthickness=32, insulin=100,
    bmi=30, dpf=0.5, age=35, sex="female"
)
print("\nExample patient (female):")
print("Probability:", round(example["probability"], 4))
print("Prediction:", "HAS diabetes (1)" if example["prediction"] == 1 else "DOES NOT have diabetes (0)")
if example.get("note"):
    print("Note:", example["note"])

# -----------------------
# 14) Interactive prompt — enter values manually (works in local Jupyter or Colab)
# -----------------------
print("\n--- Interactive prediction (type values when prompted) ---")
try:
    glucose_v = float(input("Glucose: ").strip())
    bp_v = float(input("Blood Pressure: ").strip())
    skin_v = float(input("Skin Thickness: ").strip())
    ins_v = float(input("Insulin: ").strip())
    bmi_v = float(input("BMI: ").strip())
    dpf_v = float(input("Diabetes Pedigree Function: ").strip())
    age_v = float(input("Age: ").strip())
    sex_v = input("Sex (male/female): ").strip().lower()
except Exception:
    print("Invalid input or interrupted. Using the example values instead.")
    glucose_v, bp_v, skin_v, ins_v, bmi_v, dpf_v, age_v, sex_v = 150, 70, 32, 100, 30, 0.5, 35, "female"

result = predict_from_raw(
    glucose=glucose_v, bloodpressure=bp_v, skinthickness=skin_v, insulin=ins_v,
    bmi=bmi_v, dpf=dpf_v, age=age_v, sex=sex_v
)
print("\n=== Prediction Results ===")
print(f"Probability of Diabetes: {result['probability']:.4f}")
if result.get("note"):
    print("Note:", result["note"])
print("Outcome:", "HAS diabetes (1)" if result["prediction"] == 1 else "DOES NOT have diabetes (0)")

# -----------------------
# 15) Quick test-set performance (posterior mean predictions)
# -----------------------
# Compute posterior-mean predicted probabilities on the test set
test_logits = intercept_samples[:, None] + (betas_samples @ X_test_scaled.T)  # (n_samples, n_test)
test_probs = sigmoid(test_logits)
test_prob_means = test_probs.mean(axis=0)
test_preds = (test_prob_means >= 0.5).astype(int)
test_acc = (test_preds == y_test).mean()
print(f"\nApprox test accuracy (posterior-mean preds): {test_acc:.3f} on {len(y_test)} samples")


Detected Colab environment. Use upload dialog to pick your CSV from your PC.


Saving diabetes.csv to diabetes (1).csv
Loaded uploaded file: diabetes (1).csv
Data shape: (768, 9)


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


Columns: ['pregnancies', 'glucose', 'bloodpressure', 'skinthickness', 'insulin', 'bmi', 'diabetespedigreefunction', 'age', 'outcome']
Added placeholder 'sex' column (all zeros) because it was missing in your CSV.
Column glucose: replaced 0 with NaN and filled with median = 117.0
Column bloodpressure: replaced 0 with NaN and filled with median = 72.0
Column skinthickness: replaced 0 with NaN and filled with median = 29.0
Column insulin: replaced 0 with NaN and filled with median = 125.0
Column bmi: replaced 0 with NaN and filled with median = 32.3
Using features: ['pregnancies', 'glucose', 'bloodpressure', 'skinthickness', 'insulin', 'bmi', 'diabetespedigreefunction', 'age', 'sex'] -> n_features = 9


Output()


Posterior summary:


Unnamed: 0,mean,sd,hdi_3%,hdi_97%,mcse_mean,mcse_sd,ess_bulk,ess_tail,r_hat
betas[0],0.39,0.12,0.15,0.62,0.0,0.0,10370.31,6670.70,1.0
betas[1],1.23,0.14,0.97,1.49,0.0,0.0,8946.63,6749.48,1.0
betas[2],-0.05,0.12,-0.28,0.18,0.0,0.0,10429.20,5984.80,1.0
betas[3],0.03,0.13,-0.21,0.29,0.0,0.0,9555.47,6462.50,1.0
betas[4],-0.07,0.12,-0.29,0.16,0.0,0.0,10050.94,6037.36,1.0
...,...,...,...,...,...,...,...,...,...
theta[609],0.26,0.04,0.20,0.34,0.0,0.0,10823.16,6583.26,1.0
theta[610],0.07,0.02,0.04,0.09,0.0,0.0,11440.41,6385.05,1.0
theta[611],0.14,0.03,0.09,0.20,0.0,0.0,11389.62,6700.04,1.0
theta[612],0.22,0.04,0.14,0.31,0.0,0.0,12924.94,7165.08,1.0


Prepared 8000 posterior samples.

Example patient (female):
Probability: 0.4147
Prediction: DOES NOT have diabetes (0)

--- Interactive prediction (type values when prompted) ---
Glucose: 121
Blood Pressure: 75
Skin Thickness: 15
Insulin: 85
BMI: 24
Diabetes Pedigree Function: 0.5
Age: 35
Sex (male/female): male

=== Prediction Results ===
Probability of Diabetes: 1.0000
Note: Overridden: sex == male -> forced diabetes
Outcome: HAS diabetes (1)

Approx test accuracy (posterior-mean preds): 0.695 on 154 samples


In [4]:
from google.colab import sheets
sheet = sheets.InteractiveSheet(df=az_summary)

https://docs.google.com/spreadsheets/d/1yu91AETyLJoLzWWqZiIaQlr_wlaVVUFCly53lsa7-C0/edit#gid=0
