In [8]:
# imports
import pandas as pd, numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate, learning_curve
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import (roc_auc_score, average_precision_score, accuracy_score,
                             precision_score, recall_score, ConfusionMatrixDisplay)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt

In [9]:

!pip -q install kagglehub==0.2.5

import os, pandas as pd, kagglehub

# Download the dataset folder
import kagglehub

# Download latest version
path = kagglehub.dataset_download("ankushpanday1/alzheimers-prediction-dataset-global")

print("Path to dataset files:", path)

AttributeError: module 'kagglehub' has no attribute 'dataset_download'

In [10]:
csv_name = "alzheimers_prediction_dataset.csv"  # change if needed
df = pd.read_csv(os.path.join(path, csv_name))

print("Rows, Cols:", df.shape)
df.head(5)

NameError: name 'path' is not defined

In [None]:
# Missing values, quick schema (A)
# 1) Peek at dtypes and missingness
print("Columns:", list(df.columns))
print("\nDtypes:\n", df.dtypes)
print("\nMissing values (top 15):\n", df.isna().sum().sort_values(ascending=False).head(15))

In [None]:
# Create a clean target column, target is Alzheimer's Diagnosis (B)
# 2) Create binary target

df = df.rename(columns={"Alzheimer’s Diagnosis": "alz_dx"})
df["alz_dx"] = df["alz_dx"].map({"Yes": 1, "No": 0}).astype("int8")

# Quick check
print(df["alz_dx"].value_counts(dropna=False))

In [None]:
# Split coluns into numeric vs categorical (to plan preprocdssing) (C)
num_cols = df.select_dtypes(include=["number"]).columns.drop("alz_dx", errors="ignore").tolist()
cat_cols = [c for c in df.columns if c not in num_cols + ["alz_dx"]]

print("Numeric cols:", len(num_cols), num_cols[:10], "...")
print("Categorical cols:", len(cat_cols), cat_cols[:10], "...")

In [None]:
# Train/test split (stratified) (D)
from sklearn.model_selection import train_test_split

X = df.drop(columns=['alz_dx'])
y = df['alz_dx']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, stratify=y, random_state=42
)

print("Train shape:", X_train.shape, " Test shape:", X_test.shape)
print("Pos rate train/test:", y_train.mean().round(3), y_test.mean().round(3))

In [None]:
# Preprocessing transformer (fit ONLY on train) (E)
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

preprocess = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(with_mean=False), num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
    ],
    remainder="drop"
)

# Fit on TRAIN only; transform both
X_train_enc = preprocess.fit_transform(X_train)
X_test_enc  = preprocess.transform(X_test)

print("Encoded shapes:", X_train_enc.shape, X_test_enc.shape)

# (Optional) peek at generated feature names
feat_names = preprocess.get_feature_names_out()
print("Total features:", len(feat_names))
feat_names[:15]

In [None]:
# Train a baseline Logisitic Regression (F)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, confusion_matrix

logit = LogisticRegression(
    solver="saga",        # handles sparse, many features
    max_iter=2000,
    n_jobs=-1,
    random_state=42
)
logit.fit(X_train_enc, y_train)

y_pred = logit.predict(X_test_enc)
y_proba = logit.predict_proba(X_test_enc)[:, 1]

print("Accuracy:", round(accuracy_score(y_test, y_pred), 4))
print("ROC-AUC :", round(roc_auc_score(y_test, y_proba), 4))
print()
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))
print()
print(classification_report(y_test, y_pred, digits=4))


In [None]:
# Grab top features (G)
import numpy as np

coefs = logit.coef_.ravel()
top_idx = np.argsort(np.abs(coefs))[::-1][:15]
for i in top_idx:
    print(f"{feat_names[i]:35s}  coef={coefs[i]: .4f}")

In [None]:
# Improve Logistic Regression (class_weight + C tuning) (H)
# Goal is to boost recall/AUC with minimal complexity
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import roc_auc_score, accuracy_score, classification_report, confusion_matrix

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid = {
    "C": [0.25, 0.5, 1.0, 2.0],
    "penalty": ["l2"],             # stable with many one-hot features
    "class_weight": [None, "balanced"],
    "solver": ["saga"],
    "max_iter": [2000]
}

logit = LogisticRegression(n_jobs=-1, random_state=42)
gs = GridSearchCV(
    logit, grid, scoring="roc_auc", cv=cv, n_jobs=-1, refit=True, verbose=0
)
gs.fit(X_train_enc, y_train)

print("Best params:", gs.best_params_)
print("CV AUC:", round(gs.best_score_, 4))

best_logit = gs.best_estimator_
y_pred = best_logit.predict(X_test_enc)
y_proba = best_logit.predict_proba(X_test_enc)[:, 1]

print("Test Accuracy:", round(accuracy_score(y_test, y_pred), 4))
print("Test ROC-AUC :", round(roc_auc_score(y_test, y_proba), 4))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred, digits=4))

In [None]:
# Tune decison threshold (single cell)
import numpy as np
from sklearn.metrics import precision_recall_curve, confusion_matrix, classification_report, roc_auc_score, accuracy_score

# Use whatever model/probas you already have; fall back to baseline `logit` if needed
try:
    y_scores = y_proba  # from your last model
except NameError:
    # fall back to baseline logistic if available
    y_scores = logit.predict_proba(X_test_enc)[:, 1]

prec, rec, thr = precision_recall_curve(y_test, y_scores)

# 1) Maximize F1
f1 = 2 * (prec * rec) / (prec + rec + 1e-12)
best_idx = np.nanargmax(f1)
best_thr_f1 = thr[best_idx] if best_idx < len(thr) else 0.5

# 2) Or enforce a minimum precision and maximize recall under that precision (example: 0.70)
min_precision = 0.70
candidates = np.where(prec[:-1] >= min_precision)[0]
best_thr_p70 = thr[candidates[np.argmax(rec[candidates])]] if len(candidates) else 0.5

def eval_at(threshold, name):
    y_hat = (y_scores >= threshold).astype(int)
    print(f"\n{name} @ threshold={threshold:.3f}")
    print("Accuracy:", round(accuracy_score(y_test, y_hat), 4))
    print("AUC     :", round(roc_auc_score(y_test, y_scores), 4))  # AUC doesn't change w/ threshold
    print("Confusion matrix:\n", confusion_matrix(y_test, y_hat))
    print(classification_report(y_test, y_hat, digits=4))

eval_at(best_thr_f1,  "Best F1")
eval_at(best_thr_p70, "Max Recall with Precision≥0.70")
