In [45]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np


In [46]:
df=pd.read_csv('model2_data.csv')

In [47]:
df.columns

Index(['NACCID', 'NACCVNUM', 'DEMENTED', 'NACCMMSE', 'NACCMOCA', 'CDRSUM',
       'CDRGLOB', 'NACCAGE', 'SEX', 'EDUC', 'NACCNE4S'],
      dtype='object')

In [48]:
# Step 1: Handle high-missingness features (thousands)
# NACCMMSE, NACCMOCA, NACCNE4S
high_missing_cols = ['NACCMMSE', 'NACCMOCA', 'NACCNE4S']
for col in high_missing_cols:
    if col in df.columns:
        # Convert sentinel codes to NaN first
        df[col] = df[col].apply(lambda x: np.nan if x < 0 or x in [88, 95, 96, 97, 98, 99] else x)
        # Create missingness indicator
        df[f'{col}_missing'] = df[col].isnull().astype(int)
        # Impute with median
        df[col] = df[col].fillna(df[col].median())

# Step 2: Handle low-missingness features (not in thousands)
# EDUC (~401), CDRSUM (3), CDRGLOB (3)
# For these, we impute directly without adding indicators

# EDUC and CDRSUM (Numeric) -> Median
for col in ['EDUC', 'CDRSUM']:
    if col in df.columns:
        # Still handle potential sentinels for EDUC if they exist
        if col == 'EDUC':
             df[col] = df[col].apply(lambda x: np.nan if x < 0 or x in [95, 96, 97, 98, 99] else x)
        df[col] = df[col].fillna(df[col].median())

# CDRGLOB (Categorical/Ordinal) -> Mode
if 'CDRGLOB' in df.columns:
    df['CDRGLOB'] = df['CDRGLOB'].fillna(df['CDRGLOB'].mode()[0])

# Final check
print(f"Dataset shape: {df.shape}")
df.isnull().sum()

Dataset shape: (55268, 14)


NACCID              0
NACCVNUM            0
DEMENTED            0
NACCMMSE            0
NACCMOCA            0
CDRSUM              0
CDRGLOB             0
NACCAGE             0
SEX                 0
EDUC                0
NACCNE4S            0
NACCMMSE_missing    0
NACCMOCA_missing    0
NACCNE4S_missing    0
dtype: int64

In [49]:
df.columns

Index(['NACCID', 'NACCVNUM', 'DEMENTED', 'NACCMMSE', 'NACCMOCA', 'CDRSUM',
       'CDRGLOB', 'NACCAGE', 'SEX', 'EDUC', 'NACCNE4S', 'NACCMMSE_missing',
       'NACCMOCA_missing', 'NACCNE4S_missing'],
      dtype='object')

In [50]:
FEATURES_SCREENING = [
    'NACCMMSE',
    'NACCMOCA',
    'NACCAGE',
    'SEX',
    'EDUC',
    'NACCNE4S',
    'NACCMMSE_missing',
    'NACCMOCA_missing',
    'NACCNE4S_missing'
]

TARGET = 'DEMENTED'

X = df[FEATURES_SCREENING]
y = df[TARGET]

In [51]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

In [52]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

numeric_features = [
    'NACCMMSE',
    'NACCMOCA',
    'NACCAGE',
    'EDUC',
    'NACCNE4S'
]

binary_features = [
    'SEX',
    'NACCMMSE_missing',
    'NACCMOCA_missing',
    'NACCNE4S_missing'
]

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('bin', 'passthrough', binary_features)
    ]
)

In [53]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(
    max_iter=3000,
    class_weight='balanced',
    solver='lbfgs'
)

lr_params = {
    'model__C': [0.01, 0.1, 1, 10]
}

In [54]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)

rf_params = {
    'model__n_estimators': [200, 400],
    'model__max_depth': [None, 10, 20],
    'model__min_samples_leaf': [5, 10]
}

In [55]:
from sklearn.ensemble import HistGradientBoostingClassifier

hgb = HistGradientBoostingClassifier(
    random_state=42
)

hgb_params = {
    'model__learning_rate': [0.03, 0.05, 0.1],
    'model__max_depth': [4, 6, 8],
    'model__max_iter': [200, 300]
}

In [56]:
from sklearn.model_selection import RandomizedSearchCV

models = {
    'LogisticRegression': (lr, lr_params),
    'RandomForest': (rf, rf_params),
    'HistGradientBoosting': (hgb, hgb_params)
}

results = {}

In [57]:
from sklearn.metrics import roc_auc_score, classification_report

for name, (model, params) in models.items():
    print(f"\n===== Tuning {name} =====")

    pipe = Pipeline(steps=[
        ('preprocess', preprocessor),
        ('model', model)
    ])

    search = RandomizedSearchCV(
        pipe,
        param_distributions=params,
        n_iter=10,
        scoring='roc_auc',
        cv=5,
        random_state=42,
        n_jobs=-1,
        verbose=1
    )

    search.fit(X_train, y_train)

    best_model = search.best_estimator_
    y_prob = best_model.predict_proba(X_test)[:, 1]
    auc = roc_auc_score(y_test, y_prob)

    results[name] = {
        'best_model': best_model,
        'best_params': search.best_params_,
        'auc': auc
    }

    print(f"{name} Test AUC: {auc:.4f}")
    print("Best Params:", search.best_params_)


===== Tuning LogisticRegression =====
Fitting 5 folds for each of 4 candidates, totalling 20 fits




LogisticRegression Test AUC: 0.9087
Best Params: {'model__C': 0.1}

===== Tuning RandomForest =====
Fitting 5 folds for each of 10 candidates, totalling 50 fits
RandomForest Test AUC: 0.9118
Best Params: {'model__n_estimators': 400, 'model__min_samples_leaf': 10, 'model__max_depth': 10}

===== Tuning HistGradientBoosting =====
Fitting 5 folds for each of 10 candidates, totalling 50 fits
HistGradientBoosting Test AUC: 0.9129
Best Params: {'model__max_iter': 300, 'model__max_depth': 4, 'model__learning_rate': 0.1}


In [58]:
for name, res in results.items():
    print(f"{name}: AUC = {res['auc']:.4f}")

    

LogisticRegression: AUC = 0.9087
RandomForest: AUC = 0.9118
HistGradientBoosting: AUC = 0.9129


In [59]:
best_model_name = max(results, key=lambda x: results[x]['auc'])
best_screening_model = results[best_model_name]['best_model']

print("FINAL SCREENING MODEL:", best_model_name)

FINAL SCREENING MODEL: HistGradientBoosting


In [60]:
import joblib

joblib.dump(
    best_screening_model,
    'screening_dementia_model.pkl'
)

['screening_dementia_model.pkl']