In [20]:
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [21]:
df=pd.read_csv('final_drop.csv')

In [22]:
df.columns

Index(['NACCID', 'NACCVNUM', 'DEMENTED', 'NACCMMSE', 'NACCMOCA', 'CDRSUM',
       'CDRGLOB', 'NACCAGE', 'SEX', 'EDUC', 'NACCNE4S'],
      dtype='object')

In [23]:
df.isnull().sum()   

NACCID          0
NACCVNUM        0
DEMENTED        0
NACCMMSE    23875
NACCMOCA    34692
CDRSUM          3
CDRGLOB         3
NACCAGE         0
SEX             0
EDUC          401
NACCNE4S    14322
dtype: int64

In [24]:
# Step 1: Handle high-missingness features (thousands)
# NACCMMSE, NACCMOCA, NACCNE4S
high_missing_cols = ['NACCMMSE', 'NACCMOCA', 'NACCNE4S']
for col in high_missing_cols:
    if col in df.columns:
        # Convert sentinel codes to NaN first
        df[col] = df[col].apply(lambda x: np.nan if x < 0 or x in [88, 95, 96, 97, 98, 99] else x)
        # Create missingness indicator
        df[f'{col}_missing'] = df[col].isnull().astype(int)
        # Impute with median
        df[col] = df[col].fillna(df[col].median())

# Step 2: Handle low-missingness features (not in thousands)
# EDUC (~401), CDRSUM (3), CDRGLOB (3)
# For these, we impute directly without adding indicators

# EDUC and CDRSUM (Numeric) -> Median
for col in ['EDUC', 'CDRSUM']:
    if col in df.columns:
        # Still handle potential sentinels for EDUC if they exist
        if col == 'EDUC':
             df[col] = df[col].apply(lambda x: np.nan if x < 0 or x in [95, 96, 97, 98, 99] else x)
        df[col] = df[col].fillna(df[col].median())

# CDRGLOB (Categorical/Ordinal) -> Mode
if 'CDRGLOB' in df.columns:
    df['CDRGLOB'] = df['CDRGLOB'].fillna(df['CDRGLOB'].mode()[0])

# Final check
print(f"Dataset shape: {df.shape}")
df.isnull().sum()

Dataset shape: (55268, 14)


NACCID              0
NACCVNUM            0
DEMENTED            0
NACCMMSE            0
NACCMOCA            0
CDRSUM              0
CDRGLOB             0
NACCAGE             0
SEX                 0
EDUC                0
NACCNE4S            0
NACCMMSE_missing    0
NACCMOCA_missing    0
NACCNE4S_missing    0
dtype: int64

In [25]:
## defining the X and Y
FEATURES_CLINICAL = [
    'NACCMMSE', 'NACCMOCA',
    'CDRSUM', 'CDRGLOB',
    'NACCAGE', 'SEX', 'EDUC', 'NACCNE4S',
    'NACCMMSE_missing', 'NACCMOCA_missing', 'NACCNE4S_missing'
]

TARGET = 'DEMENTED'

X = df[FEATURES_CLINICAL]
y = df[TARGET]

In [26]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

In [27]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

numeric_features = [
    'NACCMMSE', 'NACCMOCA',
    'CDRSUM', 'CDRGLOB',
    'NACCAGE', 'EDUC', 'NACCNE4S'
]

binary_features = [
    'SEX',
    'NACCMMSE_missing', 'NACCMOCA_missing', 'NACCNE4S_missing'
]

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('bin', 'passthrough', binary_features)
    ]
)

In [28]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import HistGradientBoostingClassifier

In [29]:
models = {
    'LogisticRegression': LogisticRegression(
        max_iter=2000,
        class_weight='balanced',
        random_state=42
    ),

    'RandomForest': RandomForestClassifier(
        n_estimators=300,
        max_depth=None,
        min_samples_leaf=5,
        class_weight='balanced',
        random_state=42,
        n_jobs=-1
    ),

    'HistGradientBoosting': HistGradientBoostingClassifier(
        max_depth=6,
        learning_rate=0.05,
        max_iter=300,
        random_state=42
    )
}

In [30]:
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix

results = {}

for name, model in models.items():
    print(f"\n===== Training {name} =====")

    pipe = Pipeline(steps=[
        ('preprocess', preprocessor),
        ('model', model)
    ])

    pipe.fit(X_train, y_train)

    y_pred = pipe.predict(X_test)
    y_prob = pipe.predict_proba(X_test)[:, 1]

    auc = roc_auc_score(y_test, y_prob)

    results[name] = {
        'model': pipe,
        'auc': auc
    }

    print(f"AUC: {auc:.4f}")
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("Classification Report:")
    print(classification_report(y_test, y_pred))


===== Training LogisticRegression =====
AUC: 0.9768
Confusion Matrix:
[[7026  539]
 [ 291 3198]]
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.93      0.94      7565
           1       0.86      0.92      0.89      3489

    accuracy                           0.92     11054
   macro avg       0.91      0.92      0.91     11054
weighted avg       0.93      0.92      0.93     11054


===== Training RandomForest =====
AUC: 0.9759
Confusion Matrix:
[[7029  536]
 [ 297 3192]]
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.93      0.94      7565
           1       0.86      0.91      0.88      3489

    accuracy                           0.92     11054
   macro avg       0.91      0.92      0.91     11054
weighted avg       0.93      0.92      0.93     11054


===== Training HistGradientBoosting =====
AUC: 0.9771
Confusion Matrix:
[[7224  341]
 [ 436 3053]]
Classifica

In [31]:
for name, res in results.items():
    print(f"{name}: AUC = {res['auc']:.4f}")

LogisticRegression: AUC = 0.9768
RandomForest: AUC = 0.9759
HistGradientBoosting: AUC = 0.9771


In [32]:
best_model_name = max(results, key=lambda x: results[x]['auc'])
best_model = results[best_model_name]['model']

print("Best model:", best_model_name)

Best model: HistGradientBoosting


In [33]:
import joblib

joblib.dump(best_model, 'clinical_dementia_model.pkl')

['clinical_dementia_model.pkl']