# Data Preprocessing & Understanding

In [None]:
import pandas as pd

In [None]:
data = pd.read_csv("Dataset.csv")


In [None]:
data.shape

In [None]:
data.dtypes

In [None]:
print(data.isnull().sum())

# Feature Engineering & Selection

In [None]:
non_medical_columns = [
                       'NACCADC'
                       ,'PACKET'
                       ,'FORMVER'
                       ,'VISITMO'
                       ,'VISITDAY'
                       ,'VISITYR'
                       ,'NACCVNUM'
                       ,'NACCAVST'
                       ,'NACCNVST'
                       ,'NACCDAYS'
                       ,'NACCFDYS'
                       ,'NACCREAS'
                       ,'NACCREFR'
                       ,'BIRTHMO'
                       ,'BIRTHYR'
                       ,'SEX'
                       ,'HISPANIC'
                       ,'HISPOR'
                       ,'RACE'
                       ,'PRIMLANG'
                       ,'EDUC'
                       ,'MARISTAT'
                       ,'NACCLIVS'
                       ,'INDEPEND'
                       ,'RESIDENC'
                       ,'HANDED'
                       ,'INBIRMO'
                       ,'INBIRYR'
                       ,'INSEX'
                       ,'NEWINF'
                       ,'INHISP'
                       ,'INHISPOR'
                       ,'NACCNINR'
                       ,'INRACE'
                       ,'INEDUC'
                       ,'INRELTO'
                       ,'INKNOWN'
                       ,'INLIVWTH'
                       ,'INVISITS'
                       ,'INCALLS'
                       ,'INRELY'
                       ,'BILLS'
                       ,'TAXES'
                       ,'SHOPPING'
                       ,'GAMES'
                       ,'STOVE'
                       ,'MEALPREP'
                       ,'EVENTS'
                       ,'PAYATTN'
                       ,'REMDATES'
                       ,'TRAVEL'
                       ,'NACCAGEB'
                       ,'NACCNIHR'
                       ,'HEIGHT'
                       ,'WEIGHT'
                       ,'NACCBMI'
                       ,'NACCAGE'
                       ,'NACCDIED'
                       ,'NACCAUTP'
                       ,'NACCACTV'
                       ,'NACCNOVS'
                       ,'NACCNURP'
                       ,'NACCFTD'
                       ,'NACCMDSS'
                       ,'NACCPAFF'
                       ,'NACCLBDM'
                       ,'NACCACSF'
                       ,'NACCPCSF'
                       ,'NACCTCSF'
                       ,'NACCMRSA'
                       ,'NACCNMRI','NACCAPSA','NACCNAPA','TELCOV','TELMOD','NOGDS','SATIS','DROPACT','EMPTY','BORED','SPIRITS','AFRAID','HAPPY','HELPLESS','STAYHOME','MEMPROB','WONDRFUL','WRTHLESS','ENERGY','HOPELESS','BETTER','NACCGDS',
                       ]

In [None]:
lev1=data[non_medical_columns + ['DEMENTED']]

In [None]:
lev1.shape

In [None]:
lev1.to_csv('cleaned_data.csv')

In [None]:
for col in lev1.columns:
    print(col,len(lev1[col].unique()),'labels')



In [None]:
packt_dec = {"I":1,"F":2,"IT":3,"T":4}

lev1['PACKET']=[packt_dec[g] for g in lev1['PACKET'] ]

In [None]:
import seaborn as su
import matplotlib.pyplot as plt

su.heatmap(lev1.corr(),cmap="YlGnBu")
plt.show()

# Model Development & Training

In [None]:
#Same Info Twice
#Neuropathology data from an autopsy is available
columns_to_drop = [
    'NACCFDYS', 'NACCAGEB', 'HEIGHT', 'WEIGHT', 'SATIS', 'DROPACT', 'EMPTY', 
    'BORED', 'SPIRITS', 'AFRAID', 'HAPPY', 'HELPLESS', 'STAYHOME', 
    'MEMPROB', 'WONDRFUL', 'WRTHLESS', 'ENERGY', 'HOPELESS', 'BETTER', 
    'NACCADC', 'NACCAVST', 'NACCNVST', 'NACCDIED', 'NACCAUTP', 'NACCACTV', 
    'NACCNOVS', 'NACCNURP', 'NACCFTD', 'NACCMDSS', 'NACCPAFF', 
    'NACCLBDM', 'NACCACSF', 'NACCPCSF', 'NACCTCSF', 'NACCMRSA', 
    'NACCNMRI', 'NACCAPSA', 'NACCNAPA'
]


# Feature Lists 
numerical_features = [
    'NACCVNUM', 'NACCDAYS', 'EDUC', 'INBIRMO', 'INBIRYR', 'INEDUC', 'INKNOWN',
    'BILLS', 'TAXES', 'SHOPPING', 'GAMES', 'STOVE', 'MEALPREP', 'EVENTS',
    'PAYATTN', 'REMDATES', 'TRAVEL', 'NACCAGE', 'NACCBMI', 'NACCGDS'
]


categorical_features = [
    'PACKET', 'FORMVER', 'VISITMO', 'VISITDAY', 'VISITYR', 'NACCREAS', 
    'NACCREFR', 'BIRTHMO', 'SEX', 'HISPANIC', 'HISPOR', 'RACE', 'PRIMLANG', 
    'MARISTAT', 'NACCLIVS', 'INDEPEND', 'RESIDENC', 'HANDED', 'INSEX', 
    'NEWINF', 'INHISP', 'INHISPOR', 'NACCNINR', 'INRACE', 'INRELTO', 
    'INLIVWTH', 'INVISITS', 'INCALLS', 'INRELY', 'NACCNIHR', 'TELCOV', 
    'TELMOD', 'NOGDS'
]



# Hyperparameter Tuning & Optimization & Model Evaluation & Metrics

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier 
from sklearn.linear_model import LogisticRegression


y = lev1['DEMENTED']
X = lev1.drop(['DEMENTED'] + columns_to_drop, axis=1)

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='passthrough' 
)


**Full Model Pipeline (Using LightGBM)** 

In [None]:

lgbm_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', LGBMClassifier(random_state=42))
])

param_grid = {
    'model__n_estimators': [100, 200],
    'model__learning_rate': [0.05, 0.1],
    'model__max_depth': [5, 10]
}

grid_search = GridSearchCV(
    lgbm_pipeline,
    param_grid,
    cv=3, 
    n_jobs=-1,
    verbose=2
)



In [None]:
print("Starting Grid Search...")
grid_search.fit(X_train, y_train)


print("\n--- Grid Search Complete ---")
print(f"Best Score (Accuracy): {grid_search.best_score_:.4f}")
print("Best Parameters Found:")
print(grid_search.best_params_)


best_model = grid_search.best_estimator_
final_accuracyLightGBM = best_model.score(X_test, y_test)
print(f"Accuracy on Test Set with Best Model: {final_accuracyLightGBM:.4f}")

**Full Model Pipeline (Using Random Forest)**

In [None]:

rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestClassifier(random_state=42))
])


param_grid_rf = {
    'model__n_estimators': [100, 200],      
    'model__max_depth': [10, 20, None],     
    'model__min_samples_leaf': [1, 2, 4]    
}


grid_search_rf = GridSearchCV(
    rf_pipeline,
    param_grid_rf,
    cv=3,         
    n_jobs=-1,    
    verbose=2
)

print("Starting Grid Search for Random Forest...")
grid_search_rf.fit(X_train, y_train)


print("\n--- Random Forest Grid Search Complete ---")
print(f"Best Score (Accuracy): {grid_search_rf.best_score_:.4f}")
print("Best Parameters Found:")
print(grid_search_rf.best_params_)


best_rf_model = grid_search_rf.best_estimator_
final_rf_accuracyRandomForest = best_rf_model.score(X_test, y_test)
print(f"Accuracy on Test Set with Best Random Forest: {final_rf_accuracyRandomForest:.4f}")

**Full Model Pipeline (Using LogisticRegression)**

In [None]:
lr_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', LogisticRegression(random_state=42, max_iter=1000, solver='liblinear')) 
])


param_grid_lr = {
    'model__penalty': ['l1', 'l2'],       
    'model__C': [0.01, 0.1, 1.0, 10.0]     
}


grid_search_lr = GridSearchCV(
    lr_pipeline,
    param_grid_lr,
    cv=3,
    n_jobs=-1,
    verbose=2
)

print("Starting Grid Search for Logistic Regression...")
grid_search_lr.fit(X_train, y_train)


print("\n--- Logistic Regression Grid Search Complete ---")
print(f"Best Score (Accuracy): {grid_search_lr.best_score_:.4f}")
print("Best Parameters Found:")
print(grid_search_lr.best_params_)


best_lr_model = grid_search_lr.best_estimator_
final_lr_accuracy = best_lr_model.score(X_test, y_test)
print(f"Accuracy on Test Set with Best Logistic Regression: {final_lr_accuracy:.4f}")

**Full Model Pipeline (Using CatBoostClassifier)**

In [None]:
from catboost import CatBoostClassifier


cb_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', CatBoostClassifier(random_state=42, verbose=0))
])

param_grid_cb = {
    'model__n_estimators': [100, 200],
    'model__learning_rate': [0.05, 0.1],
    'model__max_depth': [5, 10]
}

grid_search_cb = GridSearchCV(
    cb_pipeline,
    param_grid_cb,
    cv=3,
    n_jobs=-1,
    verbose=2
)

print("Starting Grid Search for CatBoost...")
grid_search_cb.fit(X_train, y_train)

print("\n--- CatBoost Grid Search Complete ---")
print(f"Best Score (Accuracy): {grid_search_cb.best_score_:.4f}")
print("Best Parameters Found:")
print(grid_search_cb.best_params_)

best_cb_model = grid_search_cb.best_estimator_
final_cb_accuracy = best_cb_model.score(X_test, y_test)
print(f"Accuracy on Test Set with Best CatBoost: {final_cb_accuracy:.4f}")

**Full Model Pipeline (Using XGBClassifier)**

In [None]:
from xgboost import XGBClassifier

xgb_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', XGBClassifier(random_state=42, eval_metric='logloss'))
])

param_grid_xgb = {
    'model__n_estimators': [100, 200],
    'model__learning_rate': [0.05, 0.1],
    'model__max_depth': [5, 10]
}

grid_search_xgb = GridSearchCV(
    xgb_pipeline,
    param_grid_xgb,
    cv=3,
    n_jobs=-1,
    verbose=2
)

print("Starting Grid Search for XGBoost...")
grid_search_xgb.fit(X_train, y_train)

print("\n--- XGBoost Grid Search Complete ---")
print(f"Best Score (Accuracy): {grid_search_xgb.best_score_:.4f}")
print("Best Parameters Found:")
print(grid_search_xgb.best_params_)

best_xgb_model = grid_search_xgb.best_estimator_
final_xgb_accuracy = best_xgb_model.score(X_test, y_test)
print(f"Accuracy on Test Set with Best XGBoost: {final_xgb_accuracy:.4f}")

**Neural Network (MLPClassifier) Pipeline**

In [None]:
from sklearn.neural_network import MLPClassifier

mlp_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', MLPClassifier(random_state=42, max_iter=1000, early_stopping=True))
])


param_grid_mlp = {
    'model__hidden_layer_sizes': [(100,), (100, 50)], 
    'model__activation': ['relu', 'tanh'],           
    'model__alpha': [0.0001, 0.001]                  
}

grid_search_mlp = GridSearchCV(
    mlp_pipeline,
    param_grid_mlp,
    cv=3,
    n_jobs=-1,
    verbose=2
)

print("Starting Grid Search for MLPClassifier (Neural Network)...")
grid_search_mlp.fit(X_train, y_train)

print("\n--- MLPClassifier Grid Search Complete ---")
print(f"Best Score (Accuracy): {grid_search_mlp.best_score_:.4f}")
print("Best Parameters Found:")
print(grid_search_mlp.best_params_)

best_mlp_model = grid_search_mlp.best_estimator_
final_mlp_accuracy = best_mlp_model.score(X_test, y_test)
print(f"Accuracy on Test Set with Best MLPClassifier: {final_mlp_accuracy:.4f}")

**StackingClassifier Pipeline (Meta-Model)**

In [None]:
from sklearn.ensemble import StackingClassifier

estimator_list = [
    ('lgbm', Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', LGBMClassifier(random_state=42, learning_rate=0.1, max_depth=10, n_estimators=200))
    ])),
    ('rf', Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', RandomForestClassifier(random_state=42, max_depth=None, min_samples_leaf=1, n_estimators=200))
    ])),
    ('lr', Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', LogisticRegression(random_state=42, max_iter=1000, solver='liblinear', C=1.0, penalty='l1'))
    ]))
]

stacking_model = StackingClassifier(
    estimators=estimator_list,
    final_estimator=LogisticRegression(random_state=42, max_iter=1000),
    cv=3,       
    n_jobs=-1
)


param_grid_stacking = {
    'final_estimator__C': [0.1, 1.0, 10.0]
}

grid_search_stacking = GridSearchCV(
    stacking_model,
    param_grid_stacking,
    cv=3,
    n_jobs=-1,
    verbose=2
)

print("Starting Grid Search for StackingClassifier...")
grid_search_stacking.fit(X_train, y_train)

print("\n--- StackingClassifier Grid Search Complete ---")
print(f"Best Score (Accuracy): {grid_search_stacking.best_score_:.4f}")
print("Best Parameters Found:")
print(grid_search_stacking.best_params_)

best_stacking_model = grid_search_stacking.best_estimator_
final_stacking_accuracy = best_stacking_model.score(X_test, y_test)
print(f"Accuracy on Test Set with Best StackingClassifier: {final_stacking_accuracy:.4f}")

In [None]:
print(f"Accuracy on Test Set with Best LightGBM : {final_accuracyLightGBM:.4f}")
print(f"Accuracy on Test Set with Best Random Forest: {final_rf_accuracyRandomForest:.4f}")
print(f"Accuracy on Test Set with Best Logistic Regression: {final_lr_accuracy:.4f}")
print(f"Accuracy on Test Set with Best CatBoost: {final_cb_accuracy:.4f}")
print(f"Accuracy on Test Set with Best XGBoost: {final_xgb_accuracy:.4f}")
print(f"Accuracy on Test Set with Best MLPClassifier: {final_mlp_accuracy:.4f}")

In [None]:
import pandas as pd
import seaborn as su
import matplotlib.pyplot as plt

model_scores = {
    'LightGBM': final_accuracyLightGBM,
    'Random Forest':final_rf_accuracyRandomForest,
    'Logistic Regression': final_lr_accuracy,
    'CatBoost': final_cb_accuracy,      
    'XGBoost': final_xgb_accuracy,       
    'MLP': final_mlp_accuracy,           
    # 'Stacking': final_stacking_accuracy  
}

scores_df = pd.DataFrame(list(model_scores.items()), columns=['Model', 'Accuracy'])
scores_df = scores_df.sort_values(by='Accuracy', ascending=False)

plt.figure(figsize=(12, 7)) 

splot = su.barplot(
    x='Model',
    y='Accuracy',
    data=scores_df,
    palette='viridis'
)

plt.title('Model Accuracy Comparison', fontsize=18)
plt.xlabel('Model', fontsize=14)
plt.ylabel('Accuracy Score', fontsize=14)

plt.ylim(0.85, 0.95) 

for p in splot.patches:
    splot.annotate(format(p.get_height(), '.4f'), 
                   (p.get_x() + p.get_width() / 2., p.get_height()), 
                   ha = 'center', 
                   va = 'center', 
                   xytext = (0, 9), 
                   textcoords = 'offset points',
                   fontsize=12)

plt.tight_layout()
plt.savefig('model_accuracy_comparison.png')

print("Model accuracy comparison plot saved as 'model_accuracy_comparison.png'")

# Explainability & Insights