In [1]:
import pandas as pd 

data = pd.read_csv('C:\\Users\\lclai\\Desktop\\data_dementia\\clean\\alzheimer.csv')

In [2]:
data['ALZHEIMER'].value_counts()

ALZHEIMER
0    47630
1      519
Name: count, dtype: int64

In [3]:
#undersampling

import pandas as pd

df_alzheimer = data[data['ALZHEIMER'] == 1]
df_no_alzheimer = data[data['ALZHEIMER'] == 0]

df_no_alzheimer_sampled = pd.DataFrame()

for (age, gender), group in df_alzheimer.groupby(['AGE', 'GENDER']):
 
    matched_controls = df_no_alzheimer[(df_no_alzheimer['AGE'] == age) & (df_no_alzheimer['GENDER'] == gender)]
    
    if len(matched_controls) > len(group):
        matched_controls = matched_controls.sample(n=len(group), random_state=42)
    
    df_no_alzheimer_sampled = pd.concat([df_no_alzheimer_sampled, matched_controls])

df_balanced = pd.concat([df_alzheimer, df_no_alzheimer_sampled])

df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

In [4]:
# outliers from numerical values

def remove_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

df_balanced = remove_outliers_iqr(df_balanced, 'AGE')
df_balanced = remove_outliers_iqr(df_balanced, 'LAST_ADMISSION_LENGTH')

In [5]:
df_balanced.columns

Index(['AGE', 'GENDER', 'ALZHEIMER', 'HYPERTENSION',
       'CORONARY_ATHEROSCLEROSIS', 'ATRIAL_FIBRILLATION', 'HEART_FAILURE',
       'KIDNEY_FAILURE', 'HYPERLIPIDEMIA', 'DIABETES', 'RESPIRATORY_FAILURE',
       'UTI', 'PROPHYLAXIS_VIRAL_HEPATITIS', 'OBSERVATION_INFECTIOUS',
       'HYPERCHOLESTEROLEMIA', 'ESOPHAGEAL_REFLUX', 'ANEMIA', 'PNEUMONIA',
       'POSTHEMORRHAGIC_ANEMIA', 'ACIDOSIS', 'LAST_ADMISSION_LENGTH',
       'ETHNICITY', 'MARITAL_STATUS', 'POTASSIUM_CHLORIDE',
       'SODIUM_CHLORIDE_FLUSH', 'ACETAMINOPHEN', 'MAGNESIUM_SULFATE',
       'INSULIN', 'HEPARIN', 'DOCUSATE_SODIUM', 'D5W', 'ISO_OSMOTIC_DEXTROSE',
       'MORPHINE_SULFATE', 'NS', 'SW', 'FUROSEMIDE', 'CALCIUM_GLUCONATE',
       'BISACODYL'],
      dtype='object')

In [8]:
binary = ['GENDER', 'ALZHEIMER', 'HYPERTENSION',
       'CORONARY_ATHEROSCLEROSIS', 'ATRIAL_FIBRILLATION', 'HEART_FAILURE',
       'KIDNEY_FAILURE', 'HYPERLIPIDEMIA', 'DIABETES', 'RESPIRATORY_FAILURE',
       'UTI', 'PROPHYLAXIS_VIRAL_HEPATITIS', 'OBSERVATION_INFECTIOUS',
       'HYPERCHOLESTEROLEMIA', 'ESOPHAGEAL_REFLUX', 'ANEMIA', 'PNEUMONIA',
       'POSTHEMORRHAGIC_ANEMIA', 'ACIDOSIS', 'LAST_ADMISSION_LENGTH',
       'ETHNICITY', 'MARITAL_STATUS', 'POTASSIUM_CHLORIDE',
       'SODIUM_CHLORIDE_FLUSH', 'ACETAMINOPHEN', 'MAGNESIUM_SULFATE',
       'INSULIN', 'HEPARIN', 'DOCUSATE_SODIUM', 'D5W', 'ISO_OSMOTIC_DEXTROSE',
       'MORPHINE_SULFATE', 'NS', 'SW', 'FUROSEMIDE', 'CALCIUM_GLUCONATE',
       'BISACODYL']
categorical = ['ETHNICITY','MARITAL_STATUS']
numerical = ['LAST_ADMISSION_LENGTH','AGE']

In [11]:
from sklearn.preprocessing import LabelEncoder, StandardScaler

le = LabelEncoder()
scaler = StandardScaler()

for col in binary + categorical:
    df_balanced[col] = le.fit_transform(df_balanced[col]) 

df_balanced[numerical] = scaler.fit_transform(df_balanced[numerical])
df_balanced

Unnamed: 0,AGE,GENDER,ALZHEIMER,HYPERTENSION,CORONARY_ATHEROSCLEROSIS,ATRIAL_FIBRILLATION,HEART_FAILURE,KIDNEY_FAILURE,HYPERLIPIDEMIA,DIABETES,...,HEPARIN,DOCUSATE_SODIUM,D5W,ISO_OSMOTIC_DEXTROSE,MORPHINE_SULFATE,NS,SW,FUROSEMIDE,CALCIUM_GLUCONATE,BISACODYL
0,0.384259,0,1,1,0,1,0,1,0,1,...,1,1,1,1,1,1,1,0,1,1
1,0.822919,0,0,1,0,1,1,0,0,1,...,1,0,0,1,0,0,1,0,1,0
2,-2.540137,1,0,1,0,0,0,0,1,0,...,1,0,0,0,0,0,1,0,0,0
3,1.261578,0,1,1,0,0,0,0,0,0,...,1,1,0,0,1,0,1,1,1,1
4,0.969138,1,1,0,0,1,1,0,1,0,...,1,1,0,1,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1033,-1.370379,1,1,1,1,1,1,1,1,0,...,1,1,1,1,1,1,1,1,0,1
1034,-0.785499,1,1,1,0,0,0,0,0,0,...,1,1,0,0,1,0,1,0,1,0
1035,-1.370379,0,1,0,0,0,0,0,0,0,...,1,0,1,0,0,1,1,0,0,1
1036,0.091819,0,1,1,0,0,0,0,0,0,...,1,0,1,1,0,1,1,0,0,0


In [12]:
from sklearn.model_selection import train_test_split

y = df_balanced['ALZHEIMER']  
X = df_balanced.drop(columns=['ALZHEIMER']) 

In [36]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

logreg = LogisticRegression(random_state=42)

logreg_param_grid = {
    'C': [0.01, 0.1, 1, 10],
    'penalty': ['l2'],
    'max_iter': [100, 200, 500], 
}

logreg_grid_search = GridSearchCV(logreg, logreg_param_grid, cv=cv, scoring='roc_auc', n_jobs=-1)
logreg_grid_search.fit(X, y)

print("Best parameters found: ", logreg_grid_search.best_params_)

logreg_auc = cross_val_score(logreg_grid_search.best_estimator_, X, y, cv=cv, scoring='roc_auc').mean()

print(f"Logistic Regression AUC: {logreg_auc}")

Best parameters found:  {'C': 0.1, 'max_iter': 100, 'penalty': 'l2'}
Logistic Regression AUC: 0.656647838029417


In [24]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state=42)

rf_param_grid = {
    'n_estimators': [200, 500, 1000],  
    'max_depth': [None, 10, 20],  
    'min_samples_split': [2, 5], 
    'min_samples_leaf': [1, 2, 4], 
    'max_features': ['sqrt', 'log2'],  
    'bootstrap': [False],  
    'max_samples': [None],  
}

rf_grid_search = GridSearchCV(rf, rf_param_grid, cv=cv, scoring='roc_auc', n_jobs=-1)
rf_grid_search.fit(X, y)

print("Best parameters found: ", rf_grid_search.best_params_)

rf_auc = cross_val_score(rf_grid_search.best_estimator_, X, y, cv=cv, scoring='roc_auc').mean()

print(f"Random Forest AUC: {rf_auc}")

Best parameters found:  {'bootstrap': False, 'max_depth': 10, 'max_features': 'sqrt', 'max_samples': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 500}
Random Forest AUC: 0.773683767499557


In [25]:
import xgboost as xgb

xgb_model = xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')

xgb_param_grid = {
    'n_estimators': [200, 500,1000], 
    'max_depth': [3, 6, 10, 20], 
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],  
    'gamma': [0, 0.1, 0.2], 
}

xgb_grid_search = GridSearchCV(xgb_model, xgb_param_grid, cv=cv, scoring='roc_auc', n_jobs=-1)
xgb_grid_search.fit(X, y)

print("Best parameters found: ", xgb_grid_search.best_params_)

xgb_auc = cross_val_score(xgb_grid_search.best_estimator_, X, y, cv=cv, scoring='roc_auc').mean()

print(f"XGBoost AUC: {xgb_auc}")


Best parameters found:  {'colsample_bytree': 0.6, 'gamma': 0.1, 'learning_rate': 0.01, 'max_depth': 10, 'n_estimators': 200, 'subsample': 1.0}
XGBoost AUC: 0.743019448874712


In [31]:
from sklearn.neighbors import KNeighborsClassifier

knn_model = KNeighborsClassifier()

knn_param_grid = {
    'n_neighbors': [ 10, 15,20,25,30],  
    'weights': ['uniform', 'distance'], 
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'], 
    'leaf_size': [20, 30, 40],
    'p': [1, 2], 
}

knn_grid_search = GridSearchCV(knn_model, knn_param_grid, cv=cv, scoring='roc_auc', n_jobs=-1)
knn_grid_search.fit(X, y)

print("Best parameters found: ", knn_grid_search.best_params_)

knn_auc = cross_val_score(knn_grid_search.best_estimator_, X, y, cv=cv, scoring='roc_auc').mean()

print(f"KNN AUC: {knn_auc}")

Best parameters found:  {'algorithm': 'ball_tree', 'leaf_size': 30, 'n_neighbors': 25, 'p': 1, 'weights': 'distance'}
KNN AUC: 0.7197962076909444


In [32]:
from sklearn.naive_bayes import GaussianNB

nb_model = GaussianNB()

nb_param_grid = {
    'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5]  
}

nb_grid_search = GridSearchCV(nb_model, nb_param_grid, cv=cv, scoring='roc_auc', n_jobs=-1)
nb_grid_search.fit(X, y)

print("Best parameters found: ", nb_grid_search.best_params_)

nb_auc = cross_val_score(nb_grid_search.best_estimator_, X, y, cv=cv, scoring='roc_auc').mean()

print(f"Naive Bayes AUC: {nb_auc}")

Best parameters found:  {'var_smoothing': 1e-06}
Naive Bayes AUC: 0.6322239943292575


In [33]:
from sklearn.neural_network import MLPClassifier

mlp_model = MLPClassifier(max_iter=1000, random_state=42)

mlp_param_grid = {
    'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 100)],  
    'activation': ['relu', 'tanh', 'logistic'],
    'solver': ['adam', 'sgd'], 
    'alpha': [0.0001, 0.001, 0.01],  
    'learning_rate': ['constant', 'invscaling', 'adaptive'], 
    'learning_rate_init': [0.001, 0.01, 0.1],  
}

mlp_grid_search = GridSearchCV(mlp_model, mlp_param_grid, cv=cv, scoring='roc_auc', n_jobs=-1)
mlp_grid_search.fit(X, y)
print("Best parameters found: ", mlp_grid_search.best_params_)
mlp_auc = cross_val_score(mlp_grid_search.best_estimator_, X, y, cv=cv, scoring='roc_auc').mean()
print(f"Neural Network AUC: {mlp_auc}")


Best parameters found:  {'activation': 'tanh', 'alpha': 0.01, 'hidden_layer_sizes': (100,), 'learning_rate': 'constant', 'learning_rate_init': 0.001, 'solver': 'adam'}
Neural Network AUC: 0.7187947457026405


In [34]:
from sklearn.svm import SVC

svm_model = SVC(random_state=42)

svm_param_grid = {
    'C': [0.1, 1, 10, 100], 
    'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],  
    'degree': [2, 3, 4], 
    'gamma': ['scale', 'auto', 0.1, 1, 10],
    'coef0': [0.0, 0.1, 0.5],  
    'shrinking': [True, False],  
    'probability': [True],
}

svm_grid_search = GridSearchCV(svm_model, svm_param_grid, cv=cv, scoring='roc_auc', n_jobs=-1)
svm_grid_search.fit(X, y)

print("Best parameters found: ", svm_grid_search.best_params_)

svm_auc = cross_val_score(svm_grid_search.best_estimator_, X, y, cv=cv, scoring='roc_auc').mean()

print(f"SVM AUC: {svm_auc}")


Best parameters found:  {'C': 1, 'coef0': 0.0, 'degree': 4, 'gamma': 'scale', 'kernel': 'poly', 'probability': True, 'shrinking': True}
SVM AUC: 0.7217681197944357
