In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('Alzheimer.csv')

In [3]:
df.head()

Unnamed: 0,Subject ID,MRI ID,Group,Visit,MR Delay,M/F,Hand,Age,EDUC,SES,MMSE,CDR,eTIV,nWBV,ASF
0,OAS2_0001,OAS2_0001_MR1,Nondemented,1,0,M,R,87,14,2.0,27.0,0.0,1987,0.696,0.883
1,OAS2_0001,OAS2_0001_MR2,Nondemented,2,457,M,R,88,14,2.0,30.0,0.0,2004,0.681,0.876
2,OAS2_0002,OAS2_0002_MR1,Demented,1,0,M,R,75,12,,23.0,0.5,1678,0.736,1.046
3,OAS2_0002,OAS2_0002_MR2,Demented,2,560,M,R,76,12,,28.0,0.5,1738,0.713,1.01
4,OAS2_0002,OAS2_0002_MR3,Demented,3,1895,M,R,80,12,,22.0,0.5,1698,0.701,1.034


In [4]:
df.shape

(373, 15)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 373 entries, 0 to 372
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Subject ID  373 non-null    object 
 1   MRI ID      373 non-null    object 
 2   Group       373 non-null    object 
 3   Visit       373 non-null    int64  
 4   MR Delay    373 non-null    int64  
 5   M/F         373 non-null    object 
 6   Hand        373 non-null    object 
 7   Age         373 non-null    int64  
 8   EDUC        373 non-null    int64  
 9   SES         354 non-null    float64
 10  MMSE        371 non-null    float64
 11  CDR         373 non-null    float64
 12  eTIV        373 non-null    int64  
 13  nWBV        373 non-null    float64
 14  ASF         373 non-null    float64
dtypes: float64(5), int64(5), object(5)
memory usage: 43.8+ KB


In [7]:
(df.isna().sum()/df.shape[0])*100

Unnamed: 0,0
Subject ID,0.0
MRI ID,0.0
Group,0.0
Visit,0.0
MR Delay,0.0
M/F,0.0
Hand,0.0
Age,0.0
EDUC,0.0
SES,5.093834


In [8]:
ses_replace , mmse_replace = df.SES.median(), df.MMSE.median()
ses_replace , mmse_replace

(2.0, 29.0)

In [9]:
df.SES.fillna(ses_replace,inplace=True)
df.MMSE.fillna(mmse_replace,inplace=True)

In [10]:
df.isna().sum()

Unnamed: 0,0
Subject ID,0
MRI ID,0
Group,0
Visit,0
MR Delay,0
M/F,0
Hand,0
Age,0
EDUC,0
SES,0


In [45]:
to_scale = ['Age','EDUC','SES','MMSE','CDR','eTIV','nWBV','ASF']
to_ohe = ['M/F']

In [46]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder,LabelEncoder,StandardScaler

preprocessor = ColumnTransformer(
    transformers=[
        ('StandardScaler',StandardScaler(),to_scale),
        ('One-Hot Encoder',OneHotEncoder(),to_ohe)
    ])

In [47]:
processed_df = df[['Group','M/F','Age','EDUC','SES','MMSE','CDR','eTIV','nWBV','ASF']]
demented_df = processed_df[processed_df['Group']=='Demented']
demented_df.EDUC.value_counts()

Unnamed: 0_level_0,count
EDUC,Unnamed: 1_level_1
12,57
16,29
14,12
18,10
15,10
8,7
20,6
13,6
11,4
6,3


In [48]:
processed_df.groupby('Group').count()

Unnamed: 0_level_0,M/F,Age,EDUC,SES,MMSE,CDR,eTIV,nWBV,ASF
Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Converted,37,37,37,37,37,37,37,37,37
Demented,146,146,146,146,146,146,146,146,146
Nondemented,190,190,190,190,190,190,190,190,190


In [49]:
X = processed_df[['Age','EDUC','SES','MMSE','CDR','eTIV','nWBV','ASF','M/F']]
Y = processed_df[['Group']]

In [50]:
from sklearn.model_selection import train_test_split,GridSearchCV

X_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.2,random_state=42)

In [51]:
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


In [52]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score,classification_report
from sklearn.model_selection import RandomizedSearchCV

In [53]:
pipeline_rf = Pipeline([
    ('Preprocessor',preprocessor),
    ('RandomForest',RandomForestClassifier(random_state=42))
])
pipeline_svc = Pipeline([
    ('Preprocessor', preprocessor),
    ('SVM-C', SVC(random_state=42))
])
pipeline_xgb = Pipeline([
    ('Preprocessor', preprocessor),
    ('xgb', XGBClassifier(random_state=42))
])

In [54]:
# Random Forest parameter grid
rf_param_grid = {
    'RandomForest__n_estimators': [100, 200, 500],
    'RandomForest__max_depth': [10, 20, 30, None],
    'RandomForest__min_samples_split': [2, 5, 10],
    'RandomForest__min_samples_leaf': [1, 2, 4],
    'RandomForest__bootstrap': [True, False]
}

# SVM parameter grid
svc_param_grid = {
    'SVM-C__C': [0.1, 1, 10, 100],
    'SVM-C__gamma': [1, 0.1, 0.01, 0.001],
    'SVM-C__kernel': ['rbf', 'linear', 'poly']
}

# XGBoost parameter grid
xgb_param_grid = {
    'xgb__n_estimators': [100, 200, 500],
    'xgb__max_depth': [3, 5, 7],
    'xgb__learning_rate': [0.01, 0.1, 0.2],
    'xgb__subsample': [0.8, 1.0],
    'xgb__colsample_bytree': [0.6, 0.8, 1.0]
}

In [60]:
# GridSearchCV for Random Forest
grid_rf = GridSearchCV(estimator=pipeline_rf, param_grid=rf_param_grid,
                       cv=10, verbose=2, n_jobs=-1, scoring='accuracy')

# GridSearchCV for SVM
grid_svc = GridSearchCV(estimator=pipeline_svc, param_grid=svc_param_grid,
                        cv=10, verbose=2, n_jobs=-1, scoring='accuracy')

# GridSearchCV for XGBoost
grid_xgb = GridSearchCV(estimator=pipeline_xgb, param_grid=xgb_param_grid,
                        cv=10, verbose=2, n_jobs=-1, scoring='accuracy')

# Fit the GridSearchCV models on the training data
grid_rf.fit(X_train, y_train)
grid_svc.fit(X_train, y_train)
grid_xgb.fit(X_train, y_train)

Fitting 10 folds for each of 216 candidates, totalling 2160 fits
Fitting 10 folds for each of 48 candidates, totalling 480 fits
Fitting 10 folds for each of 162 candidates, totalling 1620 fits


In [61]:
y_pred_rf = grid_rf.predict(x_test)
y_pred_svc = grid_svc.predict(x_test)
y_pred_xgb = grid_xgb.predict(x_test)

In [62]:
print("Best Random Forest Parameters:", grid_rf.best_params_)
print(f"Random Forest Accuracy: {accuracy_score(y_test, y_pred_rf) * 100:.2f}%")
print("Classification report for RF: ")
print(classification_report(y_test, y_pred_rf))

Best Random Forest Parameters: {'RandomForest__bootstrap': True, 'RandomForest__max_depth': 10, 'RandomForest__min_samples_leaf': 1, 'RandomForest__min_samples_split': 5, 'RandomForest__n_estimators': 200}
Random Forest Accuracy: 85.33%
Classification report for RF: 
              precision    recall  f1-score   support

           0       1.00      0.09      0.17        11
           1       0.89      1.00      0.94        32
           2       0.82      0.97      0.89        32

    accuracy                           0.85        75
   macro avg       0.90      0.69      0.66        75
weighted avg       0.87      0.85      0.80        75



In [63]:
print("Best SVM Parameters:", grid_svc.best_params_)
print(f"SVM Accuracy: {accuracy_score(y_test, y_pred_svc) * 100:.2f}%")
print("Classification report for SVM: ")
print(classification_report(y_test, y_pred_svc))

Best SVM Parameters: {'SVM-C__C': 100, 'SVM-C__gamma': 1, 'SVM-C__kernel': 'linear'}
SVM Accuracy: 81.33%
Classification report for SVM: 
              precision    recall  f1-score   support

           0       0.29      0.18      0.22        11
           1       0.97      0.88      0.92        32
           2       0.79      0.97      0.87        32

    accuracy                           0.81        75
   macro avg       0.68      0.68      0.67        75
weighted avg       0.79      0.81      0.80        75



In [64]:
print("Best XGBoost Parameters:", grid_xgb.best_params_)
print(f"XGBoost Accuracy: {accuracy_score(y_test, y_pred_xgb) * 100:.2f}%")
print("Classification report for XGB: ")
print(classification_report(y_test, y_pred_xgb))

Best XGBoost Parameters: {'xgb__colsample_bytree': 0.6, 'xgb__learning_rate': 0.01, 'xgb__max_depth': 5, 'xgb__n_estimators': 100, 'xgb__subsample': 0.8}
XGBoost Accuracy: 84.00%
Classification report for XGB: 
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        11
           1       0.89      1.00      0.94        32
           2       0.79      0.97      0.87        32

    accuracy                           0.84        75
   macro avg       0.56      0.66      0.60        75
weighted avg       0.72      0.84      0.77        75



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


According to analysis the best model is Random Forest as it has the most accuracy and highest recall

In [65]:
best_model = grid_rf

In [66]:
import pickle

with open('best_model.pkl', 'wb') as file:
    pickle.dump(best_model, file)

with open('best_model.pkl', 'rb') as file:
    best_model = pickle.load(file)

Demented - 1

In [67]:
new_data_d = pd.DataFrame({'Age': [65],'EDUC': [12],'SES': [2],'MMSE': [29],'CDR': [0.5],'eTIV': [1500],'nWBV': [0.75],'ASF': [1.2],'M/F': ['M']})

Non Demented - 2

In [68]:
new_data_nd = pd.DataFrame({'Age': [88],'EDUC': [14],'SES': [2.0],'MMSE': [30.0],'CDR': [0.0],'eTIV': [2004],'nWBV': [0.681],'ASF': [0.876],'M/F': ['M']})

Converted - 0

In [69]:
new_data_c = pd.DataFrame({'Age': [92],'EDUC': [14],'SES': [1.0],'MMSE': [27.0],'CDR': [0.5],'eTIV': [1423],'nWBV': [0.696],'ASF': [1.234],'M/F': ['F']})

In [70]:
prediction_d = best_model.predict(new_data_d)[0]
prediction_nd = best_model.predict(new_data_nd)[0]
prediction_c = best_model.predict(new_data_c)[0]

In [71]:
print(prediction_d, prediction_nd, prediction_c)

1 2 0
