In [110]:
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
import pandas as pd

### Load the data

In [111]:
data = pd.read_csv("data/alzheimers_disease_data.csv")
data = data.drop("DoctorInCharge", axis=1) # this attribute is confidential in the data, and thus not useful 
data.head()

Unnamed: 0,PatientID,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,AlcoholConsumption,PhysicalActivity,DietQuality,...,FunctionalAssessment,MemoryComplaints,BehavioralProblems,ADL,Confusion,Disorientation,PersonalityChanges,DifficultyCompletingTasks,Forgetfulness,Diagnosis
0,4751,73,0,0,2,22.927749,0,13.297218,6.327112,1.347214,...,6.518877,0,0,1.725883,0,0,0,1,0,0
1,4752,89,0,0,0,26.827681,0,4.542524,7.619885,0.518767,...,7.118696,0,0,2.592424,0,0,0,0,1,0
2,4753,73,0,3,1,17.795882,0,19.555085,7.844988,1.826335,...,5.895077,0,0,7.119548,0,1,0,1,0,0
3,4754,74,1,0,1,33.800817,1,12.209266,8.428001,7.435604,...,8.965106,0,1,6.481226,0,0,0,0,0,0
4,4755,89,0,0,0,20.716974,0,18.454356,6.310461,0.795498,...,6.045039,0,0,0.014691,0,0,1,1,0,0


In [112]:
data.isna().sum() # check for missing values

PatientID                    0
Age                          0
Gender                       0
Ethnicity                    0
EducationLevel               0
BMI                          0
Smoking                      0
AlcoholConsumption           0
PhysicalActivity             0
DietQuality                  0
SleepQuality                 0
FamilyHistoryAlzheimers      0
CardiovascularDisease        0
Diabetes                     0
Depression                   0
HeadInjury                   0
Hypertension                 0
SystolicBP                   0
DiastolicBP                  0
CholesterolTotal             0
CholesterolLDL               0
CholesterolHDL               0
CholesterolTriglycerides     0
MMSE                         0
FunctionalAssessment         0
MemoryComplaints             0
BehavioralProblems           0
ADL                          0
Confusion                    0
Disorientation               0
PersonalityChanges           0
DifficultyCompletingTasks    0
Forgetfu

### Split the data

In [113]:
X, Y = data.drop("Diagnosis", axis=1), data["Diagnosis"]
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=22, stratify=Y)
X_train.head()

Unnamed: 0,PatientID,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,AlcoholConsumption,PhysicalActivity,DietQuality,...,MMSE,FunctionalAssessment,MemoryComplaints,BehavioralProblems,ADL,Confusion,Disorientation,PersonalityChanges,DifficultyCompletingTasks,Forgetfulness
1062,5813,74,0,0,2,16.327315,0,11.281357,1.151913,7.17136,...,20.474488,0.921736,0,1,4.217614,1,0,0,0,0
1982,6733,71,0,1,1,15.138279,0,3.570242,0.331743,3.376891,...,16.943652,5.492227,0,1,5.023993,0,0,0,0,1
1390,6141,88,1,0,2,35.751527,0,10.876879,7.052473,9.31438,...,17.270831,4.266609,0,1,9.352727,0,1,0,1,0
243,4994,70,0,0,2,23.185997,0,8.451036,6.184948,9.927077,...,24.199335,4.225134,0,0,5.454608,1,0,0,0,1
76,4827,68,1,0,1,24.487589,0,14.060047,8.112291,9.864426,...,22.640682,5.815445,0,0,3.301422,1,0,0,0,1


### Train the model

In [114]:
classifier = MLPClassifier(max_iter=5000, random_state=12)
classifier.fit(X_train, Y_train)
Y_pred = classifier.predict(X_test)

In [115]:
from sklearn.metrics import classification_report
print(classification_report(Y_test, Y_pred))

              precision    recall  f1-score   support

           0       0.89      0.61      0.72       348
           1       0.55      0.86      0.67       190

    accuracy                           0.70       538
   macro avg       0.72      0.73      0.70       538
weighted avg       0.77      0.70      0.70       538



In [116]:
data["Ethnicity"].value_counts()

Ethnicity
0    1278
1     454
3     211
2     206
Name: count, dtype: int64

### Compare performance based on ethnicity

In [117]:
# Join X_test, Y_test, Y_pred for analysis
results = X_test.copy()
results["TrueDiagnosis"] = Y_test
results["PredictedDiagnosis"] = Y_pred

In [118]:
# Ethnicity 0
eth0_results = results[results["Ethnicity"] == 0]
print(classification_report(eth0_results["TrueDiagnosis"], eth0_results["PredictedDiagnosis"]))

              precision    recall  f1-score   support

           0       0.90      0.60      0.72       222
           1       0.55      0.88      0.68       124

    accuracy                           0.70       346
   macro avg       0.72      0.74      0.70       346
weighted avg       0.77      0.70      0.70       346



In [119]:
# Ethncity 1
eth1_results = results[results["Ethnicity"] == 1]
print(classification_report(eth1_results["TrueDiagnosis"], eth1_results["PredictedDiagnosis"]))

              precision    recall  f1-score   support

           0       0.89      0.63      0.74        63
           1       0.54      0.84      0.66        32

    accuracy                           0.71        95
   macro avg       0.71      0.74      0.70        95
weighted avg       0.77      0.71      0.71        95



In [120]:
# Ethnicity 2  
eth2_results = results[results["Ethnicity"] == 2]
print(classification_report(eth2_results["TrueDiagnosis"], eth2_results["PredictedDiagnosis"]))

              precision    recall  f1-score   support

           0       0.78      0.61      0.68        23
           1       0.64      0.80      0.71        20

    accuracy                           0.70        43
   macro avg       0.71      0.70      0.70        43
weighted avg       0.71      0.70      0.70        43



In [121]:
# Ethncity 3
eth3_results = results[results["Ethnicity"] == 3]
print(classification_report(eth3_results["TrueDiagnosis"], eth3_results["PredictedDiagnosis"]))

              precision    recall  f1-score   support

           0       0.90      0.65      0.75        40
           1       0.44      0.79      0.56        14

    accuracy                           0.69        54
   macro avg       0.67      0.72      0.66        54
weighted avg       0.78      0.69      0.70        54

