In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

### Load the data

In [4]:
data = pd.read_csv("../data/alzheimers_disease_data.csv")
data = data.drop("DoctorInCharge", axis=1) # this attribute is confidential in the data, and thus not useful 
data.head()

Unnamed: 0,PatientID,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,AlcoholConsumption,PhysicalActivity,DietQuality,...,FunctionalAssessment,MemoryComplaints,BehavioralProblems,ADL,Confusion,Disorientation,PersonalityChanges,DifficultyCompletingTasks,Forgetfulness,Diagnosis
0,4751,73,0,0,2,22.927749,0,13.297218,6.327112,1.347214,...,6.518877,0,0,1.725883,0,0,0,1,0,0
1,4752,89,0,0,0,26.827681,0,4.542524,7.619885,0.518767,...,7.118696,0,0,2.592424,0,0,0,0,1,0
2,4753,73,0,3,1,17.795882,0,19.555085,7.844988,1.826335,...,5.895077,0,0,7.119548,0,1,0,1,0,0
3,4754,74,1,0,1,33.800817,1,12.209266,8.428001,7.435604,...,8.965106,0,1,6.481226,0,0,0,0,0,0
4,4755,89,0,0,0,20.716974,0,18.454356,6.310461,0.795498,...,6.045039,0,0,0.014691,0,0,1,1,0,0


### Undersampling by ethnicity

In [5]:
e = data["Ethnicity"]
print('Original dataset shape %s' % Counter(e))

ethnicity_counts = dict(data["Ethnicity"].value_counts())
num_ethnicities = len(ethnicity_counts)
min_count = min(ethnicity_counts.values())

strategy_under = {ethnicity: min_count for ethnicity in range(num_ethnicities)}
under = RandomUnderSampler(sampling_strategy=strategy_under)
print(strategy_under, sep='\n')

data_under, e_under=under.fit_resample(data, e)
print('Undersampled dataset shape %s' % Counter(e_under))

Original dataset shape Counter({0: 1278, 1: 454, 3: 211, 2: 206})
{0: np.int64(206), 1: np.int64(206), 2: np.int64(206), 3: np.int64(206)}
Undersampled dataset shape Counter({0: 206, 1: 206, 2: 206, 3: 206})


### Split the data

In [6]:
X, y = data_under.drop("Diagnosis", axis=1), data_under["Diagnosis"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=22, stratify=y)
X_train.head()

Unnamed: 0,PatientID,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,AlcoholConsumption,PhysicalActivity,DietQuality,...,MMSE,FunctionalAssessment,MemoryComplaints,BehavioralProblems,ADL,Confusion,Disorientation,PersonalityChanges,DifficultyCompletingTasks,Forgetfulness
292,5043,71,1,1,1,32.713562,1,10.195223,8.767569,9.891479,...,5.18331,7.399717,0,0,4.575794,0,0,1,0,1
2065,6816,88,0,1,0,23.749855,0,7.98612,4.191477,1.699245,...,22.738996,8.436638,1,0,7.870278,1,1,0,0,1
2125,6876,70,1,2,2,35.592158,1,17.105628,4.548318,1.264489,...,2.76303,6.558419,1,0,5.918483,0,0,0,0,1
1705,6456,74,0,3,0,31.410381,0,6.142996,2.543784,7.897166,...,28.089451,6.078292,0,0,2.249988,0,0,0,0,0
1290,6041,82,1,1,1,29.224278,0,10.277011,8.534762,1.759313,...,14.987596,1.556912,1,0,5.0319,0,0,0,0,1


### Train the model

In [7]:
classifier = MLPClassifier(max_iter=5000, random_state=12)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

In [8]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.69      1.00      0.81       135
           1       1.00      0.13      0.23        71

    accuracy                           0.70       206
   macro avg       0.84      0.56      0.52       206
weighted avg       0.79      0.70      0.61       206



In [None]:
X["Ethnicity"].value_counts()

Ethnicity
0    1278
1     454
3     211
2     206
Name: count, dtype: int64

### Compare performance based on ethnicity

In [10]:
# Join X_test, y_test, y_pred for analysis
results = X_test.copy()
results["TrueDiagnosis"] = y_test
results["PredictedDiagnosis"] = y_pred

In [11]:
# Ethnicity 0
eth0_results = results[results["Ethnicity"] == 0]
print(classification_report(eth0_results["TrueDiagnosis"], eth0_results["PredictedDiagnosis"]))

              precision    recall  f1-score   support

           0       0.73      1.00      0.85        33
           1       0.00      0.00      0.00        12

    accuracy                           0.73        45
   macro avg       0.37      0.50      0.42        45
weighted avg       0.54      0.73      0.62        45



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [12]:
# Ethncity 1
eth1_results = results[results["Ethnicity"] == 1]
print(classification_report(eth1_results["TrueDiagnosis"], eth1_results["PredictedDiagnosis"]))

              precision    recall  f1-score   support

           0       0.69      1.00      0.82        34
           1       0.00      0.00      0.00        15

    accuracy                           0.69        49
   macro avg       0.35      0.50      0.41        49
weighted avg       0.48      0.69      0.57        49



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [13]:
# Ethnicity 2  
eth2_results = results[results["Ethnicity"] == 2]
print(classification_report(eth2_results["TrueDiagnosis"], eth2_results["PredictedDiagnosis"]))

              precision    recall  f1-score   support

           0       0.64      1.00      0.78        32
           1       1.00      0.22      0.36        23

    accuracy                           0.67        55
   macro avg       0.82      0.61      0.57        55
weighted avg       0.79      0.67      0.60        55



In [14]:
# Ethncity 3
eth3_results = results[results["Ethnicity"] == 3]
print(classification_report(eth3_results["TrueDiagnosis"], eth3_results["PredictedDiagnosis"]))

              precision    recall  f1-score   support

           0       0.68      1.00      0.81        36
           1       1.00      0.19      0.32        21

    accuracy                           0.70        57
   macro avg       0.84      0.60      0.56        57
weighted avg       0.80      0.70      0.63        57

