In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

### Load the data

In [3]:
data = pd.read_csv("../data/alzheimers_disease_data.csv")
data = data.drop("DoctorInCharge", axis=1) # this attribute is confidential in the data, and thus not useful 
data.head()

Unnamed: 0,PatientID,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,AlcoholConsumption,PhysicalActivity,DietQuality,...,FunctionalAssessment,MemoryComplaints,BehavioralProblems,ADL,Confusion,Disorientation,PersonalityChanges,DifficultyCompletingTasks,Forgetfulness,Diagnosis
0,4751,73,0,0,2,22.927749,0,13.297218,6.327112,1.347214,...,6.518877,0,0,1.725883,0,0,0,1,0,0
1,4752,89,0,0,0,26.827681,0,4.542524,7.619885,0.518767,...,7.118696,0,0,2.592424,0,0,0,0,1,0
2,4753,73,0,3,1,17.795882,0,19.555085,7.844988,1.826335,...,5.895077,0,0,7.119548,0,1,0,1,0,0
3,4754,74,1,0,1,33.800817,1,12.209266,8.428001,7.435604,...,8.965106,0,1,6.481226,0,0,0,0,0,0
4,4755,89,0,0,0,20.716974,0,18.454356,6.310461,0.795498,...,6.045039,0,0,0.014691,0,0,1,1,0,0


### Oversampling by ethnicity

In [4]:
e = data["Ethnicity"]
print('Original dataset shape %s' % Counter(e))

ethnicity_counts = dict(data["Ethnicity"].value_counts())
num_ethnicities = len(ethnicity_counts)
max_count = max(ethnicity_counts.values())

strategy_over = {ethnicity: 10*max_count for ethnicity in range(num_ethnicities)}
over = SMOTE(sampling_strategy=strategy_over)
print(strategy_over, sep='\n')

data_over, e_over=over.fit_resample(data, e)
print('Oversampled dataset shape %s' % Counter(e_over))

Original dataset shape Counter({0: 1278, 1: 454, 3: 211, 2: 206})
{0: np.int64(12780), 1: np.int64(12780), 2: np.int64(12780), 3: np.int64(12780)}
Oversampled dataset shape Counter({0: 12780, 3: 12780, 1: 12780, 2: 12780})


### Split the data

In [5]:
X, y = data_over.drop("Diagnosis", axis=1), data_over["Diagnosis"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=22, stratify=y)
X_train.head()

Unnamed: 0,PatientID,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,AlcoholConsumption,PhysicalActivity,DietQuality,...,MMSE,FunctionalAssessment,MemoryComplaints,BehavioralProblems,ADL,Confusion,Disorientation,PersonalityChanges,DifficultyCompletingTasks,Forgetfulness
41349,5140,75,0,3,2,38.07269,0,9.178253,5.323416,7.240387,...,15.868205,1.156471,0,0,5.813641,0,0,0,0,1
18988,5393,79,1,1,2,25.489277,0,12.500795,3.289202,5.209685,...,14.35972,7.783848,0,0,7.283212,0,0,0,0,0
27790,6084,72,0,2,2,33.321303,0,2.36186,3.603741,7.702591,...,17.875659,3.580915,0,0,5.347003,0,0,0,0,0
7034,6384,89,1,0,1,28.568585,0,11.822969,3.773971,7.706554,...,20.65254,8.612014,0,0,8.135215,0,0,0,0,0
10810,5265,71,0,0,1,32.343167,0,2.121308,1.418734,3.005132,...,19.599956,4.075893,0,0,5.012721,0,0,0,0,0


### Train the model

In [6]:
classifier = MLPClassifier(max_iter=5000, random_state=12)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

In [7]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.98      0.93     10964
           1       0.62      0.22      0.33      1816

    accuracy                           0.87     12780
   macro avg       0.75      0.60      0.63     12780
weighted avg       0.85      0.87      0.84     12780



In [8]:
X["Ethnicity"].value_counts()

Ethnicity
0    12780
3    12780
1    12780
2    12780
Name: count, dtype: int64

### Compare performance based on ethnicity

In [9]:
# Join X_test, y_test, y_pred for analysis
results = X_test.copy()
results["TrueDiagnosis"] = y_test
results["PredictedDiagnosis"] = y_pred

In [10]:
# Ethnicity 0
eth0_results = results[results["Ethnicity"] == 0]
print(classification_report(eth0_results["TrueDiagnosis"], eth0_results["PredictedDiagnosis"]))

              precision    recall  f1-score   support

           0       0.87      0.97      0.92      2717
           1       0.60      0.23      0.34       512

    accuracy                           0.85      3229
   macro avg       0.74      0.60      0.63      3229
weighted avg       0.83      0.85      0.83      3229



In [11]:
# Ethncity 1
eth1_results = results[results["Ethnicity"] == 1]
print(classification_report(eth1_results["TrueDiagnosis"], eth1_results["PredictedDiagnosis"]))

              precision    recall  f1-score   support

           0       0.91      0.97      0.94      2820
           1       0.56      0.27      0.36       365

    accuracy                           0.89      3185
   macro avg       0.74      0.62      0.65      3185
weighted avg       0.87      0.89      0.87      3185



In [12]:
# Ethnicity 2  
eth2_results = results[results["Ethnicity"] == 2]
print(classification_report(eth2_results["TrueDiagnosis"], eth2_results["PredictedDiagnosis"]))

              precision    recall  f1-score   support

           0       0.84      0.97      0.90      2580
           1       0.66      0.21      0.32       612

    accuracy                           0.83      3192
   macro avg       0.75      0.59      0.61      3192
weighted avg       0.80      0.83      0.79      3192



In [13]:
# Ethncity 3
eth3_results = results[results["Ethnicity"] == 3]
print(classification_report(eth3_results["TrueDiagnosis"], eth3_results["PredictedDiagnosis"]))

              precision    recall  f1-score   support

           0       0.91      0.99      0.95      2847
           1       0.69      0.18      0.29       327

    accuracy                           0.91      3174
   macro avg       0.80      0.59      0.62      3174
weighted avg       0.89      0.91      0.88      3174

