In [1]:
!pip install xgboost -q

In [2]:
!pip install graphviz -q

In [3]:
import pandas as pd

import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [14]:
data = pd.read_csv('../data/clinical_training_data.csv')
data.head()

Unnamed: 0,DX.bl,AGE,PTGENDER,PTEDUCAT,PTETHCAT,PTRACCAT,APOE4,MMSE,AD_dx_in_5_yrs
0,LMCI,67.5,Male,10,Hisp/Latino,White,0,27,0
1,CN,73.7,Male,16,Not Hisp/Latino,White,0,29,0
2,LMCI,80.4,Female,13,Not Hisp/Latino,White,0,25,0
3,CN,78.5,Female,12,Hisp/Latino,White,0,29,0
4,CN,80.8,Male,18,Not Hisp/Latino,White,1,29,0


In [15]:
X = data.drop(columns=['AD_dx_in_5_yrs'])
y = data['AD_dx_in_5_yrs']

In [16]:
X["DX.bl"] = X["DX.bl"].astype("category")
X["PTGENDER"] = X["PTGENDER"].astype("category")
X["PTETHCAT"] = X["PTETHCAT"].astype("category")
X["PTRACCAT"] = X["PTRACCAT"].astype("category")

In [17]:
# split test, train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)
# create model instance
model = xgb.XGBClassifier(tree_method="hist", enable_categorical=True, device="cuda")
# fit model
model.fit(X_train, y_train)
# make predictions
y_pred = model.predict(X_test)

# evaluate predictions
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.78      0.74      0.76        66
           1       0.53      0.58      0.55        33

    accuracy                           0.69        99
   macro avg       0.65      0.66      0.66        99
weighted avg       0.69      0.69      0.69        99



In [28]:
features = X.columns.to_list()
importance = list(model.feature_importances_)

In [31]:
feature_importance = dict(zip(features, importance))
feature_importance_df = pd.DataFrame.from_dict(feature_importance, orient='index').reset_index()
feature_importance_df.columns = ['Feature', 'Importance']
feature_importance_df.sort_values('Importance', ascending=False)

Unnamed: 0,Feature,Importance
0,DX.bl,0.721209
6,APOE4,0.060033
7,MMSE,0.058399
1,AGE,0.036757
2,PTGENDER,0.034487
3,PTEDUCAT,0.032423
4,PTETHCAT,0.028618
5,PTRACCAT,0.028074


## Remove DX.bl

In [32]:
X = data.drop(columns=['DX.bl','AD_dx_in_5_yrs'])
y = data['AD_dx_in_5_yrs']

In [33]:
X["PTGENDER"] = X["PTGENDER"].astype("category")
X["PTETHCAT"] = X["PTETHCAT"].astype("category")
X["PTRACCAT"] = X["PTRACCAT"].astype("category")

In [34]:
# split test, train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)
# create model instance
model = xgb.XGBClassifier(tree_method="hist", enable_categorical=True, device="cuda")
# fit model
model.fit(X_train, y_train)
# make predictions
y_pred = model.predict(X_test)

# evaluate predictions
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.77      0.81      0.79        72
           1       0.42      0.37      0.39        27

    accuracy                           0.69        99
   macro avg       0.59      0.59      0.59        99
weighted avg       0.68      0.69      0.68        99



In [35]:
features = X.columns.to_list()
importance = list(model.feature_importances_)

In [36]:
feature_importance = dict(zip(features, importance))
feature_importance_df = pd.DataFrame.from_dict(feature_importance, orient='index').reset_index()
feature_importance_df.columns = ['Feature', 'Importance']
feature_importance_df.sort_values('Importance', ascending=False)

Unnamed: 0,Feature,Importance
6,MMSE,0.304788
5,APOE4,0.214837
4,PTRACCAT,0.115015
0,AGE,0.105785
2,PTEDUCAT,0.104718
1,PTGENDER,0.101483
3,PTETHCAT,0.053373
