In [1]:
#pip install xgboost

In [4]:
import xgboost as xgb
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [9]:
# match your subgroups again
sub1 = pd.read_csv("../processed_data/subgroup_1.csv").assign(subgroup = 1) # Complicated Diabetes
sub2 = pd.read_csv("../processed_data/subgroup_2.csv").assign(subgroup = 2) # Cardiac
sub3 = pd.read_csv("../processed_data/subgroup_3.csv").assign(subgroup = 3) # Hepatic and Addiction
sub4 = pd.read_csv("../processed_data/subgroup_4.csv").assign(subgroup = 4) # Uncomplicated Diabetes
sub5 = pd.read_csv("../processed_data/subgroup_5.csv").assign(subgroup = 5) # Young
sub6 = pd.read_csv("../processed_data/subgroup_6.csv").assign(subgroup = 6) # Unknown
sub7 = pd.read_csv("../processed_data/subgroup_7.csv").assign(subgroup = 7) # Cardiopulmonary

In [10]:
subject_ids = [
    26494, 1113, 22202, 6702, 12613, 3011, 11167, 25699, 6873, 3868, 17807,
    25111, 8566, 7354, 16847, 6659, 11952, 145, 17797, 17795, 9467, 453
]

In [11]:
all_subgroups = pd.concat([sub1, sub2, sub3, sub4, sub5, sub6, sub7], ignore_index=True)
sub_df = all_subgroups[~all_subgroups['subject_id'].isin(subject_ids)].drop(['rounded_age'], axis=1)

In [12]:
# 'subgroup' is the target column
X = sub_df.drop(['subject_id', 'subgroup', 'hadm_id'], axis = 1)
y = sub_df['subgroup']

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_test

Unnamed: 0,admission_type_encoded,congestive_heart_failure,cardiac_arrhythmias,valvular_disease,pulmonary_circulation,peripheral_vascular,hypertension,paralysis,other_neurological,chronic_pulmonary,...,obesity,weight_loss,fluid_electrolyte,blood_loss_anemia,deficiency_anemias,alcohol_abuse,drug_abuse,psychoses,depression,age
13032,1,1,1,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,83
32998,1,0,0,0,0,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,48
38027,1,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,51
11929,1,0,0,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,83
30890,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,61
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15666,1,0,0,0,0,0,0,0,0,0,...,0,1,1,1,0,0,0,0,0,54
15253,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,49
40481,1,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,90
33040,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,43


In [14]:
# train the XGBoost model
y_train_adj = y_train - 1
y_test_adj = y_test - 1

dtrain = xgb.DMatrix(X_train, label=y_train_adj)
dtest = xgb.DMatrix(X_test, label=y_test_adj)


# Specify parameters for the XGBoost model
params = {
    'max_depth': 6,  # the maximum depth of each tree
    'eta': 0.3,  # learning rate
    'objective': 'multi:softprob',  # learning object
    'num_class': 7
}

In [15]:
# Train the model
num_round = 100
bst = xgb.train(params, dtrain, num_round)

In [16]:
# Predictions for the training set
train_preds = bst.predict(dtrain)
train_best_preds = np.asarray([np.argmax(line) for line in train_preds])
train_accuracy = accuracy_score(y_train, train_best_preds + 1)
print("Train Accuracy: %.2f%%" % (train_accuracy * 100.0))

# Predictions for the testing set
test_preds = bst.predict(dtest)
test_best_preds = np.asarray([np.argmax(line) for line in test_preds])
test_accuracy = accuracy_score(y_test, test_best_preds + 1)
print("Test Accuracy: %.2f%%" % (test_accuracy * 100.0))

Train Accuracy: 99.29%
Test Accuracy: 96.51%


In [17]:
# probability predictions
probabilities = bst.predict(dtest)

## Tell me the probabilities of being in each group

In [19]:
subjects_features = all_subgroups[all_subgroups['subject_id'].isin(subject_ids)].drop(['rounded_age'], axis=1)

X_subject_features = subjects_features.drop(['subject_id', 'subgroup', 'hadm_id'], axis = 1)

In [20]:
# Step 4: Convert the extracted features into DMatrix format.
dsubjects = xgb.DMatrix(X_subject_features)

# Step 5: Predict the probabilities using the trained model.
subjects_probabilities = bst.predict(dsubjects)

In [22]:
rounded_probabilities_df = pd.DataFrame(subjects_probabilities.round(2), columns=[f'prob_sub{i+1}' for i in range(subjects_probabilities.shape[1])])

combined_df = pd.concat([subjects_features.reset_index(drop=True), rounded_probabilities_df], axis=1) # match the index
combined_df

Unnamed: 0,subject_id,hadm_id,admission_type_encoded,congestive_heart_failure,cardiac_arrhythmias,valvular_disease,pulmonary_circulation,peripheral_vascular,hypertension,paralysis,...,depression,age,subgroup,prob_sub1,prob_sub2,prob_sub3,prob_sub4,prob_sub5,prob_sub6,prob_sub7
0,453,187961,1,1,0,1,0,0,1,0,...,0,80,1,1.00,0.00,0.00,0.00,0.0,0.00,0.00
1,1113,128609,1,1,0,0,0,0,1,0,...,0,46,1,0.74,0.01,0.24,0.00,0.0,0.01,0.00
2,1113,128609,1,1,0,0,0,0,1,0,...,0,46,1,0.74,0.01,0.24,0.00,0.0,0.01,0.00
3,3868,107365,1,0,1,0,0,0,1,0,...,0,65,1,0.94,0.00,0.00,0.06,0.0,0.00,0.00
4,22202,101241,1,1,1,0,0,0,1,0,...,0,66,1,1.00,0.00,0.00,0.00,0.0,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81,17795,179589,1,1,1,1,0,0,0,0,...,0,59,7,0.00,0.12,0.04,0.00,0.0,0.00,0.84
82,17795,179589,1,1,1,1,0,0,0,0,...,0,59,7,0.00,0.12,0.04,0.00,0.0,0.00,0.84
83,17797,108144,1,1,1,1,1,0,0,0,...,0,57,7,0.00,0.00,0.00,0.00,0.0,0.00,1.00
84,17797,108144,1,1,1,1,1,0,0,0,...,0,57,7,0.00,0.00,0.00,0.00,0.0,0.00,1.00


In [17]:
# save csv
# combined_df.to_csv('../processed_data/subgroup_probabilities.csv', index=False)