In [2]:
#pip install xgboost

In [3]:
import xgboost as xgb
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [4]:
# match your subgroups again
sub1 = pd.read_csv("C:/Users/jyson/dsc180ab/dsc180b-wi24-quarter2/processed_data/subgroup_1.csv").assign(subgroup = 1) # Complicated Diabetes
sub2 = pd.read_csv("C:/Users/jyson/dsc180ab/dsc180b-wi24-quarter2/processed_data/subgroup_2.csv").assign(subgroup = 2) # Cardiac
sub3 = pd.read_csv("C:/Users/jyson/dsc180ab/dsc180b-wi24-quarter2/processed_data/subgroup_3.csv").assign(subgroup = 3) # Hepatic and Addiction
sub4 = pd.read_csv("C:/Users/jyson/dsc180ab/dsc180b-wi24-quarter2/processed_data/subgroup_4.csv").assign(subgroup = 4) # Cardiopulmonary
sub5 = pd.read_csv("C:/Users/jyson/dsc180ab/dsc180b-wi24-quarter2/processed_data/subgroup_5.csv").assign(subgroup = 5) # Young
sub6 = pd.read_csv("C:/Users/jyson/dsc180ab/dsc180b-wi24-quarter2/processed_data/subgroup_6.csv").assign(subgroup = 6) # 
sub7 = pd.read_csv("C:/Users/jyson/dsc180ab/dsc180b-wi24-quarter2/processed_data/subgroup_7.csv").assign(subgroup = 7) # Cardiopulmonary

In [5]:
subject_ids = [
    26494, 1113, 22202, 6702, 12613, 3011, 11167, 25699, 6873, 3868, 17807,
    25111, 8566, 7354, 16847, 6659, 11952, 145, 17797, 17795, 9467, 453
]

In [6]:
all_subgroups = pd.concat([sub1, sub2, sub3, sub4, sub5, sub6, sub7], ignore_index=True)
sub_df = all_subgroups[~all_subgroups['subject_id'].isin(subject_ids)].drop(columns=['subject_id', 'hadm_id', 'rounded_age'])

In [7]:
#all_subgroups #51816
#sub_df #51730

In [8]:
# 'subgroup' is the target column
X = sub_df.drop('subgroup', axis = 1)
y = sub_df['subgroup']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
# train the XGBoost model
y_train_adj = y_train - 1
y_test_adj = y_test - 1

dtrain = xgb.DMatrix(X_train, label=y_train_adj)
dtest = xgb.DMatrix(X_test, label=y_test_adj)


# Specify parameters for the XGBoost model
params = {
    'max_depth': 6,  # the maximum depth of each tree
    'eta': 0.3,  # learning rate
    'objective': 'multi:softprob',  # error evaluation
    'num_class': 7
}

In [11]:
# Train the model
num_round = 100  # the number of training iterations
bst = xgb.train(params, dtrain, num_round)

In [12]:
# Predictions for the training set
train_preds = bst.predict(dtrain) # vectors of probabilities of each class
train_best_preds = np.asarray([np.argmax(line) for line in train_preds])
train_accuracy = accuracy_score(y_train, train_best_preds + 1)  # +1 to adjust back to original label scale
print("Train Accuracy: %.2f%%" % (train_accuracy * 100.0))

# Predictions for the testing set
test_preds = bst.predict(dtest)
test_best_preds = np.asarray([np.argmax(line) for line in test_preds])
test_accuracy = accuracy_score(y_test, test_best_preds + 1)  # +1 to adjust back to original label scale
print("Test Accuracy: %.2f%%" % (test_accuracy * 100.0))

Train Accuracy: 99.29%
Test Accuracy: 96.51%


In [13]:
# probability predictions
probabilities = bst.predict(dtest)

## tell me the probabilities of being in each group

In [14]:
subjects_features = all_subgroups[all_subgroups['subject_id'].isin(subject_ids)].drop(['rounded_age'], axis=1)

X_subject_features = subjects_features.drop(['subject_id', 'hadm_id', 'subgroup'], axis = 1)

In [15]:
# Step 4: Convert the extracted features into DMatrix format.
dsubjects = xgb.DMatrix(X_subject_features)

# Step 5: Predict the probabilities using the trained model.
subjects_probabilities = bst.predict(dsubjects)

In [16]:
# Assuming subjects_probabilities contains the probabilities for the subjects
rounded_probabilities_df = pd.DataFrame(subjects_probabilities.round(2), columns=[f'prob_sub{i+1}' for i in range(subjects_probabilities.shape[1])])

# Concatenate the rounded probabilities with the subject features based on index order
# Make sure subjects_features is a DataFrame containing the relevant features and has the same number of rows as rounded_probabilities_df
combined_df = pd.concat([subjects_features.reset_index(drop=True), rounded_probabilities_df], axis=1)
combined_df

Unnamed: 0,subject_id,hadm_id,admission_type_encoded,congestive_heart_failure,cardiac_arrhythmias,valvular_disease,pulmonary_circulation,peripheral_vascular,hypertension,paralysis,...,depression,age,subgroup,prob_sub1,prob_sub2,prob_sub3,prob_sub4,prob_sub5,prob_sub6,prob_sub7
0,453,187961,1,1,0,1,0,0,1,0,...,0,80,1,1.00,0.00,0.00,0.00,0.0,0.00,0.00
1,1113,128609,1,1,0,0,0,0,1,0,...,0,46,1,0.74,0.01,0.24,0.00,0.0,0.01,0.00
2,1113,128609,1,1,0,0,0,0,1,0,...,0,46,1,0.74,0.01,0.24,0.00,0.0,0.01,0.00
3,3868,107365,1,0,1,0,0,0,1,0,...,0,65,1,0.94,0.00,0.00,0.06,0.0,0.00,0.00
4,22202,101241,1,1,1,0,0,0,1,0,...,0,66,1,1.00,0.00,0.00,0.00,0.0,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81,17795,179589,1,1,1,1,0,0,0,0,...,0,59,7,0.00,0.12,0.04,0.00,0.0,0.00,0.84
82,17795,179589,1,1,1,1,0,0,0,0,...,0,59,7,0.00,0.12,0.04,0.00,0.0,0.00,0.84
83,17797,108144,1,1,1,1,1,0,0,0,...,0,57,7,0.00,0.00,0.00,0.00,0.0,0.00,1.00
84,17797,108144,1,1,1,1,1,0,0,0,...,0,57,7,0.00,0.00,0.00,0.00,0.0,0.00,1.00


In [18]:
# save csv
# combined_df.to_csv('../processed_data/subgroup_probabilities.csv', index=False)