In [1]:
import warnings
# Ignore all warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

df = pd.read_csv("preprocessed_hospital_stay_data.csv")

In [2]:
from sklearn.utils import shuffle

class_num = df['Stay'].nunique()

X_columns = df.columns[2:-1]
y_columns = df.columns[-1]
X = df[X_columns].to_numpy()
Y = df[y_columns].to_numpy()

X, Y = shuffle(X, Y,random_state=42)
print(X.shape)
print(Y.shape)

(313793, 48)
(313793,)


In [3]:
configes={'model_1':{
    'objective': 'multi:softmax',  # Multi-class classification
    'num_class': class_num,  # Number of classes in the dataset
    'eval_metric': 'merror',  # Evaluation metric (multi-class classification error rate)
    'max_depth': 20,  # Depth of each tree
    'learning_rate': 0.01,  # Learning rate, controls the boosting process
    'n_estimators': 500  # Number of boosted trees to fit
}
}

In [11]:
from sklearn.model_selection import StratifiedKFold
import xgboost as xgb
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, average_precision_score
import tqdm

def minmax_normalization(input_features):
    scaler = MinMaxScaler()
    scaler.fit(input_features)
    input_features_scaled = scaler.transform(input_features)
    return input_features_scaled

def compute_multiclass_pr_auc(y_true, y_pred_probs, num_classes):
    pr_aucs = []
    for i in range(num_classes):
        # Binarize the true labels for the current class
        y_true_bin = (y_true == i).astype(int)
        
        # Compute precision-recall curve
        precision, recall, _ = precision_recall_curve(y_true_bin, y_pred_probs[:, i])
        
        # Compute PR-AUC
        pr_auc = auc(recall, precision)
        pr_aucs.append(pr_auc)
    
    # Macro-average PR-AUC
    macro_pr_auc = np.mean(pr_aucs)
    return pr_aucs, macro_pr_auc

    
def training(params,k,X,y,batch_size):
    # Initializing lists to store evaluation metrics
    accuracy_scores = []
    precision_macro_scores = []
    recall_macro_scores = []
    f1_macro_scores = []
    predictions = []
    pr_auc_scores = []
    
    models = []
    skf = StratifiedKFold(n_splits=k, shuffle=False)
    
    # Iterating over each fold
    for index, (train_index, test_index) in enumerate(skf.split(X,Y)):
        print('Fold ',str(index+1))
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        X_train = minmax_normalization(X_train)
        X_test = minmax_normalization(X_test)
        
        num_batches = len(X_train) // batch_size
        print('number of baches',num_batches)
        model = None
        
        for batch_idx in range(num_batches):
            batch_start = batch_idx * batch_size
            batch_end = (batch_idx + 1) * batch_size

            # Handle the last batch which may be smaller
            if batch_end > len(X_train):
                batch_end = len(X_train)

            X_batch = X_train[batch_start:batch_end]
            y_batch = y_train[batch_start:batch_end]

            # Convert data to DMatrix format for XGBoost
            dtrain = xgb.DMatrix(X_batch, label=y_batch)

           # Train the model on the current batch
            if model is None:
                # Initial training
                model = xgb.train(params, dtrain, num_boost_round=100, evals=[(dtrain, 'train')], 
                                  early_stopping_rounds=10, verbose_eval=False)
            else:
                # Continuing training with existing model
                model = xgb.train(params, dtrain, num_boost_round=100, evals=[(dtrain, 'train')], 
                                  early_stopping_rounds=10, verbose_eval=False, xgb_model=model)



        models.append(model)
        # Making predictions on the test data for this fold
        fold_predictions = model.predict(X_test)

        # Calculating evaluation metrics for this fold
        accuracy_scores.append(accuracy_score(y_test, fold_predictions))
        precision_macro_scores.append(precision_score(y_test, fold_predictions, average='macro'))
        recall_macro_scores.append(recall_score(y_test, fold_predictions, average='macro'))
        f1_macro_scores.append(f1_score(y_test, fold_predictions, average='macro'))
        pr_auc_scores.append(average_precision_score(y_test, fold_predictions))
        print(pr_auc_scores[-1])
        

    return models, (accuracy_scores,precision_macro_scores,recall_macro_scores,f1_macro_scores)


In [12]:
results = {}
num_folds = [10]
batch_size = 300
for folds in num_folds:
  results[folds] = {}
  for model_name in configes.keys():
    models, metrics = training(configes[model_name],folds,X,Y,batch_size)
    results[folds][model_name] = (models,metrics)

TypeError: can only concatenate list (not "int") to list

In [None]:
from matplotlib import pyplot as plt

markers = ['o','s','d','x','^','v','p','*']
colors = ['red','green','blue','purple','pink','orange','brown','black']
for folds in num_folds:
  fig1, axs1 = plt.subplots(2, 2, figsize=(10, 8))
  for index,model_name in enumerate(results[folds].keys()):

    print('\n'+model_name+' with configes:')
    if 'repeat' in model_name:
      print(configes['model6'])
    else:
      print(configes[model_name])

    metrics = results[folds][model_name][1]
    x = np.arange(len(metrics[1]))
    axs1[0,0].plot(x, metrics[1],markers[index]+'-',color=colors[index])
    axs1[0,0].set_title('Precision Macro results over '+str(folds)+' folds')
    axs1[0,0].set_xlabel('Folds')
    axs1[0,0].set_ylabel('Precision Macro')

    axs1[0,1].plot(x, metrics[2],markers[index]+'-',color=colors[index])
    axs1[0,1].set_title('Recall Macro results over '+str(folds)+' folds')
    axs1[0,1].set_xlabel('Folds')
    axs1[0,1].set_ylabel('Recall Macro')

    axs1[1,0].plot(x, metrics[3],markers[index]+'-',color=colors[index])
    axs1[1,0].set_title('F1 Macro results over '+str(folds)+' folds')
    axs1[1,0].set_xlabel('Folds')
    axs1[1,0].set_ylabel('F1 Macro')

    x = np.arange(len(metrics[0]))
    axs1[1,1].plot(x, metrics[0],markers[index]+'-',color=colors[index],label=model_name)
    axs1[1,1].set_title('Accuracy results over '+str(folds)+' folds')
    axs1[1,1].set_xlabel('Folds')
    axs1[1,1].set_ylabel('Accuracy')

  axs1[1,1].legend(loc='upper center', bbox_to_anchor=(1.3, 0.75), fancybox=True, shadow=True)
  plt.tight_layout()
  plt.show()

  plt.figure()
  x = np.arange(4)
  for index,model_name in enumerate(results[folds].keys()):
    avg_metrics = []
    for metric in results[folds][model_name][1]:
      avg_metrics.append(np.mean(metric))
    plt.plot(x,avg_metrics,markers[index]+'-',color=colors[index],label=model_name)

  plt.title('Summary of '+str(folds)+'Folds Results')
  plt.xlabel('Different Metrics')
  plt.ylabel('Average of Folds results')
  plt.xticks(x,['Accuracy', 'Precision Macro', 'Recall Macro', 'F1 Macro'], rotation=0.45)
  plt.legend()
  plt.show()
