# Introduction
This notebook trains a machine learning model named XGBoost with decision trees as the base model on the given datasets.

Before going through the implementation of the model, we import neccessary packages.

In [None]:
import warnings
# Ignore all warnings
warnings.filterwarnings("ignore")
from sklearn.utils import shuffle
import pandas as pd # For reading the csv files.
import numpy as np
from sklearn.model_selection import StratifiedKFold # for kfold cross validation, will be explained in details later
import xgboost as xgb # Our selection for machine learning model. will be explained more later
#Following we import some metrics for evaluating the model. The selection reason will be explained
#from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, average_precision_score, roc_curve
from sklearn import metrics
import tqdm
# To train the models on each fold in paralle.
from concurrent.futures import ProcessPoolExecutor, as_completed

# We read the datsets.
df = pd.read_csv("preprocessed_hospital_stay_data.csv")

In the following cell, we specify number of classes, input features and their target.

In [None]:
# First, we specify the number of clases by counting the number of uniqie values
class_num = df['Stay'].nunique()
# The first two coulmns are not considered as inputs as they are case id and patient id. 
X_columns = df.columns[2:-1]
# last column as the label for each sample
y_columns = df.columns[-1]
# convert them to numpy array
X = df[X_columns].to_numpy()
Y = df[y_columns].to_numpy()
# We shuffle the data.
X, Y = shuffle(X, Y,random_state=42)
print(X.shape)
print(Y.shape)

# Confige defition

In this cell, we set the hyperparameter of the model. This paramters can be also tuned in a grid search.

In [None]:
configes={'model_1':{
    'objective': 'multi:softmax',  # Multi-class classification
    'num_class': class_num,  # Number of classes in the dataset
    'eval_metric': 'merror',  # Evaluation metric (multi-class classification error rate)
    'max_depth': 20,  # Depth of each tree
    'learning_rate': 0.01,  # Learning rate, controls the boosting process
    'n_estimators': 500  # Number of boosted trees to fit
}
}

# Model Selection, Training loop, and validation metrics

1- Model Selection: XGBoost with decision tree. Since the datset is hugely imblance we consider XGB with Decsion tree, since decsion trees can handle imblanced data.

2- We utilize stratified Kfold instead of usall kfold cross validation. The stratified version make sure that each samples in each fold follows the same destribution as that of the full data. This is suitable when we face with data imblance

3- We utilzie different metrics, specially PR-AUC when we deal with imblance data and we want to measure the confiedence of the model in making the decision.


In [None]:
'''
We have defined a minmax normalization to normalize the input features. This function is uesed in case that
we use other machine learning models such as logistic regression. In case of XGBoost with decion tree as the based model
we don't need to do minmax normalization since decision tree considers reletive oredering of features for creating the tree
not the absolute values of the trees.
'''
def minmax_normalization(input_features):
    scaler = MinMaxScaler()
    scaler.fit(input_features)
    input_features_scaled = scaler.transform(input_features)
    return input_features_scaled

#This function trains a model on the given fold.
# To train the model on each fold, we follow mini-batch training strategy
def train_single_fold(train_index, test_index, params, X, y, batch_size):
    '''
    inputs:
            train_index: index of the traing samples for the current fold traing
            test_index: index of the test samples for the current fold training
            X: Input features of the all samples
            y: output label of the all samples
            batch_size: size of the batch for mini batch training
    output:
            model
            A set of evaluation metrics
    '''
    print('Runing seperate process for training on single fold')
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Computing number of iteration based on the defined batch size and number of samples in the given fold.
    num_batches = len(X_train) // batch_size
    model = None
    
    for batch_idx in range(num_batches):
        batch_start = batch_idx * batch_size
        batch_end = (batch_idx + 1) * batch_size

        if batch_end > len(X_train):
            batch_end = len(X_train)

        X_batch = X_train[batch_start:batch_end]
        y_batch = y_train[batch_start:batch_end]

        dtrain = xgb.DMatrix(X_batch, label=y_batch)

        if model is None:
            model = xgb.train(params, dtrain, num_boost_round=100, evals=[(dtrain, 'train')], 
                              early_stopping_rounds=10, verbose_eval=False)
        else:
            model = xgb.train(params, dtrain, num_boost_round=100, evals=[(dtrain, 'train')], 
                              early_stopping_rounds=10, verbose_eval=False, xgb_model=model)
    #Perfomes prediction on unse
    fold_predictions = model.predict(xgb.DMatrix(X_test))
    fpr, tpr, thresholds = metrics.roc_curve(y_test, fold_predictions, pos_label=class_num)
    pr_auc = metrics.auc(fpr, tpr)
    
    return model, pr_auc
'''
# This function implements the training loop.  
'''
def training(params, k, X, y, batch_size):
    '''
    Inputs:
            params: paramters of the machine learning model
            k: number of folds
            x: input features
            y: output label
            batch_size: size of the batch
    
    Outputs:
        model: the trained model
        A touple of statistics
    '''
    accuracy_scores = []
    precision_macro_scores = []
    recall_macro_scores = []
    f1_macro_scores = []
    pr_auc_scores = []
    models = []
    # Defines startified Kfold
    skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)

    #In order to train the models in each fold in parallel we use ProcessPoolExecutor
    futures = []
    with ProcessPoolExecutor() as executor:
        for train_index, test_index in skf.split(X, y):
            futures.append(executor.submit(train_single_fold, train_index, test_index, params, X, y, batch_size))
        
        for future in tqdm.tqdm(as_completed(futures), total=len(futures)):
            model, pr_auc = future.result()
            models.append(model)
            pr_auc_scores.append(pr_auc)
            
    return models, pr_auc_scores


The following cell, start traing phase for given parameters and number of folds.

In [None]:
results = {}
num_folds = [10]
batch_size = 50
for folds in num_folds:
  results[folds] = {}
  for model_name in configes.keys():
    models, mymetrics = training(configes[model_name],folds,X,Y,batch_size)
    results[folds][model_name] = (models,mymetrics)

# Plotting the AUC result per fold

In [None]:
from matplotlib import pyplot as plt

markers = ['o']
colors = ['red']
for folds in num_folds:
  for index,model_name in enumerate(results[folds].keys()):
    print('\n'+model_name+' with configes:')
    if 'repeat' in model_name:
      print(configes['model6'])
    else:
      print(configes[model_name])

    metrics = results[folds][model_name][1]
    x = np.arange(len(metrics[1]))

    plt.plot(x, metrics[2],markers[index]+'-',color=colors[index])
    plt.set_title('AUC over '+str(folds)+' folds')
    plt.set_xlabel('Folds')
    plt.set_ylabel('AUC')
      
  plt.tight_layout()
  plt.show()
