# Model Creation

## Import Libraries

In [1]:
import os
from operator import itemgetter    
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')


### Import Data from Pre-Processing
* In this scenario, we are using only the surgeries with the greatest volume count (id_category: 08R). 
* Missing values HAVE been imputed.
* No PCA performed yet, no 1hot encoding. 


In [2]:
######################
#
# Trial with SINGLE Operation Category
#
#####################

df= pd.read_csv('./_data/operations_imputed_CLEAN.csv', index_col=0)
df = df[df['category_id'] == '08R']
df.drop(['race'], axis=1, inplace=True)



In [None]:
df

### Create the X and y DataFrames

  * create y
  * create X (complete with all the features)
  * drop the features we identified as not meeting impact threshold. 



In [3]:
# Label = LOS

features_to_retain = ['age','sex',	'weight',	'height',	'art_mbp',	'art_sbp',	'bt',	'cvp',	'hr',	'pip',	'pmean',	'rr',	'spo2',	'vt',	'alp',	'alt',	'ast',	'chloride',	'creatinine',	'glucose',	'hb',	'hco3',	'lymphocyte',	'platelet',	'potassium',	'sodium',	'total_bilirubin',	'wbc',	'is_outlier',	'prolonged_LOS',	'icu_visit',	'or_duration',	'anesth_duration',	'asa','department','antype'] 

y = df['LOS']
X = df.drop('LOS', axis=1)
# Get a list of column names with data type 'object'

col_to_cast = ['asa','sex','department','antype'] #When restoring scope to full category list, add cat_id here.
# Convert the object data type columns to string
X[col_to_cast] = X[col_to_cast].astype(str)

X= X[features_to_retain]

X.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7181 entries, 172 to 102253
Data columns (total 36 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   age              7181 non-null   int64  
 1   sex              7181 non-null   object 
 2   weight           7107 non-null   float64
 3   height           7160 non-null   float64
 4   art_mbp          7036 non-null   float64
 5   art_sbp          7036 non-null   float64
 6   bt               7037 non-null   float64
 7   cvp              7015 non-null   float64
 8   hr               7037 non-null   float64
 9   pip              7037 non-null   float64
 10  pmean            7037 non-null   float64
 11  rr               7037 non-null   float64
 12  spo2             7037 non-null   float64
 13  vt               7037 non-null   float64
 14  alp              7036 non-null   float64
 15  alt              7036 non-null   float64
 16  ast              7036 non-null   float64
 17  chloride       

## Training 

### Split data
- Training Set (80% of total): 
  - Used to train the models.
- Validation Set (20% of Traning Set ): 
  - Used to fine-tune hyperparameters, select models, and monitor training progress.  
- Testing Set (20% of total): 
  - Used to evaluate the final model's performance on unseen data and estimate its generalization performance.

In [4]:
from sklearn.model_selection import train_test_split

TEST_SPLIT = .2
TRAINING_SPLIT = 1-TEST_SPLIT
VALIDATION_SPLIT = .2

# Split data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SPLIT, random_state=85100)

# Split the Training AGAIN into train and Validate
X_train, X_validate, y_train, y_validate = train_test_split(X_train, y_train, test_size=TEST_SPLIT, random_state=85100)

# Then, you can use X_train and y_train for model training and X_test and y_test for evaluation.

data_subset_dict = {
    'X_train': X_train,
    'X_validate': X_validate,
    'X_test': X_test,
    'y_train': y_train,
    'y_validate': y_validate,
    'y_test': y_test}

for key, value in data_subset_dict.items():
    shape = value.shape
    print(f"{key} shape: {shape}")


X_train shape: (4595, 36)
X_validate shape: (1149, 36)
X_test shape: (1437, 36)
y_train shape: (4595,)
y_validate shape: (1149,)
y_test shape: (1437,)


In [5]:
X_train

Unnamed: 0,age,sex,weight,height,art_mbp,art_sbp,bt,cvp,hr,pip,...,total_bilirubin,wbc,is_outlier,prolonged_LOS,icu_visit,or_duration,anesth_duration,asa,department,antype
34592,60,M,61.0,160.0,100.076553,139.701068,28.657895,1.350000,50.000000,14.242069,...,1.066259,9.695545,0,0,0,70.0,70.0,2.0,OT,MAC
94885,30,F,59.0,161.0,94.597087,133.039801,30.461647,2.041667,75.500000,14.500000,...,0.769130,9.342222,0,0,0,50.0,70.0,1.0,OT,General
77431,65,F,56.0,146.0,100.849638,139.747273,29.182886,4.968750,77.000000,3.000000,...,0.753823,8.694646,0,0,0,110.0,125.0,1.0,OT,General
83924,20,M,32.0,140.0,92.702290,140.826772,29.800000,0.272727,87.383832,21.500000,...,1.078906,10.044925,0,0,0,120.0,130.0,1.0,OT,General
35112,85,M,63.0,170.0,99.155797,147.262963,27.925000,0.800000,74.000000,12.467181,...,0.868367,8.634091,0,0,0,35.0,45.0,2.0,OT,MAC
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75644,55,M,73.0,174.0,98.096004,136.113176,36.000000,1.425287,89.000000,13.500000,...,1.062437,10.155298,0,0,0,70.0,90.0,2.0,OT,General
70267,70,F,47.0,145.0,93.265258,140.169903,28.851351,6.023810,70.000000,12.790064,...,0.949612,8.450882,0,0,0,30.0,45.0,3.0,OT,MAC
80265,45,F,58.0,145.0,98.941909,135.781646,29.636617,3.142857,74.000000,13.930459,...,0.745297,9.067261,0,1,0,20.0,30.0,1.0,OT,MAC
88922,65,M,77.0,172.0,99.296077,140.548387,28.214066,1.875000,82.000000,13.912527,...,1.024095,9.696084,0,1,0,10.0,30.0,2.0,OT,MAC


## MODEL PIPELINE

### Linear Regression

In [6]:
#########################
#
#  SIMPLE LINEAR REGRESSION Pipeline -
#  -- No tuning. 
########################

from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor, BaggingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error
from math import sqrt

numeric_transform = Pipeline([('impute_mean', SimpleImputer(strategy='mean')),
                              ('scaling', StandardScaler())])

categorical_transform = Pipeline([('impute_mode', SimpleImputer(strategy='most_frequent')),
                                  ('one-hot-encode', OneHotEncoder(sparse_output=False))])

preprocessing_df = ColumnTransformer([('numeric', numeric_transform, ['age','weight',	'height',	'art_mbp',	'art_sbp',	'bt',	'cvp',	'hr',	'pip',	'pmean',	'rr',	'spo2',	'vt',	'alp',	'alt',	'ast',	'chloride',	'creatinine',	'glucose',	'hb',	'hco3',	'lymphocyte',	'platelet',	'potassium',	'sodium',	'total_bilirubin',	'wbc',	'is_outlier',	'prolonged_LOS',	'icu_visit',	'or_duration',	'anesth_duration']),
                                      ('categorical', categorical_transform, ['sex','asa','department','antype'])])


pipeline_base = Pipeline([('proprocessing', preprocessing_df),
                    ('model', LinearRegression())])
pipeline_base.fit(X_train, y_train)


y_pred = pipeline_base.predict(X_validate)
r2 = pipeline_base.score(X_validate, y_validate)

mse = mean_squared_error(y_validate, y_pred, squared=False)
rmse = sqrt(mse)
print(f'R-squared of base model: {r2}')
print(f"RMSE of the base model: {rmse:.3f}")




R-squared of base model: 0.4204646312467418
RMSE of the base model: 0.725


In [None]:
from sklearn import set_config
set_config(display='diagram')
pipeline_base

### Ensemble Methods


In [10]:
#########################
#
#  Ensemble Pipeline -
#  -- No tuning. Change the variable "model_name" for other models. 
########################

from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor, BaggingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from catboost import Pool, CatBoostRegressor

model_name=CatBoostRegressor()

numeric_transform = Pipeline([('impute_mean', SimpleImputer(strategy='mean')),
                              ('scaling', StandardScaler())])

categorical_transform = Pipeline([('impute_mode', SimpleImputer(strategy='most_frequent')),
                                  ('one-hot-encode', OneHotEncoder(sparse_output=False))])

preprocessing_df = ColumnTransformer([('numeric', numeric_transform, ['age','weight',	'height',	'art_mbp',	'art_sbp',	'bt',	'cvp',	'hr',	'pip',	'pmean',	'rr',	'spo2',	'vt',	'alp',	'alt',	'ast',	'chloride',	'creatinine',	'glucose',	'hb',	'hco3',	'lymphocyte',	'platelet',	'potassium',	'sodium',	'total_bilirubin',	'wbc',	'is_outlier',	'prolonged_LOS',	'icu_visit',	'or_duration',	'anesth_duration']),
                                      ('categorical', categorical_transform, ['sex','asa','department','antype'])])


pipeline_base = Pipeline([('proprocessing', preprocessing_df),
                    ('model', model_name)])
pipeline_base.fit(X_train, y_train)


y_pred = pipeline_base.predict(X_validate)
r2 = pipeline_base.score(X_validate, y_validate)

rmse = sqrt(mean_squared_error(y_validate, y_pred, squared=False))
print(f'Model employed: {model_name}')
print(f'R-squared of base model: {r2}')
print(f"RMSE of the base model: {rmse:.3f}")


Learning rate set to 0.052098
0:	learn: 0.6870033	total: 147ms	remaining: 2m 27s
1:	learn: 0.6680413	total: 150ms	remaining: 1m 14s
2:	learn: 0.6498389	total: 153ms	remaining: 50.9s
3:	learn: 0.6330048	total: 156ms	remaining: 38.9s
4:	learn: 0.6178612	total: 160ms	remaining: 31.8s
5:	learn: 0.6042441	total: 163ms	remaining: 27s
6:	learn: 0.5910820	total: 166ms	remaining: 23.6s
7:	learn: 0.5790153	total: 169ms	remaining: 21s
8:	learn: 0.5678499	total: 172ms	remaining: 19s
9:	learn: 0.5576967	total: 175ms	remaining: 17.4s
10:	learn: 0.5476696	total: 179ms	remaining: 16.1s
11:	learn: 0.5388409	total: 182ms	remaining: 15s
12:	learn: 0.5301971	total: 185ms	remaining: 14s
13:	learn: 0.5221065	total: 188ms	remaining: 13.2s
14:	learn: 0.5143164	total: 191ms	remaining: 12.5s
15:	learn: 0.5080155	total: 195ms	remaining: 12s
16:	learn: 0.5015730	total: 198ms	remaining: 11.5s
17:	learn: 0.4958584	total: 202ms	remaining: 11s
18:	learn: 0.4911988	total: 205ms	remaining: 10.6s
19:	learn: 0.4858347	to

### Optimize Model with Hyperparameter Tuning via Grid Search


In [13]:
#########################
#
#  STANDALONE TUNING  - CatBoost
# 
########################
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor, BaggingRegressor
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest
from tqdm import tqdm  # Import tqdm
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score, GridSearchCV
from catboost import Pool, CatBoostRegressor

In [12]:
from sklearn.metrics import mean_squared_error
from math import sqrt

def rmse_scorer(estimator, X, y):
    y_pred = estimator.predict(X)
    mse = mean_squared_error(y, y_pred)
    rmse = sqrt(mse)
    return -rmse  # Return negative RMSE for grid search to minimize

In [14]:
from catboost import CatBoostRegressor
from sklearn.model_selection import GridSearchCV, train_test_split


categorical_features_indices =[1,33,34,35]


# Define the parameter grid to search
param_grid = {
    'iterations': [100, 200, 300],      # Number of boosting iterations
    'depth': [6, 8, 10],                # Depth of trees
    'learning_rate': [0.01, 0.1, 0.2],  # Learning rate  
    }

# Create a CatBoostRegressor model
catboost_model = CatBoostRegressor()

# Initialize the GridSearchCV object
grid_search = GridSearchCV(estimator=catboost_model, param_grid=param_grid, cv=5, scoring=rmse_scorer, n_jobs=-1, error_score='raise')

# Perform the grid search
grid_search.fit(X_train, y_train, cat_features=categorical_features_indices)

# Print the best hyperparameters and corresponding MSE score
print("Best hyperparameters found:")
print(grid_search.best_params_)
print("Best RMSE score:", -grid_search.best_score_)

# Get the best trained model
best_catboost_model = grid_search.best_estimator_

# Evaluate the best model on the validation set
validation_predictions = best_catboost_model.predict(X_validate)

# Output - best settings for training the model


0:	learn: 0.6682624	total: 64.4ms	remaining: 12.8s
1:	learn: 0.6355766	total: 136ms	remaining: 13.5s
2:	learn: 0.6065928	total: 197ms	remaining: 13s
3:	learn: 0.5814759	total: 270ms	remaining: 13.2s
4:	learn: 0.5598538	total: 342ms	remaining: 13.3s
5:	learn: 0.5406256	total: 417ms	remaining: 13.5s
6:	learn: 0.5230752	total: 488ms	remaining: 13.4s
7:	learn: 0.5075717	total: 572ms	remaining: 13.7s
8:	learn: 0.4954915	total: 652ms	remaining: 13.8s
9:	learn: 0.4845436	total: 725ms	remaining: 13.8s
10:	learn: 0.4736284	total: 786ms	remaining: 13.5s
11:	learn: 0.4648395	total: 869ms	remaining: 13.6s
12:	learn: 0.4573200	total: 940ms	remaining: 13.5s
13:	learn: 0.4505681	total: 1.01s	remaining: 13.5s
14:	learn: 0.4443846	total: 1.09s	remaining: 13.5s
15:	learn: 0.4397321	total: 1.17s	remaining: 13.5s
16:	learn: 0.4359150	total: 1.24s	remaining: 13.4s
17:	learn: 0.4326991	total: 1.31s	remaining: 13.2s
18:	learn: 0.4289998	total: 1.4s	remaining: 13.3s
19:	learn: 0.4253524	total: 1.49s	remaining