# Model Creation

## Import Libraries

In [1]:
import os
from operator import itemgetter    
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')




### Import Data from Pre-Processing
* In this scenario, we are using only the surgeries with the greatest volume count. 
* Missing values HAVE been imputed.
* No PCA performed yet, no 1hot. 


In [2]:
######################
#
# Trial with SINGLE Operation Category
#
#####################

df= pd.read_csv('./_data/operations_imputed_CLEAN.csv', index_col=0)
df = df[df['category_id'] == '08R']
df.drop(['race'], axis=1, inplace=True)



In [None]:
df

### Create the X and y DataFrames

  * create y
  * create X (complete with all the features)
  * drop the features we identified as not meeting impact threshold. 



In [3]:
# Label = LOS

features_to_retain = ['age','sex',	'weight',	'height',	'art_mbp',	'art_sbp',	'bt',	'cvp',	'hr',	'pip',	'pmean',	'rr',	'spo2',	'vt',	'alp',	'alt',	'ast',	'chloride',	'creatinine',	'glucose',	'hb',	'hco3',	'lymphocyte',	'platelet',	'potassium',	'sodium',	'total_bilirubin',	'wbc',	'is_outlier',	'prolonged_LOS',	'icu_visit',	'or_duration',	'anesth_duration',	'asa','department','antype'] 

y = df['LOS']
X = df.drop('LOS', axis=1)
# Get a list of column names with data type 'object'

col_to_cast = ['asa','sex','department','antype']
# Convert the object data type columns to string
X[col_to_cast] = X[col_to_cast].astype(str)

X= X[features_to_retain]

X.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7181 entries, 172 to 102253
Data columns (total 36 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   age              7181 non-null   int64  
 1   sex              7181 non-null   object 
 2   weight           7107 non-null   float64
 3   height           7160 non-null   float64
 4   art_mbp          7036 non-null   float64
 5   art_sbp          7036 non-null   float64
 6   bt               7037 non-null   float64
 7   cvp              7015 non-null   float64
 8   hr               7037 non-null   float64
 9   pip              7037 non-null   float64
 10  pmean            7037 non-null   float64
 11  rr               7037 non-null   float64
 12  spo2             7037 non-null   float64
 13  vt               7037 non-null   float64
 14  alp              7036 non-null   float64
 15  alt              7036 non-null   float64
 16  ast              7036 non-null   float64
 17  chloride       

## Training 

### Split data
- Training Set (80% of total): 
  - Used to train the models.
- Validation Set (20% of Traning Set ): 
  - Used to fine-tune hyperparameters, select models, and monitor training progress.  
- Testing Set (20% of total): 
  - Used to evaluate the final model's performance on unseen data and estimate its generalization performance.

In [4]:
from sklearn.model_selection import train_test_split

TEST_SPLIT = .2
TRAINING_SPLIT = 1-TEST_SPLIT
VALIDATION_SPLIT = .2

# Split data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SPLIT, random_state=85100)

# Split the Training AGAIN into train and Validate
X_train, X_validate, y_train, y_validate = train_test_split(X_train, y_train, test_size=TEST_SPLIT, random_state=85100)

# Then, you can use X_train and y_train for model training and X_test and y_test for evaluation.

data_subset_dict = {
    'X_train': X_train,
    'X_validate': X_validate,
    'X_test': X_test,
    'y_train': y_train,
    'y_validate': y_validate,
    'y_test': y_test}

for key, value in data_subset_dict.items():
    shape = value.shape
    print(f"{key} shape: {shape}")


X_train shape: (4595, 36)
X_validate shape: (1149, 36)
X_test shape: (1437, 36)
y_train shape: (4595,)
y_validate shape: (1149,)
y_test shape: (1437,)


In [16]:
X_train

Unnamed: 0,age,sex,weight,height,art_mbp,art_sbp,bt,cvp,hr,pip,...,total_bilirubin,wbc,is_outlier,prolonged_LOS,icu_visit,or_duration,anesth_duration,asa,department,antype
34592,60,M,61.0,160.0,100.076553,139.701068,28.657895,1.350000,50.000000,14.242069,...,1.066259,9.695545,0,0,0,70.0,70.0,2.0,OT,MAC
94885,30,F,59.0,161.0,94.597087,133.039801,30.461647,2.041667,75.500000,14.500000,...,0.769130,9.342222,0,0,0,50.0,70.0,1.0,OT,General
77431,65,F,56.0,146.0,100.849638,139.747273,29.182886,4.968750,77.000000,3.000000,...,0.753823,8.694646,0,0,0,110.0,125.0,1.0,OT,General
83924,20,M,32.0,140.0,92.702290,140.826772,29.800000,0.272727,87.383832,21.500000,...,1.078906,10.044925,0,0,0,120.0,130.0,1.0,OT,General
35112,85,M,63.0,170.0,99.155797,147.262963,27.925000,0.800000,74.000000,12.467181,...,0.868367,8.634091,0,0,0,35.0,45.0,2.0,OT,MAC
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75644,55,M,73.0,174.0,98.096004,136.113176,36.000000,1.425287,89.000000,13.500000,...,1.062437,10.155298,0,0,0,70.0,90.0,2.0,OT,General
70267,70,F,47.0,145.0,93.265258,140.169903,28.851351,6.023810,70.000000,12.790064,...,0.949612,8.450882,0,0,0,30.0,45.0,3.0,OT,MAC
80265,45,F,58.0,145.0,98.941909,135.781646,29.636617,3.142857,74.000000,13.930459,...,0.745297,9.067261,0,1,0,20.0,30.0,1.0,OT,MAC
88922,65,M,77.0,172.0,99.296077,140.548387,28.214066,1.875000,82.000000,13.912527,...,1.024095,9.696084,0,1,0,10.0,30.0,2.0,OT,MAC


## MODEL PIPELINE

### Linear Regression

In [None]:
#########################
#
#  SIMPLE LINEAR REGRESSION Pipeline -
#  -- No tuning. 
########################

from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor, BaggingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error

numeric_transform = Pipeline([('impute_mean', SimpleImputer(strategy='mean')),
                              ('scaling', StandardScaler())])

categorical_transform = Pipeline([('impute_mode', SimpleImputer(strategy='most_frequent')),
                                  ('one-hot-encode', OneHotEncoder(sparse_output=False))])

preprocessing_df = ColumnTransformer([('numeric', numeric_transform, ['age','weight',	'height',	'art_mbp',	'art_sbp',	'bt',	'cvp',	'hr',	'pip',	'pmean',	'rr',	'spo2',	'vt',	'alp',	'alt',	'ast',	'chloride',	'creatinine',	'glucose',	'hb',	'hco3',	'lymphocyte',	'platelet',	'potassium',	'sodium',	'total_bilirubin',	'wbc',	'is_outlier',	'prolonged_LOS',	'icu_visit',	'or_duration',	'anesth_duration']),
                                      ('categorical', categorical_transform, ['sex','asa','department','antype'])])


pipeline_base = Pipeline([('proprocessing', preprocessing_df),
                    ('model', LinearRegression())])
pipeline_base.fit(X_train, y_train)


y_pred = pipeline_base.predict(X_validate)
r2 = pipeline_base.score(X_validate, y_validate)

rmse = mean_squared_error(y_validate, y_pred, squared=False)
print(f'R-squared of base model: {r2}')
print(f"RMSE of the base model: {rmse:.3f}")




In [None]:
from sklearn import set_config
set_config(display='diagram')
pipeline_base

### Ensemble Methods


In [8]:
from sklearn.metrics import mean_squared_error
from math import sqrt

def rmse_scorer(estimator, X, y):
    y_pred = estimator.predict(X)
    mse = mean_squared_error(y, y_pred)
    rmse = sqrt(mse)
    return -rmse  # Return negative RMSE for grid search to minimize

In [9]:
#########################
#
#  Ensemble Pipeline -
#  -- No tuning. Change the variable "model_name" for other models. 
########################

from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor, BaggingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from catboost import Pool, CatBoostRegressor

model_name=CatBoostRegressor()

numeric_transform = Pipeline([('impute_mean', SimpleImputer(strategy='mean')),
                              ('scaling', StandardScaler())])

categorical_transform = Pipeline([('impute_mode', SimpleImputer(strategy='most_frequent')),
                                  ('one-hot-encode', OneHotEncoder(sparse_output=False))])

preprocessing_df = ColumnTransformer([('numeric', numeric_transform, ['age','weight',	'height',	'art_mbp',	'art_sbp',	'bt',	'cvp',	'hr',	'pip',	'pmean',	'rr',	'spo2',	'vt',	'alp',	'alt',	'ast',	'chloride',	'creatinine',	'glucose',	'hb',	'hco3',	'lymphocyte',	'platelet',	'potassium',	'sodium',	'total_bilirubin',	'wbc',	'is_outlier',	'prolonged_LOS',	'icu_visit',	'or_duration',	'anesth_duration']),
                                      ('categorical', categorical_transform, ['sex','asa','department','antype'])])


pipeline_base = Pipeline([('proprocessing', preprocessing_df),
                    ('model', model_name)])
pipeline_base.fit(X_train, y_train)


y_pred = pipeline_base.predict(X_validate)
r2 = pipeline_base.score(X_validate, y_validate)

rmse = sqrt(mean_squared_error(y_validate, y_pred, squared=False))
print(f'Model employed: {model_name}')
print(f'R-squared of base model: {r2}')
print(f"RMSE of the base model: {rmse:.3f}")




Learning rate set to 0.052098
0:	learn: 0.6870033	total: 3.85ms	remaining: 3.84s
1:	learn: 0.6680413	total: 6.71ms	remaining: 3.35s
2:	learn: 0.6498389	total: 10.6ms	remaining: 3.52s
3:	learn: 0.6330048	total: 13.8ms	remaining: 3.43s
4:	learn: 0.6178612	total: 17.2ms	remaining: 3.42s
5:	learn: 0.6042441	total: 20.3ms	remaining: 3.36s
6:	learn: 0.5910820	total: 23.6ms	remaining: 3.35s
7:	learn: 0.5790153	total: 27.5ms	remaining: 3.41s
8:	learn: 0.5678499	total: 31.1ms	remaining: 3.42s
9:	learn: 0.5576967	total: 34.3ms	remaining: 3.39s
10:	learn: 0.5476696	total: 36.8ms	remaining: 3.31s
11:	learn: 0.5388409	total: 39.7ms	remaining: 3.27s
12:	learn: 0.5301971	total: 43.1ms	remaining: 3.27s
13:	learn: 0.5221065	total: 46ms	remaining: 3.24s
14:	learn: 0.5143164	total: 49.3ms	remaining: 3.23s
15:	learn: 0.5080155	total: 51.9ms	remaining: 3.19s
16:	learn: 0.5015730	total: 54.5ms	remaining: 3.15s
17:	learn: 0.4958584	total: 58.2ms	remaining: 3.17s
18:	learn: 0.4911988	total: 61.1ms	remaining: 

### Optimize Model with Hyperparameter Tuning via Grid Search


In [17]:
#########################
#
#  STANDALONE TUNING  - CatBoost
# 
########################
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor, BaggingRegressor
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest
from tqdm import tqdm  # Import tqdm
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score, GridSearchCV
from catboost import Pool, CatBoostRegressor

In [6]:
from catboost import CatBoostRegressor
from sklearn.model_selection import GridSearchCV, train_test_split


categorical_features_indices =[1,33,34,35]


# Define the parameter grid to search
param_grid = {
    'iterations': [100, 200, 300],  # Number of boosting iterations
    'depth': [6, 8, 10],  # Depth of trees
    'learning_rate': [0.01, 0.1, 0.2],  # Learning rate
    # Add more hyperparameters to search and their respective values
}

# Create a CatBoostRegressor model
catboost_model = CatBoostRegressor()

# Initialize the GridSearchCV object
grid_search = GridSearchCV(estimator=catboost_model, param_grid=param_grid, cv=5, scoring=rmse_scorer, n_jobs=-1, error_score='raise')

# Perform the grid search
grid_search.fit(X_train, y_train, cat_features=categorical_features_indices)

# Print the best hyperparameters and corresponding MSE score
print("Best hyperparameters found:")
print(grid_search.best_params_)
print("Best RMSE score:", -grid_search.best_score_)

# Get the best trained model
best_catboost_model = grid_search.best_estimator_

# Evaluate the best model on the validation set
validation_predictions = best_catboost_model.predict(X_validate)

# You can now use the best model for predictions on your test data


0:	learn: 0.6682624	total: 51.1ms	remaining: 10.2s
1:	learn: 0.6355766	total: 112ms	remaining: 11.1s
2:	learn: 0.6065928	total: 164ms	remaining: 10.8s
3:	learn: 0.5814759	total: 223ms	remaining: 10.9s
4:	learn: 0.5598538	total: 283ms	remaining: 11s
5:	learn: 0.5406256	total: 345ms	remaining: 11.1s
6:	learn: 0.5230752	total: 408ms	remaining: 11.2s
7:	learn: 0.5075717	total: 468ms	remaining: 11.2s
8:	learn: 0.4954915	total: 528ms	remaining: 11.2s
9:	learn: 0.4845436	total: 638ms	remaining: 12.1s
10:	learn: 0.4736284	total: 711ms	remaining: 12.2s
11:	learn: 0.4648395	total: 771ms	remaining: 12.1s
12:	learn: 0.4573200	total: 835ms	remaining: 12s
13:	learn: 0.4505681	total: 892ms	remaining: 11.8s
14:	learn: 0.4443846	total: 945ms	remaining: 11.7s
15:	learn: 0.4397321	total: 1.01s	remaining: 11.6s
16:	learn: 0.4359150	total: 1.06s	remaining: 11.4s
17:	learn: 0.4326991	total: 1.12s	remaining: 11.3s
18:	learn: 0.4289998	total: 1.18s	remaining: 11.2s
19:	learn: 0.4253524	total: 1.24s	remaining: