# Model Creation

## Import Libraries

In [1]:
import os
from operator import itemgetter    
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')


### Import Data from Pre-Processing
* In this scenario, we are using only the surgeries with the greatest volume count (id_category: 08R). 
* Missing values HAVE been imputed.
* No PCA performed yet, no 1hot encoding. 


In [2]:
######################
#
# Trial with SINGLE Operation Category
#
#####################

df= pd.read_csv('../_data/operations_imputed_CLEAN_v2.csv', index_col=0)

# Going for it - doing the whole deal
# df = df[df['category_id']=='08R']
df.drop(['race'], axis=1, inplace=True)



In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 76742 entries, 8 to 128030
Data columns (total 37 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   op_id            76742 non-null  int64  
 1   subject_id       76742 non-null  int64  
 2   hadm_id          76742 non-null  int64  
 3   opdate           76742 non-null  int64  
 4   age              76742 non-null  int64  
 5   sex              76742 non-null  object 
 6   weight           76742 non-null  float64
 7   height           76742 non-null  float64
 8   asa              76742 non-null  float64
 9   department       76742 non-null  object 
 10  antype           76742 non-null  object 
 11  icd10_pcs        76742 non-null  object 
 12  category_desc    76742 non-null  object 
 13  desc_short       76742 non-null  object 
 14  category_id      76742 non-null  object 
 15  hr               76742 non-null  float64
 16  pip              76742 non-null  float64
 17  pmean           

### Create the X and y DataFrames

  * create y
  * create X (complete with all the features)
  * drop the features we identified as not meeting impact threshold. 



In [6]:
# Label = LOS

# When doing a Categorical Model, reinsert 'prolonged_LOS' and instead, drop 'LOS'

features_to_retain = ['category_id','age','sex',	'weight',	'height',	'hr',	'pip',	'pmean',	'rr',	'spo2',	'vt',	'chloride',	'creatinine',	'glucose',	'hb',	'hco3',	'lymphocyte',	'platelet',	'potassium',	'sodium',	'total_bilirubin',	'wbc',	'icu_visit',	'or_duration',	'anesth_duration',	'asa','department','antype'] 


y = df['LOS']
X = df.drop('LOS', axis=1)
# Get a list of column names with data type 'object'


X= X[features_to_retain]
X.info()

<class 'pandas.core.frame.DataFrame'>
Index: 76742 entries, 8 to 128030
Data columns (total 28 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   category_id      76742 non-null  object 
 1   age              76742 non-null  int64  
 2   sex              76742 non-null  object 
 3   weight           76742 non-null  float64
 4   height           76742 non-null  float64
 5   hr               76742 non-null  float64
 6   pip              76742 non-null  float64
 7   pmean            76742 non-null  float64
 8   rr               76742 non-null  float64
 9   spo2             76742 non-null  float64
 10  vt               76742 non-null  float64
 11  chloride         76742 non-null  float64
 12  creatinine       76742 non-null  float64
 13  glucose          76742 non-null  float64
 14  hb               76742 non-null  float64
 15  hco3             76742 non-null  float64
 16  lymphocyte       76742 non-null  float64
 17  platelet        

In [8]:
# Indentify the columns that need to be either cast as Str or Scaled
############################################################

COLS_TO_CAST = ['asa','sex','department','antype'] #When restoring scope to full category list, add cat_id here.
# Convert the object data type columns to string

X[COLS_TO_CAST] = X[COLS_TO_CAST].astype(str)

# Filter columns with dtype 'numeric' for scaling later in the Pipleine
COLS_TO_SCALE = X.select_dtypes(include=['int', 'float']).columns

print(COLS_TO_SCALE)
X.info()

Index(['age', 'weight', 'height', 'hr', 'pip', 'pmean', 'rr', 'spo2', 'vt',
       'chloride', 'creatinine', 'glucose', 'hb', 'hco3', 'lymphocyte',
       'platelet', 'potassium', 'sodium', 'total_bilirubin', 'wbc',
       'icu_visit', 'or_duration', 'anesth_duration'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
Index: 76742 entries, 8 to 128030
Data columns (total 28 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   category_id      76742 non-null  object 
 1   age              76742 non-null  int64  
 2   sex              76742 non-null  object 
 3   weight           76742 non-null  float64
 4   height           76742 non-null  float64
 5   hr               76742 non-null  float64
 6   pip              76742 non-null  float64
 7   pmean            76742 non-null  float64
 8   rr               76742 non-null  float64
 9   spo2             76742 non-null  float64
 10  vt               76742 non-null  float64
 11 

## Training 

### Split data
- Training Set (80% of total): 
  - Used to train the models.
- Validation Set (20% of Traning Set ): 
  - Used to fine-tune hyperparameters, select models, and monitor training progress.  
- Testing Set (20% of total): 
  - Used to evaluate the final model's performance on unseen data and estimate its generalization performance.

In [9]:
from sklearn.model_selection import train_test_split

TEST_SPLIT = .2
TRAINING_SPLIT = 1-TEST_SPLIT
VALIDATION_SPLIT = .2

# Split data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SPLIT, random_state=85100)

# Split the Training AGAIN into train and Validate
X_train, X_validate, y_train, y_validate = train_test_split(X_train, y_train, test_size=TEST_SPLIT, random_state=85100)

# Then, you can use X_train and y_train for model training and X_test and y_test for evaluation.

data_subset_dict = {
    'X_train': X_train,
    'X_validate': X_validate,
    'X_test': X_test,
    'y_train': y_train,
    'y_validate': y_validate,
    'y_test': y_test}

for key, value in data_subset_dict.items():
    shape = value.shape
    print(f"{key} shape: {shape}")


X_train shape: (49114, 28)
X_validate shape: (12279, 28)
X_test shape: (15349, 28)
y_train shape: (49114,)
y_validate shape: (12279,)
y_test shape: (15349,)


In [None]:
X_train.info()

## MODEL PIPELINE

### Linear Regression

In [None]:
#########################
#
#  SIMPLE LINEAR REGRESSION Pipeline -
#  -- No tuning. 
########################

from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor, BaggingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error
from math import sqrt

numeric_transform = Pipeline([('impute_mean', SimpleImputer(strategy='mean')),
                              ('scaling', StandardScaler())])

categorical_transform = Pipeline([('impute_mode', SimpleImputer(strategy='most_frequent')),
                                  ('one-hot-encode', OneHotEncoder(sparse_output=False))])

preprocessing_df = ColumnTransformer([('numeric', numeric_transform, ['age','weight',	'height',	'art_mbp',	'art_sbp',	'bt',	'cvp',	'hr',	'pip',	'pmean',	'rr',	'spo2',	'vt',	'alp',	'alt',	'ast',	'chloride',	'creatinine',	'glucose',	'hb',	'hco3',	'lymphocyte',	'platelet',	'potassium',	'sodium',	'total_bilirubin',	'wbc',	'is_outlier',	'prolonged_LOS',	'icu_visit',	'or_duration',	'anesth_duration']),
                                      ('categorical', categorical_transform, ['sex','asa','department','antype'])])


pipeline_base = Pipeline([('proprocessing', preprocessing_df),
                    ('model', LinearRegression())])
pipeline_base.fit(X_train, y_train)


y_pred = pipeline_base.predict(X_validate)
r2 = pipeline_base.score(X_validate, y_validate)

mse = mean_squared_error(y_validate, y_pred, squared=False)
rmse = sqrt(mse)
print(f'R-squared of base model: {r2}')
print(f"RMSE of the base model: {rmse:.3f}")




In [None]:
from sklearn import set_config
set_config(display='diagram')
pipeline_base

### Ensemble Methods


In [None]:
#########################
#
#  Ensemble Pipeline -
#  -- No tuning. Change the variable "model_name" for other models. 
########################

from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor, BaggingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from catboost import Pool, CatBoostRegressor



### Optimize Model with Hyperparameter Tuning via Grid Search


In [None]:
#########################
#
#  STANDALONE TUNING  - CatBoost
# 
########################
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor, BaggingRegressor
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest
from tqdm import tqdm  # Import tqdm
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score, GridSearchCV
from catboost import Pool, CatBoostRegressor

In [None]:
from sklearn.metrics import mean_squared_error
from math import sqrt

def rmse_scorer(estimator, X, y):
    y_pred = estimator.predict(X)
    mse = mean_squared_error(y, y_pred)
    rmse = sqrt(mse)
    return -rmse  # Return negative RMSE for grid search to minimize

In [None]:
from catboost import CatBoostRegressor
from sklearn.model_selection import GridSearchCV, train_test_split


categorical_features_indices =[1,33,34,35]


# Define the parameter grid to search
param_grid = {
    'iterations': [100, 200, 300],      # Number of boosting iterations
    'depth': [6, 8, 10],                # Depth of trees
    'learning_rate': [0.01, 0.1, 0.2],  # Learning rate  
    }

# Create a CatBoostRegressor model
catboost_model = CatBoostRegressor()

# Initialize the GridSearchCV object
grid_search = GridSearchCV(estimator=catboost_model, param_grid=param_grid, cv=5, scoring=rmse_scorer, n_jobs=-1, error_score='raise')

# Perform the grid search
grid_search.fit(X_train, y_train, cat_features=categorical_features_indices)

# Print the best hyperparameters and corresponding MSE score
print("Best hyperparameters found:")
print(grid_search.best_params_)
print("Best RMSE score:", -grid_search.best_score_)

# Get the best trained model
best_catboost_model = grid_search.best_estimator_

# Evaluate the best model on the validation set
validation_predictions = best_catboost_model.predict(X_validate)

# Output - best settings for training the model


In [None]:
#########################
#
#  OPTIMIZED - CatBoost
# 
########################
categorical_features_indices =[0,2,26,27]


# Best hyperparameters found (after hyperparameter tuning)
best_params = {'depth': 8, 'iterations': 200, 'learning_rate': 0.1}

# Instantiate CatBoostRegressor with the best hyperparameters
model_name = CatBoostRegressor(**best_params)

numeric_transform = Pipeline([('impute_mean', SimpleImputer(strategy='mean')),
                              ('scaling', StandardScaler())])

categorical_transform = Pipeline([('impute_mode', SimpleImputer(strategy='most_frequent')),
                                  ('one-hot-encode', OneHotEncoder(sparse_output=False))])

preprocessing_df = ColumnTransformer([('numeric', numeric_transform, COLS_TO_SCALE),
                                      ('categorical', categorical_transform, COLS_TO_SCALE)])


pipeline_base = Pipeline([('proprocessing', preprocessing_df),
                    ('model', model_name)])
pipeline_base.fit(X_train, y_train, model__cat_features=categorical_features_indices)



y_pred = pipeline_base.predict(X_validate)
r2 = pipeline_base.score(X_validate, y_validate)

rmse = sqrt(mean_squared_error(y_validate, y_pred, squared=False))
print(f'Model employed: {model_name}')
print(f'R-squared of base model: {r2}')
print(f"RMSE of the base model: {rmse:.3f}")
