# Model Creation

## Import Libraries

In [1]:
import os
from operator import itemgetter    
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
get_ipython().magic(u'matplotlib inline')
plt.style.use('ggplot')



### Import Data from Pre-Processing
* In this scenario, we are using only the surgeries with the greatest volume count. 
* Missing values HAVE been imputed.
* No PCA performed yet, no 1hot. 


In [19]:
######################
#
# Trial with SINGLE Operation Category
#
#####################

df= pd.read_csv('./_data/operations_imputed_CLEAN.csv', index_col=0)
df = df[df['category_id'] == '08R']
df.drop(['race'], axis=1, inplace=True)



In [20]:
df

Unnamed: 0,op_id,subject_id,hadm_id,opdate,age,sex,weight,height,asa,emop,...,potassium,sodium,total_bilirubin,wbc,LOS,is_outlier,prolonged_LOS,icu_visit,or_duration,anesth_duration
172,492906682,194038391,216766577,2880,60,F,54.0,150.0,1.0,0,...,3.900000,141.000000,0.600000,8.200501,1.357639,0,0,0,80.0,95.0
250,482249338,119650423,239907618,2880,30,F,53.0,165.0,1.0,0,...,3.828520,138.163265,0.769130,9.342222,2.354167,0,1,0,125.0,150.0
251,442852020,182792072,213043850,2880,45,F,58.0,155.0,2.0,0,...,3.753750,138.584978,0.766393,9.008460,2.281250,0,1,0,85.0,85.0
324,434799733,175117222,220073137,1440,75,M,73.0,168.0,1.0,0,...,4.200000,138.391775,0.940000,8.352611,1.378472,0,0,0,75.0,90.0
406,465710019,155181730,201087199,1440,55,F,57.0,157.0,1.0,0,...,4.000000,142.000000,0.600000,8.416539,1.461806,0,0,0,40.0,65.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
102249,485124229,183871820,281514484,3407040,70,F,69.0,157.0,3.0,0,...,3.700000,142.000000,0.949612,8.450882,0.475694,0,0,0,25.0,30.0
102250,406022295,188973481,294228750,0,40,M,56.0,171.0,1.0,0,...,4.053019,139.306867,1.025291,10.639694,0.399306,0,0,0,30.0,35.0
102251,435410039,160162844,277120271,20160,70,F,49.0,152.0,1.0,0,...,3.878091,139.838269,0.748828,8.877578,0.361111,0,1,0,10.0,20.0
102252,489985637,140939310,245004066,10080,55,M,77.0,176.0,1.0,0,...,4.046623,138.998693,0.948347,10.473106,0.340278,0,0,0,15.0,20.0


### Create the X and y DataFrames

  * create y
  * create X (complete with all the features)
  * drop the features we identified as not meeting impact threshold. 



In [32]:
# Label = LOS

features_to_retain = ['age','sex',	'weight',	'height',	'art_mbp',	'art_sbp',	'bt',	'cvp',	'hr',	'pip',	'pmean',	'rr',	'spo2',	'vt',	'alp',	'alt',	'ast',	'chloride',	'creatinine',	'glucose',	'hb',	'hco3',	'lymphocyte',	'platelet',	'potassium',	'sodium',	'total_bilirubin',	'wbc',	'is_outlier',	'prolonged_LOS',	'icu_visit',	'or_duration',	'anesth_duration',	'asa','department','antype'] 

y = df['LOS']
X = df.drop('LOS', axis=1)
X= X[features_to_retain]
X.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7181 entries, 172 to 102253
Data columns (total 36 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   age              7181 non-null   int64  
 1   sex              7181 non-null   object 
 2   weight           7107 non-null   float64
 3   height           7160 non-null   float64
 4   art_mbp          7036 non-null   float64
 5   art_sbp          7036 non-null   float64
 6   bt               7037 non-null   float64
 7   cvp              7015 non-null   float64
 8   hr               7037 non-null   float64
 9   pip              7037 non-null   float64
 10  pmean            7037 non-null   float64
 11  rr               7037 non-null   float64
 12  spo2             7037 non-null   float64
 13  vt               7037 non-null   float64
 14  alp              7036 non-null   float64
 15  alt              7036 non-null   float64
 16  ast              7036 non-null   float64
 17  chloride       

## Training 

### Split data
- Training Set (80% of total): 
  - Used to train the models.
- Validation Set (20% of Traning Set ): 
  - Used to fine-tune hyperparameters, select models, and monitor training progress.  
- Testing Set (20% of total): 
  - Used to evaluate the final model's performance on unseen data and estimate its generalization performance.

In [28]:
from sklearn.model_selection import train_test_split

TEST_SPLIT = .2
TRAINING_SPLIT = 1-TEST_SPLIT
VALIDATION_SPLIT = .2

# Split data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SPLIT, random_state=85100)

# Split the Training AGAIN into train and Validate
X_train, X_validate, y_train, y_validate = train_test_split(X_train, y_train, test_size=TEST_SPLIT, random_state=85100)

# Then, you can use X_train and y_train for model training and X_test and y_test for evaluation.

data_subset_dict = {
    'X_train': X_train,
    'X_validate': X_validate,
    'X_test': X_test,
    'y_train': y_train,
    'y_validate': y_validate,
    'y_test': y_test}

for key, value in data_subset_dict.items():
    shape = value.shape
    print(f"{key} shape: {shape}")


X_train shape: (4595, 36)
X_validate shape: (1149, 36)
X_test shape: (1437, 36)
y_train shape: (4595,)
y_validate shape: (1149,)
y_test shape: (1437,)


## MODEL PIPELINE

In [34]:
## Define an evaluation function Root Mean Squared Error:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error

def rmse_cv(X,y):
     rmse = np.sqrt(mean_squared_error(valid_y, predictions))
    rmse = np.sqrt(-cross_val_score(X, y, scoring="neg_mean_squared_error", cv=5))
    return rmse

preds = model.predict(dtest_reg)


# This step of the process is called model evaluation (or inference). Once you generate predictions with predict, you pass them inside mean_squared_error function of Sklearn to compare against y_test:

rmse = mean_squared_error(y_test, preds, squared=False)

print(f"RMSE of the base model: {rmse:.3f}")

In [49]:
#########################
#
#  SIMPLE Pipeline -
#  -- No tuning. 
########################
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor, BaggingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error

numeric_transform = Pipeline([('impute_mean', SimpleImputer(strategy='mean')),
                              ('scaling', StandardScaler())])

categorical_transform = Pipeline([('impute_mode', SimpleImputer(strategy='most_frequent')),
                                  ('one-hot-encode', OneHotEncoder(sparse_output=False))])

preprocessing_df = ColumnTransformer([('numeric', numeric_transform, ['age','weight',	'height',	'art_mbp',	'art_sbp',	'bt',	'cvp',	'hr',	'pip',	'pmean',	'rr',	'spo2',	'vt',	'alp',	'alt',	'ast',	'chloride',	'creatinine',	'glucose',	'hb',	'hco3',	'lymphocyte',	'platelet',	'potassium',	'sodium',	'total_bilirubin',	'wbc',	'is_outlier',	'prolonged_LOS',	'icu_visit',	'or_duration',	'anesth_duration']),
                                      ('categorical', categorical_transform, ['sex','asa','department','antype'])])


pipeline_base = Pipeline([('proprocessing', preprocessing_df),
                    ('model', LinearRegression())])
pipeline_base.fit(X_train, y_train)


y_pred = pipeline_base.predict(X_validate)
r2 = pipeline_base.score(X_validate, y_validate)

rmse = mean_squared_error(y_validate, y_pred, squared=False)
print(f'R-squared of base model: {r2}')
print(f"RMSE of the base model: {rmse:.3f}")




R-squared of base model: 0.4212040803368613
RMSE of the base model: 0.525


In [50]:
from sklearn import set_config
set_config(display='diagram')
pipeline_base

### Optimize Models with Hyperparameter Tuning via Grid Search


In [None]:
#########################
#
#  Pipeline with TUNING 
# 
########################
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor, BaggingRegressor
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest
from tqdm import tqdm  # Import tqdm
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score, GridSearchCV

In [None]:
import numpy as np
import xgboost as xgb

xgb_model = xgb.XGBRegressor( # tree_method="gpu_hist" # deprecated
    tree_method="hist",
    device="cuda"
)

X = np.random.rand(50, 2)
y = np.random.randint(2, size=50)

xgb_model.fit(X, y)

xgb_model 

In [None]:
from sklearn.pipeline import Pipeline
# model_list= [ExtraTreesRegressor (),RandomForestRegressor(),XGBRegressor()]
# model_names = ["ExtraTreesRegressor", "RandomForestRegressor","XGBRegressor"]

model_list= [XGBRegressor()]
model_names = ["XGBRegressor"]

# Find the best hyperparameters using GridSearchCV on the train set
param_grids = {
    'RandomForestRegressor': {
                    'classifier__n_estimators': [50, 100, 200],
                    'classifier__max_depth': [None, 10, 20, 30],
                    'classifier__min_samples_split': [2, 5, 10],
                    'classifier__min_samples_leaf': [1, 2, 4],
                    'classifier__max_features': ['auto', 'sqrt', 'log2'],
                    'classifier__bootstrap': [True, False]
            },          
    'ExtraTreesRegressor':{
                    'classifier__n_estimators': [50, 100, 200],
                    'classifier__max_depth': [None, 10, 20, 30],
                    'classifier__min_samples_split': [2, 5, 10],
                    'classifier__min_samples_leaf': [1, 2, 4],
                    'classifier__max_features': ['auto', 'sqrt', 'log2'],
                    'classifier__bootstrap': [True, False]
                    },
    'XGBRegressor': {
                    'classifier__device':['cuda'],
                    'classifier__objective': ['reg:squarederror'],
                    'classifier__n_estimators': [50, 100, 200],# Number of boosting rounds (trees)
                    'classifier__learning_rate': [0.01, 0.1, 0.2],# Learning rate
                    'classifier__max_depth': [3, 4, 5],# Maximum depth of each tree
                    'classifier__min_child_weight': [1, 2, 3],# Minimum sum of instance weight (hessian) needed in a child
                    'classifier__subsample': [0.8, 0.9, 1.0],# Subsample ratio of columns when constructing each tree
                    'classifier__colsample_bytree': [0.8, 0.9, 1.0]
                    }
        }

ModScores = {}
for model_name, model in zip(model_names, model_list):
    pipeline = Pipeline(steps=[('classifier', model)])
    
     # Access the parameter grid for the current model
    param_grid = param_grids[model_name]
    grid = GridSearchCV(pipeline, param_grid=param_grid, cv=5, n_jobs=-1)
    grid.fit(X_train, y_train)
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_validate)

    score = rmse_cv(model, y_validate, y_pred)
    ModScores[model_name] = score.mean()  # Use model_name as the key here
    print("{}: {:.2f}".format(model_name, score.mean()))

    best_model = grid.best_estimator_
    best_hyperparams = grid.best_params_
    best_acc = grid.score(X_test, y_test)
    print(f'Best test set accuracy: {best_acc}\nAchieved with hyperparameters: {best_hyperparams}')

    # Update the progress bar
    tqdm.write(f"{model_name} completed.")

print("_" * 100)
for key, value in sorted(ModScores.items(), key=itemgetter(1), reverse=False):
    print(key, round(value, 3))

