##  Catboost Model
### Pipeline setup, Train, Pickle

#### 1.1 Import Data and Required Packages
##### Importing Pandas, Numpy, Matplotlib, Seaborn and Warings Library.

In [1]:
# Basic Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
# Modelling
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler, FunctionTransformer,OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import warnings

#### Import the CSV Data as Pandas DataFrame

In [2]:
######################
#
# Import Data from PreProcessing
#
#####################

df= pd.read_csv('../../Prolonged_LOS_Project/_data/operations_imputed_CLEAN_v2.csv', index_col=0)

df.drop(['race'], axis=1, inplace=True)

#### Show Top 5 Records

In [3]:
df.head()

Unnamed: 0,op_id,subject_id,hadm_id,opdate,age,sex,weight,height,asa,department,...,platelet,potassium,sodium,total_bilirubin,wbc,LOS,prolonged_LOS,icu_visit,or_duration,anesth_duration
8,467425045,134213281,225860669,1440,60,F,62.0,154.0,1.0,GS,...,217.282759,3.846584,140.033084,0.744921,8.200501,3.493056,1,0,70.0,90.0
9,461473883,134195201,265770645,1440,35,F,50.0,160.0,1.0,OS,...,124.0,3.9,138.0,0.6,6.31,4.236111,0,0,115.0,150.0
10,430539801,181420324,208290342,1440,20,M,62.0,179.0,1.0,OL,...,237.222222,4.041013,139.824013,1.078906,10.044925,1.572917,1,0,90.0,135.0
11,466389608,160947402,262240911,1440,60,F,52.0,152.0,1.0,OL,...,217.282759,3.846584,140.033084,0.744921,8.200501,1.607639,0,0,30.0,90.0
15,439560439,163619571,279388936,0,75,F,65.0,154.0,2.0,OT,...,207.27027,3.867044,139.499091,0.739452,8.552474,0.604167,0,0,15.0,25.0


#### Preparing X and Y variables

In [4]:
## Features to retain are those in X that will be used in training. Exludued features are features such as Operation_ID, Subject_ID..
features_to_retain = ['category_id','age','sex',	'weight',	'height',	'hr',	'pip',	'pmean',	'rr',	'spo2',	'vt',	'chloride',	'creatinine',	'glucose',	'hb',	'hco3',	'lymphocyte',	'platelet',	'potassium',	'sodium',	'total_bilirubin',	'wbc',	'icu_visit',	'or_duration',	'anesth_duration',	'department','antype'] 

## Create the Y, the Target - Round LOS to an integer because we dont want to predict the precison of LOS to a decimal... 
# that means they would be discharging at strange times like 2:45am (not likely)
y = df['LOS'].round().astype(int)


## Create X the Features for Train/Test/Validate
X = df.drop('LOS', axis=1)
X= X[features_to_retain]

X.info()

<class 'pandas.core.frame.DataFrame'>
Index: 76742 entries, 8 to 128030
Data columns (total 27 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   category_id      76742 non-null  object 
 1   age              76742 non-null  int64  
 2   sex              76742 non-null  object 
 3   weight           76742 non-null  float64
 4   height           76742 non-null  float64
 5   hr               76742 non-null  float64
 6   pip              76742 non-null  float64
 7   pmean            76742 non-null  float64
 8   rr               76742 non-null  float64
 9   spo2             76742 non-null  float64
 10  vt               76742 non-null  float64
 11  chloride         76742 non-null  float64
 12  creatinine       76742 non-null  float64
 13  glucose          76742 non-null  float64
 14  hb               76742 non-null  float64
 15  hco3             76742 non-null  float64
 16  lymphocyte       76742 non-null  float64
 17  platelet        

In [None]:
X.head()

In [None]:
## Cofirm what Y looks like. (should be straing integers)
y

## PreProcessor Pipeline (X - features)

In [5]:
# Create Column Transformer with 3 types of transformers
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

COLS_TO_CAST = ['category_id','antype','sex','department', 'icu_visit']

# Define the columns that you want to scale and cast to strings
numeric_features = X.select_dtypes(include=['int', 'float']).columns
string_features =  COLS_TO_CAST

# Define transformers
numeric_transformer = Pipeline([('impute_mean', SimpleImputer(strategy='mean')),
                              ('scaling', StandardScaler())])
string_transformer = FunctionTransformer(lambda x: x.astype(str), validate=False)
categorical_transform = Pipeline([('impute_mode', SimpleImputer(strategy='most_frequent'))])
# onehot_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# Combine transformers into a preprocessor with ColumnTransformer
preprocessor_catboost = ColumnTransformer(
                        transformers=[
                                    ('num', numeric_transformer, numeric_features),
                                    ('categ', categorical_transform,string_features),
                                    ('str', string_transformer, string_features)],
                        remainder='passthrough')  # Leaves the rest of the columns alone)

'''
NOTE 1 - OneHot Encoding has be REMOVED from the preproccing pipeline here because it is handled within the `catboost` model. 

NOTE 2 - If doing a single model in the pipeline, you could use the below snippet. In this case, we want to separate preprocessing pipeline so it can be used again in production (ie predicting using user-provided data)

# Create a full pipeline by combining with an estimator, for example, a classifier
# pipeline_LR = Pipeline(steps=[
    # ('preprocessor', preprocessor),
    # ('model', LinearRegression())])

TODO - If there is problem with preprocess on incomplete data - check that ctegory transformer isnt the reason
'''
 

In [6]:
### Package (pickle the preproccessor)
# Preprocessor with be referenced again in production to scale/transform user-provided data.

import pickle

with open('../output/preprocessor.pickle', 'wb') as f:
    pickle.dump(preprocessor_catboost, f)


In [9]:

# Load from a file using Pickle
try:
    with open('../output/preprocessor.pickle', 'rb') as f:
        preprocessor_catboost = pickle.load(f)
except FileNotFoundError:
    print("File 'preprocessor_catboost' not found.")
except Exception as e:
    print(f"An error occurred while loading the model: {str(e)}")

In [17]:
X = preprocessor_catboost.fit_transform(X)

In [None]:
# separate dataset into train and test
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split

TEST_SPLIT = .2
TRAINING_SPLIT = 1-TEST_SPLIT
VALIDATION_SPLIT = .2


def split_data(X, y, test_split=0.2, validation_split=0.2, random_state=None):
    """
    Splits data into training, validation, and test sets.

    Parameters:
    - X: Features data.
    - y: Target variable.
    - test_split: Fraction of the data to be used as test set.
    - validation_split: Fraction of the training data to be used as validation set.
    - random_state: Seed for the random number generator.

    Returns:
    X_train, X_test, y_train, y_test
    """

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_split, random_state=random_state)

    # Adjust validation split to account for the initial test split
    validation_size = validation_split / (1 - test_split)

    # Split the training data again into training and validation sets
    X_train, X_validate, y_train, y_validate = train_test_split(
        X_train, y_train, test_size=validation_size, random_state=random_state)

    # Create a dictionary to hold the data splits
    data_splits = {
        'X_train': X_train,
        'X_validate': X_validate,
        'X_test': X_test,
        'y_train': y_train,
        'y_validate': y_validate,
        'y_test': y_test
    }
    #Print the shapes of the splits
    for key, value in data_splits.items():
        shape = value.shape
        print(f"{key} shape: {shape}")
        
    return X_train, X_validate, X_test, y_train, y_validate, y_test


In [None]:
## Call data split function:
X_train, X_validate, X_test, y_train, y_validate, y_test = split_data(X, y,test_split=TEST_SPLIT,
    validation_split=VALIDATION_SPLIT,random_state=85100)

## Model Evalution and Selection

#### Create an Evaluate Function to give all metrics after model Training
`Note`: This has been run previously and determined Catboost was most applicable for this problem. Skip to Model Hypertuning below

In [None]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [None]:
####################
#
# Baseline Model Training (various models)
# Comment-out models to be excluded
#
#####################
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Regressor": KNeighborsRegressor(n_jobs=-1),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(n_jobs=-1),
    "XGBRegressor": XGBRegressor(n_jobs=-1), 
    "CatBoosting Regressor": CatBoostRegressor(verbose=False),
    "AdaBoost Regressor": AdaBoostRegressor()
}
model_list = []
r2_list =[]

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate Train and Test dataset
    model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)

    model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)

    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    r2_list.append(model_test_r2)
    
    print('='*35)
    print('\n')

### Results

In [None]:
pd.DataFrame(list(zip(model_list, r2_list)), columns=['Model Name', 'R2_Score']).sort_values(by=["R2_Score"],ascending=False)

## Catboost Parameter Tuning

* Catboost does NOT require 1-hot encoding in preprocessing. 
* Instead categorical features must be declared by index.
* PCA was not used in the problem as the large number of elements with the 'category_id' would lead to problems with overfit and/or interpretation. 
* tSNE will be considered at a later time point. 

In [None]:
from catboost import CatBoostRegressor
from sklearn.model_selection import GridSearchCV, train_test_split


# Declare the categorical features that would be 1-hot encoded (not all the categorical cols)
# This only needs to be done during training model (ie not required as part of preprocessing)
categorical_features_indices =[0, 2, 22,25, 26]

# Define the parameter grid to search
param_grid = {
    'iterations': [100, 200, 300],      # Number of boosting iterations
    'depth': [6, 8, 10],                # Depth of trees
    'learning_rate': [0.01, 0.1, 0.2],  # Learning rate  
    }

# Create a CatBoostRegressor model
catboost_model = CatBoostRegressor()

# Initialize the GridSearchCV object
grid_search_lin = GridSearchCV(estimator=catboost_model, param_grid=param_grid, cv=5, scoring=evaluate_model, n_jobs=-1, error_score='raise')

# Perform the grid search
grid_search_lin.fit(X_train, y_train, cat_features=categorical_features_indices)

# Print the best hyperparameters and corresponding MSE score
print("Best hyperparameters found:")
print(grid_search_lin.best_params_)
print("Best RMSE score:", -grid_search_lin.best_score_)

# Get the best trained model
best_catboost_model_lin = grid_search_lin.best_estimator_

# Evaluate the best model on the validation set
validation_predictions = best_catboost_model_lin.predict(X_validate)

########################
## Output:
########################
'''Best hyperparameters found:
{'depth': 10, 'iterations': 300, 'learning_rate': 0.1}
Best RMSE score: 3.834154628241867'''