# Regression Model Creation

## Import Libraries

In [2]:
import os
from operator import itemgetter    
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')


### Import Data from Pre-Processing
* Missing values HAVE been imputed.
* No PCA performed yet, no 1hot encoding. 


In [3]:
######################
#
# Import Data from PreProcessing
#
#####################

df= pd.read_csv('../_data/operations_imputed_CLEAN_v2.csv', index_col=0)

df.drop(['race'], axis=1, inplace=True)

In [None]:
df.info()

### Create the X and y DataFrames for REGRESSION Model


  * create y
  * create X (complete with all the features)
  * drop the features we identified as not meeting impact threshold.
  *  * Target = `LOS` (continuous variable)  



In [4]:
# When doing a Categorical Model, reinsert 'prolonged_LOS' and instead, drop 'LOS'

## Features to retain are those in X that will be used in training. Exludued features are features such as Operation_ID, Subject_ID..
features_to_retain = ['category_id','age','sex',	'weight',	'height',	'hr',	'pip',	'pmean',	'rr',	'spo2',	'vt',	'chloride',	'creatinine',	'glucose',	'hb',	'hco3',	'lymphocyte',	'platelet',	'potassium',	'sodium',	'total_bilirubin',	'wbc',	'icu_visit',	'or_duration',	'anesth_duration',	'department','antype'] 

## Create the Y, the Target
y = df['LOS']

## Create X the Features for Train/Test/Validate
X = df.drop('LOS', axis=1)
X= X[features_to_retain]

X.info()

<class 'pandas.core.frame.DataFrame'>
Index: 76742 entries, 8 to 128030
Data columns (total 27 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   category_id      76742 non-null  object 
 1   age              76742 non-null  int64  
 2   sex              76742 non-null  object 
 3   weight           76742 non-null  float64
 4   height           76742 non-null  float64
 5   hr               76742 non-null  float64
 6   pip              76742 non-null  float64
 7   pmean            76742 non-null  float64
 8   rr               76742 non-null  float64
 9   spo2             76742 non-null  float64
 10  vt               76742 non-null  float64
 11  chloride         76742 non-null  float64
 12  creatinine       76742 non-null  float64
 13  glucose          76742 non-null  float64
 14  hb               76742 non-null  float64
 15  hco3             76742 non-null  float64
 16  lymphocyte       76742 non-null  float64
 17  platelet        

# Define Pipeline


In [None]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.pipeline import Pipeline


# Define the columns that you want to scale and cast to strings
numeric_features = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
numeric_features = [col for col in numeric_features if col not in ['a', 'b', 'c']]
string_features = ['category_id','antype','sex','department', 'icu_visit'] 

# Define transformers
numeric_transformer = StandardScaler()
string_transformer = FunctionTransformer(lambda x: x.astype(str), validate=False)

# Combine transformers into a preprocessor with ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('str', string_transformer, string_features)
    ])

# Create a full pipeline by combining with an estimator, for example, a classifier
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', MODEL_NAME )],
    remainder='passthrough')  # Leaves the rest of the columns alone



# Split the data (example using a generic train_test_split function)
# X_train, X_test, y_train, y_test = ...

# Fit the model with the training data
# model.fit(X_train, y_train)

# Now you can use the model to make predictions or evaluate
# y_pred = model.predict(X_test)

## Training-Test-Validation Split

- Training Set (80% of total): 
  - Used to train the models.
- Validation Set (20% of Traning Set ): 
  - Used to fine-tune hyperparameters, select models, and monitor training progress.  
- Testing Set (20% of total): 
  - Used to evaluate the final model's performance on unseen data and estimate its generalization performance.

In [7]:
from sklearn.model_selection import train_test_split

TEST_SPLIT = .2
TRAINING_SPLIT = 1-TEST_SPLIT
VALIDATION_SPLIT = .2


def split_data(X, y, test_split=0.2, validation_split=0.2, random_state=None):
    """
    Splits data into training, validation, and test sets.

    Parameters:
    - X: Features data.
    - y: Target variable.
    - test_split: Fraction of the data to be used as test set.
    - validation_split: Fraction of the training data to be used as validation set.
    - random_state: Seed for the random number generator.

    Returns:
    A dictionary containing the split data.
    """

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_split, random_state=random_state)

    # Adjust validation split to account for the initial test split
    validation_size = validation_split / (1 - test_split)

    # Split the training data again into training and validation sets
    X_train, X_validate, y_train, y_validate = train_test_split(
        X_train, y_train, test_size=validation_size, random_state=random_state)

    # Create a dictionary to hold the data splits
    data_splits = {
        'X_train': X_train,
        'X_validate': X_validate,
        'X_test': X_test,
        'y_train': y_train,
        'y_validate': y_validate,
        'y_test': y_test
    }

    return data_splits


'''# Example usage:
# Assuming X and y are your data and labels
data_splits = split_data(X, y, test_split=TEST_SPLIT, validation_split=VALIDATION_SPLIT, random_state=85100)

# Print the shapes of the splits
for key, value in data_splits.items():
    shape = value.shape
    print(f"{key} shape: {shape}")'''

## Call data split function:
data_splits = split_data(X, y, test_split=TEST_SPLIT, validation_split=VALIDATION_SPLIT, random_state=85100)
for key, value in data_splits.items():
    shape = value.shape
    print(f"{key} shape: {shape}")

X_train shape: (46044, 27)
X_validate shape: (15349, 27)
X_test shape: (15349, 27)
y_train shape: (46044,)
y_validate shape: (15349,)
y_test shape: (15349,)


## Standardize and/or Normalize X
* We do NOT scale Y, the target
* We fit the StandardScaler on X_training and then transform both your training and validation sets

### Class to modify Features: to be either cast as Str or Scaled

In [6]:
class RawFeats:
    def __init__(self, feats):
        self.feats = feats

    def fit(self, X, y=None):
        pass

    def transform(self, X, y=None):
        return X[self.feats]

    def fit_transform(self, X, y=None):
        self.fit(X)
        return self.transform(X)

    def cast_columns_as_string(self, X, columns_to_cast):
        X[columns_to_cast] = X[columns_to_cast].astype(str)
        return X
    
    def get_numeric_columns(self, X):
        numeric_columns = X.select_dtypes(include=['int', 'float']).columns
        return numeric_columns
    
COLS_TO_CAST = ['category_id','antype','sex','department', 'icu_visit'] 
raw_feats = RawFeats(feats=X_train)

In [None]:
##### 
## SCALE X_train and X_validate
#########
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

# Two preprocessors, one for scale and one for casting

# Get numerical columns for scaling
# COLS_TO_SCALE = RawFeats.get_numeric_columns(features_to_retain)

preprocessor_nums = Pipeline(steps=[("rawFeats", raw_feats,'scaler', StandardScaler())])
# preprocessor_cats = Pipeline(steps=[('caster', RawFeats.cast_columns_as_string(X_train,COLS_TO_CAST))])




# Create the preprocessor with separate transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('num', preprocessor_nums, X_train),
        # ('str', preprocessor_cats, )
    ],
    remainder='passthrough'  # Leaves the rest of the columns alone
)

# # Fit on the training data
# preprocessor.fit(X_train)

# # Transform the training and validation data
# X_train_scaled = preprocessor.transform(X_train)
# X_validate_scaled = preprocessor.transform(X_validate)

# Now X_train_scaled and X_validate_scaled have the specified columns scaled, and the rest are unchanged

## MODEL 

### Linear Regression - Baseline

In [None]:
#########################
#
#  SIMPLE LINEAR REGRESSION Pipeline -
#  -- No tuning. 
########################

from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

categorical_transform = Pipeline([('one-hot-encode', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))])

preprocessing_df = ColumnTransformer([('categorical', categorical_transform, COLS_TO_CAST)])

pipeline_base = Pipeline([('preprocessing', preprocessing_df),
                          ('model', LinearRegression())])
pipeline_base.fit(X_train, y_train)

y_pred_lin = pipeline_base.predict(X_validate)
r2 = r2_score(y_validate, y_pred_lin)
rmse = mean_squared_error(y_validate, y_pred_lin, squared=False)
print(f'R-squared of base model: {r2}')
print(f"RMSE of the base model: {rmse:.3f}")




In [None]:
from sklearn import set_config
set_config(display='diagram')
pipeline_base

## Ensemble Models - Baselines  

Models employed:
* ExtraTreesRegressor
* Random Forest
* XGBRegressor
* CatBoost

In [None]:
#########################
#
#  Ensemble Pipeline -
#  -- No tuning. 
########################

from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor, BaggingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from sklearn.metrics import r2_score

model_list= [LinearRegression(),ExtraTreesRegressor (n_jobs=-1),RandomForestRegressor(n_jobs=-1),XGBRegressor(n_jobs=-1)]
model_names = ["Linear Regression","ExtraTreesRegressor", "Random Forest","XGBRegressor"]


ModScores = {}

categorical_transform = Pipeline([('one-hot-encode', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))])

preprocessing_df = ColumnTransformer([('categorical', categorical_transform, COLS_TO_CAST)])

for model_names, model in zip(model_names, model_list):
    pipeline_base = Pipeline([('preprocessing', preprocessing_df),
                          ('model', model)])
    pipeline_base.fit(X_train, y_train)

    y_pred = pipeline_base.predict(X_validate)
    
    # Calculate the R-squared value
    r2 = r2_score(y_validate, y_pred)
    rmse = mean_squared_error(y_validate, y_pred, squared=False)
    
    ModScores[model_names] = rmse
    
    print(f"{model}: R2: {r2:.2f}, RMSE: {rmse:.2f}")

print("_"*100)
for key, value in sorted(ModScores.items(), key=itemgetter(1), reverse=False):
    print(f"{key}: RMSE: {value:.3f}")

########################
## Output:
########################
'''Random Forest: RMSE: 3.995
XGBRegressor: RMSE: 4.004
ExtraTreesRegressor: RMSE: 4.032
Linear Regression: RMSE: 9515250466.184'''

In [None]:
#########################
#
#  CatBoost - Baseline
# 
########################

from catboost import CatBoostRegressor
from catboost import Pool
from sklearn.pipeline import Pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score


###############
# Specify categorical feature indices
categorical_features_indices = [0, 2, 22,25, 26]


# Create the Pool for training data
train_pool = Pool(data=X_train, label=y_train, cat_features=categorical_features_indices)

# If you have a validation dataset
validation_pool = Pool(data=X_validate, label=y_validate, cat_features=categorical_features_indices)


# Instantiate CatBoostRegressor with the best hyperparameters
cat_model = CatBoostRegressor()

cat_model.fit(
    train_pool,
    eval_set=validation_pool,  # Remove this if you don't have a validation set
    verbose=10,  # This will print the progress every 10 iterations
    plot=True    # This will plot the learning curve (only works in Jupyter notebooks)
)

predictions = cat_model.predict(X_validate)  # If you used Pool, the data here should not be the Pool object but raw data.

# Calculate the R-squared value
r2 = r2_score(y_validate, predictions)
rmse = mean_squared_error(y_validate, predictions, squared=False)

print(f'R-squared of base model: {r2}')
print(f"RMSE of the base model: {rmse:.3f}")

########################
## Output:
########################
'''Shrink model to first 619 iterations.
R-squared of base model: 0.5159662312506688
RMSE of the base model: 3.884'''

## Tuning -  Optimize Model with Hyperparameter Tuning via Grid Search

## CatBoost


In [None]:
#########################
#
#  STANDALONE TUNING  - CatBoost
# 
########################

from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, GridSearchCV
from catboost import Pool, CatBoostRegressor

### Define a Scorer for Grid Search

In [None]:
#########################
#
#  RMSE Scorer for GridSearch
# 
########################

def rmse_scorer(estimator, X, y):
    """
    Custom scoring function to calculate the negative RMSE (Root Mean Squared Error).

    Parameters:
        estimator: Scikit-learn estimator object
            The model to be evaluated.
        X: array-like or pd.DataFrame
            Features for prediction.
        y: array-like or pd.Series
            True target values.

    Returns:
        float
            Negative RMSE value.
    """
    y_pred = estimator.predict(X)
    mse = mean_squared_error(y, y_pred)
    rmse = sqrt(mse)
    
    # Return negative RMSE for grid search to minimize
    return -rmse

### CatBoost -  Perform GridSearch (Regression)

In [None]:
from catboost import CatBoostRegressor
from sklearn.model_selection import GridSearchCV, train_test_split


categorical_features_indices =[0, 2, 22,25, 26]


# Define the parameter grid to search
param_grid = {
    'iterations': [100, 200, 300],      # Number of boosting iterations
    'depth': [6, 8, 10],                # Depth of trees
    'learning_rate': [0.01, 0.1, 0.2],  # Learning rate  
    }

# Create a CatBoostRegressor model
catboost_model = CatBoostRegressor()

# Initialize the GridSearchCV object
grid_search_lin = GridSearchCV(estimator=catboost_model, param_grid=param_grid, cv=5, scoring=rmse_scorer, n_jobs=-1, error_score='raise')

# Perform the grid search
grid_search_lin.fit(X_train, y_train, cat_features=categorical_features_indices)

# Print the best hyperparameters and corresponding MSE score
print("Best hyperparameters found:")
print(grid_search_lin.best_params_)
print("Best RMSE score:", -grid_search_lin.best_score_)

# Get the best trained model
best_catboost_model_lin = grid_search_lin.best_estimator_

# Evaluate the best model on the validation set
validation_predictions = best_catboost_model_lin.predict(X_validate)

########################
## Output:
########################
'''Best hyperparameters found:
{'depth': 10, 'iterations': 300, 'learning_rate': 0.1}
Best RMSE score: 3.834154628241867'''

### CatBoostRegressor - Train Model on Optimized Hyperparameters

In [None]:
#########################
#
#  OPTIMIZED - CatBoost
# 
########################

from catboost import CatBoostRegressor
from catboost import Pool
from sklearn.pipeline import Pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score


###############
# Specify categorical feature indices
categorical_features_indices = [0, 2, 22,25, 26]


# Create the Pool for training data
train_pool = Pool(data=X_train, label=y_train, cat_features=categorical_features_indices)

# If you have a validation dataset
validation_pool = Pool(data=X_validate, label=y_validate, cat_features=categorical_features_indices)

#########
# Best hyperparameters found (after hyperparameter tuning)
best_params = {'depth': 8, 'iterations': 200, 'learning_rate': 0.1}

# Instantiate CatBoostRegressor with the best hyperparameters
cat_model_lin = CatBoostRegressor(**best_params)

cat_model_lin.fit(
    train_pool,
    eval_set=validation_pool,  # Remove this if you don't have a validation set
    verbose=10,  # This will print the progress every 10 iterations
    plot=True    # This will plot the learning curve (only works in Jupyter notebooks)
)


y_pred_cat_lin_r2 = cat_model_lin.predict(X_validate)  # If you used Pool, the data here should not be the Pool object but raw data.


# Calculate the R-squared value
r2 = r2_score(y_validate, y_pred_cat_lin_r2)
rmse = mean_squared_error(y_validate, y_pred_cat_lin_r2, squared=False)


print(f'R-squared of base model: {r2}')
print(f"RMSE of the base model: {rmse:.3f}")


########################
## Output:
########################
'''bestTest = 3.863708828
bestIteration = 177

Shrink model to first 178 iterations.
R-squared of base model: 0.5210853587572815
RMSE of the base model: 3.864'''





### Evaluate Regression Model

In [None]:
import matplotlib.pyplot as plt

# Calculate residuals
residuals = y_validate - y_pred_cat_lin_r2

# Create a scatter plot of residuals vs. predicted values
plt.scatter(y_pred_cat_lin_r2, residuals)
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.axhline(y=0, color='r', linestyle='--')
plt.show()

In [None]:
plt.scatter(y_validate,y_pred_cat_lin_r2)
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Actual vs. Predicted Values')
plt.plot([min(y_validate), max(y_validate)], [min(y_validate), max(y_validate)], color='red', linestyle='--')
plt.show()

In [None]:
plt.hist(residuals, bins=30)
plt.xlabel('Residuals')
plt.ylabel('Frequency')
plt.title('Histogram of Residuals')
plt.show()

## Model Deployment


In [None]:
import pickle

with open('../_output/catboost_model_regression_opt.pickle', 'wb') as f:
    pickle.dump(cat_model_lin, f)
