# Regression Model Creation

## Import Libraries

In [1]:
import os
from operator import itemgetter    
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')


### Import Data from Pre-Processing
* Missing values HAVE been imputed.
* No PCA performed yet, no 1hot encoding. 


In [2]:
######################
#
# Import Data from PreProcessing
#
#####################

df= pd.read_csv('../_data/operations_imputed_CLEAN_v2.csv', index_col=0)

df.drop(['race'], axis=1, inplace=True)

In [None]:
df.info()

### Create the X and y DataFrames for REGRESSION Model


  * create y
  * create X (complete with all the features)
  * drop the features we identified as not meeting impact threshold.
  *  * Target = `LOS` (continuous variable)  



In [3]:
# When doing a Categorical Model, reinsert 'prolonged_LOS' and instead, drop 'LOS'

## Features to retain are those in X that will be used in training. Exludued features are features such as Operation_ID, Subject_ID..
features_to_retain = ['category_id','age','sex',	'weight',	'height',	'hr',	'pip',	'pmean',	'rr',	'spo2',	'vt',	'chloride',	'creatinine',	'glucose',	'hb',	'hco3',	'lymphocyte',	'platelet',	'potassium',	'sodium',	'total_bilirubin',	'wbc',	'icu_visit',	'or_duration',	'anesth_duration',	'department','antype'] 

## Create the Y, the Target
y = df['LOS']

## Create X the Features for Train/Test/Validate
X = df.drop('LOS', axis=1)
X= X[features_to_retain]

X.info()

<class 'pandas.core.frame.DataFrame'>
Index: 76742 entries, 8 to 128030
Data columns (total 27 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   category_id      76742 non-null  object 
 1   age              76742 non-null  int64  
 2   sex              76742 non-null  object 
 3   weight           76742 non-null  float64
 4   height           76742 non-null  float64
 5   hr               76742 non-null  float64
 6   pip              76742 non-null  float64
 7   pmean            76742 non-null  float64
 8   rr               76742 non-null  float64
 9   spo2             76742 non-null  float64
 10  vt               76742 non-null  float64
 11  chloride         76742 non-null  float64
 12  creatinine       76742 non-null  float64
 13  glucose          76742 non-null  float64
 14  hb               76742 non-null  float64
 15  hco3             76742 non-null  float64
 16  lymphocyte       76742 non-null  float64
 17  platelet        

## Training-Test-Validation Split

- Training Set (80% of total): 
  - Used to train the models.
- Validation Set (20% of Traning Set ): 
  - Used to fine-tune hyperparameters, select models, and monitor training progress.  
- Testing Set (20% of total): 
  - Used to evaluate the final model's performance on unseen data and estimate its generalization performance.

In [4]:
from sklearn.model_selection import train_test_split

TEST_SPLIT = .2
TRAINING_SPLIT = 1-TEST_SPLIT
VALIDATION_SPLIT = .2


def split_data(X, y, test_split=0.2, validation_split=0.2, random_state=None):
    """
    Splits data into training, validation, and test sets.

    Parameters:
    - X: Features data.
    - y: Target variable.
    - test_split: Fraction of the data to be used as test set.
    - validation_split: Fraction of the training data to be used as validation set.
    - random_state: Seed for the random number generator.

    Returns:
    A dictionary containing the split data.
    """

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_split, random_state=random_state)

    # Adjust validation split to account for the initial test split
    validation_size = validation_split / (1 - test_split)

    # Split the training data again into training and validation sets
    X_train, X_validate, y_train, y_validate = train_test_split(
        X_train, y_train, test_size=validation_size, random_state=random_state)

    # Create a dictionary to hold the data splits
    data_splits = {
        'X_train': X_train,
        'X_validate': X_validate,
        'X_test': X_test,
        'y_train': y_train,
        'y_validate': y_validate,
        'y_test': y_test
    }

    return X_train, X_validate, X_test, y_train, y_validate, y_test


'''# Example usage:
# Assuming X and y are your data and labels
data_splits = split_data(X, y, test_split=TEST_SPLIT, validation_split=VALIDATION_SPLIT, random_state=85100)

# Print the shapes of the splits
for key, value in data_splits.items():
    shape = value.shape
    print(f"{key} shape: {shape}")'''

## Call data split function:
X_train, X_validate, X_test, y_train, y_validate, y_test = split_data(X, y,test_split=TEST_SPLIT,
    validation_split=VALIDATION_SPLIT,random_state=85100)

# Define Pipeline
* This section is limited to establishing the pre-processing pipeline. 
* calling the pipeline is after the data has been split. 
  
### Linear Regression Pipeline

In [6]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, FunctionTransformer,OneHotEncoder
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LinearRegression

COLS_TO_CAST = ['category_id','antype','sex','department', 'icu_visit']

# Define the columns that you want to scale and cast to strings
numeric_features = X.select_dtypes(include=['int', 'float']).columns
string_features =  COLS_TO_CAST

# Define transformers
numeric_transformer = StandardScaler()
string_transformer = FunctionTransformer(lambda x: x.astype(str), validate=False)
onehot_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# Combine transformers into a preprocessor with ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('one_hot', onehot_encoder, string_features)],
        remainder='passthrough')  # Leaves the rest of the columns alone)

# Create a full pipeline by combining with an estimator, for example, a classifier
pipeline_LR = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', LinearRegression())])


## MODEL - Fit

### Linear Regression - Baseline

In [7]:
#########################
#
#  SIMPLE LINEAR REGRESSION Pipeline -
#  -- No tuning. 
########################

from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

pipeline_LR.fit(X_train, y_train)

y_pred_lin = pipeline_LR.predict(X_validate)
r2 = r2_score(y_validate, y_pred_lin)
rmse = mean_squared_error(y_validate, y_pred_lin, squared=False)
print(f'R-squared of base model: {r2}')
print(f"RMSE of the base model: {rmse:.3f}")

R-squared of base model: -30263841350018.305
RMSE of the base model: 31113658.038


In [8]:
from sklearn import set_config
set_config(display='diagram')
pipeline_LR

------------------------------
**Pipeline above this line is functional :)**


## XGBoost - Linear

### Define Pipeline - XGB

In [9]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, FunctionTransformer,OneHotEncoder
from sklearn.pipeline import Pipeline

from xgboost import XGBRegressor

COLS_TO_CAST = ['category_id','antype','sex','department', 'icu_visit']

# Define the columns that you want to scale and cast to strings
numeric_features = X.select_dtypes(include=['int', 'float']).columns
string_features =  COLS_TO_CAST

# Define transformers
numeric_transformer = StandardScaler()
string_transformer = FunctionTransformer(lambda x: x.astype(str), validate=False)
onehot_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# Combine transformers into a preprocessor with ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('one_hot', onehot_encoder, string_features)],
        remainder='passthrough')  # Leaves the rest of the columns alone)

# MODEL DECLARATION: model_XGB
# Create a full pipeline by combining with an estimator, for example, a classifier

XGB_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', XGBRegressor())])


### XGB - Baseline

In [10]:
#########################
#
#  XGB  Pipeline -
#  -- No tuning. 
########################

from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor
from sklearn.feature_selection import SelectKBest
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

XGB_pipeline.fit(X_train, y_train)

y_pred_XGB = XGB_pipeline.predict(X_validate)
r2 = r2_score(y_validate, y_pred_XGB)
rmse = mean_squared_error(y_validate, y_pred_XGB, squared=False)
print(f'R-squared of base model: {r2}')
print(f"RMSE of the base model: {rmse:.3f}")

R-squared of base model: 0.5060832787913404
RMSE of the base model: 3.975


### Define a Scorer for Grid Search

In [12]:
from sklearn.model_selection import cross_val_score, GridSearchCV


# Define the parameter grid to search
param_grid = {
    'model__n_estimators': [100, 200, 300],      # Number of boosting iterations (trees)
    'model__max_depth': [6, 8, 10],               # Depth of trees
    'model__learning_rate': [0.01, 0.1, 0.2],     # Learning rate  
}

# Initialize the GridSearchCV object
grid_search_XGB = GridSearchCV(
    estimator=XGB_pipeline, 
    param_grid=param_grid, 
    cv=5, 
    scoring='neg_root_mean_squared_error',  # Using the negative RMSE
    n_jobs=-1, 
    error_score='raise'
)


# Perform the grid search
grid_search_XGB.fit(X_train, y_train)

# Print the best hyperparameters and corresponding MSE score
print("Best hyperparameters found:")
print(grid_search_XGB.best_params_)
print("Best RMSE score:", -grid_search_XGB.best_score_)

# Get the best trained model
best_XGB_lin = grid_search_XGB.best_estimator_

# Evaluate the best model on the validation set
validation_predictions = best_XGB_lin.predict(X_validate)

Best hyperparameters found:
{'model__learning_rate': 0.1, 'model__max_depth': 6, 'model__n_estimators': 300}
Best RMSE score: 3.7674818923960727


## Model Deployment


In [None]:
import pickle

with open('../_output/XGB_linear.pickle', 'wb') as f:
    pickle.dump(best_XGB_lin, f)
