In [1]:
import numpy as np
import pandas as pd
pd.options.display.max_columns = None

import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px

from sklearn import preprocessing
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, KFold, train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score, f1_score, roc_auc_score
from sklearn.feature_selection import SequentialFeatureSelector as SFS, SelectFromModel
from xgboost import XGBClassifier as XGBC

from sklearn.neural_network import MLPRegressor

In [2]:
merged=pd.read_csv('Mmerged.csv')
mergedT=pd.read_csv('MmergedTest.csv')
mergedH=pd.read_csv('MmergedHoldout.csv')

In [3]:
reg_merged=merged.drop(['id','hosp_admittime','hosp_dischtime','icu_intime','icu_outtime','icu_outcome', 'icu_death'], axis=1)
reg_mergedT=mergedT.drop(['id','hosp_admittime','hosp_dischtime','icu_intime','icu_outtime','icu_outcome', 'icu_death'],axis=1)
reg_mergedH=mergedH.drop(['id','hosp_admittime','hosp_dischtime','icu_intime','icu_outtime','icu_outcome', 'icu_death'],axis=1)

In [4]:
#split X, y

regX_train=reg_merged.drop(['los_icu'], axis=1)
regy_train=reg_merged['los_icu']

regX_test=reg_mergedT.drop(['los_icu'], axis=1)
regy_test=reg_mergedT['los_icu']

regX_holdout=reg_mergedH.drop(['los_icu'], axis=1)
regy_holdout=reg_mergedH['los_icu']

In [5]:
#normalise
num_cols = regX_train.columns[regX_train.dtypes.apply(lambda c: np.issubdtype(c, np.number))]
scaler = preprocessing.StandardScaler()
regX_train[num_cols] = scaler.fit_transform(regX_train[num_cols])

num_cols = regX_test.columns[regX_test.dtypes.apply(lambda c: np.issubdtype(c, np.number))]
scaler = preprocessing.StandardScaler()
regX_test[num_cols] = scaler.fit_transform(regX_test[num_cols])

num_cols = regX_holdout.columns[regX_holdout.dtypes.apply(lambda c: np.issubdtype(c, np.number))]
scaler = preprocessing.StandardScaler()
regX_holdout[num_cols] = scaler.fit_transform(regX_holdout[num_cols])

In [6]:
#select frm model

sel = SelectFromModel(RandomForestRegressor())
sel.fit(regX_train, regy_train)

selected_feats=sel.get_feature_names_out()
print(selected_feats)
print(len(selected_feats))

['admission_age' 'weight_admit' 'height' 'charlson_score' 'aniongap'
 'bicarbonate' 'bun' 'calcium' 'chloride' 'creatinine' 'glucose' 'sodium'
 'potassium' 'hematocrit' 'hemoglobin' 'mch' 'mchc' 'mcv' 'platelet' 'rbc'
 'rdw' 'wbc' 'inr' 'pt' 'ptt']
25


In [7]:
rfr = RandomForestRegressor()

parameters = {'max_depth':np.arange(start=1,stop=10,step=1),
             'n_estimators':np.arange(start=100,stop=500,step=100),
             }
KfoldCV = KFold(n_splits=5, shuffle =True, random_state=1)
Best_rfr = GridSearchCV(
    rfr,
    param_grid=parameters,
    cv=KfoldCV,
)

Best_rfr.fit(regX_train,regy_train)


In [8]:
Best_rfr.best_estimator_

In [9]:
rfr_pred = Best_rfr.predict(regX_test)
rfr_MSE = mean_squared_error(regy_test, rfr_pred)

rfr_MSE

34.292493549573585

In [10]:
rfr_predH = Best_rfr.predict(regX_holdout)
rfr_MSE_H = mean_squared_error(regy_holdout, rfr_predH)

rfr_MSE_H

29.136792281990783