In [None]:
import pandas as pd
import numpy as np

In [None]:
sofa_score = pd.read_csv('sofa_scores.csv')

In [None]:
all_subgroups = pd.read_csv('all_subgroups.csv')
all_subgroups = all_subgroups[['subject_id', 'hadm_id', 'subgroup']]
temp = pd.read_csv('chartevents_temp_cleaned.csv')
temp = temp[temp['hadm_id'].isin(all_subgroups['hadm_id'].value_counts().index)][['hadm_id', 'Final_Temperature_F', 'charttime']]
#temp = temp[temp['charttime'].isin(all_subgroups['charttime'].value_counts().index)][['hadm_id', 'Final_Temperature_F', 'charttime']]
bp = pd.read_csv('chartevents_bp_cleaned.csv')
bp = bp[bp['hadm_id'].isin(all_subgroups['hadm_id'].value_counts().index)][['hadm_id', 'Combined_Blood_Pressure', 'charttime']].dropna()
def impute(heart_rate, max_rate):
    if pd.isna(heart_rate) == True:
        return max_rate
    else:
        return heart_rate
heart_rate = pd.read_csv('chartevents_hr_cleaned.csv')
heart_rate['heart_rate'] = heart_rate.apply(lambda x: impute(x["Heart Rate"], x["Heart rate Alarm - High"]), axis=1)
heart_rate = heart_rate[['hadm_id', 'charttime', 'heart_rate']]
o2_state = pd.read_csv('chartevents_o2sat.csv')
o2_state = o2_state[['hadm_id', 'charttime', 'SpO2']]
respiritory_rate = pd.read_csv('chartevents_rr_cleaned.csv')
respiritory_rate = respiritory_rate[['hadm_id', 'charttime', 'RespiratoryRate_combined']]
white_blood_cell = pd.read_csv('chartevents_wbc_cleaned.csv')
white_blood_cell = white_blood_cell[['hadm_id', 'charttime', 'WBC_combined']]

In [None]:
df = all_subgroups.merge(bp, on = 'hadm_id', how = 'outer')

In [None]:
df = df.merge(temp, on = ['charttime', 'hadm_id'], how = 'left')

In [None]:
df = df.merge(heart_rate, on = ['charttime', 'hadm_id'], how = 'left')


In [None]:
df = df.merge(respiritory_rate, on = ['charttime', 'hadm_id'], how = 'left')

In [None]:
df = df.merge(o2_state, on = ['charttime', 'hadm_id'], how = 'left')

In [None]:
df = df.dropna()

In [None]:
df

In [None]:

df['date'] = pd.to_datetime(df['charttime']).dt.date
df['charttime'] = pd.to_datetime(df['charttime'])

In [None]:
df.isna().sum()

In [None]:
#accounts for readmission 
df['hadm_number'] = df.groupby('subject_id')['hadm_id'].rank(method='dense').astype(int)

In [None]:
df_sorted = df.sort_values(by=['subject_id', 'hadm_id', 'charttime'])

# Define a function to assign hours
def assign_hours(group):
    group['hours_in_hospital'] = (group['charttime'] - group['charttime'].iloc[0]).dt.total_seconds() / 3600
    return group

# Apply the function to each group of subject_id and hadm_id
df_with_hours = df_sorted.groupby(['subject_id', 'hadm_id']).apply(assign_hours)

# Convert hours_in_hospital to integers
df_with_hours['hours_in_hospital'] = df_with_hours['hours_in_hospital'].astype(int)

# Display the DataFrame with the new column
df_with_hours = df_with_hours.reset_index(drop = True)

In [None]:
sofa_score = sofa_score[['hadm_id', 'sofa']]

final_df = df_with_hours.merge(sofa_score, on = 'hadm_id', how = 'right')
subgroup_1 = final_df[final_df['subgroup'] == 1]

In [None]:
import seaborn as sns

In [None]:
def groups(x):
    if 0<=x<=3:
        return 0 
    elif 4<=x<=6:
        return 1 
    else:
        return 2

In [None]:
subgroup_1['group'] = subgroup_1['sofa'].apply(lambda x: groups(x))

In [None]:
subgroup_1

In [None]:
subgroup_1 = subgroup_1.sample(frac = 0.4)

In [None]:
subgroup_1

In [20]:
X = subgroup_1.iloc[:,:-1][['Final_Temperature_F', 'Combined_Blood_Pressure', 'heart_rate', 'RespiratoryRate_combined', 'SpO2', 'hours_in_hospital']]
y = subgroup_1.iloc[:,-1]

In [21]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [22]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
from sklearn.model_selection import GridSearchCV
# Create the parameter grid based on the results of random search 
param_grid = {'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}
# Create a based model
rf = RandomForestClassifier()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 5, n_jobs = -1, verbose = 2)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 3960 candidates, totalling 19800 fits
[CV] END bootstrap=True, max_depth=10, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   3.8s
[CV] END bootstrap=True, max_depth=10, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=600; total time=  11.2s
[CV] END bootstrap=True, max_depth=10, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=800; total time=  14.9s
[CV] END bootstrap=True, max_depth=10, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=1200; total time=  22.4s
[CV] END bootstrap=True, max_depth=10, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=1600; total time=  29.6s
[CV] END bootstrap=True, max_depth=10, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=1800; total time=  33.6s
[CV] END bootstrap=True, max_depth=10, max_features=auto, min_samples_leaf=1, min_samples_split=5, n_estimators=200

[CV] END bootstrap=True, max_depth=10, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=400; total time=   7.3s
[CV] END bootstrap=True, max_depth=10, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=600; total time=  10.7s
[CV] END bootstrap=True, max_depth=10, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=1000; total time=  17.3s
[CV] END bootstrap=True, max_depth=10, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=1400; total time=  24.2s
[CV] END bootstrap=True, max_depth=10, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=1600; total time=  27.8s
[CV] END bootstrap=True, max_depth=10, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=2000; total time=  34.9s
[CV] END bootstrap=True, max_depth=10, max_features=auto, min_samples_leaf=1, min_samples_split=5, n_estimators=400; total time=   7.1s
[CV] END bootstrap=True, max_depth=10, max_f

[CV] END bootstrap=True, max_depth=10, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=400; total time=   7.6s
[CV] END bootstrap=True, max_depth=10, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=800; total time=  14.9s
[CV] END bootstrap=True, max_depth=10, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=1000; total time=  18.7s
[CV] END bootstrap=True, max_depth=10, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=1400; total time=  26.0s
[CV] END bootstrap=True, max_depth=10, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=1800; total time=  33.3s
[CV] END bootstrap=True, max_depth=10, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=2000; total time=  37.4s
[CV] END bootstrap=True, max_depth=10, max_features=auto, min_samples_leaf=1, min_samples_split=5, n_estimators=800; total time=  14.9s
[CV] END bootstrap=True, max_depth=10, max_f

[CV] END bootstrap=True, max_depth=10, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   4.0s
[CV] END bootstrap=True, max_depth=10, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=600; total time=  11.2s
[CV] END bootstrap=True, max_depth=10, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=1000; total time=  18.5s
[CV] END bootstrap=True, max_depth=10, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=1200; total time=  22.6s
[CV] END bootstrap=True, max_depth=10, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=1600; total time=  29.6s
[CV] END bootstrap=True, max_depth=10, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=2000; total time=  37.2s
[CV] END bootstrap=True, max_depth=10, max_features=auto, min_samples_leaf=1, min_samples_split=5, n_estimators=400; total time=   7.4s
[CV] END bootstrap=True, max_depth=10, max_f

In [None]:
grid_search.best_params_

In [None]:
sum(grid_search.predict(X_test) == y_test)/len(y_test)