## Model for Sepsis Risk Categorization for Entire Population

#### Create Useable DataFrame - same as created dataframe for comorbidities groups

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier

In [None]:
sofa_score = pd.read_csv('../processed_data/sofa_scores.csv')
all_subgroups = pd.read_csv('../processed_data/all_subgroups.csv')
#all_subgroups = all_subgroups[['subject_id', 'hadm_id', 'subgroup', 'rounded_age']]
temp = pd.read_csv('../processed_data/chartevents_temp.csv')
temp = temp[temp['hadm_id'].isin(all_subgroups['hadm_id'].value_counts().index)][['hadm_id', 'Final_Temperature_F', 'charttime']]
#temp = temp[temp['charttime'].isin(all_subgroups['charttime'].value_counts().index)][['hadm_id', 'Final_Temperature_F', 'charttime']]
bp = pd.read_csv('../processed_data/chartevents_bp.csv')
bp = bp[bp['hadm_id'].isin(all_subgroups['hadm_id'].value_counts().index)][['hadm_id', 'Combined_Blood_Pressure', 'charttime']].dropna()
def impute(heart_rate, max_rate):
    if pd.isna(heart_rate) == True:
        return max_rate
    else:
        return heart_rate
heart_rate = pd.read_csv('../processed_data/chartevents_hr.csv')
heart_rate['heart_rate'] = heart_rate.apply(lambda x: impute(x["Heart Rate"], x["Heart rate Alarm - High"]), axis=1)
heart_rate = heart_rate[['hadm_id', 'charttime', 'heart_rate']]
o2_state = pd.read_csv('../processed_data/chartevents_o2sat.csv')
o2_state = o2_state[['hadm_id', 'charttime', 'SpO2']]
respiritory_rate = pd.read_csv('../processed_data/chartevents_rr.csv')
respiritory_rate = respiritory_rate[['hadm_id', 'charttime', 'RespiratoryRate_combined']]
white_blood_cell = pd.read_csv('../processed_data/chartevents_wbc.csv')
white_blood_cell = white_blood_cell[['hadm_id', 'charttime', 'WBC_combined', 'subject_id']]

model_subgroups = all_subgroups[['subject_id', 'hadm_id', 'subgroup', 'rounded_age']]

df = model_subgroups.merge(bp, on = 'hadm_id', how = 'outer')

df = df.merge(temp, on = ['charttime', 'hadm_id'], how = 'left')

df = df.merge(heart_rate, on = ['charttime', 'hadm_id'], how = 'left')


df = df.merge(respiritory_rate, on = ['charttime', 'hadm_id'], how = 'left')

df = df.merge(o2_state, on = ['charttime', 'hadm_id'], how = 'left')

df = df.dropna()


df['date'] = pd.to_datetime(df['charttime']).dt.date
df['charttime'] = pd.to_datetime(df['charttime'])

df.isna().sum()

#accounts for readmission 
df['hadm_number'] = df.groupby('subject_id')['hadm_id'].rank(method='dense').astype(int)

df_sorted = df.sort_values(by=['subject_id', 'hadm_id', 'charttime'])

# Define a function to assign hours
def assign_hours(group):
    group['hours_in_hospital'] = (group['charttime'] - group['charttime'].iloc[0]).dt.total_seconds() / 3600
    return group

# Apply the function to each group of subject_id and hadm_id
df_with_hours = df_sorted.groupby(['subject_id', 'hadm_id']).apply(assign_hours)

# Convert hours_in_hospital to integers
df_with_hours['hours_in_hospital'] = df_with_hours['hours_in_hospital'].astype(int)

# Display the DataFrame with the new column
df_with_hours = df_with_hours.reset_index(drop = True)

import seaborn as sns

def groups(x):
    if 0<=x<=4:
        return 0 
    elif 5<=x<=11:
        return 1 
    else:
        return 2

sofa_score = sofa_score[['hadm_id', 'sofa']]

final_df = df_with_hours.merge(sofa_score, on = 'hadm_id', how = 'inner')

final_df['group'] = final_df['sofa'].apply(lambda x: groups(x))

#### Sample from entire population

In [None]:
sample = final_df.sample(frac = 0.05)
sample

In [None]:
X = sample.iloc[:,:-1][['Final_Temperature_F', 'heart_rate', 'RespiratoryRate_combined', 'SpO2', 'Combined_Blood_Pressure', 'hadm_number', 'hours_in_hospital', 'rounded_age']]
y = sample['group']

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

#### Create Random Forest Classification for entire population

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
from sklearn.model_selection import GridSearchCV
# Create the parameter grid based on the results of random search 
param_grid = {'bootstrap': [True],
 'max_depth': [90, 100, 500, 1000],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4, 10],
 'min_samples_split': [ 1, 5, 10],
 'n_estimators': [100, 600, 1000]}
# Create a based model
rf = RandomForestClassifier()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)
grid_search.fit(X_train, y_train)
predictions = grid_search.predict(X_test)
print(sum(predictions == y_test)/len(y_test))

In [None]:
grid_search.best_params_

#### Look at other models for entire population

In [None]:
from sklearn import linear_model
regr = linear_model.LogisticRegression()
regr.fit(X_train, y_train)

In [None]:
sum(regr.predict(X_test) == y_test)/len(X_test)

In [27]:
rf = RandomForestClassifier(bootstrap = True,
 max_depth= 90,
 max_features= 2,
 min_samples_leaf= 2,
 min_samples_split= 5,
 n_estimators= 500)

In [28]:
rf.fit(X_train, y_train)

RandomForestClassifier(max_depth=90, max_features=2, min_samples_leaf=2,
                       min_samples_split=5, n_estimators=500)

In [29]:
sum(rf.predict(X_test) == y_test)/len(y_test)

0.6195337944157793