# Data Cleaning & One-Hot Encoding

In [31]:
import preprocessing as pp
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, classification_report, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

import tensorflow as tf
import tensorflow_datasets as tfds

ModuleNotFoundError: No module named 'tensorflow_datasets'

In [2]:
df = pp.import_df()
df = pp.impute_missing_values(df)
df = pp.one_hot_encode(df)

## Creating Diff Columns

In [3]:
# all_cols = list(df.columns)
# max_cols = pp.find_columns(all_cols, 'max')
# min_cols = pp.find_columns(all_cols, 'min')

In [4]:
# cols_to_find_diff = [col[:-4] for col in max_cols]

In [5]:
# def find_diff(df, col_list):
#     """
#     """
#     new_df = df.copy(deep=True)
#     for col in col_list:
#         col_name = col + '_diff'
#         min_col_name = col + '_min'
#         max_col_name = col + '_max'
#         new_df[col_name] = new_df[max_col_name] - new_df[min_col_name]
#     return new_df

In [6]:
# df = find_diff(df, cols_to_find_diff)

## Identifying features with high correlation to target

In [7]:
x = df.corr()['hospital_death']
x = list(zip(list(x.index),list(x)))
above_5 = [i for i, j in x if abs(j) > 0.1]
below_5 = [i for i, j in x if abs(j) <= 0.1]

In [8]:
# above_5

## Train/Test Split and Scaling Features

In [9]:
X = df.drop('hospital_death',axis=1)
y = df['hospital_death']

In [10]:
x_train, x_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=2,stratify=y)

In [11]:
continuous_col = ['age', 'bmi', 'height', 'pre_icu_los_days', 'weight', 'map_apache', 'resprate_apache',
                  'd1_heartrate_max', 'd1_heartrate_min', 'd1_mbp_max', 'd1_mbp_min', 'd1_resprate_max',
                  'd1_resprate_min', 'd1_spo2_max', 'd1_spo2_min', 'd1_sysbp_max', 'd1_sysbp_min', 
                  'd1_sysbp_noninvasive_max', 'd1_sysbp_noninvasive_min', 'd1_temp_max', 'd1_temp_min', 
                  'h1_diasbp_max', 'h1_diasbp_min', 'h1_heartrate_max', 'h1_heartrate_min', 'h1_mbp_max', 
                  'h1_mbp_min', 'h1_resprate_max', 'h1_resprate_min', 'h1_spo2_max', 'h1_spo2_min', 
                  'h1_sysbp_max', 'h1_sysbp_min', 'd1_bun_max', 'd1_bun_min', 'd1_calcium_max', 
                  'd1_calcium_min', 'd1_creatinine_max', 'd1_creatinine_min', 'd1_glucose_max', 'd1_glucose_min',
                  'd1_hco3_max', 'd1_hco3_min', 'd1_hematocrit_max', 'd1_hematocrit_min', 'd1_platelets_max',
                  'd1_platelets_min', 'd1_potassium_max', 'd1_potassium_min', 'd1_sodium_max', 'd1_sodium_min',
                  'd1_wbc_max', 'd1_wbc_min', 'apache_4a_hospital_death_prob', 'apache_4a_icu_death_prob']

In [12]:
#continuous_col = [x for x in continuous_col if x not in below_5]

In [13]:
ss = StandardScaler()

In [14]:
x_train.loc[:,continuous_col] = ss.fit_transform(x_train[continuous_col])
x_test.loc[:,continuous_col] = ss.transform(x_test[continuous_col])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [15]:
x_train = pd.DataFrame(x_train)
y_train = pd.Series(y_train)

# Modeling

## Tree-Based Methods
### Gradient Boosting Classifier

In [17]:
GBC = GradientBoostingClassifier()
GBC.fit(x_train, y_train)
y_pred = GBC.predict(x_test)

In [18]:
accuracy_score(y_test, y_pred)

0.926293408929837

In [19]:
f1_score(y_test, y_pred)

0.405452946350044

In [20]:
roc_auc_score(y_test,y_pred)

0.6387480268424806

In [21]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.94      0.99      0.96     16760
           1       0.67      0.29      0.41      1583

    accuracy                           0.93     18343
   macro avg       0.80      0.64      0.68     18343
weighted avg       0.91      0.93      0.91     18343



### RANDOM FOREST

In [22]:
rfc = RandomForestClassifier(max_depth=20, n_estimators=100, min_samples_leaf=2,min_samples_split=5)

In [23]:
rfc.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=20, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=2, min_samples_split=5,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [24]:
y_pred = rfc.predict(x_test)

In [25]:
accuracy_score(y_test, y_pred)

0.9271656762797797

In [26]:
f1_score(y_test, y_pred)

0.3721804511278196

In [27]:
print(classification_report(y_test,y_pred))
confusion_matrix(y_test, y_pred)

              precision    recall  f1-score   support

           0       0.93      0.99      0.96     16760
           1       0.73      0.25      0.37      1583

    accuracy                           0.93     18343
   macro avg       0.83      0.62      0.67     18343
weighted avg       0.92      0.93      0.91     18343



array([[16611,   149],
       [ 1187,   396]])

In [28]:
roc_auc_score(y_test, y_pred)

0.6206338565938514

## Neural Network

In [30]:
train_dataset = tf.data.Dataset.from_tensor_slices((x_train.values, y_train.values))
test_dataset = tf.data.Dataset.from_tensor_slices((x_test.values,y_test.values))

NameError: name 'tf' is not defined

In [None]:
BATCH_SIZE = 128
SHUFFLE_BUFFER_SIZE = 120000

train_dataset = train_dataset.shuffle(SHUFFLE_BUFFER_SIZE).batch(BATCH_SIZE)
test_dataset = test_dataset.batch(BATCH_SIZE)

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Dense(128, kernel_regularizer=tf.keras.regularizers.l2(0.1), activation='relu'),
    tf.keras.layers.BatchNormalization(),

    tf.keras.layers.Dense(64, kernel_regularizer=tf.keras.regularizers.l2(0.1), activation='relu'),
    tf.keras.layers.BatchNormalization(),
    
    tf.keras.layers.Dense(32, kernel_regularizer=tf.keras.regularizers.l2(0.1), activation='relu'),
    tf.keras.layers.BatchNormalization(),
    
    tf.keras.layers.Dense(1, activation='sigmoid')
    
])

In [None]:
model.compile(loss=tf.keras.losses.BinaryCrossentropy(),
             optimizer=tf.keras.optimizers.Adam(learning_rate=5e-4),
             metrics=['acc'])

In [None]:
model.fit(train_dataset,validation_data=test_dataset,epochs=50)

In [None]:
y_pred = model.predict(test_dataset)

In [None]:
f1_score(y_test,np.round(y_pred))

In [None]:
roc_auc_score(y_test,np.round(y_pred))

In [None]:
print(classification_report(y_test,np.round(y_pred)))

In [None]:
model.summary()