## Build a classification model for Survival.csv dataset
Find dataset describtion here
https://physionet.org/content/challenge-2012/1.0.0/


In [1]:
import pandas as pd
import numpy as np

import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [2]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score
from sklearn.metrics import classification_report, matthews_corrcoef, roc_auc_score, f1_score, precision_score, recall_score, precision_recall_curve, auc, roc_curve,average_precision_score

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingRegressor, RandomForestClassifier, AdaBoostRegressor, StackingClassifier

In [3]:
df = pd.read_csv('Survival_dataset.csv')
df.head(5)

Unnamed: 0,recordid,SAPS-I,SOFA,Length_of_stay,Survival,In-hospital_death,Age,Gender,Height,Weight,...,SysABP_last,TroponinI_last,TroponinT_last,WBC_last,Weight_last,pH_last,MechVentStartTime,MechVentDuration,MechVentLast8Hour,UrineOutputSum
0,132539,6,1,5,-1,0,54.0,0.0,,,...,,,,9.4,,,,,,
1,132540,16,8,8,-1,0,76.0,1.0,175.3,76.0,...,103.0,,,13.3,81.6,7.37,71.0,360.0,0.0,5.0
2,132541,21,11,19,-1,0,44.0,0.0,,56.7,...,126.0,,,6.2,56.7,7.47,617.0,2160.0,1.0,14.0
3,132543,7,1,9,575,0,68.0,1.0,180.3,84.6,...,,,,7.9,84.6,,,,,
4,132545,17,2,4,918,0,88.0,0.0,,,...,,,,4.8,,,,,,


In [4]:
df = df.drop(columns=['Length_of_stay', 'Survival','recordid'])
df.info(verbose=True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 117 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   SAPS-I             4000 non-null   int64  
 1   SOFA               4000 non-null   int64  
 2   In-hospital_death  4000 non-null   int64  
 3   Age                4000 non-null   float64
 4   Gender             3997 non-null   float64
 5   Height             2106 non-null   float64
 6   Weight             3669 non-null   float64
 7   CCU                4000 non-null   int64  
 8   CSRU               4000 non-null   int64  
 9   SICU               4000 non-null   int64  
 10  DiasABP_first      2779 non-null   float64
 11  GCS_first          3936 non-null   float64
 12  Glucose_first      3887 non-null   float64
 13  HR_first           3937 non-null   float64
 14  MAP_first          2792 non-null   float64
 15  NIDiasABP_first    3482 non-null   float64
 16  NIMAP_first        3480

In [5]:
df = df.dropna(thresh=3000, axis=1)
df_final = df.fillna(df.median())

In [6]:
df_final = df_final.rename(columns={'In-hospital_death': 'InHospitalDeath'})

In [7]:
X = df_final.drop(["InHospitalDeath"], axis = 1).to_numpy()
y = df_final.InHospitalDeath.to_numpy()

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=24,  test_size = 0.2)

### Support Vector Machine

In [9]:
clf_svc_poly = SVC(kernel = "poly", random_state = 24)

params = {
    "C": np.linspace(50, 250, 5),
}

clf_svc_poly_cv = GridSearchCV(clf_svc_poly, params, cv = 3, scoring = "f1")
clf_svc_poly_cv.fit(X_train,y_train)

print("Best score:", clf_svc_poly_cv.best_score_)
print("Best parameter:", clf_svc_poly_cv.best_params_)

Best score: 0.319319520174482
Best parameter: {'C': 200.0}


In [25]:
clf_svc_poly = SVC(kernel = "poly", random_state = 3, C=200)
clf_svc_poly.fit(X_train, y_train)
y_pred = clf_svc_poly.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.90      0.94      0.92       698
           1       0.41      0.27      0.33       102

    accuracy                           0.86       800
   macro avg       0.66      0.61      0.62       800
weighted avg       0.84      0.86      0.84       800



### Logistic regression

In [11]:
clf_log = LogisticRegression(random_state=24).fit(X_train, y_train)
y_pred = clf_log.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.89      0.97      0.93       698
           1       0.47      0.20      0.28       102

    accuracy                           0.87       800
   macro avg       0.68      0.58      0.60       800
weighted avg       0.84      0.87      0.84       800



### Decision tree

In [12]:
clf_tree = DecisionTreeClassifier(random_state=24)

params = {
    "max_depth": np.arange(1, 20),
    "max_leaf_nodes": np.arange(1, 20)
}

clf_cv = GridSearchCV(clf_tree, params, cv = 5, scoring = "f1")
clf_cv.fit(X_train, y_train)

print("Best score:", clf_cv.best_score_)
print("Best parameter:", clf_cv.best_params_)

Best score: 0.29869115690986536
Best parameter: {'max_depth': 7, 'max_leaf_nodes': 14}


In [13]:
clf_tree = DecisionTreeClassifier(max_depth=7, max_leaf_nodes=14,random_state=24)
clf_tree.fit(X_train, y_train)
y_pred = clf_tree.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.90      0.97      0.93       698
           1       0.53      0.25      0.34       102

    accuracy                           0.88       800
   macro avg       0.71      0.61      0.64       800
weighted avg       0.85      0.88      0.86       800



### QDA and LDA

In [14]:
clf_qda = QuadraticDiscriminantAnalysis()
clf_qda.fit(X_train, y_train)
y_pred = clf_qda.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.92      0.89      0.90       698
           1       0.37      0.45      0.41       102

    accuracy                           0.83       800
   macro avg       0.65      0.67      0.66       800
weighted avg       0.85      0.83      0.84       800



In [15]:
clf_lda = LinearDiscriminantAnalysis()
clf_lda.fit(X_train, y_train)
y_pred = clf_lda.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.91      0.96      0.93       698
           1       0.54      0.33      0.41       102

    accuracy                           0.88       800
   macro avg       0.72      0.65      0.67       800
weighted avg       0.86      0.88      0.87       800



### Random forest

In [16]:
clf_forest = RandomForestClassifier(random_state=24)

params = {
    "n_estimators": np.arange(1,20),
    "max_features": np.arange(1,20)
}

forest_reg_cv = GridSearchCV(clf_forest, params, cv = 5, scoring = "f1")
forest_reg_cv.fit(X_train, y_train)

print("Best score:", forest_reg_cv.best_score_)
print("Best parameter:", forest_reg_cv.best_params_)

Best score: 0.3437502956648246
Best parameter: {'max_features': 13, 'n_estimators': 1}


In [17]:
clf_forest = RandomForestClassifier(max_features = 13, n_estimators = 1, random_state=24)
clf_forest.fit(X_train, y_train)
y_pred = clf_forest.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.89      0.91      0.90       698
           1       0.31      0.26      0.29       102

    accuracy                           0.83       800
   macro avg       0.60      0.59      0.60       800
weighted avg       0.82      0.83      0.83       800



### Naive Bayes

In [18]:
clf_NB = GaussianNB()
clf_NB.fit(X_train, y_train)
y_pred = clf_NB.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.93      0.83      0.88       698
           1       0.33      0.57      0.42       102

    accuracy                           0.80       800
   macro avg       0.63      0.70      0.65       800
weighted avg       0.85      0.80      0.82       800



### XGboost

In [19]:
clf_xg = XGBClassifier(random_state = 24)

params = {
    "scale_pos_weight": np.linspace(5000, 100000, 5),
}

clf_xg_cv = GridSearchCV(clf_xg, params, cv = 3, scoring = "f1")
clf_xg_cv.fit(X_train,y_train)

print("Best score:", clf_xg_cv.best_score_)
print("Best parameter:", clf_xg_cv.best_params_)

Best score: 0.40690365507637866
Best parameter: {'scale_pos_weight': 52500.0}


In [20]:
clf_xg = XGBClassifier(scale_pos_weight=52500, random_state=24)
clf_xg.fit(X_train, y_train)
y_pred = clf_xg.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.92      0.80      0.86       698
           1       0.28      0.53      0.37       102

    accuracy                           0.77       800
   macro avg       0.60      0.67      0.61       800
weighted avg       0.84      0.77      0.80       800



### Stacking 

In [21]:
estimators = [
    ("xgb", XGBClassifier(scale_pos_weight=52500,random_state=24)),
    ('qda', QuadraticDiscriminantAnalysis()),
    ('NB', GaussianNB()),
    ('dt', DecisionTreeClassifier(max_depth=7, max_leaf_nodes=14,random_state=24))

]
clf_stacking = StackingClassifier(
    estimators=estimators, final_estimator=LogisticRegression())

In [22]:
clf_stacking.fit(X_train, y_train)
y_pred = clf_stacking.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.90      0.97      0.93       698
           1       0.51      0.25      0.33       102

    accuracy                           0.87       800
   macro avg       0.70      0.61      0.63       800
weighted avg       0.85      0.87      0.85       800



### Stacking 2

In [23]:
estimators = [
    ('lda', LinearDiscriminantAnalysis()),
    ('NB', GaussianNB()),
    ("xgb", XGBClassifier(scale_pos_weight=52500,random_state=24))
]
clf_stacking = StackingClassifier(
    estimators=estimators, final_estimator=LogisticRegression())

In [24]:
clf_stacking.fit(X_train, y_train)
y_pred = clf_stacking.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.91      0.98      0.94       698
           1       0.67      0.32      0.44       102

    accuracy                           0.89       800
   macro avg       0.79      0.65      0.69       800
weighted avg       0.88      0.89      0.88       800

