In [2]:
import pandas as pd
import numpy as np

In [3]:
df=pd.read_csv('survival.csv',header=0)
# Dropping irrelevant columns
df.drop(['encounter_id','patient_id','hospital_id','icu_id','Unnamed: 83'],inplace=True,axis=1)
# Dealing with negative probabilities
df.drop(df[(df['apache_4a_hospital_death_prob'] <0)].index, inplace=True)
df.drop(df[(df['apache_4a_icu_death_prob'] <0)].index, inplace=True)
df=df.fillna(df.median()) #Mean imputation for numeric features
df = df.fillna(df.mode().iloc[0]) # Mode imputation for categorical features
# Dropping more columns
df.drop(['aids','leukemia','lymphoma'],inplace=True,axis=1)


  df=df.fillna(df.median()) #Mean imputation for numeric features


In [4]:
# Outlier treatment
for col in df.columns:
    if df[col].dtype=='int64' or df[col].dtype=='float64':
        uq=np.percentile(df[col],[99])[0] #Upper Quartile
        df[col][(df[col] > 3*uq)] = 3*uq
        lq=np.percentile(df[col],[1])[0] #Lower quartile
        df[col][(df[col] < 0.3*lq)] = 0.3*lq

#Generating dummy variables
df=pd.get_dummies(df, columns=['ethnicity','gender','icu_admit_source','icu_stay_type','icu_type','apache_3j_bodysystem','apache_2_bodysystem'],drop_first=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col][(df[col] > 3*uq)] = 3*uq
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col][(df[col] < 0.3*lq)] = 0.3*lq


In [5]:
#Basic model
X=df.loc[:,df.columns!='hospital_death']
Y=df['hospital_death']

In [6]:
from sklearn.model_selection import train_test_split,KFold,cross_val_score,StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
scaler.fit(X)
X=scaler.transform(X)
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.3,random_state=42)


In [7]:
def cross_val(model,X,Y):
    cfv = StratifiedKFold(n_splits=10, random_state=5, shuffle=True)
    scores = cross_val_score(model, X, Y, scoring='f1', cv=cfv, n_jobs=-1)
    scores1 = cross_val_score(model, X, Y, scoring='precision', cv=cfv, n_jobs=-1)
    scores2 = cross_val_score(model, X, Y, scoring='recall', cv=cfv, n_jobs=-1)
    print("Precison Score: ",np.mean(scores1))
    print("Recall Score: ",np.mean(scores2))
    print("F1 Score: ",np.mean(scores),'\n')

In [16]:
#Logistic Regression
lrm=LogisticRegression(random_state=42)
print('Stratified 10 fold cross validation scores:')
cross_val(lrm,X,Y)
lrm.fit(X_train,Y_train)
print(classification_report(Y_test,lrm.predict(X_test)))


Stratified 10 fold cross validation scores:
Precison Score:  0.6562712698379758
Recall Score:  0.2848587797563325
F1 Score:  0.39716064511894594 

              precision    recall  f1-score   support

           0       0.94      0.99      0.96     24443
           1       0.64      0.28      0.39      2264

    accuracy                           0.93     26707
   macro avg       0.79      0.63      0.67     26707
weighted avg       0.91      0.93      0.91     26707



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [17]:
#Undersampling
from imblearn.under_sampling import ClusterCentroids
cc=ClusterCentroids(sampling_strategy='majority',random_state=52)
X_under,Y_under=cc.fit_resample(X,Y)

In [18]:
lrm1=LogisticRegression(random_state=42)
print('Stratified 10 fold cross validation scores:')
cross_val(lrm1,X_under,Y_under)
X_train,X_test,Y_train,Y_test=train_test_split(X_under,Y_under,test_size=0.3,random_state=42)
lrm1.fit(X_train,Y_train)
print(classification_report(Y_test,lrm1.predict(X_test)))

Stratified 10 fold cross validation scores:
Precison Score:  0.7685631082321673
Recall Score:  0.7542709890012629
F1 Score:  0.7613182873773129 

              precision    recall  f1-score   support

           0       0.75      0.78      0.76      2282
           1       0.77      0.75      0.76      2319

    accuracy                           0.76      4601
   macro avg       0.76      0.76      0.76      4601
weighted avg       0.76      0.76      0.76      4601



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [8]:
#Oversampling
from imblearn.over_sampling import SMOTE
X_new=X.astype(np.uint8)
Y_new=Y.astype(np.uint8)
smt=SMOTE(sampling_strategy='minority',k_neighbors=5,random_state=42)
X_over,Y_over=smt.fit_resample(X_new,Y_new)

In [9]:
lrm2=LogisticRegression(random_state=42,max_iter=150)
print('Stratified 10 fold cross validation scores:')
cross_val(lrm2,X_over,Y_over)
X_train,X_test,Y_train,Y_test=train_test_split(X_over,Y_over,test_size=0.3,random_state=42)
lrm2.fit(X_train,Y_train)
print(classification_report(Y_test,lrm2.predict(X_test)))

Stratified 10 fold cross validation scores:
Precison Score:  0.9248638821213564
Recall Score:  0.8776947863577934
F1 Score:  0.9006453857958698 



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


              precision    recall  f1-score   support

           0       0.89      0.93      0.91     24265
           1       0.92      0.89      0.90     24548

    accuracy                           0.91     48813
   macro avg       0.91      0.91      0.91     48813
weighted avg       0.91      0.91      0.91     48813



In [10]:
#Combined resampling
from imblearn.combine import SMOTETomek
stmk=SMOTETomek(random_state=42)
X_comb,Y_comb=stmk.fit_resample(X_new,Y_new)

In [None]:
lrm3=LogisticRegression(random_state=42,max_iter=150)
print('Stratified 10 fold cross validation scores:')
cross_val(lrm3,X_comb,Y_comb)
X_train,X_test,Y_train,Y_test=train_test_split(X_comb,Y_comb,test_size=0.3,random_state=42)
lrm3.fit(X_train,Y_train)
print(classification_report(Y_test,lrm3.predict(X_test)))