In [1]:
import os
import sys
import glob
import numpy as np
import pandas as pd
from tqdm import tqdm
!pip install sklearn-genetic

Collecting sklearn-genetic
  Downloading sklearn_genetic-0.5.1-py3-none-any.whl (11 kB)
Collecting deap>=1.0.2
  Downloading deap-1.3.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (160 kB)
[K     |████████████████████████████████| 160 kB 2.7 MB/s 
Installing collected packages: deap, sklearn-genetic
Successfully installed deap-1.3.1 sklearn-genetic-0.5.1


In [2]:
# Upload the data file
from google.colab import files
uploaded = files.upload()

Saving train.csv to train.csv


In [3]:
import io
train_data = pd.read_csv(io.BytesIO(uploaded['train.csv']))

In [4]:
cols = [
        
        # 'icustay id',
        # 'Capillary refill rate',
        'Diastolic blood pressure',
        # 'Fraction inspired oxygen',
        # 'Glascow coma scale eye opening',
        # 'Glascow coma scale motor response',
        # 'Glascow coma scale total',
        # 'Glascow coma scale verbal response',
        'Glucose',
        'Heart Rate',
        # 'Height',
        'Mean blood pressure',
        'Oxygen saturation',
        'Respiratory rate',
        'Systolic blood pressure',
        'Temperature',
        'Weight',
        'pH',
        # 'label'

]

df = train_data[cols]
df.describe(include='all')

Unnamed: 0,Diastolic blood pressure,Glucose,Heart Rate,Mean blood pressure,Oxygen saturation,Respiratory rate,Systolic blood pressure,Temperature,Weight,pH
count,17689.0,17886.0,17689.0,17687.0,17783.0,17683.0,17689.0,17554.0,13052.0,14776.0
mean,71.121262,155.349435,89.817288,82.512712,96.600045,18.210202,123.949799,36.60408,713.5584,7.092986
std,757.341034,113.680344,21.932248,50.767863,6.34325,6.940607,25.984069,1.284341,72123.64,2.137607
min,0.0,0.0,0.0,-34.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,53.0,109.0,76.0,69.333298,96.0,14.0,106.0,36.01389,66.0,7.23
50%,63.0,135.0,88.0,80.0,98.0,18.0,121.0,36.611112,78.6,7.35
75%,74.0,173.0,102.0,93.0,100.0,22.0,140.0,37.166698,94.0,7.42
max,100105.01,9999.0,941.0,6350.0,100.0,115.0,281.0,60.0,8239872.0,100.0


In [5]:
# Fill invalid data
# Nan data is going to be filled with the mean of each feature
from sklearn.impute import SimpleImputer as SI
fill_mean = SI(missing_values = np.nan, strategy = 'mean')
data = fill_mean.fit_transform(df)

df2 = pd.DataFrame(data, columns=df.columns)

In [6]:
# Remove outlier
# Use 3-standard deviation filtering
index_list = []
for name in df2.columns:
  index_list.extend(df2.index[(np.abs(df2[name] - df2[name].mean()) <= (3*df2[name].std())) == False].tolist())
index_list.sort()
index_list = np.unique(index_list)
print("From 3 standard deviations filtering, ",np.size(index_list), "outliers are filtered")
df3 = df2.drop(index_list)

# Remove outlier from the index list
label_list = train_data['label'].drop(index_list)
label_list = pd.DataFrame(label_list, columns=['label'])

# Add the label
df3 = pd.concat([df3, label_list], axis=1)
cols.append('label')

From 3 standard deviations filtering,  943 outliers are filtered


In [7]:
# min-max standardization to change the range to [0, 1]
from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()
df3[cols[:-1]] = min_max_scaler.fit_transform(df3[cols[:-1]])

In [9]:
# Divide the label 0 data into 3, keep all label 1 data

label1_index = label_list.index[label_list['label'] == 1].tolist()
label0_index = label_list.index[label_list['label'] == 0].tolist()

# Assign a random seed
np.random.seed(10)

# Randomly shuffle the index of label 0
np.random.shuffle(label0_index)

label_index1, label_index2, label_index3 = [], [], []

divide_index  = int(np.size(label0_index)/3)

# Data List 1
label0_index1 = label0_index[0:divide_index]
label_index1.extend(label0_index1)
label_index1.extend(label1_index)
label_index1.sort()
data_list1  = df3.loc[label_index1]

# Data List 2
label0_index2 = label0_index[divide_index:divide_index*2]
label_index2.extend(label0_index2)
label_index2.extend(label1_index)
label_index2.sort()
data_list2  = df3.loc[label_index2]

# Data List 3
label0_index3 = label0_index[divide_index*2:divide_index*3]
label_index3.extend(label0_index3)
label_index3.extend(label1_index)
label_index3.sort()
data_list3  = df3.loc[label_index3]


In [10]:
# Choose the data type

# data = data_list1
# data = data_list2
# data = data_list3
# data = df3

# Use 2/3 of label 0 data + all label 1 data
label_index12 = []
label_index12.extend(label0_index1)
label_index12.extend(label0_index2)
label_index12.extend(label1_index)
label_index12.sort()
data_list12 = df3.loc[label_index12]

data = data_list12

In [11]:
# Split the data: Divide the data into 80% train, 20% test
from sklearn.model_selection import train_test_split

input = data[cols[:-1]]
label = data['label']

x_train, x_test, y_train, y_test = train_test_split(input,label,test_size=0.2,random_state=10)

# Check for the label distribution
print("---------- Train Set Breakdown ----------")
label0_size    = np.size(y_train.index[y_train == 0].tolist())
label1_size    = np.size(y_train.index[y_train == 1].tolist())
y_train_size    = np.size(y_train)
label0_percent = label0_size / y_train_size * 100 
label1_percent = label1_size / y_train_size * 100
txt1 = "Label 0: {:.2f} %".format(label0_percent)
txt2 = "Label 1: {:.2f} %\n".format(label1_percent)
print(txt1)
print(txt2)

print("---------- Test Set Breakdown ----------")
label0_size    = np.size(y_test.index[y_test == 0].tolist())
label1_size    = np.size(y_test.index[y_test == 1].tolist())
y_test_size    = np.size(y_test)
label0_percent = label0_size / y_test_size * 100 
label1_percent = label1_size / y_test_size * 100
txt1 = "Label 0: {:.2f} %".format(label0_percent)
txt2 = "Label 1: {:.2f} %\n".format(label1_percent)
print(txt1)
print(txt2)

---------- Train Set Breakdown ----------
Label 0: 81.12 %
Label 1: 18.88 %

---------- Test Set Breakdown ----------
Label 0: 81.48 %
Label 1: 18.52 %



In [12]:
# Feature Selection: Genetic Algorithm
from genetic_selection import GeneticSelectionCV
from sklearn import linear_model

x = pd.DataFrame(x_train, columns=df.columns)
y = pd.DataFrame(y_train, columns=['label'])
y = y.values.ravel()

estimator = linear_model.LogisticRegression(solver="liblinear", multi_class="ovr")
model = GeneticSelectionCV(
    estimator, cv=3, verbose=1,
    scoring="accuracy", max_features=10,
    n_jobs=-1, n_population=200, crossover_proba=0.8,
    mutation_proba=0.1, n_generations=40,
    crossover_independent_proba=0.1,
    mutation_independent_proba=0.05,
    tournament_size=3, n_gen_no_change=None,
    caching=True)
model = model.fit(x, y)
print('Feature Selection:', x.columns[model.support_])


Selecting features with genetic algorithm.
gen	nevals	avg                            	std                            	min                            	max                               
0  	200   	[ 0.811196  5.32      0.000182]	[ 0.000093  2.977852  0.000103]	[ 0.810881  1.        0.000028]	[  0.811503  10.         0.000521]
1  	169   	[-149.200928    5.76      150.000194]	[ 1215.623186     3.142038  1215.524554]	[-10000.            0.            0.000028]	[     0.811503     10.        10000.      ]
2  	146   	[ 0.811307  7.175     0.000212]      	[ 0.000108  2.141115  0.000135]         	[ 0.810984  1.        0.000028]            	[  0.811503  10.         0.000521]         
3  	161   	[ 0.81139   6.755     0.000208]      	[ 0.00009   1.554019  0.000128]         	[ 0.811088  3.        0.000028]            	[  0.811503  10.         0.000521]         
4  	161   	[ 0.81144   6.08      0.000195]      	[ 0.00009   1.213919  0.000111]         	[ 0.810985  4.        0.000028]            	[ 0.8

In [13]:
# Use only data of the selected features from Genetic Algorithm
new_col = []
selected_features = x.columns[model.support_]
for i in range(np.size(selected_features)):
  new_col.append(selected_features[i])
x_train = x_train[new_col]
x_test  = x_test[new_col]

In [14]:
# function to print result
def print_result(acc_train, acc_test, auroc):
  txt1 = "Train Accuracy: {:.2f} %".format(acc_train*100)
  txt2 = "Test Accuracy : {:.2f} %".format(acc_test*100)
  txt3 = "AUROC         : {:.2f}\n".format(auroc)
  print(txt1)
  print(txt2)
  print(txt3)

In [15]:
# Model 1: Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score 
print("---------- Logistic Regression ----------")

# ----------- without regularization -----------  #
print("Without regularization")

clf = LogisticRegression(penalty='none',solver='newton-cg').fit(x_train, y_train)
acc_train   = clf.score(x_train, y_train)
acc_test    = clf.score(x_test , y_test )
auroc       = roc_auc_score(y_test, clf.predict_proba(x_test)[:, 1])
print_result(acc_train, acc_test, auroc)

print("--------------------------------------")

# ----------- with regularization -----------  #
# with various C values (smaller C, higher regularization)
c_list = [0.001, 0.1, 1]

# with L1 regularization (Lasso Regression)
print("With L1/Lasso Regularization")
for c in c_list:
  clf = LogisticRegression(penalty='l1', solver='liblinear', C=c).fit(x_train, y_train)
  acc_train   = clf.score(x_train, y_train)
  acc_test    = clf.score(x_test , y_test )
  auroc       = roc_auc_score(y_test, clf.predict_proba(x_test)[:, 1])
  txt1 = "When C is {}".format(c)
  print(txt1)
  print_result(acc_train, acc_test, auroc)

print("--------------------------------------")
# with L2 regularization (Ridge Regression)
print("With L2/Ridge Regularization")
for c in c_list:
  clf = LogisticRegression(penalty='l2',solver='newton-cg', C=c).fit(x_train, y_train)
  acc_train   = clf.score(x_train, y_train)
  acc_test    = clf.score(x_test , y_test )
  auroc       = roc_auc_score(y_test, clf.predict_proba(x_test)[:, 1])
  txt1 = "When C is {}".format(c)
  print(txt1)
  print_result(acc_train, acc_test, auroc)


---------- Logistic Regression ----------
Without regularization
Train Accuracy: 81.16 %
Test Accuracy : 81.52 %
AUROC         : 0.62

--------------------------------------
With L1/Lasso Regularization
When C is 0.001
Train Accuracy: 81.12 %
Test Accuracy : 81.48 %
AUROC         : 0.50

When C is 0.1
Train Accuracy: 81.12 %
Test Accuracy : 81.48 %
AUROC         : 0.62

When C is 1
Train Accuracy: 81.16 %
Test Accuracy : 81.48 %
AUROC         : 0.62

When C is 10
Train Accuracy: 81.16 %
Test Accuracy : 81.52 %
AUROC         : 0.62

When C is 100
Train Accuracy: 81.16 %
Test Accuracy : 81.52 %
AUROC         : 0.62

--------------------------------------
With L2/Ridge Regularization
When C is 0.001
Train Accuracy: 81.12 %
Test Accuracy : 81.48 %
AUROC         : 0.62

When C is 0.1
Train Accuracy: 81.12 %
Test Accuracy : 81.48 %
AUROC         : 0.62

When C is 1
Train Accuracy: 81.16 %
Test Accuracy : 81.48 %
AUROC         : 0.62

When C is 10
Train Accuracy: 81.16 %
Test Accuracy : 81.52

In [15]:
# Model 2: Decision Tree
from sklearn.tree import DecisionTreeClassifier
print("---------- Decision Tree ----------")
clf = DecisionTreeClassifier().fit(x_train, y_train)
acc_train   = clf.score(x_train, y_train)
acc_test    = clf.score(x_test , y_test )
auroc       = roc_auc_score(y_test, clf.predict_proba(x_test)[:, 1])
print_result(acc_train, acc_test, auroc)


---------- Decision Tree ----------
Train Accuracy: 99.93 %
Test Accuracy : 72.52 %
AUROC         : 0.55



In [16]:
# Model 3: Random Forest
from sklearn.ensemble import RandomForestClassifier
print("---------- Random Forest ----------")
clf = DecisionTreeClassifier(max_depth=None, min_samples_leaf=1, max_features="auto").fit(x_train, y_train)
acc_train   = clf.score(x_train, y_train)
acc_test    = clf.score(x_test , y_test )
auroc       = roc_auc_score(y_test, clf.predict_proba(x_test)[:, 1])
print_result(acc_train, acc_test, auroc)

---------- Random Forest ----------
Train Accuracy: 99.93 %
Test Accuracy : 70.95 %
AUROC         : 0.53



In [24]:
# # Model 4: SVM
from sklearn import svm
print("---------- Support Vector Machine ----------")
clf = svm.SVC(C=1, kernel='rbf', degree=3, probability=True).fit(x_train, y_train)
acc_train   = clf.score(x_train, y_train)
acc_test    = clf.score(x_test , y_test )
auroc       = roc_auc_score(y_test, clf.predict_proba(x_test)[:, 1])
print_result(acc_train, acc_test, auroc)

---------- Support Vector Machine ----------
Train Accuracy: 81.15 %
Test Accuracy : 81.48 %
AUROC         : 0.56



In [25]:
clf = svm.SVC(C=1, kernel='poly', degree=1, probability=True).fit(x_train, y_train)
acc_train   = clf.score(x_train, y_train)
acc_test    = clf.score(x_test , y_test )
auroc       = roc_auc_score(y_test, clf.predict_proba(x_test)[:, 1])
print_result(acc_train, acc_test, auroc)

Train Accuracy: 81.12 %
Test Accuracy : 81.48 %
AUROC         : 0.41



In [26]:
clf = svm.SVC(C=1, kernel='poly', degree=2, probability=True).fit(x_train, y_train)
acc_train   = clf.score(x_train, y_train)
acc_test    = clf.score(x_test , y_test )
auroc       = roc_auc_score(y_test, clf.predict_proba(x_test)[:, 1])
print_result(acc_train, acc_test, auroc)

Train Accuracy: 81.12 %
Test Accuracy : 81.48 %
AUROC         : 0.44



In [27]:
clf = svm.SVC(C=1, kernel='poly', degree=3, probability=True).fit(x_train, y_train)
acc_train   = clf.score(x_train, y_train)
acc_test    = clf.score(x_test , y_test )
auroc       = roc_auc_score(y_test, clf.predict_proba(x_test)[:, 1])
print_result(acc_train, acc_test, auroc)

Train Accuracy: 81.12 %
Test Accuracy : 81.48 %
AUROC         : 0.57



In [21]:
# Model 5: Gradient Boost
from sklearn.ensemble import GradientBoostingClassifier
print("---------- Gradient Boost ----------")
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=1, random_state=0).fit(x_train, y_train)
acc_train   = clf.score(x_train, y_train)
acc_test    = clf.score(x_test , y_test )
auroc       = roc_auc_score(y_test, clf.predict_proba(x_test)[:, 1])
print_result(acc_train, acc_test, auroc)

---------- Gradient Boost ----------
Train Accuracy: 81.16 %
Test Accuracy : 81.43 %
AUROC         : 0.64



In [22]:
# Model 6: Ada-Boost
from sklearn.ensemble import AdaBoostClassifier
print("---------- Ada-Boost ----------")
clf = AdaBoostClassifier(n_estimators=100, learning_rate=0.1, random_state=0).fit(x_train, y_train)
acc_train   = clf.score(x_train, y_train)
acc_test    = clf.score(x_test , y_test )
auroc       = roc_auc_score(y_test, clf.predict_proba(x_test)[:, 1])
print_result(acc_train, acc_test, auroc)

---------- Ada-Boost ----------
Train Accuracy: 81.14 %
Test Accuracy : 81.48 %
AUROC         : 0.64

