In [1]:
# import relevant libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, ParameterGrid, GridSearchCV, StratifiedKFold, RepeatedKFold
from sklearn.linear_model import SGDClassifier, LinearRegression, LogisticRegression, RidgeCV
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.pipeline import make_pipeline
from sklearn.metrics import roc_auc_score, roc_curve, auc, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.datasets import make_classification

In [2]:
# load data set
df_train_features = pd.read_csv('train_features.csv')
df_train_labels = pd.read_csv('train_labels.csv')
df_test_features = pd.read_csv('test_features.csv')

# add labels with matching pid to features
labels_copied = pd.DataFrame()
labels_copied = df_train_labels.loc[df_train_labels.index.repeat(12)]
labels_copied = labels_copied.drop(columns=['pid'])
labels = df_train_labels.drop(columns=['pid'])

LABELS1 = ['LABEL_BaseExcess', 'LABEL_Fibrinogen', 'LABEL_AST', 'LABEL_Alkalinephos', 'LABEL_Bilirubin_total',
         'LABEL_Lactate', 'LABEL_TroponinI', 'LABEL_SaO2',
         'LABEL_Bilirubin_direct', 'LABEL_EtCO2']
LABELS2 = ['LABEL_Sepsis']
LABELS3 = ['LABEL_RRate', 'LABEL_ABPm', 'LABEL_SpO2', 'LABEL_Heartrate']

# impute the missing data on the test set
TEST_X = df_test_features.drop(columns=['pid', 'Time']).reset_index(drop=True).fillna(df_test_features.mean())

**Sub-Task 1**

In [3]:
# split training data into training and validation set and impute missing data
X = df_train_features.drop(columns=['pid', 'Time']).reset_index(drop=True).fillna(df_train_features.mean())
y_1 = labels_copied[LABELS1]
X_train_1, X_val_1, y_train_1, y_val_1 = train_test_split(X, y_1, test_size=0.2, shuffle=True)

In [4]:
# use a random forest classifier with every hour for every patient
clf = RandomForestClassifier(n_estimators=300, class_weight=None, n_jobs=-1)
clf.fit(X_train_1, y_train_1)
y_pred_proba=clf.predict_proba(X_val_1)

err = np.empty(10)
list_proba=list()
for i,label in enumerate(LABELS1):
    err[i] = roc_auc_score(y_val_1[label], y_pred_proba[i][:,1])
    list_proba.append(y_pred_proba[i][:,1])
print(err)
print('Average score: ', np.mean(err))

[0.84302875 0.74672505 0.71108954 0.71374089 0.71146423 0.75940477
 0.81020476 0.78476432 0.78900523 0.90380658]
Average score:  0.7773234137324464


In [5]:
err = np.empty(10)
for i, labels in enumerate(LABELS1):
    y_val_reduced = y_val_1[labels][0:len(y_val_1[labels]):12]
    y_pred_proba_reduced = np.empty(int(len(y_pred_proba[i][:,1])/12))
    counter = 0
    for splits in np.split(np.array(y_pred_proba[i][:,1]), int(len(y_pred_proba[i][:,1])/12)):
        y_pred_proba_reduced[counter] = splits.mean() 
        counter = counter+1
    err[i] = roc_auc_score(y_val_reduced, y_pred_proba_reduced)
    print(err[i])
print('Average Score: ', np.mean(err))

0.6163849342185341
0.5757274397853795
0.566577834241885
0.5615395784524944
0.5600832861647934
0.5949277906930227
0.5949529645965884
0.5975596268326568
0.6769628863276752
0.7045208264773636
Average Score:  0.6049237167790393


In [6]:
TEST_y_pred_proba = clf.predict_proba(TEST_X)

TEST_list_proba = list()
for j in range(10):
    TEST_y_pred_proba_reduced = np.empty(int(len(TEST_y_pred_proba[j][:,1])/12))
    counter = 0
    for splits in np.split(np.array(TEST_y_pred_proba[j][:,1]), int(len(TEST_y_pred_proba[j][:,1])/12)):
        TEST_y_pred_proba_reduced[counter] = splits.mean() 
        counter = counter+1
    TEST_list_proba.append(TEST_y_pred_proba_reduced)
proba_subtask1 = TEST_list_proba

**Sub-Task 2**

In [7]:
# split training data into training and validation set and impute missing data
y_2 = labels_copied[LABELS2]
X_train_2, X_val_2, y_train_2, y_val_2 = train_test_split(X, y_2, test_size=0.01, shuffle=True)

In [8]:
# use a random forest classifier with every hour for every patient
clf = RandomForestClassifier(n_estimators=300, class_weight=None, n_jobs=-1)

clf.fit(X_train_2, y_train_2)
y_pred_proba = clf.predict_proba(X_val_2)
err = roc_auc_score(y_val_2, y_pred_proba[:,1])
print(err)

  after removing the cwd from sys.path.


0.7499531661392891


In [9]:
TEST_y_pred_proba = clf.predict_proba(TEST_X)

In [10]:
TEST_y_pred_proba_reduced = np.empty(int(len(TEST_y_pred_proba[:,1])/12))
counter = 0
for splits in np.split(np.array(TEST_y_pred_proba[:,1]), int(len(TEST_y_pred_proba[:,1])/12)):
    TEST_y_pred_proba_reduced[counter] = splits.mean() 
    counter = counter+1
proba_subtask2 = TEST_y_pred_proba_reduced

**Sub-Task 3**

In [11]:
# split training data into training and validation set and impute missing data
y_3 = labels_copied[LABELS3]
X_train_3, X_val_3, y_train_3, y_val_3 = train_test_split(X, y_3, test_size=0.2, shuffle=True)

In [12]:
kf = RepeatedKFold(n_splits=10, n_repeats=3, random_state=42)
model = RidgeCV(alphas=(0.1, 1, 10), scoring='r2', cv=kf)

model.fit(X_train_3, y_train_3)
y_pred = model.predict(X_val_3)
print(r2_score(y_val_3, y_pred))

0.29355174609634227


In [13]:
for i, labels in enumerate(LABELS3):
    y_test_reduced = y_val_3[labels][0:len(y_val_3[labels]):12]
    y_pred_reduced = np.empty(int(len(y_pred[:,i])/12))
    counter = 0
    for splits in np.split(np.array(y_pred[:,i]), int(len(y_pred[:,i])/12)):
        y_pred_reduced[counter] = splits.mean() 
        counter = counter+1
    print(r2_score(y_test_reduced, y_pred_reduced))

0.02034172374344556
0.03348940686420532
0.02133598799664138
0.03279945487074065


In [14]:
TEST_y_pred_values = model.predict(TEST_X)
print(TEST_y_pred_values)

[[18.43489729 84.75657179 97.17780483 85.23306489]
 [16.54211997 80.69237988 98.10692822 80.703822  ]
 [16.4183435  83.4694375  98.01750034 83.04454697]
 ...
 [18.15093592 77.11419336 97.74942134 87.73480906]
 [18.44904101 79.13707552 97.75810323 87.91817959]
 [17.45656539 77.23737213 97.7670354  86.62373098]]


In [15]:
TEST_list_values = list()
for j in range(4):
    TEST_y_pred_values_reduced = np.empty(int(len(TEST_y_pred_values[:,j])/12))
    counter = 0
    for splits in np.split(np.array(TEST_y_pred_values[:,j]), int(len(TEST_y_pred_values[:,j])/12)):
        TEST_y_pred_values_reduced[counter] = splits.mean() 
        counter = counter+1
    TEST_list_values.append(TEST_y_pred_values_reduced)
proba_subtask3=TEST_list_values

**Submission**

In [16]:
filename = 'sample.zip'
df_submission = pd.read_csv(filename)

for i,label in enumerate(LABELS1):
    # round classification labels
    df_submission[label] = proba_subtask1[i]
df_submission[LABELS2[0]] = proba_subtask2

for i,label in enumerate(LABELS3):
    # round classification labels
    df_submission[label]=proba_subtask3[i]

df_submission.to_csv('submission.csv',index=False)
df_submission.to_csv('prediction.zip', index=False, float_format='%.3f', compression='zip')