In [13]:
# import relevant libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, ParameterGrid, GridSearchCV
from sklearn.linear_model import SGDClassifier, LinearRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.pipeline import make_pipeline
from sklearn.metrics import roc_auc_score, roc_curve, auc, mean_squared_error
import parfit.parfit as pf
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification


In [14]:
# load data set
df_train_features = pd.read_csv('train_features.csv')
df_train_labels = pd.read_csv('train_labels.csv')
df_test_features = pd.read_csv('test_features.csv')

# add labels with matching pid to features
labels_copied = pd.DataFrame()
labels_copied = df_train_labels.loc[df_train_labels.index.repeat(12)]
labels_copied = labels_copied.drop(columns=['pid'])
labels = df_train_labels.drop(columns=['pid'])

LABELS1 = ['LABEL_BaseExcess', 'LABEL_Fibrinogen', 'LABEL_AST', 'LABEL_Alkalinephos', 'LABEL_Bilirubin_total',
         'LABEL_Lactate', 'LABEL_TroponinI', 'LABEL_SaO2',
         'LABEL_Bilirubin_direct', 'LABEL_EtCO2']
LABELS2 = ['LABEL_Sepsis']
LABELS3 = ['LABEL_RRate', 'LABEL_ABPm', 'LABEL_SpO2', 'LABEL_Heartrate']

# impute the missing data on the test set
TEST_X = df_test_features.drop(columns=['pid', 'Time']).reset_index(drop=True).fillna(df_test_features.mean())

**Sub-Task 1**

In [30]:
# split training data into training and validation set and impute missing data
X = df_train_features.drop(columns=['pid', 'Time']).reset_index(drop=True).fillna(df_train_features.mean())
y_1 = labels_copied[LABELS1]
X_train_1, X_val_1, y_train_1, y_val_1 = train_test_split(X, y_1, test_size=0.2, shuffle=True)

In [31]:
# use a random forest classifier with every hour for every patient
clf = RandomForestClassifier(n_estimators=100, class_weight='balanced', n_jobs=-1)

for i, label in enumerate(LABELS1):
    clf.fit(X_train_1, y_train_1[label])
    y_pred_proba = clf.predict_proba(X_val_1)

In [32]:
err = np.empty(10)
list_proba = list()
for i,label in enumerate(LABELS1):
    err[i] = roc_auc_score(y_val_1[label], y_pred_proba[:,1])
    list_proba.append(y_pred_proba[:,1])
print(err)
print('Average score: ', np.mean(err))

[0.35541453 0.52504998 0.547615   0.5507502  0.54549137 0.49915421
 0.64785733 0.48366763 0.59570544 0.88577012]
Average score:  0.5636475819306217


In [33]:
err = np.empty(10)
for i, labels in enumerate(LABELS1):
    y_val_reduced = y_val_1[labels][0:len(y_val_1[labels]):12]
    y_pred_proba_reduced = np.empty(int(len(y_pred_proba[:,1])/12))
    counter = 0
    for splits in np.split(np.array(y_pred_proba[:,1]), int(len(y_pred_proba[:,1])/12)):
        y_pred_proba_reduced[counter] = splits.mean() 
        counter = counter+1
    err[i] = roc_auc_score(y_val_reduced, y_pred_proba_reduced)
    print(err[i])
print('Average Score: ', np.mean(err))

0.4702586443092772
0.5164655075800315
0.49806408127351276
0.49789758509082677
0.49446048672532833
0.5264744185072168
0.5232551449086882
0.5015396393331065
0.5175795372378886
0.6083089895100252
Average Score:  0.5154304034475902


In [None]:
TEST_y_pred_proba = clf.predict_proba(TEST_X)

TEST_list_proba = list()
for j in range(10):
    TEST_y_pred_proba_reduced = np.empty(int(len(TEST_y_pred_proba[j][:,1])/12))
    counter=0
    for splits in np.split(np.array(TEST_y_pred_proba[j][:,1]), int(len(TEST_y_pred_proba[j][:,1])/12)):
        TEST_y_pred_proba_reduced[counter] = splits.mean() 
        counter = counter+1
    TEST_list_proba.append(TEST_y_pred_proba_reduced)
proba_subtask1 = TEST_list_proba

**Sub-Task 2**

In [None]:
# split training data into training and validation set and impute missing data
y_2 = labels_copied[LABELS2]
X_train_2, X_val_2, y_train_2, y_val_2 = train_test_split(X, y_2, test_size=0.2, shuffle=True)

In [None]:
# use a random forest classifier with every hour for every patient
clf = RandomForestClassifier(n_estimators=300, class_weight=None, n_jobs=-1)

clf.fit(X_train_2, y_train_2)
y_pred_proba = clf.predict_proba(X_val_2)
err = roc_auc_score(y_val_2, y_pred_proba[:,1])
print(err)

In [None]:
TEST_y_pred_proba = clf.predict_proba(TEST_X)

In [None]:
TEST_y_pred_proba_reduced = np.empty(int(len(TEST_y_pred_proba[:,1])/12))
counter = 0
for splits in np.split(np.array(TEST_y_pred_proba[:,1]), int(len(TEST_y_pred_proba[:,1])/12)):
    TEST_y_pred_proba_reduced[counter] = splits.mean() 
    counter = counter+1
proba_subtask2 = TEST_y_pred_proba_reduced

**Sub-Task 3**

In [None]:
# split training data into training and validation set and impute missing data
y_3 = labels_copied[LABELS3]
X_train_3, X_val_3, y_train_3, y_val_3 = train_test_split(X, y_3, test_size=0.2, shuffle=True)

In [None]:
from sklearn.linear_model import MultiTaskLassoCV
from sklearn.model_selection import train_test_split, KFold, RepeatedKFold, cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV
from sklearn.metrics import mean_squared_error

cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=42)

model = RidgeCV(alphas=(0.1, 1, 10), cv=cv)
model.fit(X_train_3, y_train_3)
y_pred = model.predict(X_val_3)
print(mean_squared_error(y_val_3, y_pred))

In [None]:
for i, labels in enumerate(LABELS3):
    y_test_reduced = y_val_3[labels][0:len(y_val_3[labels]):12]
    y_pred_reduced = np.empty(int(len(y_pred[:,i])/12))
    counter = 0
    for splits in np.split(np.array(y_pred[:,i]), int(len(y_pred[:,i])/12)):
        y_pred_reduced[counter] = splits.mean() 
        counter = counter+1
    print(mean_squared_error(y_test_reduced, y_pred_reduced))

In [None]:
TEST_y_pred_values = model.predict(TEST_X)
print(TEST_y_pred_values)

In [None]:
TEST_list_values = list()
for j in range(4):
    TEST_y_pred_values_reduced = np.empty(int(len(TEST_y_pred_values[:,j])/12))
    counter = 0
    for splits in np.split(np.array(TEST_y_pred_values[:,j]), int(len(TEST_y_pred_values[:,j])/12)):
        TEST_y_pred_values_reduced[counter] = splits.mean() 
        counter = counter+1
    TEST_list_values.append(TEST_y_pred_values_reduced)
proba_subtask3=TEST_list_values

**Submission**

In [None]:
filename = 'sample.zip'
df_submission = pd.read_csv(filename)
for i,label in enumerate(LABELS1):
    # round classification labels
    df_submission[label] = proba_subtask1[i]
df_submission[LABELS2[0]] = proba_subtask2
for i,label in enumerate(LABELS3):
    # round classification labels
    df_submission[label]=proba_subtask3[i]
df_submission.to_csv('submission.csv',index=False)
df_submission.to_csv('prediction.zip', index=False, float_format='%.3f', compression='zip')