In [30]:
import pandas as pd
import numpy as np
import matplotlib as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

In [31]:
train_features_data=pd.read_csv('train_features.csv')
train_labels_data=pd.read_csv('train_labels.csv')
test_features_data=pd.read_csv('test_features.csv')
# add labels with matching pid to features
labels_copied = pd.DataFrame()
labels_copied = train_labels_data.loc[train_labels_data.index.repeat(12)]
labels_copied = labels_copied.drop(columns=['pid'])
labels = train_labels_data.drop(columns=['pid'])
LABELS1 = ['LABEL_BaseExcess', 'LABEL_Fibrinogen', 'LABEL_AST', 'LABEL_Alkalinephos', 'LABEL_Bilirubin_total',
         'LABEL_Lactate', 'LABEL_TroponinI', 'LABEL_SaO2',
         'LABEL_Bilirubin_direct', 'LABEL_EtCO2']
LABELS2 = ['LABEL_Sepsis']
LABELS3 = ['LABEL_RRate', 'LABEL_ABPm', 'LABEL_SpO2', 'LABEL_Heartrate']

**Sub-Task 1**

In [32]:
X = train_features_data.drop(columns=['pid', 'Time']).reset_index(drop=True).fillna(test_features_data.mean())
y = labels_copied[LABELS1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)

TEST_X = test_features_data.drop(columns=['pid', 'Time']).reset_index(drop=True).fillna(test_features_data.mean())

In [33]:
clf = RandomForestClassifier(n_estimators=300, class_weight=None, n_jobs=-1)
clf.fit(X_train, y_train)
y_pred_proba=clf.predict_proba(X_test)
err=np.empty(10)
list_proba=list()
for i,label in enumerate(LABELS1):
    err[i]=roc_auc_score(y_test[label], y_pred_proba[i][:,1])
    list_proba.append(y_pred_proba[i][:,1])
display(err)
display('Average score: ', np.mean(err))

array([0.83805999, 0.74313657, 0.71377258, 0.71558142, 0.71343182,
       0.75722364, 0.81129012, 0.78336141, 0.78300249, 0.91290222])

'Average score: '

0.777176226325439

In [34]:
err=np.empty(10)
for i, labels in enumerate(LABELS1):
    y_test_reduced=y_test[labels][0:len(y_test[labels]):12]
    y_pred_proba_reduced=np.empty(int(len(y_pred_proba[i][:,1])/12))
    counter=0
    for splits in np.split(np.array(y_pred_proba[i][:,1]), int(len(y_pred_proba[i][:,1])/12)):
        y_pred_proba_reduced[counter]=splits.mean() 
        counter=counter+1
    err[i]=roc_auc_score(y_test_reduced, y_pred_proba_reduced)
    display(err[i])
display('Average Score: ', np.mean(err))

0.5927854807202311

0.5990483441187666

0.5667199468119817

0.5677165524379841

0.5734232537970856

0.5764141299655984

0.5848663213038594

0.5806972914949041

0.6778188822078667

0.7057045852880367

'Average Score: '

0.6025194788146315

In [35]:
TEST_y_pred_proba=clf.predict_proba(TEST_X)

In [36]:
TEST_list_proba=list()
for j in range(10):
    TEST_y_pred_proba_reduced=np.empty(int(len(TEST_y_pred_proba[j][:,1])/12))
    counter=0
    for splits in np.split(np.array(TEST_y_pred_proba[j][:,1]), int(len(TEST_y_pred_proba[j][:,1])/12)):
        TEST_y_pred_proba_reduced[counter]=splits.mean() 
        counter=counter+1
    TEST_list_proba.append(TEST_y_pred_proba_reduced)
proba_subtask1=TEST_list_proba

**Sub-Task 2**

In [37]:
y = labels_copied[LABELS2]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)

In [38]:
clf=RandomForestClassifier(n_estimators=300, class_weight=None, n_jobs=-1)

clf.fit(X_train, y_train)
y_pred_proba=clf.predict_proba(X_test)
err=roc_auc_score(y_test, y_pred_proba[:,1])
display(err)

  This is separate from the ipykernel package so we can avoid doing imports until


0.7531643474813929

In [39]:
TEST_y_pred_proba=clf.predict_proba(TEST_X)

In [40]:
TEST_y_pred_proba_reduced=np.empty(int(len(TEST_y_pred_proba[:,1])/12))
counter=0
for splits in np.split(np.array(TEST_y_pred_proba[:,1]), int(len(TEST_y_pred_proba[:,1])/12)):
    TEST_y_pred_proba_reduced[counter]=splits.mean() 
    counter=counter+1
proba_subtask2=TEST_y_pred_proba_reduced

**Sub-Task 3**

In [41]:
y = labels_copied[LABELS3]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)

In [42]:
from sklearn.linear_model import MultiTaskLassoCV
from sklearn.model_selection import train_test_split, KFold, RepeatedKFold, cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV
from sklearn.metrics import mean_squared_error


cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=42)

model = RidgeCV(alphas=(0.1, 1, 10), cv=cv)
model.fit(X_train, y_train)
y_pred=model.predict(X_test)
print(mean_squared_error(y_test, y_pred))

59.1472790830782


In [43]:
for i, labels in enumerate(LABELS3):
    y_test_reduced=y_test[labels][0:len(y_test[labels]):12]
    y_pred_reduced=np.empty(int(len(y_pred[:,i])/12))
    counter=0
    for splits in np.split(np.array(y_pred[:,i]), int(len(y_pred[:,i])/12)):
        y_pred_reduced[counter]=splits.mean() 
        counter=counter+1
    display(mean_squared_error(y_test_reduced, y_pred_reduced))

12.178231632838566

158.76495455996073

4.081079608443603

210.5812593355625

In [44]:
TEST_y_pred_values=model.predict(TEST_X)
display(TEST_y_pred_values)

array([[18.42979709, 84.63189056, 97.16603678, 85.22237716],
       [16.44438825, 81.04771302, 97.90423776, 80.56801682],
       [16.39009215, 83.05933386, 98.02093075, 82.87670078],
       ...,
       [18.14305129, 77.00011097, 97.72903206, 87.64944768],
       [18.44245049, 79.01844308, 97.73463876, 87.83224154],
       [17.45013289, 77.12264536, 97.74997756, 86.53530667]])

In [45]:
TEST_list_values=list()
for j in range(4):
    TEST_y_pred_values_reduced=np.empty(int(len(TEST_y_pred_values[:,j])/12))
    counter=0
    for splits in np.split(np.array(TEST_y_pred_values[:,j]), int(len(TEST_y_pred_values[:,j])/12)):
        TEST_y_pred_values_reduced[counter]=splits.mean() 
        counter=counter+1
    TEST_list_values.append(TEST_y_pred_values_reduced)
proba_subtask3=TEST_list_values

**Submission**

In [46]:
filename = 'sample.zip'
df_submission = pd.read_csv(filename)
for i,label in enumerate(LABELS1):
    # round classification labels
    df_submission[label]=proba_subtask1[i]
df_submission[LABELS2[0]]=proba_subtask2
for i,label in enumerate(LABELS3):
    # round classification labels
    df_submission[label]=proba_subtask3[i]
df_submission.to_csv('submission.csv',index=False)
df_submission.to_csv('prediction.zip', index=False, float_format='%.3f', compression='zip')