In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from imblearn.over_sampling import SMOTE

In [2]:
train_features_data=pd.read_csv('train_features.csv')
train_labels_data=pd.read_csv('train_labels.csv')
test_features_data=pd.read_csv('test_features.csv')
# add labels with matching pid to features
labels_copied = pd.DataFrame()
labels_copied = train_labels_data.loc[train_labels_data.index.repeat(12)]
labels_copied = labels_copied.drop(columns=['pid'])
labels = train_labels_data.drop(columns=['pid'])
LABELS1 = ['LABEL_BaseExcess', 'LABEL_Fibrinogen', 'LABEL_AST', 'LABEL_Alkalinephos', 'LABEL_Bilirubin_total',
         'LABEL_Lactate', 'LABEL_TroponinI', 'LABEL_SaO2',
         'LABEL_Bilirubin_direct', 'LABEL_EtCO2']
LABELS2 = ['LABEL_Sepsis']
LABELS3 = ['LABEL_RRate', 'LABEL_ABPm', 'LABEL_SpO2', 'LABEL_Heartrate']

## Subtask 1

In [3]:
X = train_features_data.drop(columns=['pid', 'Time']).reset_index(drop=True).fillna(test_features_data.mean())
y = labels_copied[LABELS1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.01, shuffle=True, random_state=42)

TEST_X = test_features_data.drop(columns=['pid', 'Time']).reset_index(drop=True).fillna(test_features_data.mean())

### RandomForest model and Roc_Auc_Score using every hour per patient as sample for training

In [4]:
clf=RandomForestClassifier(n_estimators=300, class_weight=None, n_jobs=-1 )
clf.fit(X_train, y_train)
y_pred_proba=clf.predict_proba(X_test)
err=np.empty(10)
list_proba=list()
for i,label in enumerate(LABELS1):
    err[i]=roc_auc_score(y_test[label], y_pred_proba[i][:,1])
    list_proba.append(y_pred_proba[i][:,1])
display(err)
display('Average score: ', np.mean(err))

array([0.84444609, 0.74624438, 0.7218315 , 0.72349666, 0.71586261,
       0.76938312, 0.81472183, 0.80230192, 0.78972144, 0.91998747])

'Average score: '

0.7847997017963831

### Reduce values by 12 for test-labels and y_pred_proba

In [5]:
err=np.empty(10)
for i, labels in enumerate(LABELS1):
    y_test_reduced=y_test[labels][0:len(y_test[labels]):12]
    y_pred_proba_reduced=np.empty(int(len(y_pred_proba[i][:,1])/12))
    counter=0
    for splits in np.split(np.array(y_pred_proba[i][:,1]), int(len(y_pred_proba[i][:,1])/12)):
        y_pred_proba_reduced[counter]=splits.mean() 
        counter=counter+1
    err[i]=roc_auc_score(y_test_reduced, y_pred_proba_reduced)
    display(err[i])
display('Average Score: ', np.mean(err))

0.5973368070632509

0.5763888888888888

0.6168573943661972

0.6093016431924883

0.6217542270531401

0.5546757164404224

0.6191904047976013

0.5534909909909911

0.7298956414978515

0.7138798701298701

'Average Score: '

0.6192771584420702

### Predictions for Test_features

In [6]:
TEST_y_pred_proba=clf.predict_proba(TEST_X)

### Reduce values by 12 for TEST_y_pred_proba

In [7]:
TEST_list_proba=list()
for j in range(10):
    TEST_y_pred_proba_reduced=np.empty(int(len(TEST_y_pred_proba[j][:,1])/12))
    counter=0
    for splits in np.split(np.array(TEST_y_pred_proba[j][:,1]), int(len(TEST_y_pred_proba[j][:,1])/12)):
        TEST_y_pred_proba_reduced[counter]=splits.mean() 
        counter=counter+1
    TEST_list_proba.append(TEST_y_pred_proba_reduced)
proba_subtask1=TEST_list_proba

## Subtask 2

In [8]:
y = labels_copied[LABELS2]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.01, shuffle=True, random_state=42)

### RandomForest model and Roc_Auc_Score using every hour per patient as sample

In [9]:
clf=RandomForestClassifier(n_estimators=300, class_weight=None, n_jobs=-1)

clf.fit(X_train, y_train)
y_pred_proba=clf.predict_proba(X_test)
err=roc_auc_score(y_test, y_pred_proba[:,1])
display(err)

  This is separate from the ipykernel package so we can avoid doing imports until


0.7263875992984627

### Prediction for Test features

In [10]:
TEST_y_pred_proba=clf.predict_proba(TEST_X)

### Reduce values by 12 for TEST_y_pred_proba

In [11]:
TEST_y_pred_proba_reduced=np.empty(int(len(TEST_y_pred_proba[:,1])/12))
counter=0
for splits in np.split(np.array(TEST_y_pred_proba[:,1]), int(len(TEST_y_pred_proba[:,1])/12)):
    TEST_y_pred_proba_reduced[counter]=splits.mean() 
    counter=counter+1
proba_subtask2=TEST_y_pred_proba_reduced

## Subtask 3

In [6]:
y = labels_copied[LABELS3]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False, random_state=42)

### Implement MultiTaskLassoCV model

In [7]:
from sklearn.linear_model import MultiTaskLassoCV
from sklearn.model_selection import train_test_split, KFold, RepeatedKFold, cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score


cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=42)

model = RidgeCV(alphas=(0.1, 1, 10), cv=cv)
model.fit(X_train, y_train)
y_pred=model.predict(X_test)
print(mean_squared_error(y_test, y_pred))
print(r2_score(y_test, y_pred))

58.9649507805526
0.29397840307348955


### Reduce values by 12 for test-labels and y_pred

In [15]:
for i, labels in enumerate(LABELS3):
    y_test_reduced=y_test[labels][0:len(y_test[labels]):12]
    y_pred_reduced=np.empty(int(len(y_pred[:,i])/12))
    counter=0
    for splits in np.split(np.array(y_pred[:,i]), int(len(y_pred[:,i])/12)):
        y_pred_reduced[counter]=splits.mean() 
        counter=counter+1
    display(mean_squared_error(y_test_reduced, y_pred_reduced))

9.286756647275718

166.05384745478807

3.06387575457702

180.9739500447106

### Predictions for Test features

In [16]:
TEST_y_pred_values=model.predict(TEST_X)
display(TEST_y_pred_values)

array([[18.43060338, 84.67101428, 97.16760692, 85.19986706],
       [16.51291492, 80.6929088 , 98.10689491, 80.5983686 ],
       [16.37222118, 83.14702545, 98.04809191, 82.99084645],
       ...,
       [18.14853251, 77.00880845, 97.73806771, 87.66333252],
       [18.44695555, 79.02849377, 97.74488846, 87.84801953],
       [17.45441408, 77.13508889, 97.75643728, 86.5513912 ]])

### Reduce values by 12 for TEST_y_pred_values

In [17]:
TEST_list_values=list()
for j in range(4):
    TEST_y_pred_values_reduced=np.empty(int(len(TEST_y_pred_values[:,j])/12))
    counter=0
    for splits in np.split(np.array(TEST_y_pred_values[:,j]), int(len(TEST_y_pred_values[:,j])/12)):
        TEST_y_pred_values_reduced[counter]=splits.mean() 
        counter=counter+1
    TEST_list_values.append(TEST_y_pred_values_reduced)
proba_subtask3=TEST_list_values

### Write values into sample dataframe

In [18]:
filename = 'sample.zip'
df_submission = pd.read_csv(filename)
for i,label in enumerate(LABELS1):
    # round classification labels
    df_submission[label]=proba_subtask1[i]
df_submission[LABELS2[0]]=proba_subtask2
for i,label in enumerate(LABELS3):
    # round classification labels
    df_submission[label]=proba_subtask3[i]
df_submission.to_csv('submission.csv',index=False)
df_submission.to_csv('prediction.zip', index=False, float_format='%.3f', compression='zip')