In [1]:
# import packages
import pandas as pd
import numpy as np
from tqdm import tqdm
import pickle

# import Sklearn packages
from sklearn.datasets import load_linnerud

from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

In [2]:
all_features = ['pid','Time','Age','EtCO2', 'PTT', 'BUN', 'Lactate', 'Temp', 'Hgb', 'HCO3', 'BaseExcess', 'RRate', 'Fibrinogen', 'Phosphate', 'WBC', 'Creatinine', 'PaCO2', 'AST', 'FiO2', 'Platelets', 'SaO2', 'Glucose', 'ABPm', 'Magnesium', 'Potassium', 'ABPd', 'Calcium', 'Alkalinephos', 'SpO2', 'Bilirubin_direct', 'Chloride', 'Hct', 'Heartrate', 'Bilirubin_total', 'TroponinI', 'ABPs', 'pH']

features = ['EtCO2', 'PTT', 'BUN', 'Lactate', 'Temp', 'Hgb', 'HCO3', 'BaseExcess', 'RRate', 'Fibrinogen', 'Phosphate', 'WBC', 'Creatinine', 'PaCO2', 'AST', 'FiO2', 'Platelets', 'SaO2', 'Glucose', 'ABPm', 'Magnesium', 'Potassium', 'ABPd', 'Calcium', 'Alkalinephos', 'SpO2', 'Bilirubin_direct', 'Chloride', 'Hct', 'Heartrate', 'Bilirubin_total', 'TroponinI', 'ABPs', 'pH']

labels_test = ['LABEL_BaseExcess', 'LABEL_Fibrinogen', 'LABEL_AST', 'LABEL_Alkalinephos', 'LABEL_Bilirubin_total', 'LABEL_Lactate', 'LABEL_TroponinI', 'LABEL_SaO2', 'LABEL_Bilirubin_direct', 'LABEL_EtCO2']
label_measure = ['LABEL_RRate', 'LABEL_ABPm', 'LABEL_SpO2','LABEL_Heartrate']

In [3]:
## import data
df_features = pd.read_csv("train_features.csv")
df_features = df_features.sort_values(by=['pid'])
df_labels = pd.read_csv("train_labels.csv")
df_labels = df_labels.sort_values(by=['pid'])
#print(df_labels) # ATTENTION: TIME IS NOT SORTED ANYMORE
#df_features.count()

In [4]:
with open('X_eng.pkl', 'rb') as f:

    X = pickle.load(f)

X = X.dropna()

In [5]:
Y = df_labels[df_labels['pid'].isin(X['pid'])]
Y = Y[label_measure]

In [6]:
# Train/test split
X = X.iloc[:,1:]
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.1)

In [7]:
clf = MultiOutputRegressor(Ridge(random_state=123)).fit(X_train, y_train)

In [8]:
y_pred = clf.predict(X_test)

In [9]:
y_pred = pd.DataFrame(data=y_pred, index=None, columns=label_measure)

In [10]:
task3 = np.mean([0.5 + 0.5 * np.maximum(0, r2_score(y_test[entry], y_pred[entry])) for entry in label_measure])
task3

0.7531804621227678

In [11]:
with open('X_handin_eng_nopid.pkl', 'rb') as f:
    X_handin = pickle.load(f)
X_handin = X_handin.dropna()

In [12]:
X_handin

Unnamed: 0,Age,EtCO2_min,EtCO2_max,EtCO2_med,EtCO2_std,EtCO2_a,EtCO2_b,PTT_min,PTT_max,PTT_med,...,ABPs_med,ABPs_std,ABPs_a,ABPs_b,pH_min,pH_max,pH_med,pH_std,pH_a,pH_b
0,39,34.0,34.0,34.0,0.0,-1.234403e-15,34.0,30.60,44.20,30.60,...,120.5,8.357087,-0.500000,129.000000,7.34,7.40,7.38,1.337116e-02,1.188811e-03,7.370606
1,84,30.0,30.0,30.0,0.0,-2.254118e-15,30.0,32.90,32.90,32.90,...,121.0,24.196168,1.804196,105.272727,7.38,7.38,7.38,9.276721e-16,-2.732902e-16,7.380000
2,62,33.0,33.0,33.0,0.0,-7.468214e-16,33.0,31.50,31.50,31.50,...,106.5,10.823655,-1.923077,121.166667,7.37,7.37,7.37,9.276721e-16,-4.989413e-16,7.370000
3,71,33.0,33.0,33.0,0.0,-7.468214e-16,33.0,32.60,150.00,32.60,...,129.5,15.863767,1.451049,118.818182,7.36,7.40,7.37,9.374369e-03,-3.496503e-04,7.373939
4,51,33.0,33.0,33.0,0.0,-7.468214e-16,33.0,32.50,35.40,32.50,...,126.0,8.404094,1.947552,113.424242,7.37,7.37,7.37,9.276721e-16,-4.989413e-16,7.370000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12659,29,34.0,34.0,34.0,0.0,-1.234403e-15,34.0,31.75,31.75,31.75,...,108.5,5.166911,-0.181818,109.015152,7.37,7.37,7.37,9.276721e-16,-4.989413e-16,7.370000
12660,83,30.0,30.0,30.0,0.0,-2.254118e-15,30.0,28.80,33.00,33.00,...,119.0,12.099023,-0.989510,125.181818,7.26,7.38,7.38,4.923660e-02,3.636364e-03,7.329697
12661,74,31.5,31.5,31.5,0.0,-1.615910e-15,31.5,32.40,32.80,32.80,...,119.0,3.270622,0.153846,117.166667,7.33,7.41,7.37,1.729862e-02,-1.643357e-03,7.381515
12662,40,35.0,35.0,35.0,0.0,-1.654362e-15,35.0,30.70,30.70,30.70,...,125.0,17.385774,-2.646853,145.787879,7.38,7.38,7.38,9.276721e-16,-2.732902e-16,7.380000


In [13]:
y_handin = pd.DataFrame(data=clf.predict(X_handin), index=None, columns=label_measure)

In [14]:
y_handin

Unnamed: 0,LABEL_RRate,LABEL_ABPm,LABEL_SpO2,LABEL_Heartrate
0,16.587936,84.061050,98.831290,86.616088
1,17.852835,86.517402,97.432785,90.200357
2,18.953483,73.590349,95.557363,69.400579
3,16.522814,89.934676,98.113115,91.377605
4,18.643589,90.765926,96.649855,89.913472
...,...,...,...,...
12659,17.312794,73.648424,96.983956,70.015875
12660,16.381503,82.533943,96.761890,91.403075
12661,18.379722,72.551383,97.942238,92.024140
12662,18.291592,90.234583,97.312935,109.956853


In [15]:
pd.DataFrame(y_handin).to_csv("y_handin_task3.csv", index=False) 