In [157]:
import pandas as pd
import numpy as np
import math
from sklearn import svm
from sklearn import preprocessing
from sklearn import ensemble
from sklearn.metrics import roc_auc_score
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.kernel_ridge import KernelRidge
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

##init X train         
X = pd.read_csv("train_features.csv")

##init y train
y = pd.read_csv("train_labels.csv")

##init X train
X_test = pd.read_csv("test_features.csv")

##rearange 12 samples to one, because it's a time series
def widen(X, time_series_amount):
    return pd.DataFrame(data=X.values.reshape(int(X.shape[0] / time_series_amount), X.shape[1] * time_series_amount))

def fill_time_series(X, time_series_amount=12, median=None):
    
    if median is None: 
        median = X.median()   
    
    X_filled = pd.DataFrame(columns=X.columns)
    for i in range(0, X.shape[0], time_series_amount):
         series = X.iloc[i:i + time_series_amount,:].mean()
         X_filled_12 = X.iloc[i:i + time_series_amount,:].fillna(series)
         X_filled = X_filled.append(X_filled_12)
        
    return X_filled.fillna(median), median

def drop_features(X, featureNames):
    return X.drop(featureNames, axis = 1)


def write_out(target_label_predictions, target_label, y_df, folder_path):
    df = pd.DataFrame(columns=y.columns.values)
    df[target_label] = target_label_predictions
    df.to_csv(folder_path + "/" + target_label + ".csv", index=False)
    

def pp_X_and_X_test(X, X_test, drop_features_list=[], time_series_amount=12):
    ##train set pp
    X = drop_features(X, drop_features_list)
    X, median = fill_time_series(X, time_series_amount)
    scaler = StandardScaler().fit(X)
    X = pd.DataFrame(data=scaler.transform(X), columns=X.columns)
    X = widen(X, time_series_amount)
    
    ##test set pp
    X_test = drop_features(X_test, drop_features_list)
    X_test, _ = fill_time_series(X_test, time_series_amount, median)
    X_test = pd.DataFrame(data=scaler.transform(X_test), columns=X_test.columns)
    X_test = widen(X_test, time_series_amount)
    
    return X, X_test
    
def train_GB_w_writeout_train_and_test(X, y, target_label, X_test, random_state=0):
    print("start calculation: " + target_label)
    gb = ensemble.GradientBoostingClassifier(random_state = random_state).fit(X, y[target_label])
    
    ##compare with train_labels & save file
    y_predict = gb.predict_proba(X)[:,1]
    y_true = y[target_label]
    print(roc_auc_score(y_true, y_predict))
    write_out(y_predict, target_label, y, "predict_train_labels")
   
    ##predict and save to file
    y_test_predict = gb.predict_proba(X_test)[:,1]
    write_out(y_test_predict, target_label, y, "predict_test_labels")
    print("finish calculation: " + target_label)
    
    return gb

def train_KR_w_writeout_train_and_test(X, y, target_label, X_test, alpha=1, degree=3):
    print("start calculation: " + target_label)
    X = X[target_label].to_frame()
    X_test = X_test[target_label].to_frame()
    X, X_test = pp_X_and_X_test(X, X_test)
    
    target_label = "LABEL_" + target_label    
    
    kr = KernelRidge(alpha=alpha, kernel="poly", degree=degree).fit(X, y[target_label])
    
    ##compare with train_labels & save file
    y_predict = kr.predict(X)
    y_true = y[target_label]
    print(kr.score(X,y_true))
    write_out(y_predict, target_label, y, "predict_train_labels")
   
    ##predict and save to file
    y_test_predict = kr.predict(X_test)
    write_out(y_test_predict, target_label, y, "predict_test_labels")
    print("finish calculation: " + target_label)
    
    return kr, X, X_test

def train_GBR_w_writeout_train_and_test(X, y, target_label, X_test, random_state=0):
    print("start calculation: " + target_label)    
    gbr = GradientBoostingRegressor(random_state=random_state).fit(X, y[target_label])
    
    ##compare with train_labels & save file
    y_predict = gbr.predict(X)
    y_true = y[target_label]
    print(gbr.score(X,y_true))
    write_out(y_predict, target_label, y, "predict_train_labels")
   
    ##predict and save to file
    y_test_predict = gbr.predict(X_test)
    write_out(y_test_predict, target_label, y, "predict_test_labels")
    print("finish calculation: " + target_label)
    
    return gbr

##forest regressor
def train_RFR_w_writeout_train_and_test(X, y, target_label, X_test, random_state=0):
    print("start calculation: " + target_label)    
    m = RandomForestRegressor(random_state=random_state).fit(X, y[target_label])
    
    ##compare with train_labels & save file
    y_predict = m.predict(X)
    y_true = y[target_label]
    print(r2_score(y_true, y_predict))
    write_out(y_predict, target_label, y, "predict_train_labels")
   
    ##predict and save to file
    y_test_predict = m.predict(X_test)
    write_out(y_test_predict, target_label, y, "predict_test_labels")
    print("finish calculation: " + target_label)
    
    return m

def test_R(X, y, m, target_label):
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)
    m.fit(X_train, y_train[target_label])
    
    print(r2_score(y_train[target_label], m.predict(X_train)))
    
    return r2_score(y_test[target_label], m.predict(X_test))

def test_C(X,y, m, target_label):
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)
    m.fit(X_train, y_train[target_label])
    
    print(roc_auc_score(y_train[target_label], m.predict_proba(X_train)[:,1]))
    
    return roc_auc_score(y_test[target_label], m.predict_proba(X_test)[:,1])

In [3]:
X_PP, X_test_PP = pp_X_and_X_test(X, X_test, ["pid", "Time","Age"])

In [26]:
LABELS = ["LABEL_BaseExcess","LABEL_Fibrinogen","LABEL_AST", "LABEL_Alkalinephos","LABEL_Bilirubin_total",
          "LABEL_Lactate","LABEL_TroponinI","LABEL_SaO2","LABEL_Bilirubin_direct","LABEL_EtCO2","LABEL_Sepsis"]
for label in LABELS:
    train_GB_w_writeout_train_and_test(X_PP, y, label, X_test_PP, 0)    

start calculation: LABEL_BaseExcess
0.9322069153750262
finish calculation: LABEL_BaseExcess
start calculation: LABEL_Fibrinogen
0.8518248284821175
finish calculation: LABEL_Fibrinogen
start calculation: LABEL_AST,LABEL_Alkalinephos


KeyError: 'LABEL_AST,LABEL_Alkalinephos'

In [27]:
LABELS = ["LABEL_AST", "LABEL_Alkalinephos","LABEL_Bilirubin_total",
          "LABEL_Lactate","LABEL_TroponinI","LABEL_SaO2","LABEL_Bilirubin_direct","LABEL_EtCO2","LABEL_Sepsis"]
for label in LABELS:
    train_GB_w_writeout_train_and_test(X_PP, y, label, X_test_PP, 0)  

start calculation: LABEL_AST
0.79361433770905
finish calculation: LABEL_AST
start calculation: LABEL_Alkalinephos
0.7962776421963176
finish calculation: LABEL_Alkalinephos
start calculation: LABEL_Bilirubin_total
0.7949649397585792
finish calculation: LABEL_Bilirubin_total
start calculation: LABEL_Lactate
0.8371177614413499
finish calculation: LABEL_Lactate
start calculation: LABEL_TroponinI
0.9258444660463825
finish calculation: LABEL_TroponinI
start calculation: LABEL_SaO2
0.853620202988562
finish calculation: LABEL_SaO2
start calculation: LABEL_Bilirubin_direct
0.864475838810551
finish calculation: LABEL_Bilirubin_direct
start calculation: LABEL_EtCO2
0.9507724427876676
finish calculation: LABEL_EtCO2
start calculation: LABEL_Sepsis
0.817322660132909
finish calculation: LABEL_Sepsis


In [64]:
m = train_GBR_w_writeout_train_and_test(X_PP, y, "LABEL_RRate", X_test_PP, 0)

start calculation: LABEL_RRate
0.49632463396521376
finish calculation: LABEL_RRate


In [96]:
test_RFR(X_PP, y, RandomForestRegressor(n_estimators=200, max_features=10, random_state=0), "LABEL_RRate")

0.917827747111051


0.4090262891615185

In [None]:
train_RFR_w_writeout_train_and_test(X, y, "LABEL_RRate", X_test, random_state=0)

In [106]:
test_GBR(X_PP, y, GradientBoostingRegressor(random_state=0, subsample=1), "LABEL_RRate")

0.5089099135097332


0.43138793826861255

In [None]:
def test_RFR(X,y, target_label, random_state=0):
    m = RandomForestRegressor(random_state=random_state)
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)
    m.fit(X_train, y_train[target_label])
    
    return r2_score(y_test[target_label], gbr.predict(X_test))

In [119]:
LABELS_REGRESSION = ["LABEL_RRate", "LABEL_ABPm", "LABEL_SpO2", "LABEL_Heartrate"]
for label in LABELS_REGRESSION:
    train_GBR_w_writeout_train_and_test(X_PP, y, label, X_test_PP, random_state=0)

start calculation: LABEL_RRate
0.49632463396521376
finish calculation: LABEL_RRate
start calculation: LABEL_ABPm
0.6655000904291237
finish calculation: LABEL_ABPm
start calculation: LABEL_SpO2
0.5732118245740526
finish calculation: LABEL_SpO2
start calculation: LABEL_Heartrate
0.7031271276684545
finish calculation: LABEL_Heartrate


In [123]:
import sklearn.metrics as metrics

VITALS = ['LABEL_RRate', 'LABEL_ABPm', 'LABEL_SpO2', 'LABEL_Heartrate']
TESTS = ['LABEL_BaseExcess', 'LABEL_Fibrinogen', 'LABEL_AST', 'LABEL_Alkalinephos', 'LABEL_Bilirubin_total',
         'LABEL_Lactate', 'LABEL_TroponinI', 'LABEL_SaO2',
         'LABEL_Bilirubin_direct', 'LABEL_EtCO2']


def get_score(df_true, df_submission):
    df_submission = df_submission.sort_values('pid')
    df_true = df_true.sort_values('pid')
    task1 = np.mean([metrics.roc_auc_score(df_true[entry], df_submission[entry]) for entry in TESTS])
    task2 = metrics.roc_auc_score(df_true['LABEL_Sepsis'], df_submission['LABEL_Sepsis'])
    task3 = np.mean([0.5 + 0.5 * np.maximum(0, metrics.r2_score(df_true[entry], df_submission[entry])) for entry in VITALS])
    score = np.mean([task1, task2, task3])
    print(task1, task2, task3)
    return score



X_pid = pd.read_csv("train_features.csv")
pid = X_pid.values[::12,0]
pid = pd.Series(pid.astype("int64"))

y = pd.read_csv("train_labels.csv")
df = pd.DataFrame(columns=y.columns.values)

for i in y.columns.values[1:]:
    df[i] = pd.read_csv("predict_train_labels/" + i + ".csv")[i]
    
df["pid"] = pid
df.to_csv("test_labels" + ".csv", index=False, float_format="%.4f")
df

Unnamed: 0,pid,LABEL_BaseExcess,LABEL_Fibrinogen,LABEL_AST,LABEL_Alkalinephos,LABEL_Bilirubin_total,LABEL_Lactate,LABEL_TroponinI,LABEL_SaO2,LABEL_Bilirubin_direct,LABEL_EtCO2,LABEL_Sepsis,LABEL_RRate,LABEL_ABPm,LABEL_SpO2,LABEL_Heartrate
0,1,0.579958,0.055937,0.075863,0.066241,0.074032,0.130279,0.011235,0.123093,0.010962,0.009420,0.038141,18.175508,72.727117,98.330778,76.352025
1,10,0.045994,0.027821,0.222830,0.185259,0.194169,0.080819,0.139353,0.081541,0.020144,0.074904,0.036744,17.681424,99.786287,97.254178,79.011318
2,100,0.148693,0.027449,0.103513,0.106858,0.108417,0.116313,0.016931,0.102715,0.011324,0.009526,0.036139,16.622710,82.750322,96.598218,101.350109
3,1000,0.028951,0.037321,0.118134,0.109295,0.116196,0.712392,0.047374,0.630952,0.020676,0.858167,0.033221,16.686595,80.852156,96.677929,86.013362
4,10000,0.568221,0.029558,0.073086,0.076091,0.087201,0.141442,0.009345,0.489074,0.011677,0.006205,0.042732,16.087969,71.270272,97.204971,84.864697
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18990,9993,0.032034,0.060150,0.461343,0.461353,0.464584,0.097716,0.820460,0.077706,0.041463,0.032963,0.040451,18.611987,79.809352,98.665491,104.743554
18991,9995,0.351892,0.068722,0.169915,0.135952,0.144181,0.180746,0.020337,0.169814,0.020936,0.011002,0.085228,20.315793,97.409080,95.857317,66.818651
18992,9996,0.488265,0.030718,0.143223,0.145342,0.122289,0.098703,0.035872,0.684148,0.017451,0.015713,0.040187,18.672412,71.521867,97.949213,93.711040
18993,9998,0.070815,0.045381,0.117262,0.125406,0.119229,0.106495,0.764195,0.152955,0.016894,0.019318,0.030239,19.775108,84.057145,98.587018,95.726379


In [124]:
get_score(y,df)

0.8600719375595605 0.817322660132909 0.8047704595796055


0.827388352424025

In [143]:
##drop all labels except RRate, ABPm, SpO2, Heartrate
drop_LABELS_RR = ["pid","Time","Age","EtCO2","PTT","BUN",
                  "Lactate","Temp","Hgb","HCO3","BaseExcess","Fibrinogen",
                  "Phosphate","WBC","Creatinine","PaCO2","AST","FiO2","Platelets",
                  "SaO2","Glucose","Magnesium","Potassium","ABPd","Calcium","Alkalinephos",
                  "Bilirubin_direct","Chloride","Hct","Bilirubin_total","TroponinI","ABPs","pH"]
X_PP_reg, X_test_PP_reg = pp_X_and_X_test(X, X_test, drop_LABELS_RR, time_series_amount=12)

In [144]:
test_GBR(X_PP_reg, y, GradientBoostingRegressor(random_state=0, subsample=1), "LABEL_RRate")

0.49029208973865235


0.3959344003963461

In [154]:
##max_features
test_RFR(X_PP_reg, y, RandomForestRegressor(n_estimators=400, min_samples_leaf=5, random_state=0), "LABEL_RRate")

0.7926224824279963


0.4228478847534495

In [161]:
from sklearn.kernel_ridge import KernelRidge
test_R(X_PP_reg, y, KernelRidge(alpha=1,kernel="rbf", coef0=1), target_label)

0.311544294390816


0.09563814739104937

In [171]:
from sklearn.neural_network import MLPRegressor
test_R(X_PP, y, MLPRegressor(random_state=0, hidden_layer_sizes=(1000,100), max_iter=2000), "LABEL_RRate")

0.9263274922850223


0.02683282203802495

In [None]:
test_GBR(X_PP, y, GradientBoostingRegressor(random_state=0, subsample=1), "LABEL_RRate")

In [145]:
X_PP_reg

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,38,39,40,41,42,43,44,45,46,47
0,-0.431680,0.100662,0.844154,0.551371,-0.431680,-0.085332,0.844154,0.836659,-0.025517,-0.519317,...,0.844154,-1.046239,-0.025517,-0.829306,0.844154,-0.989181,-0.025517,-1.821271,0.844154,-0.875066
1,-0.007055,1.199714,0.137771,-0.314865,0.989890,1.092627,0.490963,-0.133319,0.989890,1.588610,...,0.490963,-0.475664,-0.634761,1.030629,0.490963,-0.589779,-0.431680,1.216623,0.137771,-0.532721
2,-0.837843,1.278621,0.844154,0.551371,-1.040924,-0.581315,0.490963,0.380199,-1.040924,-0.705310,...,-0.568612,1.635464,-0.228598,-0.333323,-0.215420,1.293119,-1.040924,0.100662,-0.215420,1.635464
3,-1.244005,0.066845,0.426746,0.115660,-1.244005,-0.023334,0.844154,1.464291,-1.244005,-0.147330,...,0.844154,-0.304491,-1.244005,-0.457319,0.844154,-0.304491,-1.244005,-0.705310,-0.921803,-0.304491
4,-1.225543,-0.823670,0.330421,-0.413419,-1.447087,-0.891304,0.844154,-1.160354,-1.244005,-1.449284,...,0.137771,0.323141,-0.431680,-0.643312,-0.215420,0.323141,-2.056331,-0.953302,0.490963,0.323141
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18990,0.202949,-0.175511,0.587288,1.625090,0.202949,0.658642,-0.921803,2.833671,1.396053,0.596644,...,0.844154,1.121946,-0.431680,-0.085332,0.844154,1.236061,-0.025517,-0.395321,0.844154,1.578406
18991,0.288336,1.278621,-1.082345,-1.238159,-0.228598,0.720640,-0.568612,-1.160354,-0.025517,0.968632,...,-1.274995,-1.388584,1.192971,1.464614,-1.274995,-1.274469,0.583727,2.518578,-0.568612,-1.160354
18992,-1.650168,0.100662,0.844154,0.094911,-1.650168,-0.271325,0.844154,0.551371,-0.431680,-0.581315,...,0.844154,0.893716,-0.228598,-0.643312,0.844154,0.665486,-0.025517,-0.891304,0.844154,0.608429
18993,0.472955,0.083753,0.844154,0.748479,0.177564,-0.643312,0.844154,1.236061,0.177564,-0.395321,...,0.844154,0.950774,-0.025517,-0.333323,0.844154,0.665486,0.380646,0.782638,0.844154,0.893716


In [126]:
X_pid = pd.read_csv("test_features.csv")
pid = X_pid.values[::12,0]
pid = pd.Series(pid.astype("int64"))

y = pd.read_csv("train_labels.csv")
df = pd.DataFrame(columns=y.columns.values)

for i in y.columns.values[1:]:
    df[i] = pd.read_csv("predict_test_labels/" + i + ".csv")[i]
    
df["pid"] = pid
df.to_csv("test_labels" + ".csv", index=False, float_format="%.4f")
df

Unnamed: 0,pid,LABEL_BaseExcess,LABEL_Fibrinogen,LABEL_AST,LABEL_Alkalinephos,LABEL_Bilirubin_total,LABEL_Lactate,LABEL_TroponinI,LABEL_SaO2,LABEL_Bilirubin_direct,LABEL_EtCO2,LABEL_Sepsis,LABEL_RRate,LABEL_ABPm,LABEL_SpO2,LABEL_Heartrate
0,0,0.892975,0.501668,0.606478,0.677912,0.703910,0.405850,0.022601,0.425893,0.068013,0.008743,0.056529,14.531436,81.937926,98.632684,83.674579
1,10001,0.062226,0.034116,0.289485,0.293442,0.295673,0.083600,0.062619,0.084182,0.023482,0.020238,0.026843,17.892099,90.426451,94.661882,97.605498
2,10003,0.028638,0.026378,0.179561,0.173847,0.182386,0.279887,0.039660,0.302964,0.021239,0.031942,0.049623,16.920881,82.148005,97.969036,86.836794
3,10004,0.040161,0.027720,0.252851,0.242700,0.281055,0.059366,0.040277,0.078581,0.014772,0.024869,0.025658,16.802586,76.819207,95.545708,91.943939
4,10005,0.124222,0.044688,0.159199,0.153118,0.150913,0.102789,0.013206,0.125934,0.016624,0.008792,0.038589,19.215745,73.614120,95.916934,62.478848
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12659,9989,0.242122,0.028755,0.140995,0.139686,0.147240,0.200412,0.078046,0.073129,0.011749,0.007377,0.030815,20.674633,73.787970,95.518546,101.956974
12660,9991,0.737262,0.102193,0.179199,0.177297,0.173635,0.291565,0.013167,0.168148,0.022731,0.008707,0.059218,18.996400,93.765079,98.605032,73.273979
12661,9992,0.833868,0.065943,0.100991,0.096709,0.107772,0.163054,0.011007,0.673659,0.012367,0.008323,0.059499,18.531658,64.699076,96.997185,82.990448
12662,9994,0.976865,0.893502,0.903025,0.907083,0.915331,0.896697,0.013208,0.964530,0.038671,0.012899,0.224177,16.390786,90.960492,98.436110,96.381440
