In [27]:
import pandas as pd
import numpy as np
import math
from sklearn import svm
from sklearn import preprocessing
from sklearn import ensemble
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor

##init X train         
X = pd.read_csv("train_features.csv")

##init y train
y = pd.read_csv("train_labels.csv")

##init X train
X_test = pd.read_csv("test_features.csv")

##rearange 12 samples to one, because it's a time series
def widen(X, time_series_amount):
    return pd.DataFrame(data=X.values.reshape(int(X.shape[0] / time_series_amount), X.shape[1] * time_series_amount))

def fill_time_series(X, time_series_amount=12, median=None):
    
    if median is None: 
        median = X.median()   
    
    X_filled = pd.DataFrame(columns=X.columns)
    for i in range(0, X.shape[0], time_series_amount):
         series = X.iloc[i:i + time_series_amount,:].mean()
         X_filled_12 = X.iloc[i:i + time_series_amount,:].fillna(series)
         X_filled = X_filled.append(X_filled_12)
        
    return X_filled.fillna(median), median

def drop_features(X, featureNames):
    return X.drop(featureNames, axis = 1)


def write_out(target_label_predictions, target_label, y_df, folder_path):
    df = pd.DataFrame(columns=y.columns.values)
    df[target_label] = target_label_predictions
    df.to_csv(folder_path + "/" + target_label + ".csv", index=False)
    

def pp_X_and_X_test(X, X_test, drop_features_list, time_series_amount=12):
    ##train set pp
    X = drop_features(X, drop_features_list)
    X, median = fill_time_series(X, time_series_amount)
    scaler = StandardScaler().fit(X)
    X = pd.DataFrame(data=scaler.transform(X), columns=X.columns)
    X = widen(X, time_series_amount)
    
    ##test set pp
    X_test = drop_features(X_test, drop_features_list)
    X_test, _ = fill_time_series(X_test, time_series_amount, median)
    X_test = pd.DataFrame(data=scaler.transform(X_test), columns=X_test.columns)
    X_test = widen(X_test, time_series_amount)
    
    return X, X_test
    
def train_GB_w_writeout_train_and_test(X, y, target_label, X_test, random_state):
    print("start calculation: " + target_label)
    gb = ensemble.GradientBoostingClassifier(random_state = random_state).fit(X, y[target_label])
    
    ##compare with train_labels & save file
    y_predict = gb.predict_proba(X)
    y_true = y[target_label]
    print(roc_auc_score(y_true, y_predict))
    write_out(y_predict, target_label, y, "predict_train_labels")
    
    ##predict and save to file
    y_test_predict = gb.predict_proba(X_test)
    write_out(y_test_predict, target_label, y, "predict_test_labels")
    print("finish calculation: " + target_label)
    
    return gb


#def train_RandomForest(X,)

X, X_test = pp_X_and_X_test(X, X_test, [])

In [41]:
from sklearn.metrics import r2_score

X_pre_test = pd.DataFrame(X.iloc[15196:,:], columns=X.columns)
y_pre_test = pd.DataFrame(y.iloc[15196:,:], columns=y.columns)
X_pre_train = pd.DataFrame(X.iloc[:15195,:], columns=X.columns)
y_pre_train = pd.DataFrame(y.iloc[:15195,:], columns=y.columns)

def train_RF_Classifier_w_writeout_train_and_test(X, y, target_label, X_test, random_state=0, n_estimators=100):
    print("start calculation: " + target_label)
    rf = RandomForestClassifier(random_state = random_state, n_estimators = n_estimators).fit(X, y[target_label])
    
    ##compare with train_labels & save file
    y_predict = rf.predict_proba(X)[:,1]
    y_true = y[target_label]
    print("train score")
    print(roc_auc_score(y_true, y_predict))
    write_out(y_predict, target_label, y, "predict_train_labels")
    
    ##predict and save to file
    y_test_predict = rf.predict_proba(X_test)[:,1]
    write_out(y_test_predict, target_label, y, "predict_test_labels")
    print("finish calculation: " + target_label)
    
    return rf

def train_RF_Regressor_w_writeout_train_and_test(X, y, target_label, X_test, random_state=0, n_estimators=100):
    print("start calculation: " + target_label)
    rf = RandomForestRegressor(random_state = random_state, n_estimators = n_estimators).fit(X, y[target_label])
    
    ##compare with train_labels & save file
    y_predict = rf.predict(X)
    y_true = y[target_label]
    print(r2_score(y_true, y_predict))
    write_out(y_predict, target_label, y, "predict_train_labels")
    
    ##predict and save to file
    y_test_predict = rf.predict(X_test)
    write_out(y_test_predict, target_label, y, "predict_test_labels")
    print("finish calculation: " + target_label)
    
    return rf

def mymodel(X_train, y_train, X_pre_test, y_pre_test, X_test, random_state=0, n_estimators=100):
    LABELS = ["LABEL_BaseExcess","LABEL_Fibrinogen","LABEL_AST","LABEL_Alkalinephos","LABEL_Bilirubin_total","LABEL_Lactate","LABEL_TroponinI","LABEL_SaO2","LABEL_Bilirubin_direct","LABEL_EtCO2","LABEL_Sepsis"]
    for label in LABELS:

        classifier_model = train_RF_Classifier_w_writeout_train_and_test(X_train, y_train, label, X_test, random_state, n_estimators)
        y_pred_pretest = classifier_model.predict(X_pre_test)
        print("test score")
        print(roc_auc_score(y_pre_test[label], y_pred_pretest))

mymodel(X_pre_train, y_pre_train, X_pre_test, y_pre_test, X_test)






#classifier_model = train_RF_Classifier_w_writeout_train_and_test(X_pre_train, y_pre_train, "LABEL_BaseExcess", X_test, 0, 100)

#regressor_model = train_RF_Regressor_w_writeout_train_and_test(X, y, "LABEL_RRate", X_test, 0, 10)

start calculation: LABEL_BaseExcess




inhouse score
0.9993511460018227
finish calculation: LABEL_BaseExcess
outside score
0.8043177358959832
start calculation: LABEL_Fibrinogen




inhouse score
0.9997460536425471
finish calculation: LABEL_Fibrinogen
outside score
0.5734553193222548
start calculation: LABEL_AST




inhouse score
0.9993867349157162
finish calculation: LABEL_AST
outside score
0.6134294735429524
start calculation: LABEL_Alkalinephos




inhouse score
0.9992746648733615
finish calculation: LABEL_Alkalinephos
outside score
0.6072644935900395
start calculation: LABEL_Bilirubin_total




inhouse score
0.9993090232747809
finish calculation: LABEL_Bilirubin_total
outside score
0.5891479654093056
start calculation: LABEL_Lactate




inhouse score
0.9993028853620072
finish calculation: LABEL_Lactate
outside score
0.6242306707629288
start calculation: LABEL_TroponinI




inhouse score
0.9996092321755028
finish calculation: LABEL_TroponinI
outside score
0.6908002336448598
start calculation: LABEL_SaO2




inhouse score
0.9993289176496845
finish calculation: LABEL_SaO2
outside score
0.6667923803437822
start calculation: LABEL_Bilirubin_direct




inhouse score
0.9998603350090461
finish calculation: LABEL_Bilirubin_direct
outside score
0.5657999226242594
start calculation: LABEL_EtCO2




inhouse score
0.9997929158531755
finish calculation: LABEL_EtCO2
outside score
0.7530677814512493
start calculation: LABEL_Sepsis




inhouse score
0.9998248903947964
finish calculation: LABEL_Sepsis
outside score
0.49902696691687515
