In [None]:
import pandas as pd
import numpy as np
import math
from sklearn import svm
from sklearn import preprocessing
from sklearn import ensemble
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor

##init X train         
X = pd.read_csv("train_features.csv")

##init y train
y = pd.read_csv("train_labels.csv")

##init X train
X_test = pd.read_csv("test_features.csv")

##rearange 12 samples to one, because it's a time series
def widen(X, time_series_amount):
    return pd.DataFrame(data=X.values.reshape(int(X.shape[0] / time_series_amount), X.shape[1] * time_series_amount))

def fill_time_series(X, time_series_amount=12, median=None):
    
    if median is None: 
        median = X.median()   
    
    X_filled = pd.DataFrame(columns=X.columns)
    for i in range(0, X.shape[0], time_series_amount):
         series = X.iloc[i:i + time_series_amount,:].mean()
         X_filled_12 = X.iloc[i:i + time_series_amount,:].fillna(series)
         X_filled = X_filled.append(X_filled_12)
        
    return X_filled.fillna(median), median

def drop_features(X, featureNames):
    return X.drop(featureNames, axis = 1)


def write_out(target_label_predictions, target_label, y_df, folder_path):
    df = pd.DataFrame(columns=y.columns.values)
    df[target_label] = target_label_predictions
    df.to_csv(folder_path + "/" + target_label + ".csv", index=False)
    

def pp_X_and_X_test(X, X_test, drop_features_list, time_series_amount=12):
    ##train set pp
    X = drop_features(X, drop_features_list)
    X, median = fill_time_series(X, time_series_amount)
    scaler = StandardScaler().fit(X)
    X = pd.DataFrame(data=scaler.transform(X), columns=X.columns)
    X = widen(X, time_series_amount)
    
    ##test set pp
    X_test = drop_features(X_test, drop_features_list)
    X_test, _ = fill_time_series(X_test, time_series_amount, median)
    X_test = pd.DataFrame(data=scaler.transform(X_test), columns=X_test.columns)
    X_test = widen(X_test, time_series_amount)
    
    return X, X_test
    
def train_GB_w_writeout_train_and_test(X, y, target_label, X_test, random_state):
    print("start calculation: " + target_label)
    gb = ensemble.GradientBoostingClassifier(random_state = random_state).fit(X, y[target_label])
    
    ##compare with train_labels & save file
    y_predict = gb.predict_proba(X)
    y_true = y[target_label]
    print(roc_auc_score(y_true, y_predict))
    write_out(y_predict, target_label, y, "predict_train_labels")
    
    ##predict and save to file
    y_test_predict = gb.predict_proba(X_test)
    write_out(y_test_predict, target_label, y, "predict_test_labels")
    print("finish calculation: " + target_label)
    
    return gb


#def train_RandomForest(X,)

X, X_test = pp_X_and_X_test(X, X_test, [])

In [26]:
from sklearn.metrics import r2_score

X_pre_test = pd.DataFrame(X.iloc[15196:,:], columns=X.columns)
y_pre_test = pd.DataFrame(y.iloc[15196:,:], columns=y.columns)
X_pre_train = X.iloc[:15195,:]
y_pre train = y.iloc[:15195,:]

print(X_pretest)
print(y_pretest)

def train_RF_Classifier_w_writeout_train_and_test(X, y, target_label, X_test, random_state=0, n_estimators=100, max_depth=3):
    print("start calculation: " + target_label)
    rf = RandomForestClassifier(random_state = random_state).fit(X, y[target_label])
    
    ##compare with train_labels & save file
    y_predict = rf.predict_proba(X)[:,1]
    y_true = y[target_label]
    print(roc_auc_score(y_true, y_predict))
    write_out(y_predict, target_label, y, "predict_train_labels")
    
    ##predict and save to file
    y_test_predict = rf.predict_proba(X_test)[:,1]
    write_out(y_test_predict, target_label, y, "predict_test_labels")
    print("finish calculation: " + target_label)
    
    return rf

def train_RF_Regressor_w_writeout_train_and_test(X, y, target_label, X_test, random_state=0, n_estimators=100, max_depth=3):
    print("start calculation: " + target_label)
    rf = RandomForestRegressor(random_state = random_state).fit(X, y[target_label])
    
    ##compare with train_labels & save file
    y_predict = rf.predict(X)
    y_true = y[target_label]
    print(r2_score(y_true, y_predict))
    write_out(y_predict, target_label, y, "predict_train_labels")
    
    ##predict and save to file
    y_test_predict = rf.predict(X_test)
    write_out(y_test_predict, target_label, y, "predict_test_labels")
    print("finish calculation: " + target_label)
    
    return rf

classifier_model = train_RF_Classifier_w_writeout_train_and_test(X, y, "LABEL_BaseExcess", X_test, 0, 10)

y_pred_pretest = classifier_model.predict(X_pretest)
print(r2_score(y_pretest, y_pred_pretest["LABEL_BaseExcess"]))

#regressor_model = train_RF_Regressor_w_writeout_train_and_test(X, y, "LABEL_RRate", X_test, 0, 10)

Empty DataFrame
Columns: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, ...]
Index: []

[0 rows x 444 columns]
Empty DataFrame
Columns: [pid, LABEL_BaseExcess, LABEL_Fibrinogen, LABEL_AST, LABEL_Alkalinephos, LABEL_Bilirubin_total, LABEL_Lactate, LABEL_TroponinI, LABEL_SaO2, LABEL_Bilirubin_direct, LABEL_EtCO2, LABEL_Sepsis, LABEL_RRate, LABEL_ABPm, LABEL_SpO2, LABEL_Heartrate]
Index: []
start calculation: LABEL_BaseExcess




0.9994107158946778
finish calculation: LABEL_BaseExcess


ValueError: Found array with 0 sample(s) (shape=(0, 444)) while a minimum of 1 is required.