In [60]:
import pandas as pd
import numpy as np
import math
from sklearn import svm
from sklearn import preprocessing
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score
from sklearn.kernel_ridge import KernelRidge

##init X train         
X = pd.read_csv("train_features.csv")

##init y train
y = pd.read_csv("train_labels.csv")

##init X train
X_test = pd.read_csv("test_features.csv")


##sometimes overflows
def sigmoid(x):
    sig = (1 / (1 + np.exp(-x)))
    return sig

sigmoid_v = np.vectorize(sigmoid)

##returns indices for all the specified labels
def label_to_ind(df, label_list):
    index = []
    for i in label_list:
        index += [df.columns.get_loc(i)]
    return index

##random oversampling function, assumption more 0 than 1. X: numpy array, y: pandas dataframe with labels
def ros(X, y, label_name, random_seed):
    y_ind0 = np.array(y.index[y[label_name] == 0].tolist())
    y_ind1 = np.array(y.index[y[label_name] == 1].tolist())

    np.random.seed(random_seed)
    copy_ind = np.random.choice(y_ind1, len(y_ind0) - len(y_ind1), replace=True)
    
    os_samples = X[copy_ind[0]]
    os_samples = os_samples.reshape(1,os_samples.shape[0])    
    for i in copy_ind[1:]:
        copy_sample = X[i]
        os_samples = np.concatenate((os_samples, copy_sample.reshape(1, copy_sample.shape[0])), axis=0)
    
    X_val_os = np.concatenate((X, os_samples), axis=0)
    y_val_os = np.concatenate((y[label_name].values, np.ones(len(os_samples), dtype = float)), axis=0)
    return X_val_os, y_val_os


def test_roc_auc(model, X, y, label_target, label_list):
    X_test = X.values[:, label_to_ind(X, label_list)]

    y_dec = model.decision_function(X_test)
    y_dec_prob = sigmoid_v(y_dec)
    
    df = pd.DataFrame(columns=y.columns.values)
    df[label_target] = y_dec_prob
    df.to_csv("Labels_Training_New/" + label_target + ".csv", index=False)
    
    y_true = y[label_target]
    
    return roc_auc_score(y_true, y_dec_prob)

def train_model(X_PP, y, target_LABEL, C_LABELS,  C, degree):
    X_val_CL = X_PP.values[:,label_to_ind(X, C_LABELS)]
    X_val_CL, y_val_CL = ros(X_val_CL, y, target_LABEL, 10)
    m = svm.SVC(C=C, kernel="poly", degree=degree, decision_function_shape="ovo").fit(X_val_CL, y_val_CL)
    print(test_roc_auc(m, X_PP, y, target_LABEL, C_LABELS))
    return m

def write_out(model, X_test, y, label_name, label_list):
    X_test = X_test.values[:, label_to_ind(X_test, label_list)]
    
    y_dec_prob = sigmoid_v(model.decision_function(X_test))
    
    df = pd.DataFrame(columns=y.columns.values)
    df[label_name] = y_dec_prob
    df.to_csv("Labels_New/" + label_name + ".csv", index=False)    
    return 

In [2]:
arr1 = np.array([[1,2,3,1,2,3,1,2,3,1,2,3],
                 [np.nan, np.nan, np.nan,np.nan, np.nan, np.nan,np.nan, np.nan, np.nan,np.nan, np.nan, np.nan],
                 [np.nan, np.nan, np.nan,1,2,3,np.nan, np.nan, np.nan,1,2,3]])
arr1 = np.transpose(arr1)
df = pd.DataFrame(data=arr1, columns=["a","b","c"])
df

Unnamed: 0,a,b,c
0,1.0,,
1,2.0,,
2,3.0,,
3,1.0,,1.0
4,2.0,,2.0
5,3.0,,3.0
6,1.0,,
7,2.0,,
8,3.0,,
9,1.0,,1.0


In [3]:
from sklearn.preprocessing import StandardScaler

##PP
def get_mean_dict(X):
    mean_dict = {}
    for ind in X.columns.values:
        mean_dict.update({ind : X[ind].mean()})
    return mean_dict

def preprocessing_train(X):
    
    mean_dict = get_mean_dict(X)    
    
    X_mean = pd.DataFrame(columns=X.columns)
    for i in range(0, X.shape[0], 12):
        X_mean = X_mean.append(X.iloc[i:i + 12,:].mean(axis=0).to_frame().T)
        
    for i in X.columns.values:
        series = X_mean[i].fillna(mean_dict[i])
        X_mean[i] = series
        
    scaler = StandardScaler().fit(X_mean)
    scaler.transform(X_mean, copy=False)
    
    return (X_mean, mean_dict, scaler)

def preprocessing_test(X, mean_dict, scaler):
    
    X_mean = pd.DataFrame(columns=X.columns)
    for i in range(0, X.shape[0], 12):
        X_mean = X_mean.append(X.iloc[i:i + 12,:].mean(axis=0).to_frame().T)
        
    for i in X.columns.values:
        series = X_mean[i].fillna(mean_dict[i])
        X_mean[i] = series
        
    scaler.transform(X_mean, copy=False)
    
    return X_mean

In [4]:
X_PP, mean_dict, scaler = preprocessing_train(X)

In [23]:
X_test_PP = preprocessing_test(X_test, mean_dict, scaler)

In [24]:
X_test_PP

Unnamed: 0,pid,Time,Age,EtCO2,PTT,BUN,Lactate,Temp,Hgb,HCO3,...,Alkalinephos,SpO2,Bilirubin_direct,Chloride,Hct,Heartrate,Bilirubin_total,TroponinI,ABPs,pH
0,-1.725202,-0.160088,-1.402508,0.001645,0.117163,-0.268126,0.128106,-1.170812,-0.603510,-3.359407,...,0.298311,1.073610,0.015158,-1.615035,-0.664814,0.010096,14.522767,0.013876,0.312972,-0.015287
0,-0.632420,-0.160088,-0.004486,0.001645,0.034751,0.008320,0.128106,0.715458,-0.050198,-0.047657,...,0.005064,-1.531297,0.015158,0.038826,-0.109148,1.677564,0.015526,0.013876,-0.579549,-0.080393
0,-0.632202,-0.160088,0.603349,0.001645,0.034751,0.008320,-0.762227,0.201021,-0.050198,-0.047657,...,0.005064,0.910293,0.015158,0.038826,-0.109148,0.413016,0.015526,0.013876,0.119932,2.101206
0,-0.632092,-0.160088,-1.098590,0.001645,0.034751,-0.768864,0.128106,-0.793558,1.121687,-0.047657,...,0.461187,-0.845365,0.015158,0.038826,1.110201,0.204512,-0.915237,0.013876,-1.447997,-0.080393
0,-0.631983,-0.160088,1.575886,0.001645,0.022225,1.116266,0.128106,-1.170812,-1.084302,0.509416,...,0.005064,-0.763707,0.015158,0.498463,-0.439259,-1.829810,0.015526,0.013876,0.389597,-0.080393
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,-0.633731,0.151126,-1.159374,0.001645,-0.861683,-0.827774,0.128106,0.338204,-0.235845,0.877876,...,0.005064,-1.209426,0.015158,-0.558286,-0.223512,1.678597,0.015526,0.013876,-0.011978,-0.080393
0,-0.633513,-0.160088,-0.004486,0.001645,0.683519,-0.385947,0.128106,0.338204,-1.763068,-1.517110,...,0.005064,1.073610,0.015158,-3.068064,-1.616064,-0.840512,0.015526,0.013876,0.776910,-0.250453
0,-0.633404,0.462339,-0.673105,0.001645,0.113889,0.114791,0.128106,0.115075,0.442921,0.509416,...,0.005064,0.474781,0.015158,1.291024,0.610059,-0.150039,0.015526,0.013876,-1.109145,0.572627
0,-0.633185,0.151126,-0.612322,0.001645,5.469717,0.203156,6.282485,-0.667807,0.216666,-0.411732,...,0.461187,1.073610,0.015158,-0.954567,0.031463,0.986402,-0.204701,0.013876,-0.139421,2.042414


In [34]:
mv_row_indices = (X.isnull().sum()/ X.shape[0])*100 < 90
LS_LABELS = []
for i in range(len(mv_row_indices.values)):
    if mv_row_indices.values[i]: LS_LABELS += [X.columns.values[i]]
LABELS = LS_LABELS[1:]

In [36]:
##LABEL_Sepsis
target_LABEL = "LABEL_Sepsis"
C_LABELS = LABELS

m = train_model(X_PP, y, target_LABEL, C_LABELS, 0.8, 4)

0.807122183979975


  sig = (1 / (1 + np.exp(-x)))


In [37]:
write_out(m, X_test_PP, y, target_LABEL, C_LABELS)

  sig = (1 / (1 + np.exp(-x)))


In [38]:
##LABEL_BaseExcess
target_LABEL = "LABEL_BaseExcess"
C_LABELS = LABELS
m = train_model(X_PP, y, target_LABEL, C_LABELS, 0.8, 4)

0.7909706411911092


  sig = (1 / (1 + np.exp(-x)))


In [39]:
write_out(m, X_test_PP, y, target_LABEL, C_LABELS)

In [40]:
##LABEL_Fibrinogen
target_LABEL = "LABEL_Fibrinogen"
C_LABELS = ["Time", "PTT", "Hgb", "Fibrinogen", "Platelets"]
m = train_model(X_PP, y, target_LABEL, C_LABELS, 0.8, 4)

0.7101342102058215


  sig = (1 / (1 + np.exp(-x)))


In [41]:
write_out(m, X_test_PP, y, target_LABEL, C_LABELS)

  sig = (1 / (1 + np.exp(-x)))


In [46]:
##LABEL_AST
target_LABEL = "LABEL_AST"
C_LABELS = LABELS
m = train_model(X_PP, y, target_LABEL, C_LABELS, 0.8, 4)
write_out(m, X_test_PP, y, target_LABEL, C_LABELS)

0.691917762572571


  sig = (1 / (1 + np.exp(-x)))


In [49]:
##LABEL_Alkalinephos
target_LABEL = "LABEL_Alkalinephos"
C_LABELS = X.columns.values
m = train_model(X_PP, y, target_LABEL, C_LABELS, 1, 4)
write_out(m, X_test_PP, y, target_LABEL, C_LABELS)

0.8402159127839768


In [51]:
##LABEL_Bilirubin_total
target_LABEL = "LABEL_Bilirubin_total"
C_LABELS = X.columns.values
m = train_model(X_PP, y, target_LABEL, C_LABELS, 0.8, 4)
write_out(m, X_test_PP, y, target_LABEL, C_LABELS)

0.8357826833883855


In [53]:
##LABEL_Lactate
target_LABEL = "LABEL_Lactate"
C_LABELS = LABELS
m = train_model(X_PP, y, target_LABEL, C_LABELS, 0.8, 4)
write_out(m, X_test_PP, y, target_LABEL, C_LABELS)

  sig = (1 / (1 + np.exp(-x)))


0.7549161944569412


  sig = (1 / (1 + np.exp(-x)))


In [54]:
##LABEL_TroponinI
target_LABEL = "LABEL_TroponinI"
C_LABELS = LABELS
m = train_model(X_PP, y, target_LABEL, C_LABELS, 0.8, 4)
write_out(m, X_test_PP, y, target_LABEL, C_LABELS)

  sig = (1 / (1 + np.exp(-x)))


0.7820898640620901


  sig = (1 / (1 + np.exp(-x)))


In [56]:
##LABEL_SaO2
target_LABEL = "LABEL_SaO2"
C_LABELS = LABELS
m = train_model(X_PP, y, target_LABEL, C_LABELS, 0.7, 4)
write_out(m, X_test_PP, y, target_LABEL, C_LABELS)

  sig = (1 / (1 + np.exp(-x)))


0.7669870859733924


  sig = (1 / (1 + np.exp(-x)))


In [57]:
##LABEL_Bilirubin_direct
target_LABEL = "LABEL_Bilirubin_direct"
C_LABELS = LABELS
m = train_model(X_PP, y, target_LABEL, C_LABELS, 0.8, 4)
write_out(m, X_test_PP, y, target_LABEL, C_LABELS)

  sig = (1 / (1 + np.exp(-x)))


0.8675690325742569


  sig = (1 / (1 + np.exp(-x)))


In [58]:
##LABEL_EtCO2
target_LABEL = "LABEL_EtCO2"
C_LABELS = LABELS
m = train_model(X_PP, y, target_LABEL, C_LABELS, 0.8, 4)
write_out(m, X_test_PP, y, target_LABEL, C_LABELS)

  sig = (1 / (1 + np.exp(-x)))


0.8452968987487601


  sig = (1 / (1 + np.exp(-x)))


In [None]:
##beginning of last part

In [63]:
##"LABEL_RRate"
label_name = "LABEL_RRate"
m = KernelRidge(alpha=1.5, kernel="poly", degree=5).fit(X_PP.values, y[label_name].values)
m.score(X_PP.values, y[label_name].values)

0.7657347615569133

In [64]:
df = pd.DataFrame(columns=y.columns.values)
df[label_name] = m.predict(X_PP.values)
df.to_csv("Labels_Training_New/" + label_name + ".csv")
##training
df = pd.DataFrame(columns=y.columns.values)
df[label_name] = m.predict(X_test_PP.values)
df.to_csv("Labels_New/" + label_name + ".csv")

In [65]:
##"LABEL_ABPm"
label_name = "LABEL_ABPm"
m = KernelRidge(alpha=2.0, kernel="poly", degree=5).fit(X_PP.values, y[label_name].values)
m.score(X_PP.values, y[label_name].values)

0.8147858679844178

In [66]:
df = pd.DataFrame(columns=y.columns.values)
df[label_name] = m.predict(X_PP.values)
df.to_csv("Labels_Training_New/" + label_name + ".csv")
##training
df = pd.DataFrame(columns=y.columns.values)
df[label_name] = m.predict(X_test_PP.values)
df.to_csv("Labels_New/" + label_name + ".csv")

In [69]:
##"LABEL_SpO2"
label_name = "LABEL_SpO2"
m = KernelRidge(alpha=1.5, kernel="poly", degree=5).fit(X_PP.values, y[label_name].values)
m.score(X_PP.values, y[label_name].values)

0.7943989074154093

In [70]:
df = pd.DataFrame(columns=y.columns.values)
df[label_name] = m.predict(X_PP.values)
df.to_csv("Labels_Training_New/" + label_name + ".csv")
##training
df = pd.DataFrame(columns=y.columns.values)
df[label_name] = m.predict(X_test_PP.values)
df.to_csv("Labels_New/" + label_name + ".csv")

In [72]:
##LABEL_Heartrate
label_name = "LABEL_Heartrate"
m = KernelRidge(alpha=2.0, kernel="poly", degree=4).fit(X_PP.values, y[label_name].values)
m.score(X_PP.values, y[label_name].values)

0.772508907539619

In [73]:
df = pd.DataFrame(columns=y.columns.values)
df[label_name] = m.predict(X_PP.values)
df.to_csv("Labels_Training_New/" + label_name + ".csv")
##training
df = pd.DataFrame(columns=y.columns.values)
df[label_name] = m.predict(X_test_PP.values)
df.to_csv("Labels_New/" + label_name + ".csv")