In [7]:
import pandas as pd
import numpy as np
import math
from sklearn import svm
from sklearn import preprocessing
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score

##init X train         
X = pd.read_csv("train_features.csv")

##init y train
y = pd.read_csv("train_labels.csv")

##init X train
X_test = pd.read_csv("test_features.csv")

##rearange 12 samples to one, because it's a time series
def reara12(arr):
    old_arr = arr
    rows, cols = len(old_arr), len(old_arr[0])
    new_arr = np.arange(rows * cols, dtype=float).reshape((int)(rows / 12), cols * 12)
    
    i_new = 0
    for i_old in range (0, rows, 12):        
        new_row = old_arr[i_old]
        for i in range (1, 12):
            new_row = np.append(new_row, old_arr[i_old + i])                    
        new_arr[i_new] = new_row
        i_new += 1
        
    return new_arr

##sometimes overflows
def sigmoid(x):
    sig = (1 / (1 + np.exp(-x)))
    return sig

sigmoid_v = np.vectorize(sigmoid)

##returns indices for all the specified labels
def label_to_ind(df, label_list):
    index = []
    for i in label_list:
        index += [df.columns.get_loc(i)]
    return index

##random oversampling function, assumption more 0 than 1. X: numpy array, y: pandas dataframe with labels
def ros(X, y, label_name):
    y_ind0 = np.array(y.index[y[label_name] == 0].tolist())
    y_ind1 = np.array(y.index[y[label_name] == 1].tolist())

    copy_ind = np.random.choice(y_ind1, len(y_ind0) - len(y_ind1), replace=True)
    
    os_samples = X[copy_ind[0]]
    os_samples = os_samples.reshape(1,os_samples.shape[0])    
    for i in copy_ind[1:]:
        copy_sample = X[i]
        os_samples = np.concatenate((os_samples, copy_sample.reshape(1, copy_sample.shape[0])), axis=0)
    
    X_val_os = np.concatenate((X, os_samples), axis=0)
    y_val_os = np.concatenate((y[label_name].values, np.ones(len(os_samples), dtype = float)), axis=0)
    return X_val_os, y_val_os

def test_roc_auc(model, X, y, label_target, label_list, imputer, scaler):
    X_test = imp.transform(X)
    X_test = scaler.transform(X_test)
    X_test = X_test[:, label_to_ind(X, label_list)]
    X_test = reara12(X_test)

    y_dec = model.decision_function(X_test)
    y_dec_prob = sigmoid_v(y_dec)
    y_true = y[label_target]
    
    return roc_auc_score(y_true, y_dec_prob)

def write_out(model, X_test, y, label_name, label_list, imputer, scaler):
    X_test = imp.transform(X_test)
    X_test = scaler.transform(X_test)
    X_test = X_test[:, label_to_ind(X, label_list)]
    X_test = reara12(X_test)
    
    y_dec_prob = sigmoid_v(model.decision_function(X_test))
    
    df = pd.DataFrame(columns=y.columns.values)
    df[label_name] = y_dec_prob
    df.to_csv(label_name + ".csv")    
    return
    




In [8]:
imp = SimpleImputer(missing_values=np.nan, strategy='mean').fit(X)
X_val = imp.transform(X)
scaler = preprocessing.StandardScaler().fit(X_val)
X_val = scaler.transform(X_val)
X_val

array([[-1.72509230e+00, -8.51212785e-01, -1.70642596e+00, ...,
        -8.22522762e-16,  9.19839542e-01, -1.51001649e+00],
       [-1.72509230e+00, -6.39172854e-01, -1.70642596e+00, ...,
        -8.22522762e-16,  1.23243794e-01, -1.51001649e+00],
       [-1.72509230e+00, -4.27132924e-01, -1.70642596e+00, ...,
        -8.22522762e-16, -5.79634807e-01,  1.12283693e-01],
       ...,
       [-6.32638716e-01,  6.33066729e-01,  1.39353544e+00, ...,
        -8.22522762e-16,  7.32405248e-01, -1.08066901e-13],
       [-6.32638716e-01,  8.45106660e-01,  1.39353544e+00, ...,
        -8.22522762e-16,  1.23243794e-01, -1.08066901e-13],
       [-6.32638716e-01,  1.05714659e+00,  1.39353544e+00, ...,
        -8.22522762e-16,  2.63819514e-01, -1.08066901e-13]])

In [9]:
##LABEL_BaseExcess
LBE_LABELS = ["Time", "HCO3", "BaseExcess", "pH"]
X_val_LBE = X_val[:,label_to_ind(X, LBE_LABELS)]
X_val_LBE = reara12(X_val_LBE)
X_val_LBE, y_val_LBE = ros(X_val_LBE, y, "LABEL_BaseExcess")
m = svm.SVC(C=0.6, kernel="poly", degree=3, decision_function_shape="ovo").fit(X_val_LBE, y_val_LBE)
test_roc_auc(m, X, y, "LABEL_BaseExcess", LBE_LABELS, imp, scaler)

0.8751790925405677

In [11]:
write_out(m, X_test, y, "LABEL_BaseExcess", LBE_LABELS, imp, scaler)

In [14]:
##LABEL_Fibrinogen
LF_LABELS = ["Time", "PTT", "Hgb", "Fibrinogen", "Platelets"]
X_val_LF = X_val[:, label_to_ind(X, LF_LABELS)]
X_val_LF = reara12(X_val_LF)
X_val_LF, y_val_LF = ros(X_val_LF, y, "LABEL_Fibrinogen")

In [15]:
m = svm.SVC(C=0.7, kernel="poly", degree=4, decision_function_shape="ovo")
m.fit(X_val_LF, y_val_LF)
test_roc_auc(m, X, y, "LABEL_Fibrinogen", LF_LABELS, imp, scaler)

  sig = (1 / (1 + np.exp(-x)))


0.84437441643324

In [19]:
##LABEL_AST: creat is maybe bad: mag, pot, cal,
LA_LABELS = ["RRate", "Heartrate", "Time", "BUN", "Hgb", "Phosphate", "Creatinine", "AST", "Glucose", "Magnesium", "Potassium", "Calcium", "Alkalinephos", "Bilirubin_direct", "Bilirubin_total",  ]
X_val_LA = X_val[:, label_to_ind(X, LA_LABELS)]
X_val_LA = reara12(X_val_LA)
X_val_LA, y_val_LA = ros(X_val_LA, y, "LABEL_AST")

m = svm.SVC(C=0.7, kernel="poly", degree=4, decision_function_shape="ovo")
m.fit(X_val_LA, y_val_LA)
test_roc_auc(m, X, y, "LABEL_AST", LA_LABELS, imp, scaler)

0.8357326741065071