In [1]:
from sklearn.svm import SVC, SVR
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from joblib import dump, load
import os
from typing import Tuple
from tqdm import tqdm

In [2]:
MODEL_PATH = 'models'

In [3]:
PID = 'pid'

VITALS = ['LABEL_RRate', 'LABEL_ABPm', 'LABEL_SpO2', 'LABEL_Heartrate']
TESTS = ['LABEL_BaseExcess', 'LABEL_Fibrinogen', 'LABEL_AST', 'LABEL_Alkalinephos', 'LABEL_Bilirubin_total',
         'LABEL_Lactate', 'LABEL_TroponinI', 'LABEL_SaO2',
         'LABEL_Bilirubin_direct', 'LABEL_EtCO2']
SEPSIS = 'LABEL_Sepsis'

# Data

In [4]:
features = pd.read_csv('train_features.csv')
labels = pd.read_csv('train_labels.csv')
sub_features = pd.read_csv('test_features.csv')

# Preprocessing

In [5]:
def preprocess_labels(labels: pd.DataFrame) -> pd.DataFrame:
  return labels.sort_values(PID)

def preprocess_features(features: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray]:
  features = features.copy()

  # Sort features and remove time
  features.sort_values('Time', inplace=True)
  features.sort_values(PID, kind='stable', inplace=True)
  features.drop('Time', axis='columns', inplace=True)

  pids = features[PID].unique()

  features_mean = features.groupby(PID).mean().add_suffix('_mean')
  features_std = features.groupby(PID).std().add_suffix('_std')
  features_count = features.groupby(PID).count().add_suffix('_count')

  full_features = pd.concat([features_mean, features_std, features_count], axis='columns')
  full_features.fillna(full_features.mean(axis='rows'), inplace=True)

  return StandardScaler().fit_transform(full_features), pids

In [6]:
X, pids = preprocess_features(features)
Y = preprocess_labels(labels)
X_sub, sub_pids = preprocess_features(sub_features)

In [7]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

# Training

In [8]:
def train_clf(entry: str, X_train: np.ndarray, Y_train: pd.DataFrame, version='', force_new=False) -> SVC:
  file_name = f'{MODEL_PATH}/{entry}{version}.joblib'

  if not force_new and os.path.isfile(file_name):
    return load(file_name)
  else:
    y_train = Y_train[entry]

    clf = SVC(probability=True, class_weight='balanced')
    clf.fit(X_train, y_train)

    dump(clf, file_name)

    return clf

def train_regr(entry: str, X_train: np.ndarray, Y_train: pd.DataFrame, version='', force_new=False) -> SVR:
  file_name = f'{MODEL_PATH}/{entry}{version}.joblib'

  if not force_new and os.path.isfile(file_name):
    return load(file_name)
  else:
    y_train = Y_train[entry]

    regr = SVR(epsilon=0.5)
    regr.fit(X_train, y_train)

    dump(regr, file_name)

    return regr

In [13]:
# Task 1
for entry in tqdm(TESTS):
  train_clf(entry, X_train, Y_train, '_train')

# Task 2
train_clf(SEPSIS, X_train, Y_train, '_train')

# Task 3
for entry in tqdm(VITALS):
  train_regr(entry, X_train, Y_train, '_train')

100%|██████████| 10/10 [16:34<00:00, 99.44s/it]
100%|██████████| 4/4 [01:03<00:00, 15.85s/it]


# Prepare Submission

In [9]:
# Task 1
for entry in tqdm(TESTS):
  train_clf(entry, X, Y)

# Task 2
train_clf(SEPSIS, X, Y)

# Task 3
for entry in tqdm(VITALS):
  train_regr(entry, X, Y)

100%|██████████| 10/10 [27:25<00:00, 164.59s/it]
100%|██████████| 4/4 [01:41<00:00, 25.32s/it]


In [10]:
def prepare_clf(entry: str, X_test: np.ndarray, version='') -> pd.DataFrame:
  clf = load(f'{MODEL_PATH}/{entry}{version}.joblib')
  y_pred = clf.predict_proba(X_test)[:,1]
  return pd.DataFrame(y_pred, columns=[entry])

def prepare_regr(entry: str, X_test: np.ndarray, version='') -> pd.DataFrame:
  regr = load(f'{MODEL_PATH}/{entry}{version}.joblib')
  y_pred = regr.predict(X_test)
  return pd.DataFrame(y_pred, columns=[entry])

In [11]:
y_pred = [pd.DataFrame(sub_pids, columns=[PID])]

# Task 1
for entry in tqdm(TESTS):
  y_pred.append(prepare_clf(entry, X_sub))

# Task 2
y_pred.append(prepare_clf(SEPSIS, X_sub))

# Task 3
for entry in tqdm(VITALS):
  y_pred.append(prepare_regr(entry, X_sub))

100%|██████████| 10/10 [03:03<00:00, 18.36s/it]
100%|██████████| 4/4 [01:45<00:00, 26.35s/it]


In [12]:
submission = pd.concat(y_pred, axis='columns')
submission.to_csv('submission.zip', index=False, float_format='%.3f', compression='zip')

# Analysis

In [14]:
import sklearn.metrics as metrics

In [15]:
def evaluate_clf(entry: str, Y_test: pd.DataFrame, y_pred: np.ndarray, verbose=False) -> float:
  roc_auc = metrics.roc_auc_score(Y_test[entry], y_pred)

  if verbose:
    print(entry, '---')
    print('accuracy', metrics.accuracy_score(Y_test[entry], np.around(y_pred)))
    print('recall', metrics.recall_score(Y_test[entry], np.around(y_pred)))
    print('precision', metrics.precision_score(Y_test[entry], np.around(y_pred)))
    print('ROC AUC', roc_auc)

  return roc_auc

def evaluate_regr(entry: str, Y_test: pd.DataFrame, y_pred: np.ndarray, verbose=False) -> float:
  r2 = 0.5 + 0.5 * np.maximum(0, metrics.r2_score(Y_test[entry], y_pred))
  
  if verbose:
    print(entry, '---')
    print('R2', r2)

  return r2

In [17]:
# Task 1
task1 = []
for entry in tqdm(TESTS):
  task1.append(evaluate_clf(entry, Y_test, prepare_clf(entry, X_test, '_train')))
print('TASK 1:', np.mean(task1))

# Task 2
task2 = evaluate_clf(entry, Y_test, prepare_clf(entry, X_test, '_train'))
print('TASK 2:', task2)

# Task 3
task3 = []
for entry in tqdm(VITALS):
  task3.append(evaluate_regr(entry, Y_test, prepare_regr(entry, X_test, '_train')))
print('TASK 3:', np.mean(task3))

100%|██████████| 10/10 [00:44<00:00,  4.44s/it]


TASK 1: 0.807753368162541
TASK 2: 0.9222469348913379


100%|██████████| 4/4 [00:25<00:00,  6.48s/it]

TASK 3: 0.7115516606663133



