In [15]:
from sklearn.svm import SVC, SVR
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
from joblib import dump, load
import os
from typing import Tuple

In [2]:
MODEL_PATH = 'models'

In [3]:
PID = 'pid'

VITALS = ['LABEL_RRate', 'LABEL_ABPm', 'LABEL_SpO2', 'LABEL_Heartrate']
TESTS = ['LABEL_BaseExcess', 'LABEL_Fibrinogen', 'LABEL_AST', 'LABEL_Alkalinephos', 'LABEL_Bilirubin_total',
         'LABEL_Lactate', 'LABEL_TroponinI', 'LABEL_SaO2',
         'LABEL_Bilirubin_direct', 'LABEL_EtCO2']
SEPSIS = 'LABEL_Sepsis'

# Data

In [4]:
train_features = pd.read_csv('train_features.csv')
train_labels = pd.read_csv('train_labels.csv')
test_features = pd.read_csv('test_features.csv')

# Preprocessing

In [16]:
def preprocess_labels(labels: pd.DataFrame) -> pd.DataFrame:
  return labels.sort_values(PID)

def preprocess_features(features: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray]:
  features = features.copy()

  # Sort features and remove time
  features.sort_values('Time', inplace=True)
  features.sort_values(PID, kind='stable', inplace=True)
  features.drop('Time', axis='columns', inplace=True)

  pids = features[PID].unique()

  features_mean = features.groupby(PID).mean().add_suffix('_mean')
  features_std = features.groupby(PID).std().add_suffix('_std')
  # features_count = features.groupby(PID).count().add_suffix('_count')

  full_features = pd.concat([features_mean, features_std], axis='columns')
  full_features.fillna(full_features.mean(axis='rows'), inplace=True)

  return StandardScaler().fit_transform(full_features), pids

In [17]:
X_train, train_pids = preprocess_features(train_features)
y_train = preprocess_labels(train_labels)
X_test, test_pids = preprocess_features(test_features)

# Training

In [7]:
def train_clf(entry: str, X_train: np.ndarray, train_labels: pd.DataFrame, force_new=False) -> SVC:
  file_name = f'{MODEL_PATH}/{entry}.joblib'

  if not force_new and os.path.isfile(file_name):
    return load(file_name)
  else:
    y_train = train_labels[entry]

    clf = SVC(probability=True, class_weight='balanced')
    clf.fit(X_train, y_train)

    dump(clf, file_name)

    return clf

def train_regr(entry: str, X_train: np.ndarray, train_labels: pd.DataFrame, force_new=False) -> SVR:
  file_name = f'{MODEL_PATH}/{entry}.joblib'

  if not force_new and os.path.isfile(file_name):
    return load(file_name)
  else:
    y_train = train_labels[entry]

    regr = SVR()
    regr.fit(X_train, y_train)

    dump(regr, file_name)

    return regr

In [8]:
# Task 1
for entry in TESTS:
  train_clf(entry, X_train, train_labels)

# Task 2
train_clf(SEPSIS, X_train, train_labels)

# Task 3
for entry in VITALS:
  train_regr(entry, X_train, train_labels)

# Prepare Submission

In [9]:
def prepare_clf(entry: str, X_test: np.ndarray) -> pd.DataFrame:
  clf = load(f'{MODEL_PATH}/{entry}.joblib')
  y_pred = clf.predict_proba(X_test)[:,1]
  return pd.DataFrame(y_pred, columns=[entry])

def prepare_regr(entry: str, X_test: np.ndarray) -> pd.DataFrame:
  regr = load(f'{MODEL_PATH}/{entry}.joblib')
  y_pred = regr.predict(X_test)
  return pd.DataFrame(y_pred, columns=[entry])

In [21]:
y_pred = [pd.DataFrame(test_pids, columns=[PID])]

# Task 1
for entry in TESTS:
  y_pred.append(prepare_clf(entry, X_test))

# Task 2
y_pred.append(prepare_clf(SEPSIS, X_test))

# Task 3
for entry in VITALS:
  y_pred.append(prepare_regr(entry, X_test))

In [None]:
submission = pd.concat(y_pred, axis='columns')
submission.to_csv('submission.zip', index=False, float_format='%.3f', compression='zip')

# Analysis

In [34]:
import sklearn.metrics as metrics

In [35]:
def print_report(clf: SVC, X_test: np.ndarray, y_test: pd.DataFrame):
  y_pred = clf.predict(X_test)
  y_pred

  y_pred_proba = clf.predict_proba(X_test)[:,1]
  y_pred_proba

  print('accuracy', metrics.accuracy_score(y_test, y_pred))
  print('recall', metrics.recall_score(y_test, y_pred))
  print('precision', metrics.precision_score(y_test, y_pred))
  print('ROC AUC', metrics.roc_auc_score(y_test, y_pred_proba))

In [36]:
print_report(clf, X_train, train_labels['LABEL_Sepsis'])

accuracy 0.9147143985259278
recall 0.8520220588235294
precision 0.38851634534786256
ROC AUC 0.9475991560973527
