In [1]:
import os
import pickle
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, StratifiedKFold, GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, roc_auc_score, recall_score, precision_score
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline

import matplotlib.pyplot as plt
%matplotlib inline

In [152]:
def calculate_metric(prob, target, threshold=0.5):
    pred = prob.copy()
    pred[pred >= threshold] = 1
    pred[pred < threshold] = 0
    tn, fp, fn, tp = confusion_matrix(target, pred).ravel()
    m = {}
    m['pred'] = pred
    m['auc'] = roc_auc_score(target, prob)
    m['acc'] = (tp + tn) / (tp + tn + fp + fn)
    m['tpr'] = tp / (tp + fn)
    m['tnr'] = tn / (tn + fp)
    m['ppv'] = tp / (tp + fp)
    m['f1'] = 2 * tp / (2 * tp + fp + fn)
    return m

def scale(x):
    xx = np.zeros_like(x)
    xx -= np.clip(x, a_min=-np.inf, a_max=0) / x.min()
    xx += np.clip(x, a_min=0, a_max=np.inf) / x.max()
    return (xx + 1) / 2
# scale = lambda x: (x - x.min()) / (x.max() - x.min())

In [234]:
findings = pickle.load(open('tak_npy/findings.pkl', "rb"))
query = list(findings['0000-1'].keys())
_ = query.remove('label'), query.remove('zone')
images = np.stack([np.stack([findings[f][q] for q in query]) for f in findings])
labels = np.stack([findings[f]['label'] for f in findings])
images.shape, images.min(), images.max(), np.isfinite(images).all()

((317, 3, 5, 5), 0.0, 3028.0625, True)

In [235]:
images = images.reshape(317, -1)
mean = np.tile(images.mean(axis=-1)[..., np.newaxis], (1, 75))
std = np.tile(images.std(axis=-1)[..., np.newaxis], (1, 75))
images = (images - mean) / std

In [236]:
features = np.hstack((images.reshape(317, -1), pd.get_dummies([v['zone'] for v in findings.values()]).values))
features.shape

(317, 78)

In [237]:
X, y = features, labels.astype(int)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
len(X_train), len(X_test), len(y_train), len(y_test)

(253, 64, 253, 64)

In [238]:
svc = SVC(C=30, gamma=10e-3, class_weight='balanced')
svc.fit(X_train, y_train)
prob = scale(svc.decision_function(X_test))
calculate_metric(prob, y_test)

{'pred': array([0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 1., 0.,
        1., 0., 1., 0., 0., 1., 0., 1., 0., 0., 1., 0., 0., 1., 0., 0., 1.,
        1., 0., 1., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1.]),
 'auc': 0.6517006802721088,
 'acc': 0.75,
 'tpr': 0.5333333333333333,
 'tnr': 0.8163265306122449,
 'ppv': 0.47058823529411764,
 'f1': 0.5}

In [239]:
pipe = Pipeline([
    ('clf', SVC(class_weight='balanced'))
])

param_grid = {
    'clf__C': [0.1, 0.5, 1, 2, 5, 10, 20, 30, 50],
    'clf__gamma': [1e-1, 1e-2, 1e-3, 1e-4, 1e-5],
}

auc_scorer = lambda estimator, x, y: calculate_metric(scale(estimator.decision_function(x)), y)['auc']

search = GridSearchCV(pipe, 
                      param_grid, 
                      cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=0), 
#                       cv=StratifiedShuffleSplit(n_splits=3, random_state=0),
                      scoring=auc_scorer,
                      n_jobs=-1)

In [240]:
import time
np.random.seed(0)
begin = time.time()
search.fit(X, y)
print(time.time() - begin)

1.0775988101959229


In [241]:
pipe = search.best_estimator_
search.best_params_, search.best_score_

({'clf__C': 0.5, 'clf__gamma': 0.1}, 0.7169358292348357)

In [232]:
ms = []
for train_index, test_index in search.cv.split(X, y):
    prob = scale(pipe.decision_function(X[test_index]))
    ms.append(calculate_metric(prob, y[test_index]))
[m['auc'] for m in ms], np.mean([m['auc'] for m in ms])

([0.8261728395061728, 0.8048780487804877, 0.7078189300411524],
 0.7796232727759377)