In [1]:
import numpy as np
from sklearn import svm
from sklearn import datasets
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit, GridSearchCV
%matplotlib inline

Loads dataset

In [2]:
data = datasets.load_breast_cancer()

Splits and normalizes data.

`X_full` -> complete set

`X` -> test + train

In [3]:
X_full, Y_full = data['data'], data['target']
X_full = (X_full - X_full.mean(axis=0))/np.std(X_full, axis=0)
until = int(0.8*Y_full.shape[0])
X = X_full[:until]
Y = Y_full[:until]
X_valid = X_full[until:]
Y_valid = Y_full[until:]
X_train, X_test, Y_train, Y_test = train_test_split(X, Y)

In [4]:
def optimize_kernel(kernel):
    """
    optimizes freedom parameters over a 2D grid based on kernel type
    """
    C_range = np.logspace(-2, 2, 20)
    param_grid = dict(C=C_range)
    if kernel == 'rbf':
        gamma_range = np.logspace(-7, 1, 20)
        param_grid = dict(gamma=gamma_range, C=C_range)
        
    cv = ShuffleSplit(n_splits=10, test_size=0.20)
    grid = GridSearchCV(svm.SVC(kernel=kernel), param_grid=param_grid, cv=cv, n_jobs=-1)
    grid.fit(X_train, Y_train)
    return grid.best_params_

def evaluate_kernel(kernel):
    best_params = optimize_kernel(kernel)
    if kernel == 'rbf':
        svc = svm.SVC(kernel=kernel, C=best_params['C'], gamma=best_params['gamma'])
    else:
        svc = svm.SVC(kernel=kernel, C=best_params['C'])
    
    svc.fit(X_train, Y_train)
    ys_trues = [Y_train, Y_test, Y_valid]
    xs = [X_train, X_test, X_valid]
    labels = ['Train', 'Test', 'Valid']
    print(best_params)
    for (x, y, label) in zip(xs, ys_trues, labels):
        y_pred = svc.predict(x)
        conf = confusion_matrix(y, y_pred)
        print("%s: FP = %d, FN = %d, TP+TN = %d, Total = %d"%(label, conf[0,1], conf[1,0], np.trace(conf), y.shape[0]))

In [5]:
evaluate_kernel('rbf')

{'C': 37.92690190732246, 'gamma': 0.001623776739188721}
Train: FP = 4, FN = 0, TP+TN = 337, Total = 341
Test: FP = 3, FN = 1, TP+TN = 110, Total = 114
Valid: FP = 1, FN = 1, TP+TN = 112, Total = 114


In [6]:
evaluate_kernel('linear')

{'C': 0.016237767391887217}
Train: FP = 4, FN = 0, TP+TN = 337, Total = 341
Test: FP = 3, FN = 1, TP+TN = 110, Total = 114
Valid: FP = 1, FN = 1, TP+TN = 112, Total = 114


In [7]:
evaluate_kernel('poly')

{'C': 100.0}
Train: FP = 1, FN = 0, TP+TN = 340, Total = 341
Test: FP = 7, FN = 2, TP+TN = 105, Total = 114
Valid: FP = 2, FN = 3, TP+TN = 109, Total = 114


Best results were with `rbf` and `linear` because total missed score was the lowest with 10 missclassifications over the whole dataset.