In [1]:
import pandas as pd
import numpy as np
import timeit
import sys

from sklearn import preprocessing, neighbors
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cross_validation import train_test_split, StratifiedShuffleSplit
from sklearn.grid_search import GridSearchCV
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

df = pd.read_csv('transfusion.data') # Import the datafile
X = preprocessing.scale(df.values[:, :4].astype("float64")) # The predictors, standardized
#X = df.values[:, :4].astype("float64") # The predictors, not standardized
y = df.values[:, 4] # The responses
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25) # Set aside 25% of the data for test
print("{0} is 0, {1} is 1".format(1 - sum(y) / len(y), sum(y) / len(y)))

cv = StratifiedShuffleSplit(y_train, n_iter=5, test_size=1/3, random_state=42) # Generate the cross validation labels

0.7620320855614973 is 0, 0.23796791443850268 is 1


In [2]:
C_range = np.logspace(-4, 4, 9) # Range of Parameter C for all 3 svcs
k_range = np.arange(1, 15) # Range of parameter n_neighbors for all 3 knncs

gamma_range = np.logspace(-4, 4, 9) #  Range of parameter gamma for rbf kernel
l_range = np.logspace(-3, 3, 7) # Range of parameter l for Epanechnikov kernel

classifiers = dict() # The set of classifiers
params_grid = dict() # The set of cross validation parameters for each classifier

# The svm classifier with linear kernel
classifiers["svc_linear"] = SVC(kernel='linear')
params_grid["svc_linear"] = dict(C=C_range)

# The svm classifier with rbf kernel
classifiers["svc_rbf"] = SVC(kernel='rbf')
params_grid["svc_rbf"] = dict(gamma=gamma_range, C=C_range)

# The svm classifier with user defined Epanechnikov kernel
def epa(X, Y, l=1.0):
    """ Epanechnikov kernel with bandwidth parameter
    return a n_row_X-by-n-row_Y matrix
    """
    n_sample_X = len(X)
    n_sample_Y = len(Y)
    t = np.empty([n_sample_X, n_sample_Y], dtype="float64")
    for i in range(n_sample_X):
        for j in range(n_sample_Y):
            t[i, j] = np.linalg.norm(X[i] - Y[j]) / l
    
    return 3 / 4  * (1 - t ** 2) * (abs(t) < 1)

class EpaKernel(BaseEstimator, TransformerMixin):
    def __init__(self, l=1.0):
        super(EpaKernel, self).__init__()
        self.l = l

    def transform(self, X):
        return epa(X, self.X_train_, l=self.l)

    def fit(self, X, y=None, **fit_params):
        self.X_train_ = X
        return self
    
classifiers["svc_epa"] = Pipeline([('epa', EpaKernel()),('svm', SVC()),])
params_grid["svc_epa"] = dict([('epa__l', l_range), ('svm__kernel', ['precomputed']), ('svm__C', C_range),])

# The knn classifier with linear kernel
classifiers["knnc_linear"] = neighbors.KNeighborsClassifier()
params_grid["knnc_linear"] = dict(n_neighbors=k_range)

# The knn classifier with rbf kernel
def kernel_rbf(x, y, gamma):
    return np.exp(-gamma * np.linalg.norm(x-y)  ** 2)

def dist_rbf(x, y, gamma):
    return kernel_rbf(x, x, gamma) + kernel_rbf(y, y, gamma) - 2 * kernel_rbf(x, y, gamma)

classifiers["knnc_rbf"] = neighbors.KNeighborsClassifier(metric=dist_rbf)
params_grid["knnc_rbf"] = dict(metric_params=[{'gamma': gamma} for gamma in gamma_range], n_neighbors=k_range)

# The knn classifier with user defined Epanechnikov kernel
def kernel_epa(x, y, l):
    t = np.linalg.norm(x - y) / l
    return 3 / 4  * (1 - t ** 2) if abs(t) < 1 else 0.0

def dist_epa(x, y, l):
    return kernel_epa(x, x, l) + kernel_epa(y, y, l) - 2 * kernel_epa(x, y, l)

classifiers["knnc_epa"]  = neighbors.KNeighborsClassifier(metric=dist_epa)
params_grid["knnc_epa"] = dict(metric_params=[{'l': l} for l in l_range], n_neighbors=k_range)

In [3]:
for classifier in classifiers.keys():
    start_time = timeit.default_timer()
    
    print("Tuning {0}...".format(classifier))
    
    grid = GridSearchCV(classifiers[classifier], param_grid=params_grid[classifier], cv=cv, verbose=0, n_jobs=4)
    grid.fit(X_train, y_train)
    
    print(" - The best parameters are {0} with a score of {1}".format(grid.best_params_, grid.best_score_))
    print(" - The score on the test set is {0}".format(grid.score(X_test, y_test)))
    print(" - Elapsed time is {0}".format(timeit.default_timer() - start_time))
    sys.stdout.flush()

Tuning knnc_linear...
 - The best parameters are {'n_neighbors': 2} with a score of 0.7700534759358288
 - The score on the test set is 0.7593582887700535
 - Elapsed time is 0.24574940296588466
Tuning svc_rbf...
 - The best parameters are {'C': 1.0, 'gamma': 0.01} with a score of 0.7839572192513369
 - The score on the test set is 0.7700534759358288
 - Elapsed time is 3.2635654539917596
Tuning svc_epa...
 - The best parameters are {'svm__C': 1.0, 'svm__kernel': 'precomputed', 'epa__l': 10.0} with a score of 0.7754010695187166
 - The score on the test set is 0.7593582887700535
 - Elapsed time is 118.96837743703509
Tuning svc_linear...


Process ForkPoolWorker-16:
Process ForkPoolWorker-14:
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/wenhaowu/anaconda3/lib/python3.5/multiprocessing/process.py", line 254, in _bootstrap
    self.run()
  File "/home/wenhaowu/anaconda3/lib/python3.5/multiprocessing/process.py", line 254, in _bootstrap
    self.run()
  File "/home/wenhaowu/anaconda3/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/home/wenhaowu/anaconda3/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/home/wenhaowu/anaconda3/lib/python3.5/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "/home/wenhaowu/anaconda3/lib/python3.5/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "/home/wenhaowu/anaconda3/lib/python3.5/site-packages/sklearn/externals/joblib/pool.py", line 360, in get
    return recv()
  File "/home/wenha

KeyboardInterrupt: 