In [1]:
%matplotlib notebook

import pandas as pd
import numpy as np
from collections import OrderedDict as od
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Reading in training data

X = pd.read_csv('X_train.txt',delim_whitespace=True, header=None).values
y = pd.read_csv('Y_train.txt',delim_whitespace=True, header=None).values.flatten()

In [3]:
# Standardizing data

from sklearn.preprocessing import StandardScaler

#X = StandardScaler().fit_transform(X)
# WORKS BETTER WITHOUT STANDARDIZATION

In [4]:
# Creating kernel-based Naive Bayes Classifier

from sklearn.neighbors import KernelDensity
from sklearn.base import BaseEstimator, ClassifierMixin

class KDEClassifier(BaseEstimator, ClassifierMixin):
    """Bayesian generative classification based on KDE
    
    Parameters
    ----------
    bandwidth : float
        the kernel bandwidth within each class
    kernel : str
        the kernel name, passed to KernelDensity
    """
    def __init__(self, bandwidth=1.0, kernel='gaussian'):
        self.bandwidth = bandwidth
        self.kernel = kernel
        
    def fit(self, X, y):
        self.classes_ = np.sort(np.unique(y))
        training_sets = [X[y == yi] for yi in self.classes_]
        self.models_ = [KernelDensity(bandwidth=self.bandwidth,
                                      kernel=self.kernel).fit(Xi)
                        for Xi in training_sets]
        self.logpriors_ = [np.log(Xi.shape[0] / X.shape[0])
                           for Xi in training_sets]
        return self
        
    def predict_proba(self, X):
        logprobs = np.array([model.score_samples(X)
                             for model in self.models_]).T
        result = np.exp(logprobs + self.logpriors_)
        return result / result.sum(1, keepdims=True)
        
    def predict(self, X):
        return self.classes_[np.argmax(self.predict_proba(X), 1)]

In [None]:
# Do a grid search on sample to determine best hyperparameters

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

# sampling data
sample_frac = 1/10.
X_sam, _, y_sam, _ = train_test_split(X, y, stratify = y, train_size = sample_frac,
                                           random_state = 0)

# grid search
#kernels = ['gaussian', 'tophat', 'epanechnikov', 'exponential', 'linear', 'cosine']
kernels = ['gaussian']
bandwidths = 10 ** np.linspace(-1, 1.1, 10)
grid = GridSearchCV(KDEClassifier(), od([('kernel', kernels), ('bandwidth', bandwidths)]),
                   n_jobs = 8, verbose = 10)
grid.fit(X_sam, y_sam)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


In [None]:
# plotting accurac vs hyperparameters

import plotly
import plotly.graph_objs as go

plotly.offline.init_notebook_mode(connected=True)

cv_results = grid.cv_results_

scores_shaped = np.empty(shape = (len(bandwidths), len(kernels)))
for dic_ix, dic in enumerate(cv_results['params']):
    ker_ix = kernels.index(dic['kernel'])
    band_ix = np.argmin(np.abs(bandwidths - dic['bandwidth']))
    scores_shaped[band_ix, ker_ix] = cv_results['mean_test_score'][dic_ix]

trace = go.Heatmap(x = kernels,
                   y = list(np.vectorize(lambda b: 'bw ' + '{0:.2f}'.format(b))
                            (bandwidths)),
                   z = scores_shaped)
data=[trace]
figure = go.Figure(data = data, layout = go.Layout(title = 'Accuracy (3-fold cross-validated)'))
plotly.offline.iplot(figure)

In [16]:
print('Best parameters =', grid.best_params_)

Best parameters = {'kernel': 'gaussian', 'bandwidth': 0.85769589859089412}
