In [9]:
%matplotlib notebook

import pandas as pd
import numpy as np
from collections import OrderedDict as od

In [4]:
# Reading in training data

X = pd.read_csv('X_train.txt',delim_whitespace=True, header=None).values
y = pd.read_csv('Y_train.txt',delim_whitespace=True, header=None).values.flatten()

In [5]:
# Standardizing data

from sklearn.preprocessing import StandardScaler

#X = StandardScaler().fit_transform(X)

In [6]:
# Creating kernel-based Naive Bayes Classifier

from sklearn.neighbors import KernelDensity
from sklearn.base import BaseEstimator, ClassifierMixin

class KDEClassifier(BaseEstimator, ClassifierMixin):
    """Bayesian generative classification based on KDE
    
    Parameters
    ----------
    bandwidth : float
        the kernel bandwidth within each class
    kernel : str
        the kernel name, passed to KernelDensity
    """
    def __init__(self, bandwidth=1.0, kernel='gaussian'):
        self.bandwidth = bandwidth
        self.kernel = kernel
        
    def fit(self, X, y):
        self.classes_ = np.sort(np.unique(y))
        training_sets = [X[y == yi] for yi in self.classes_]
        self.models_ = [KernelDensity(bandwidth=self.bandwidth,
                                      kernel=self.kernel).fit(Xi)
                        for Xi in training_sets]
        self.logpriors_ = [np.log(Xi.shape[0] / X.shape[0])
                           for Xi in training_sets]
        return self
        
    def predict_proba(self, X):
        logprobs = np.array([model.score_samples(X)
                             for model in self.models_]).T
        result = np.exp(logprobs + self.logpriors_)
        return result / result.sum(1, keepdims=True)
        
    def predict(self, X):
        return self.classes_[np.argmax(self.predict_proba(X), 1)]

In [28]:
# Do a grid search on sample to determine best hyperparameters

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

# sampling data
sample_frac = 1/10.
X_sam, _, y_sam, _ = train_test_split(X, y, stratify = y, train_size = sample_frac,
                                           random_state = 0)

# grid search
kernels = ['gaussian', 'tophat', 'epanechnikov', 'exponential', 'linear', 'cosine']
bandwidths = 10 ** np.linspace(-1, 1.1, 10)
grid = GridSearchCV(KDEClassifier(), od([('kernel', kernels), ('bandwidth', bandwidths)]))
grid.fit(X_sam, y_sam)


overflow encountered in exp


invalid value encountered in true_divide


overflow encountered in exp


invalid value encountered in true_divide


overflow encountered in exp


invalid value encountered in true_divide


overflow encountered in exp


invalid value encountered in true_divide


invalid value encountered in true_divide


overflow encountered in exp


invalid value encountered in true_divide


overflow encountered in exp


invalid value encountered in true_divide


overflow encountered in exp


invalid value encountered in true_divide


overflow encountered in exp


invalid value encountered in true_divide


overflow encountered in exp


invalid value encountered in true_divide


invalid value encountered in true_divide


invalid value encountered in true_divide


invalid value encountered in true_divide


overflow encountered in exp


invalid value encountered in true_divide


overflow encountered in exp


invalid value encountered in true_divide


overflow encountered in 


overflow encountered in exp


invalid value encountered in true_divide


invalid value encountered in true_divide


invalid value encountered in true_divide


invalid value encountered in true_divide


overflow encountered in exp


invalid value encountered in true_divide


overflow encountered in exp


invalid value encountered in true_divide


overflow encountered in exp


invalid value encountered in true_divide


overflow encountered in exp


invalid value encountered in true_divide


overflow encountered in exp


invalid value encountered in true_divide


invalid value encountered in true_divide


overflow encountered in exp


overflow encountered in exp


invalid value encountered in true_divide


overflow encountered in exp


invalid value encountered in true_divide


overflow encountered in exp


invalid value encountered in true_divide


invalid value encountered in true_divide


invalid value encountered in true_divide


invalid value encountered in true_divide


overflow en


overflow encountered in exp


invalid value encountered in true_divide


overflow encountered in exp


invalid value encountered in true_divide


overflow encountered in exp


invalid value encountered in true_divide


overflow encountered in exp


invalid value encountered in true_divide


invalid value encountered in true_divide


invalid value encountered in true_divide


invalid value encountered in true_divide


overflow encountered in exp


invalid value encountered in true_divide


overflow encountered in exp


invalid value encountered in true_divide


overflow encountered in exp


invalid value encountered in true_divide


invalid value encountered in true_divide


invalid value encountered in true_divide


invalid value encountered in true_divide


invalid value encountered in true_divide


invalid value encountered in true_divide


invalid value encountered in true_divide


invalid value encountered in true_divide


invalid value encountered in true_divide


invalid value e


invalid value encountered in true_divide


invalid value encountered in true_divide


invalid value encountered in true_divide


invalid value encountered in true_divide


invalid value encountered in true_divide


invalid value encountered in true_divide


invalid value encountered in true_divide



GridSearchCV(cv=None, error_score='raise',
       estimator=KDEClassifier(bandwidth=1.0, kernel='gaussian'),
       fit_params={}, iid=True, n_jobs=1,
       param_grid=OrderedDict([('kernel', ['gaussian', 'tophat', 'epanechnikov', 'exponential', 'linear', 'cosine']), ('bandwidth', array([  0.1    ,   0.17113,   0.29286,   0.50119,   0.8577 ,   1.4678 ,
         2.51189,   4.29866,   7.35642,  12.58925]))]),
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [30]:
import plotly
import plotly.graph_objs as go

plotly.offline.init_notebook_mode(connected=True)

scores_shaped = np.empty(shape = (len(bandwidths), len(kernels)))
for dic_ix, dic in enumerate(cv_results['params']):
    ker_ix = kernels.index(dic['kernel'])
    band_ix = np.argmin(np.abs(bandwidths - dic['bandwidth']))
    scores_shaped[band_ix, ker_ix] = cv_results['mean_test_score'][dic_ix]

trace = go.Heatmap(x = kernels,
                   y = list(np.vectorize(lambda b: 'bw ' + '{0:.2f}'.format(b))
                            (bandwidths)),
                   z = scores_shaped)
data=[trace]
plotly.offline.iplot(data, filename='labelled-heatmap')