In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import cross_val_score, RandomizedSearchCV, KFold
from sklearn.metrics.pairwise import kernel_metrics
from sklearn.kernel_approximation import Nystroem
from sklearn.linear_model import SGDClassifier

`sklearn.metrics.pairwise.kernel_metrics()` to list available kernels

In [2]:
kernel_metrics()

{'additive_chi2': <function sklearn.metrics.pairwise.additive_chi2_kernel(X, Y=None)>,
 'chi2': <function sklearn.metrics.pairwise.chi2_kernel(X, Y=None, gamma=1.0)>,
 'linear': <function sklearn.metrics.pairwise.linear_kernel(X, Y=None, dense_output=True)>,
 'polynomial': <function sklearn.metrics.pairwise.polynomial_kernel(X, Y=None, degree=3, gamma=None, coef0=1)>,
 'poly': <function sklearn.metrics.pairwise.polynomial_kernel(X, Y=None, degree=3, gamma=None, coef0=1)>,
 'rbf': <function sklearn.metrics.pairwise.rbf_kernel(X, Y=None, gamma=None)>,
 'laplacian': <function sklearn.metrics.pairwise.laplacian_kernel(X, Y=None, gamma=None)>,
 'sigmoid': <function sklearn.metrics.pairwise.sigmoid_kernel(X, Y=None, gamma=None, coef0=1)>,
 'cosine': <function sklearn.metrics.pairwise.cosine_similarity(X, Y=None, dense_output=True)>}

Each of these are interesting in their own ways, but the most applicable to our case will be `additive_chi2` and `chi2`, `linear`, `rbf` and `laplacian`. We will do trials of each, but `rbf` will be approximated using the `sklearn.kernel_approximation.Nystroem` method since it is intractable with so many samples especially if we need to enable probability prediction.

## Data Read

In [3]:
d_full = pd.read_csv('../data/d_full.csv')

x, y = *(d_full.drop('label', axis=1),
         d_full['label']),

x, y = map(lambda j: j.to_numpy(), (x,y))

## SGD Initialization

In [4]:
sgd = SGDClassifier(loss='log',
                    penalty='l1',
                    #early_stopping=True,
                    class_weight='balanced',
                    n_jobs=-1)

## Example Nystroem Transform

In [5]:
chi2_approx = Nystroem(kernel='chi2').fit_transform(x, y)
chi2_approx

array([[0.00975688, 0.03916245, 0.18957861, ..., 0.01379283, 0.05415589,
        0.24549443],
       [0.0053321 , 0.04324769, 0.1575751 , ..., 0.02392627, 0.03166662,
        0.2417667 ],
       [0.0093064 , 0.04698138, 0.13397684, ..., 0.01111804, 0.06420053,
        0.22089316],
       ...,
       [0.05752698, 0.01542689, 0.24630702, ..., 0.00609525, 0.03573549,
        0.06505543],
       [0.07907208, 0.01581673, 0.26961472, ..., 0.00760727, 0.04351479,
        0.07217415],
       [0.00566504, 0.0174066 , 0.10650837, ..., 0.0110558 , 0.11778056,
        0.20424286]])

In [6]:
scores = cross_val_score(sgd, chi2_approx, y, scoring='precision_micro', cv=5)
scores

array([0.70678165, 0.66091729, 0.60133281, 0.64014112, 0.58392157])

In [7]:
scores = cross_val_score(sgd, chi2_approx, y, scoring='recall_micro', cv=5)
scores

array([0.6750294 , 0.65895727, 0.59349275, 0.63582909, 0.63333333])

In [8]:
def train(clf, x, y, kernel='rbf', scoring=None, verbose=0, **kwargs):
    approx = Nystroem(kernel=kernel, kernel_params=dict(**kwargs)).fit_transform(x, y)
    return cross_val_score(clf, approx, y, scoring=scoring, cv=KFold(shuffle=True), verbose=verbose, n_jobs=-1)

## Trials

In [16]:
np.set_printoptions(precision=2)
kernels_to_try = ('additive_chi2', 'chi2', 'linear', 'rbf', 'laplacian')
scoring = 'recall_weighted'
for kernel in kernels_to_try:
    sgd = SGDClassifier(loss='log',
                    penalty='l1',
                    #early_stopping=True,
                    class_weight='balanced',
                    n_jobs=-1)
    scores = train(sgd, x, y, kernel=kernel, scoring=scoring)
    print(f"{kernel.ljust(len(max(kernels_to_try, key=len))+1)!s}: {scores!s}, Mean: {np.mean(scores)!s}")

additive_chi2 : [0.66 0.69 0.7  0.66 0.68], Mean: 0.6764937702246716
chi2          : [0.67 0.72 0.71 0.7  0.68], Mean: 0.6978191097685644
linear        : [0.66 0.66 0.65 0.64 0.66], Mean: 0.6532075848763653
rbf           : [0.64 0.64 0.66 0.65 0.64], Mean: 0.6445820093619572
laplacian     : [0.65 0.66 0.67 0.66 0.65], Mean: 0.6597141912821577
