# Multiplicity determination with Scikit-learn classifiers

In [1]:
import numpy as np
import pandas as pd
import time
import signal
import sklearn

In [2]:
from sklearn.experimental import enable_hist_gradient_boosting

from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.calibration import CalibratedClassifierCV

# from sklearn.utils.mocking import CheckingClassifier
from sklearn.multioutput import ClassifierChain
from sklearn.naive_bayes import ComplementNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.dummy import DummyClassifier
from sklearn.tree import ExtraTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.semi_supervised import LabelPropagation
from sklearn.semi_supervised import LabelSpreading
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.neural_network import MLPClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import NearestCentroid
from sklearn.svm import NuSVC
from sklearn.multiclass import OneVsOneClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.multiclass import OutputCodeClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.linear_model import Perceptron
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import RidgeClassifierCV
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier

from sklearn.preprocessing import (
    StandardScaler,
    MinMaxScaler,
    MaxAbsScaler,
    RobustScaler,
    PowerTransformer,
    QuantileTransformer,
    Normalizer,
)

In [3]:
models = [
    ('AdaBoostClassifier', AdaBoostClassifier(), 'fast'),
    ('BaggingClassifier', BaggingClassifier(n_jobs=-1), 'fast'),
    ('BernoulliNB', BernoulliNB(), 'fast'),
    ('CalibratedClassifierCV', CalibratedClassifierCV(cv=5), 'slow'),
    # dummy ('CheckingClassifier', CheckingClassifier(), 'fast'),
    ('ComplementNB', ComplementNB(), 'fast'),
    ('DecisionTreeClassifier', DecisionTreeClassifier(), 'fast'),
    # dummy ('DummyClassifier', DummyClassifier(), 'fast'),
    ('ExtraTreeClassifier', ExtraTreeClassifier(), 'fast'),
    ('ExtraTreesClassifier', ExtraTreesClassifier(n_estimators=100, n_jobs=-1), 'fast'),
    ('GaussianNB', GaussianNB(), 'fast'),
    # crashes ('GaussianProcessClassifier', GaussianProcessClassifier(), 'slow'),
    ('GradientBoostingClassifier', GradientBoostingClassifier(), 'slow'),
    ('HistGradientBoostingClassifier', HistGradientBoostingClassifier(), 'slow'),
    ('KNeighborsClassifier', KNeighborsClassifier(n_jobs=-1), 'fast'),
    # ('LabelPropagation', LabelPropagation(), 'slow'),  # requires too much memory to train with larger datasets
    # ('LabelSpreading', LabelSpreading(), 'slow'),  # bit slow
    ('LinearDiscriminantAnalysis', LinearDiscriminantAnalysis(), 'fast'),
    ('LogisticRegression', LogisticRegression(solver='lbfgs', multi_class='auto', max_iter=20000), 'slow'),  # slow with unscaled data
    ('LogisticRegressionCV', LogisticRegressionCV(cv=5, solver='lbfgs', multi_class='auto', max_iter=20000), 'slow'),  # slow
    ('MLPClassifier', MLPClassifier(), 'slow'),
    ('MultinomialNB', MultinomialNB(), 'fast'),
    ('NearestCentroid', NearestCentroid(), 'fast'),
    # nu infeasible ('NuSVC', NuSVC(), 'fast'),
    ('PassiveAggressiveClassifier', PassiveAggressiveClassifier(max_iter=1000, tol=1e-3, n_jobs=-1), 'fast'),
    ('Perceptron', Perceptron(n_jobs=-1), 'fast'),
    ('QuadraticDiscriminantAnalysis', QuadraticDiscriminantAnalysis(), 'fast'),
    ('RadiusNeighborsClassifier', RadiusNeighborsClassifier(radius=2, outlier_label=0, n_jobs=-1), 'fast'),
    ('RandomForestClassifier', RandomForestClassifier(n_estimators=100, n_jobs=-1), 'fast'),
    ('RidgeClassifier', RidgeClassifier(), 'fast'),
    ('RidgeClassifierCV', RidgeClassifierCV(), 'fast'),
    ('SGDClassifier', SGDClassifier(max_iter=1000, tol=1e-3, n_jobs=-1), 'fast'),
    ('LinearSVC', LinearSVC(max_iter=20000), 'slow'),  # slow with unscaled data
    ('SVC', SVC(gamma='scale'), 'slow'),
]

In [4]:
scalers = [
    # ('Unscaled data', ),
    ('standard scaling', StandardScaler()),
    ('min-max scaling', MinMaxScaler()),
    ('max-abs scaling', MaxAbsScaler()),
    ('robust scaling', RobustScaler(quantile_range=(25, 75))),
    ('power transformation (Yeo-Johnson)', PowerTransformer(method='yeo-johnson')),
    # ('power transformation (Box-Cox)', PowerTransformer(method='box-cox')), # 'strictly zero' meh.
    ('quantile transformation (gaussian pdf)', QuantileTransformer(output_distribution='normal')),
    ('quantile transformation (uniform pdf)', QuantileTransformer(output_distribution='uniform')),
    ('sample-wise L2 normalizing', Normalizer()),
]

In [5]:
num_dp = 30

traindata = pd.read_pickle(f"data/training_600AMeV_{num_dp}dp.pkl").sample(frac=0.1)
testdata = pd.read_pickle(f"data/test_600AMeV_{num_dp}dp.pkl")

trainscaled = [
    (
        "Unscaled data",
        traindata[["nHits", "nClus", "Edep"]],
        testdata[["nHits", "nClus", "Edep"]],
    )
] + [
    (
        sname,
        scaler.fit_transform(traindata[["nHits", "nClus", "Edep"]]),
        scaler.transform(testdata[["nHits", "nClus", "Edep"]]),
    )
    for sname, scaler in scalers
]

In [7]:
from joblib import Parallel, delayed
from sklearn.metrics import (
    balanced_accuracy_score,
    confusion_matrix,
    plot_confusion_matrix,
)

SLOWHEAD = 10000
label="nPH"
y_train_fast = traindata[[label]].values.ravel()
y_train_slow = traindata.head(SLOWHEAD)[[label]].values.ravel()
y_test = testdata[[label]].values.ravel()


def train_model(mname, modelorg, speed, sname, x_train, x_test):
    # These get killed without error?
    if mname == "RadiusNeighborsClassifier" and sname != "Unscaled data":
        return (mname, sname, np.NaN, 'Skipped')
    try:
        model = sklearn.base.clone(modelorg)
        start = time.time()
        signal.alarm(int(5 * 60))  # Timeout
        if speed == "slow":
            model.fit(x_train[0:SLOWHEAD], y_train_slow)
        elif speed == "fast":
            model.fit(x_train, y_train_fast)
        signal.alarm(0)
        end = time.time()

        y_pred = model.predict(x_test)
        y_true = y_test

        bac = balanced_accuracy_score(y_true, y_pred)
        return (mname, sname, bac, str(end - start))
    except Exception as err:
        return (mname, sname, np.NaN, err)


results = []
for mname, modelorg, speed in models:
    out = Parallel(n_jobs=10)(
        delayed(train_model)(mname, modelorg, speed, sname, x_train, x_test)
        for sname, x_train, x_test in trainscaled
    )
    for o in out:
        results.append(o)
        print(o)

('AdaBoostClassifier', 'Unscaled data', 0.539658957878153, '1.7389185428619385')
('AdaBoostClassifier', 'standard scaling', 0.539658957878153, '1.7516717910766602')
('AdaBoostClassifier', 'min-max scaling', 0.539658957878153, '1.787578821182251')
('AdaBoostClassifier', 'max-abs scaling', 0.539658957878153, '1.883124589920044')
('AdaBoostClassifier', 'robust scaling', 0.539658957878153, '1.8192639350891113')
('AdaBoostClassifier', 'power transformation (Yeo-Johnson)', 0.539658957878153, '1.7480335235595703')
('AdaBoostClassifier', 'quantile transformation (gaussian pdf)', 0.539658957878153, '1.7963471412658691')
('AdaBoostClassifier', 'quantile transformation (uniform pdf)', 0.539658957878153, '1.8270554542541504')
('AdaBoostClassifier', 'sample-wise L2 normalizing', 0.3630045864033875, '2.4161384105682373')
('BaggingClassifier', 'Unscaled data', 0.7274498120211931, '0.16216015815734863')
('BaggingClassifier', 'standard scaling', 0.7280732486022977, '0.19832205772399902')
('BaggingClass