# Multiplicity determination with Scikit-learn classifiers

Here we test out scikit-learn classification models for multiplicity reconstruction with the three main features number of hits, number of clusters, and total deposited energy.

In [1]:
import numpy as np
import pandas as pd
import h5py
import time
import multiprocessing
import functools
from joblib import Parallel, delayed

import sklearn
from sklearn import *
from sklearn.experimental import enable_hist_gradient_boosting

Load ALL the classification models from scikit-learn.
Note that some models are very slow to train with large datasets or crash outright, so we give them a reduced number of (shuffled) rows to learn.
Note that `n_jobs=1` is used, as parallelism is introduced later.

In [33]:
models_a1 = [
    ('BaggingClassifier', sklearn.ensemble.BaggingClassifier(n_jobs=1), 'medi'),
    ('BernoulliNB', sklearn.naive_bayes.BernoulliNB(), 'fast'),
    ('CalibratedClassifierCV', sklearn.calibration.CalibratedClassifierCV(cv=5), 'slow'),
    ('ComplementNB', sklearn.naive_bayes.ComplementNB(), 'fast'),
    ('GaussianNB', sklearn.naive_bayes.GaussianNB(), 'fast'),
]

models_a2 = [
    ('LinearDiscriminantAnalysis', sklearn.discriminant_analysis.LinearDiscriminantAnalysis(), 'fast'),
    ('LinearSVC', sklearn.svm.LinearSVC(max_iter=20000), 'slow'),  # slow with unscaled data
    ('LogisticRegression', sklearn.linear_model.LogisticRegression(solver='lbfgs', multi_class='auto', max_iter=20000), 'slow'),  # slow with unscaled data
    ('LogisticRegressionCV', sklearn.linear_model.LogisticRegressionCV(cv=5, solver='lbfgs', multi_class='auto', max_iter=20000), 'slow'),
    ('MLPClassifier', sklearn.neural_network.MLPClassifier(), 'slow'),
    ('MultinomialNB', sklearn.naive_bayes.MultinomialNB(), 'fast'),
]

models_b = [
    ('NearestCentroid', sklearn.neighbors.NearestCentroid(), 'fast'),
    ('PassiveAggressiveClassifier', sklearn.linear_model.PassiveAggressiveClassifier(max_iter=1000, tol=1e-3, n_jobs=1), 'fast'),
    ('Perceptron', sklearn.linear_model.Perceptron(n_jobs=1), 'fast'),
    ('QuadraticDiscriminantAnalysis', sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis(), 'fast'),
    ('RidgeClassifier', sklearn.linear_model.RidgeClassifier(), 'fast'),
    ('RidgeClassifierCV', sklearn.linear_model.RidgeClassifierCV(), 'fast'),
    ('SGDClassifier', sklearn.linear_model.SGDClassifier(max_iter=5000, tol=1e-3, n_jobs=1), 'medi'),

]

# Run these sequential due to "buffer source array is read-only" with LokyBackend
models_s = [
    ('AdaBoostClassifier', sklearn.ensemble.AdaBoostClassifier(), 'fast'),
    ('DecisionTreeClassifier', sklearn.tree.DecisionTreeClassifier(), 'medi'),    
    ('ExtraTreeClassifier', sklearn.tree.ExtraTreeClassifier(), 'fast'),
    ('ExtraTreesClassifier', sklearn.ensemble.ExtraTreesClassifier(n_estimators=100, n_jobs=-1), 'medi'),
    ('RandomForestClassifier', sklearn.ensemble.RandomForestClassifier(n_estimators=100, n_jobs=-1), 'medi'),
]

# Models that fail alot
models_o = [
    # ('NuSVC', sklearn.svm.NuSVC(), 'fast'),  # nu infeasible 
    ('RadiusNeighborsClassifier', sklearn.neighbors.RadiusNeighborsClassifier(radius=3, n_jobs=-1), 'medi'),
    ('GaussianProcessClassifier', sklearn.gaussian_process.GaussianProcessClassifier(), 'slow'),
    # ('GradientBoostingClassifier', sklearn.ensemble.GradientBoostingClassifier(), 'slow'),  # crashes
    # ('HistGradientBoostingClassifier', sklearn.ensemble.HistGradientBoostingClassifier(), 'slow'),  # crashes? 
    ('KNeighborsClassifier', sklearn.neighbors.KNeighborsClassifier(n_jobs=10), 'slow'),
    ('LabelPropagation', sklearn.semi_supervised.LabelPropagation(), 'slow'),  # requires too much memory to train with larger datasets
    ('LabelSpreading', sklearn.semi_supervised.LabelSpreading(), 'slow'),  # bit slow
    ('SVC', sklearn.svm.SVC(gamma='scale'), 'slow'), # slow, not timeouted 
]

Some models only work with properly scaled data, so we prepare ALL available scalers.

In [3]:
scalers = [
    # ('Unscaled data', ),
    ('standard scaling', sklearn.preprocessing.StandardScaler()),
    ('min-max scaling', sklearn.preprocessing.MinMaxScaler()),
    ('max-abs scaling', sklearn.preprocessing.MaxAbsScaler()),
    ('robust scaling', sklearn.preprocessing.RobustScaler(quantile_range=(25, 75))),
    ('power transformation (Yeo-Johnson)', sklearn.preprocessing.PowerTransformer(method='yeo-johnson')),
    # ('power transformation (Box-Cox)', sklearn.preprocessing.PowerTransformer(method='box-cox')), # 'strictly zero' meh.
    ('quantile transformation (gaussian pdf)', sklearn.preprocessing.QuantileTransformer(output_distribution='normal')),
    ('quantile transformation (uniform pdf)', sklearn.preprocessing.QuantileTransformer(output_distribution='uniform')),
    ('sample-wise L2 normalizing', sklearn.preprocessing.Normalizer()),
]

Train and Test data is passed through the scalers, including the "unscaled" scaler.

In [4]:
num_dp = 30

infile = f"data/600AMeV_{num_dp}dp.bars-shuffled.h5"
h5in = h5py.File(infile, "r")
x = np.array(h5in["flatfeatures"][0:100000], dtype=np.float32)
y = np.array(h5in["multiplicity"][0:100000, 2], dtype=np.int8)

In [5]:
x_scalers = {"Unscaled data": x}
for sname, scaler in scalers:
    x_scalers[sname] = scaler.fit_transform(x)

  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims)


In [6]:
x_scalers_split = {}
for sname, scaler in x_scalers.items():
    x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split(
        x_scalers[sname], y, test_size=0.2
    )
    x_scalers_split[sname] = (x_train, x_test, y_train, y_test)

Run all model/scaler combinations, in parallel. Note that we use timeouts per task, as setting at timeout in joblib will throw everything.

In [7]:
def with_timeout(timeout):
    def decorator(decorated):
        @functools.wraps(decorated)
        def inner(*args, **kwargs):
            pool = multiprocessing.pool.ThreadPool(1)
            async_result = pool.apply_async(decorated, args, kwargs)
            try:
                return async_result.get(timeout)
            except multiprocessing.TimeoutError:
                return

        return inner

    return decorator

In [8]:
MEDIHEAD = 10000
SLOWHEAD = 1000


@with_timeout(3000)
def train_model(mname, modelorg, speed, sname, data):
    try:
        x_train, x_test, y_train, y_test = data
        model = sklearn.base.clone(modelorg)
        start = time.time()
        if speed == "slow":
            model.fit(x_train[0:SLOWHEAD], y_train[0:SLOWHEAD])
        elif speed == "medi":
            model.fit(x_train[0:MEDIHEAD], y_train[0:MEDIHEAD])
        elif speed == "fast":
            model.fit(x_train, y_train)
        end = time.time()

        y_pred = model.predict(x_test)
        y_true = y_test

        bac = sklearn.metrics.balanced_accuracy_score(y_true, y_pred)
        return (mname, sname, bac, speed, (end - start), "ok")
    except Exception as err:
        return (mname, sname, np.NaN, speed, np.NaN, err)


def train_model_wrap(mname, modelorg, speed, sname, data):
    print(mname, sname)
    ret = train_model(mname, modelorg, speed, sname, data)
    if ret:
        print(ret)
        return ret
    else:
        return (mname, sname, np.NaN, speed, np.NaN, "Timeout")

In [12]:
try:
    results_a1 = Parallel(n_jobs=10, verbose=1)(
        delayed(train_model_wrap)(mname, modelorg, speed, sname, data)
        for mname, modelorg, speed in models_a1
        for sname, data in x_scalers_split.items()
    )
except Exception as err:
    print(err)

[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:  4.1min
[Parallel(n_jobs=10)]: Done  54 out of  54 | elapsed:  8.8min finished


In [15]:
try:
    results_a2 = Parallel(n_jobs=10, verbose=1)(
        delayed(train_model_wrap)(mname, modelorg, speed, sname, data)
        for mname, modelorg, speed in models_a2
        for sname, data in x_scalers_split.items()
    )
except Exception as err:
    print(err)

[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed: 21.9min
[Parallel(n_jobs=10)]: Done  54 out of  54 | elapsed: 36.7min finished


In [18]:
try:
    results_b = Parallel(n_jobs=10, verbose=1)(
        delayed(train_model_wrap)(mname, modelorg, speed, sname, data)
        for mname, modelorg, speed in models_b
        for sname, data in x_scalers_split.items()
    )
except Exception as err:
    print(err)

[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed: 11.6min
[Parallel(n_jobs=10)]: Done  63 out of  63 | elapsed: 42.2min finished


In [26]:
try:
    results_s = Parallel(n_jobs=1, verbose=1)(
        delayed(train_model_wrap)(mname, modelorg, speed, sname, data)
        for mname, modelorg, speed in models_s
        for sname, data in x_scalers_split.items()
    )
except Exception as err:
    print(err)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


AdaBoostClassifier Unscaled data
('AdaBoostClassifier', 'Unscaled data', 0.2843571933549634, 'fast', 518.7534177303314, 'ok')
AdaBoostClassifier standard scaling
('AdaBoostClassifier', 'standard scaling', 0.2805385624827795, 'fast', 484.35504055023193, 'ok')
AdaBoostClassifier min-max scaling
('AdaBoostClassifier', 'min-max scaling', 0.26770355228655024, 'fast', 475.6151866912842, 'ok')
AdaBoostClassifier max-abs scaling
('AdaBoostClassifier', 'max-abs scaling', 0.27805130079086665, 'fast', 461.894330739975, 'ok')
AdaBoostClassifier robust scaling




('AdaBoostClassifier', 'robust scaling', 0.24878782578251554, 'fast', 462.9791250228882, 'ok')
AdaBoostClassifier power transformation (Yeo-Johnson)
('AdaBoostClassifier', 'power transformation (Yeo-Johnson)', 0.2619868112466336, 'fast', 429.33453392982483, 'ok')
AdaBoostClassifier quantile transformation (gaussian pdf)
('AdaBoostClassifier', 'quantile transformation (gaussian pdf)', 0.28005826374303855, 'fast', 466.0326180458069, 'ok')
AdaBoostClassifier quantile transformation (uniform pdf)
('AdaBoostClassifier', 'quantile transformation (uniform pdf)', 0.2686092391975205, 'fast', 456.00977778434753, 'ok')
AdaBoostClassifier sample-wise L2 normalizing
('AdaBoostClassifier', 'sample-wise L2 normalizing', 0.28515800281099757, 'fast', 465.5604405403137, 'ok')
DecisionTreeClassifier Unscaled data
('DecisionTreeClassifier', 'Unscaled data', 0.43912872207068965, 'medi', 19.098145961761475, 'ok')
DecisionTreeClassifier standard scaling
('DecisionTreeClassifier', 'standard scaling', 0.438788

[Parallel(n_jobs=1)]: Done  45 out of  45 | elapsed: 77.2min finished


In [34]:
results_o = []
for mname, modelorg, speed in models_o:
    try:
        tmp = Parallel(n_jobs=10, verbose=1)(
            delayed(train_model_wrap)(mname, modelorg, speed, sname, data)
            for sname, data in x_scalers_split.items()
        )
        results_o.extend(tmp)
    except Exception as err:
        print(err)

[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   9 out of   9 | elapsed:  8.4min finished
[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   9 out of   9 | elapsed: 16.1min finished
[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   9 out of   9 | elapsed:  1.9min finished
[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   9 out of   9 | elapsed:   24.0s finished
[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   9 out of   9 | elapsed:   22.2s finished
[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   9 out of   9 | elapsed:  3.7min finished


In [35]:
results = results_a1 + results_a2 + results_b + results_s + results_o
resultsdf = pd.DataFrame(results)
pd.options.display.max_rows = 999
resultsdf.columns = ["Model", "Scaler", "BAC", "Speed", "Time", "Status"]
resultsdf.sort_values(by=["BAC", "Time"], ascending=[False, True], inplace=True)
resultsdf.style.hide_index().format({"BAC": "{:.2%}", "Time": "{:.2f}"}).bar(
    subset=["BAC"], color="lightgreen"
).bar(subset=["Time"], color="lightblue")

Model,Scaler,BAC,Speed,Time,Status
BernoulliNB,min-max scaling,65.25%,fast,6.23,ok
BernoulliNB,Unscaled data,64.90%,fast,6.16,ok
BernoulliNB,power transformation (Yeo-Johnson),64.79%,fast,9.97,ok
NearestCentroid,power transformation (Yeo-Johnson),64.57%,fast,2.4,ok
NearestCentroid,standard scaling,64.57%,fast,2.36,ok
BernoulliNB,quantile transformation (uniform pdf),64.49%,fast,6.88,ok
BernoulliNB,quantile transformation (gaussian pdf),64.49%,fast,8.43,ok
BernoulliNB,max-abs scaling,64.38%,fast,12.56,ok
BernoulliNB,standard scaling,64.34%,fast,10.75,ok
BernoulliNB,robust scaling,64.22%,fast,11.74,ok
