# Multiplicity determination with Scikit-learn classifiers

Here we test out scikit-learn classification models for multiplicity reconstruction.

In [1]:
import numpy as np
import pandas as pd
import time
import multiprocessing
import functools
from joblib import Parallel, delayed

import sklearn
from sklearn import *
from sklearn.experimental import enable_hist_gradient_boosting

import sys

sys.path.append("..")
from helpers import filename_for

Welcome to JupyROOT 6.16/00


Load ALL the classification models from scikit-learn.
Note that some models are very slow to train with large datasets or crash outright, so we give them a reduced number of (shuffled) rows to learn.
Note that `n_jobs=1` is used, as parallelism is introduced later.

In [2]:
models_a1 = [
    ("BaggingClassifier", sklearn.ensemble.BaggingClassifier(n_jobs=1), "medi"),
    ("BernoulliNB", sklearn.naive_bayes.BernoulliNB(), "fast"),
    ("CalibratedClassifierCV", sklearn.calibration.CalibratedClassifierCV(cv=5), "slow"),
    ("ComplementNB", sklearn.naive_bayes.ComplementNB(), "fast"),
    ("GaussianNB", sklearn.naive_bayes.GaussianNB(), "fast"),
]

models_a2 = [
    ("LinearDiscriminantAnalysis", sklearn.discriminant_analysis.LinearDiscriminantAnalysis(), "fast"),
    ("LinearSVC", sklearn.svm.LinearSVC(max_iter=20000), "slow"),  # slow with unscaled data
    (
        "LogisticRegression",
        sklearn.linear_model.LogisticRegression(solver="lbfgs", multi_class="auto", max_iter=20000),
        "slow",
    ),  # slow with unscaled data
    (
        "LogisticRegressionCV",
        sklearn.linear_model.LogisticRegressionCV(cv=5, solver="lbfgs", multi_class="auto", max_iter=20000),
        "slow",
    ),
    ("MLPClassifier", sklearn.neural_network.MLPClassifier(), "slow"),
    ("MultinomialNB", sklearn.naive_bayes.MultinomialNB(), "fast"),
]

models_b = [
    ("NearestCentroid", sklearn.neighbors.NearestCentroid(), "fast"),
    (
        "PassiveAggressiveClassifier",
        sklearn.linear_model.PassiveAggressiveClassifier(max_iter=1000, tol=1e-3, n_jobs=1),
        "fast",
    ),
    ("Perceptron", sklearn.linear_model.Perceptron(n_jobs=1), "fast"),
    ("QuadraticDiscriminantAnalysis", sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis(), "fast"),
    ("RidgeClassifier", sklearn.linear_model.RidgeClassifier(), "fast"),
    ("RidgeClassifierCV", sklearn.linear_model.RidgeClassifierCV(), "fast"),
    ("SGDClassifier", sklearn.linear_model.SGDClassifier(max_iter=5000, tol=1e-3, n_jobs=1), "medi"),
]

# Run these sequential due to "buffer source array is read-only" with LokyBackend
models_s = [
    ("AdaBoostClassifier", sklearn.ensemble.AdaBoostClassifier(), "fast"),
    ("DecisionTreeClassifier", sklearn.tree.DecisionTreeClassifier(), "medi"),
    ("ExtraTreeClassifier", sklearn.tree.ExtraTreeClassifier(), "fast"),
    ("ExtraTreesClassifier", sklearn.ensemble.ExtraTreesClassifier(n_estimators=100, n_jobs=-1), "medi"),
    ("RandomForestClassifier", sklearn.ensemble.RandomForestClassifier(n_estimators=100, n_jobs=-1), "medi"),
]

# Models that fail alot
models_o = [
    # ('NuSVC', sklearn.svm.NuSVC(), 'fast'),  # nu infeasible
    ("RadiusNeighborsClassifier", sklearn.neighbors.RadiusNeighborsClassifier(radius=3, n_jobs=-1), "medi"),
    ("GaussianProcessClassifier", sklearn.gaussian_process.GaussianProcessClassifier(), "slow"),
    # ('GradientBoostingClassifier', sklearn.ensemble.GradientBoostingClassifier(), 'slow'),  # crashes
    # ('HistGradientBoostingClassifier', sklearn.ensemble.HistGradientBoostingClassifier(), 'slow'),  # crashes?
    ("KNeighborsClassifier", sklearn.neighbors.KNeighborsClassifier(n_jobs=10), "slow"),
    (
        "LabelPropagation",
        sklearn.semi_supervised.LabelPropagation(),
        "slow",
    ),  # requires too much memory to train with larger datasets
    ("LabelSpreading", sklearn.semi_supervised.LabelSpreading(), "slow"),  # bit slow
    ("SVC", sklearn.svm.SVC(gamma="scale"), "slow"),  # slow, not timeouted
]

Some models only work with properly scaled data, so we prepare ALL available scalers.

In [3]:
scalers = [
    # ('Unscaled data', ),
    ("standard scaling", sklearn.preprocessing.StandardScaler()),
    ("min-max scaling", sklearn.preprocessing.MinMaxScaler()),
    ("max-abs scaling", sklearn.preprocessing.MaxAbsScaler()),
    ("robust scaling", sklearn.preprocessing.RobustScaler(quantile_range=(25, 75))),
    ("power transformation (Yeo-Johnson)", sklearn.preprocessing.PowerTransformer(method="yeo-johnson")),
    # ('power transformation (Box-Cox)', sklearn.preprocessing.PowerTransformer(method='box-cox')), # 'strictly zero' meh.
    ("quantile transformation (gaussian pdf)", sklearn.preprocessing.QuantileTransformer(output_distribution="normal")),
    ("quantile transformation (uniform pdf)", sklearn.preprocessing.QuantileTransformer(output_distribution="uniform")),
    ("sample-wise L2 normalizing", sklearn.preprocessing.Normalizer()),
]

Train and Test data is passed through the scalers, including the "unscaled" scaler.

In [4]:
files = [filename_for(15, 30, 600, 500, n, "inclxx", s, "bars.pkl") for n in [1, 2, 3, 4] for s in range(20)]
dfs = [pd.read_pickle(file) for file in files]
data = pd.concat(dfs, ignore_index=True).sample(frac=0.1)
display(data)

        nPN  nPP  nPH  nHits  nClus  Edep    0    1    2    3  ...  5990  \
556923    3    3    3     55     19   799  0.0  0.0  0.0  0.0  ...   0.0   
141807    1    1    1     10      4   219  0.0  0.0  0.0  0.0  ...   0.0   
697949    4    4    4     59     22   842  0.0  0.0  0.0  0.0  ...   0.0   
70999     1    1    1     11     10   191  0.0  0.0  0.0  0.0  ...   0.0   
110834    1    1    1      9      2   272  0.0  0.0  0.0  0.0  ...   0.0   
...     ...  ...  ...    ...    ...   ...  ...  ...  ...  ...  ...   ...   
298235    2    2    2     27      4   503  0.0  0.0  0.0  0.0  ...   0.0   
552333    3    3    3     56     13  1112  0.0  0.0  0.0  0.0  ...   0.0   
575335    3    3    3     51     16   757  0.0  0.0  0.0  0.0  ...   0.0   
637192    4    4    4     52     25  1065  0.0  0.0  0.0  0.0  ...   0.0   
530039    3    3    3     36     19   710  0.0  0.0  0.0  0.0  ...   0.0   

        5991  5992  5993  5994  5995  5996  5997  5998  5999  
556923   0.0   0.0   0.0

In [6]:
msk = np.random.rand(data.shape[0]) < 0.8
msk[0] = True
msk[1] = False
data.loc[0] = data.loc[1] = [0 for i in range(0, data.shape[1])]

traindata = data[msk]
testdata = data[~msk]

print(traindata.shape)
print(testdata.shape)

(63955, 6006)
(16047, 6006)


In [7]:
features = ["nHits", "nClus", "Edep"] + [i for i in range(0, 30 * 100 * 2)]

In [8]:
# TODO: The scaling should probably be the same for all Times and Energies
trainscaled = [("Unscaled data", traindata[features], testdata[features])] + [
    (sname, scaler.fit_transform(traindata[features]), scaler.transform(testdata[features]),)
    for sname, scaler in scalers
]

Run all model/scaler combinations, in parallel. Note that we use timeouts per task, as setting at timeout in joblib will throw everything.

In [9]:
def with_timeout(timeout):
    def decorator(decorated):
        @functools.wraps(decorated)
        def inner(*args, **kwargs):
            pool = multiprocessing.pool.ThreadPool(1)
            async_result = pool.apply_async(decorated, args, kwargs)
            try:
                return async_result.get(timeout)
            except multiprocessing.TimeoutError:
                return

        return inner

    return decorator

In [10]:
MEDIHEAD = 50000
SLOWHEAD = 5000
label = "nPN"

y_train_fast = traindata[[label]].values.ravel()
y_train_medi = traindata.head(MEDIHEAD)[[label]].values.ravel()
y_train_slow = traindata.head(SLOWHEAD)[[label]].values.ravel()
y_test = testdata[[label]].values.ravel()


@with_timeout(1200*2)
def train_model(mname, modelorg, speed, sname, x_train, x_test):
    # These get killed without error?
    if mname == "RadiusNeighborsClassifier" and sname != "Unscaled data":
        return (mname, sname, np.NaN, speed, np.NaN, "Skipped")
    try:
        model = sklearn.base.clone(modelorg)
        start = time.time()
        if speed == "slow":
            model.fit(x_train[0:SLOWHEAD], y_train_slow)
        elif speed == "medi":
            model.fit(x_train[0:MEDIHEAD], y_train_medi)
        elif speed == "fast":
            model.fit(x_train, y_train_fast)
        end = time.time()

        y_pred = model.predict(x_test)
        y_true = y_test

        bac = sklearn.metrics.balanced_accuracy_score(y_true, y_pred)
        return (mname, sname, bac, speed, (end - start), "ok")
    except Exception as err:
        return (mname, sname, np.NaN, speed, np.NaN, err)


def train_model_wrap(mname, modelorg, speed, sname, x_train, x_test):
    ret = train_model(mname, modelorg, speed, sname, x_train, x_test)
    if ret:
        return ret
    else:
        return (mname, sname, np.NaN, speed, np.NaN, "Timeout")

In [11]:
try:
    results_a1 = Parallel(n_jobs=10, verbose=1)(
        delayed(train_model_wrap)(mname, modelorg, speed, sname, x_train, x_test)
        for sname, x_train, x_test in trainscaled
        for mname, modelorg, speed in models_a1
    )
except Exception as err:
    print(err)

[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  45 out of  45 | elapsed: 28.8min finished
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]


In [12]:
try:
    results_a2 = Parallel(n_jobs=10, verbose=1)(
        delayed(train_model_wrap)(mname, modelorg, speed, sname, x_train, x_test)
        for sname, x_train, x_test in trainscaled
        for mname, modelorg, speed in models_a2
    )
except Exception as err:
    print(err)

[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed: 20.6min
[Parallel(n_jobs=10)]: Done  54 out of  54 | elapsed: 59.1min finished


In [13]:
try:
    results_b = Parallel(n_jobs=10, verbose=1)(
        delayed(train_model_wrap)(mname, modelorg, speed, sname, x_train, x_test)
        for sname, x_train, x_test in trainscaled
        for mname, modelorg, speed in models_b
    )
except Exception as err:
    print(err)

[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed: 15.9min
[Parallel(n_jobs=10)]: Done  63 out of  63 | elapsed: 57.4min finished
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  overwrite_a=True).T
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarray([np.sum(np.log(s)) for s in self.scalings_])
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  X2 = np.dot(Xm, R * (S ** (-0.5)))
  u = np.asarr

In [14]:
try:
    results_s = Parallel(n_jobs=1, verbose=1)(
        delayed(train_model_wrap)(mname, modelorg, speed, sname, x_train, x_test)
        for sname, x_train, x_test in trainscaled
        for mname, modelorg, speed in models_s
    )
except Exception as err:
    print(err)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  45 out of  45 | elapsed: 65.6min finished


In [15]:
results_o = []
for mname, modelorg, speed in models_o:
    try:
        tmp = Parallel(n_jobs=10, verbose=1)(
            delayed(train_model_wrap)(mname, modelorg, speed, sname, data) for sname, data in x_scalers_split.items()
        )
        results_o.extend(tmp)
    except Exception as err:
        print(err)

name 'x_scalers_split' is not defined
name 'x_scalers_split' is not defined
name 'x_scalers_split' is not defined
name 'x_scalers_split' is not defined
name 'x_scalers_split' is not defined
name 'x_scalers_split' is not defined


In [16]:
results = results_a1 + results_a2 + results_b + results_s + results_o
resultsdf = pd.DataFrame(results)
pd.options.display.max_rows = 999
resultsdf.columns = ["Model", "Scaler", "BAC", "Speed", "Time", "Status"]
resultsdf.sort_values(by=["BAC", "Time"], ascending=[False, True], inplace=True)
resultsdf.style.hide_index().format({"BAC": "{:.2%}", "Time": "{:.2f}"}).bar(subset=["BAC"], color="lightgreen").bar(
    subset=["Time"], color="lightblue"
)

Model,Scaler,BAC,Speed,Time,Status
BaggingClassifier,power transformation (Yeo-Johnson),70.52%,medi,975.8,ok
BaggingClassifier,max-abs scaling,69.90%,medi,1479.21,ok
BaggingClassifier,quantile transformation (uniform pdf),69.64%,medi,1431.04,ok
BaggingClassifier,min-max scaling,69.63%,medi,1490.15,ok
BaggingClassifier,Unscaled data,69.61%,medi,1501.65,ok
BaggingClassifier,standard scaling,69.59%,medi,1566.72,ok
BaggingClassifier,quantile transformation (gaussian pdf),69.53%,medi,1517.59,ok
BaggingClassifier,robust scaling,69.53%,medi,1520.23,ok
LinearDiscriminantAnalysis,power transformation (Yeo-Johnson),67.53%,fast,793.28,ok
LinearDiscriminantAnalysis,quantile transformation (uniform pdf),67.39%,fast,772.57,ok
