# Cluster classification with Scikit-learn classifiers

In [1]:
import numpy as np
import pandas as pd
import time
import multiprocessing
import functools
from joblib import Parallel, delayed
import sys

import sklearn
from sklearn import *
from sklearn.experimental import enable_hist_gradient_boosting

sys.path.append("..")
from helpers import filename_for

Welcome to JupyROOT 6.16/00


Load ALL the classification models from scikit-learn.
Note that some models are very slow to train with large datasets or crash outright, so we give them a reduced number of (shuffled) rows to learn.
Note that `n_jobs=1` is used, as parallelism is introduced later.

In [2]:
models_a1 = [
    ('BaggingClassifier', sklearn.ensemble.BaggingClassifier(n_jobs=1), 'medi'),
    ('BernoulliNB', sklearn.naive_bayes.BernoulliNB(), 'fast'),
    ('CalibratedClassifierCV', sklearn.calibration.CalibratedClassifierCV(cv=5), 'medi'),
    ('ComplementNB', sklearn.naive_bayes.ComplementNB(), 'fast'),
    ('GaussianNB', sklearn.naive_bayes.GaussianNB(), 'fast'),
]

models_a2 = [
    ('LinearDiscriminantAnalysis', sklearn.discriminant_analysis.LinearDiscriminantAnalysis(), 'fast'),
    ('LinearSVC', sklearn.svm.LinearSVC(max_iter=200000), 'medi'),  # slow with unscaled data
    ('LogisticRegression', sklearn.linear_model.LogisticRegression(solver='lbfgs', multi_class='auto', max_iter=200000), 'medi'),
    ('LogisticRegressionCV', sklearn.linear_model.LogisticRegressionCV(cv=5, solver='lbfgs', multi_class='auto', max_iter=200000), 'medi'),
    ('MLPClassifier', sklearn.neural_network.MLPClassifier(), 'slow'),
    ('MultinomialNB', sklearn.naive_bayes.MultinomialNB(), 'fast'),
]

models_b = [
    ('NearestCentroid', sklearn.neighbors.NearestCentroid(), 'fast'),
    ('PassiveAggressiveClassifier', sklearn.linear_model.PassiveAggressiveClassifier(max_iter=1000, tol=1e-3, n_jobs=1), 'fast'),
    ('Perceptron', sklearn.linear_model.Perceptron(n_jobs=1), 'fast'),
    ('QuadraticDiscriminantAnalysis', sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis(), 'fast'),
    ('RidgeClassifier', sklearn.linear_model.RidgeClassifier(), 'fast'),
    ('RidgeClassifierCV', sklearn.linear_model.RidgeClassifierCV(), 'fast'),
    ('SGDClassifier', sklearn.linear_model.SGDClassifier(max_iter=5000, tol=1e-3, n_jobs=1), 'fast'),

]

# Run these sequential due to "buffer source array is read-only" with LokyBackend
models_s = [
    ('AdaBoostClassifier', sklearn.ensemble.AdaBoostClassifier(), 'medi'),
    ('DecisionTreeClassifier', sklearn.tree.DecisionTreeClassifier(), 'fast'),  # medi
    ('ExtraTreeClassifier', sklearn.tree.ExtraTreeClassifier(), 'fast'),  # medi
    ('ExtraTreesClassifier', sklearn.ensemble.ExtraTreesClassifier(n_estimators=100, n_jobs=-1), 'fast'),  # medi
    ('RandomForestClassifier', sklearn.ensemble.RandomForestClassifier(n_estimators=100, n_jobs=-1), 'fast'),  # medi
]

# Models that fail alot
models_o = [
    # ('NuSVC', sklearn.svm.NuSVC(), 'fast'),  # nu infeasible 
    ('RadiusNeighborsClassifier', sklearn.neighbors.RadiusNeighborsClassifier(radius=5, n_jobs=-1, outlier_label=0.), 'medi'),
    # ('GaussianProcessClassifier', sklearn.gaussian_process.GaussianProcessClassifier(), 'slow'),
    # ('GradientBoostingClassifier', sklearn.ensemble.GradientBoostingClassifier(), 'slow'),  # crashes
    # ('HistGradientBoostingClassifier', sklearn.ensemble.HistGradientBoostingClassifier(), 'slow'),  # crashes? 
    ('KNeighborsClassifier', sklearn.neighbors.KNeighborsClassifier(n_jobs=2), 'medi'),  # medi
    # ('LabelPropagation', sklearn.semi_supervised.LabelPropagation(), 'slow'),  # requires too much memory to train
    # ('LabelSpreading', sklearn.semi_supervised.LabelSpreading(), 'slow'),  # requires too much memory to train
    ('SVC', sklearn.svm.SVC(gamma='scale'), 'slow'), # slow, not timeouted 
]

Some models only work with properly scaled data, so we prepare ALL available scalers.

In [3]:
scalers = [
    # ('Unscaled data', ),
    ("standard scaling", sklearn.preprocessing.StandardScaler()),
    ("min-max scaling", sklearn.preprocessing.MinMaxScaler()),
    ("max-abs scaling", sklearn.preprocessing.MaxAbsScaler()),
    ("robust scaling", sklearn.preprocessing.RobustScaler(quantile_range=(25, 75))),
    # ('power transformation (Yeo-Johnson)', sklearn.preprocessing.PowerTransformer(method='yeo-johnson')),
    # ('power transformation (Box-Cox)', sklearn.preprocessing.PowerTransformer(method='box-cox')), # 'strictly zero' meh.
    ("quantile transformation (gaussian pdf)", sklearn.preprocessing.QuantileTransformer(output_distribution="normal")),
    ("quantile transformation (uniform pdf)", sklearn.preprocessing.QuantileTransformer(output_distribution="uniform")),
    ("sample-wise L2 normalizing", sklearn.preprocessing.Normalizer()),
]

Train and Test data is passed through the scalers, including the "unscaled" scaler.

In [4]:
files = [filename_for(15, 30, 600, 500, n, "inclxx", s, "clusterfeature.pkl") for n in [1, 2, 3, 4] for s in range(20)]
dfs = [pd.read_pickle(file) for file in files]
data = pd.concat(dfs, ignore_index=True).sample(frac=1, random_state=1337)

print(data["prim"].value_counts())
display(data)

0.0    8670780
1.0    1782525
Name: prim, dtype: int64


Unnamed: 0,i_event,prim,T,E,Size,EToF,EnergyMoment,TSpawn,MaxEHit,X,Y,Z
3499177,1731.0,1.0,63.755749,149.914871,6.0,607.207764,6.798460e+00,1.218181,64.293854,-0.682722,-22.500000,1522.5
8869071,3484.0,0.0,74.362877,15.335159,4.0,580.952515,3.743494e+00,0.394498,9.338829,37.500000,-1.970521,1757.5
2585062,5713.0,0.0,68.605553,37.585819,1.0,600.481689,2.278178e-13,0.000000,37.585819,-76.964607,2.500000,1632.5
2894202,5004.0,0.0,74.553833,34.140911,2.0,625.573181,2.255625e+00,0.020336,23.147533,-6.335686,17.500000,1792.5
3878350,5814.0,0.0,86.484261,7.983827,1.0,264.821777,0.000000e+00,0.000000,7.983827,98.367348,67.500000,1622.5
...,...,...,...,...,...,...,...,...,...,...,...,...
480729,1884.0,0.0,79.410904,2.277636,1.0,296.172211,0.000000e+00,0.000000,2.277636,-72.500000,-76.763123,1547.5
3361959,3033.0,1.0,63.699387,613.164795,38.0,609.370972,3.672803e+01,5.340049,52.407722,3.392983,12.500000,1522.5
7087336,7733.0,0.0,70.274147,246.902069,11.0,568.768555,1.244804e+01,2.275502,52.906334,6.802186,37.500000,1652.5
8315069,6834.0,0.0,78.157318,2.550865,1.0,425.412720,0.000000e+00,0.000000,2.550865,41.877541,82.500000,1702.5


In [5]:
prim1 = data[data["prim"] == 1]
prim0 = data[data["prim"] == 0].sample(n=len(prim1.index), random_state=1337)
balanced_data = pd.concat([prim0, prim1], ignore_index=True).sample(frac=1, random_state=1337)

print(balanced_data["prim"].value_counts())

1.0    1782525
0.0    1782525
Name: prim, dtype: int64


(3565050, 12)

In [6]:
msk = np.random.rand(len(balanced_data)) < 0.8
traindata = balanced_data[msk]
testdata = balanced_data[~msk]

print(traindata.shape, testdata.shape)

(2851914, 12) (713136, 12)


In [7]:
trainscaled = [
    (
        "Unscaled data",
        traindata[["T", "E", "Size", "EToF", "EnergyMoment", "TSpawn", "MaxEHit", "X", "Y", "Z"]],
        testdata[["T", "E", "Size", "EToF", "EnergyMoment", "TSpawn", "MaxEHit", "X", "Y", "Z"]],
    )
] + [
    (
        sname,
        scaler.fit_transform(traindata[["T", "E", "Size", "EToF", "EnergyMoment", "TSpawn", "MaxEHit", "X", "Y", "Z"]]),
        scaler.transform(testdata[["T", "E", "Size", "EToF", "EnergyMoment", "TSpawn", "MaxEHit", "X", "Y", "Z"]]),
    )
    for sname, scaler in scalers
]

Run all model/scaler combinations, in parallel. Note that we use timeouts per task, as setting at timeout in joblib will throw everything.

In [8]:
def with_timeout(timeout):
    def decorator(decorated):
        @functools.wraps(decorated)
        def inner(*args, **kwargs):
            pool = multiprocessing.pool.ThreadPool(1)
            async_result = pool.apply_async(decorated, args, kwargs)
            try:
                return async_result.get(timeout)
            except multiprocessing.TimeoutError:
                return

        return inner

    return decorator

In [9]:
MEDIHEAD = 500000 # 0
SLOWHEAD = 50000 # 0
label = "prim"

y_train_fast = traindata[[label]].values.ravel()
y_train_medi = traindata.head(MEDIHEAD)[[label]].values.ravel()
y_train_slow = traindata.head(SLOWHEAD)[[label]].values.ravel()
y_test = testdata[[label]].values.ravel()


@with_timeout(1200)
def train_model(mname, modelorg, speed, sname, x_train, x_test):
    # These get killed without error?
    if mname == "RadiusNeighborsClassifier" and sname != "Unscaled data":
        return (mname, sname, np.NaN, speed, np.NaN, "Skipped")
    # Too slow
    if mname == "LinearSVC" and (sname == "Unscaled data" or sname == "robust scaling"):
        return (mname, sname, np.NaN, speed, np.NaN, "Skipped")
    try:
        model = sklearn.base.clone(modelorg)
        start = time.time()
        if speed == "slow":
            model.fit(x_train[0:SLOWHEAD], y_train_slow)
        elif speed == "medi":
            model.fit(x_train[0:MEDIHEAD], y_train_medi)
        elif speed == "fast":
            model.fit(x_train, y_train_fast)
        end = time.time()

        y_pred = model.predict(x_test)
        y_true = y_test

        bac = sklearn.metrics.balanced_accuracy_score(y_true, y_pred)
        return (mname, sname, bac, speed, (end - start), "ok")
    except Exception as err:
        return (mname, sname, np.NaN, speed, np.NaN, err)


def train_model_wrap(mname, modelorg, speed, sname, x_train, x_test):
    ret = train_model(mname, modelorg, speed, sname, x_train, x_test)
    if ret:
        return ret
    else:
        return (mname, sname, np.NaN, speed, np.NaN, "Timeout")

In [10]:
try:
    results_a1 = Parallel(n_jobs=-1, verbose=1)(
        delayed(train_model_wrap)(mname, modelorg, speed, sname, x_train, x_test)
        for sname, x_train, x_test in trainscaled
        for mname, modelorg, speed in models_a1
    )
except Exception as err:
    print(err)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 56 concurrent workers.
[Parallel(n_jobs=-1)]: Done  11 out of  40 | elapsed:    5.4s remaining:   14.2s
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  7.3min finished


In [None]:
try:
    results_a2 = Parallel(n_jobs=-20, verbose=1)(
        delayed(train_model_wrap)(mname, modelorg, speed, sname, x_train, x_test)
        for sname, x_train, x_test in trainscaled
        for mname, modelorg, speed in models_a2
    )
except Exception as err:
    print(err)

In [12]:
try:
    results_b = Parallel(n_jobs=-1, verbose=1)(
        delayed(train_model_wrap)(mname, modelorg, speed, sname, x_train, x_test)
        for sname, x_train, x_test in trainscaled
        for mname, modelorg, speed in models_b
    )
except Exception as err:
    print(err)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 56 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of  56 | elapsed:    3.2s remaining:  1.4min
[Parallel(n_jobs=-1)]: Done  56 out of  56 | elapsed:   12.4s finished


In [13]:
try:
    results_s = Parallel(n_jobs=1, verbose=1)(
        delayed(train_model_wrap)(mname, modelorg, speed, sname, x_train, x_test)
        for sname, x_train, x_test in trainscaled
        for mname, modelorg, speed in models_s
    )
except Exception as err:
    print(err)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed: 26.5min finished


In [14]:
try:
    results_o = Parallel(n_jobs=-1, verbose=1)(
        delayed(train_model_wrap)(mname, modelorg, speed, sname, x_train, x_test)
        for sname, x_train, x_test in trainscaled
        for mname, modelorg, speed in models_o
    )
except Exception as err:
    print(err)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 56 concurrent workers.
[Parallel(n_jobs=-1)]: Done  13 out of  24 | elapsed:  2.1min remaining:  1.8min
[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  8.5min finished


In [15]:
def nullhypo():
    y_true = y_test
    y_pred = np.zeros(y_true.shape)

    bac = sklearn.metrics.balanced_accuracy_score(y_true, y_pred)
    return ("Null Hypothesis", "Unscaled data", bac, "fast", 0, "ok")

In [16]:
results = results_a1 + results_a2 + results_b + results_s + results_o + [nullhypo()]
resultsdf = pd.DataFrame(results)
pd.options.display.max_rows = 999
resultsdf.columns = ["Model", "Scaler", "BAC", "Speed", "Time", "Status"]
resultsdf.sort_values(by=["BAC", "Time", "Model"], ascending=[False, True, True], inplace=True)
#resultsdf.sort_values(by=["Model", "Scaler"], ascending=[True, True], inplace=True)
resultsdf.style.hide_index().format({"BAC": "{:.2%}", "Time": "{:.2f}"}).bar(subset=["BAC"], color="lightgreen").bar(
    subset=["Time"], color="lightblue"
)

Model,Scaler,BAC,Speed,Time,Status
RandomForestClassifier,quantile transformation (uniform pdf),91.24%,fast,67.49,ok
RandomForestClassifier,Unscaled data,91.23%,fast,64.51,ok
RandomForestClassifier,quantile transformation (gaussian pdf),91.22%,fast,70.39,ok
RandomForestClassifier,robust scaling,91.22%,fast,69.7,ok
RandomForestClassifier,sample-wise L2 normalizing,91.20%,fast,85.15,ok
MLPClassifier,robust scaling,91.12%,slow,22.76,ok
ExtraTreesClassifier,sample-wise L2 normalizing,91.12%,fast,28.36,ok
ExtraTreesClassifier,quantile transformation (gaussian pdf),91.11%,fast,23.93,ok
ExtraTreesClassifier,quantile transformation (uniform pdf),91.10%,fast,23.4,ok
ExtraTreesClassifier,Unscaled data,91.07%,fast,25.77,ok
