# Apply Cluster classification with Scikit-learn classifiers

In [1]:
import numpy as np
import pandas as pd
import time

import sklearn
from sklearn import *

Load ALL the classification models from scikit-learn.
Note that some models are very slow to train with large datasets or crash outright, so we give them a reduced number of (shuffled) rows to learn.
Note that `n_jobs=1` is used, as parallelism is introduced later.

In [2]:
num_dp = 30

data = pd.read_pickle(f"data/600AMeV_{num_dp}dp.clusters.pkl").sample(frac=1)
print(data.shape)

data = data[data["E"] > 5]
print(data.shape)

msk = np.random.rand(len(data)) < 0.8
traindata = data[msk]
testdata = data[~msk]

(97266008, 12)
(47689578, 12)


In [3]:
features = [
    "T",
    "E",
    "Size",
    "EToF",
    "EnergyMoment",
    "TSpawn",
    "MaxEHit",
    "X",
    "Y",
    "Z",
]
label = ["prim"]

In [6]:
def train_mpl():
    model = sklearn.neural_network.MLPClassifier()

    train_size = 1000000
    x_train = traindata[0:train_size][features]
    y_train = traindata[0:train_size][label].values.ravel()

    x_test = testdata[features]
    y_test = testdata[label].values.ravel()

    start = time.time()
    model.fit(x_train, y_train)
    end = time.time()
    print(end - start)

    y_pred = model.predict(x_test)
    y_true = y_test

    bac = sklearn.metrics.balanced_accuracy_score(y_true, y_pred)
    print(bac)
    return model


mpl = train_mpl()

2523.9386110305786
0.8388404770936562


In [7]:
from joblib import dump, load
dump(mpl, "data/mpl.model")

['data/mpl.model']

In [4]:
def train_perceptron():
    model = sklearn.linear_model.Perceptron(n_jobs=-1)
    scaler = sklearn.preprocessing.Normalizer()

    x_train = scaler.fit_transform(traindata[features])
    y_train = traindata[label].values.ravel()

    x_test = scaler.transform(testdata[features])
    y_test = testdata[label].values.ravel()

    start = time.time()
    model.fit(x_train, y_train)
    end = time.time()
    print(end - start)

    y_pred = model.predict(x_test)
    y_true = y_test

    bac = sklearn.metrics.balanced_accuracy_score(y_true, y_pred)
    print(bac)
    return model, scaler


perc, perc_scaler = train_perceptron()

47.96573615074158
0.8158963200802571


In [5]:
def train_nearcentroid():
    model = sklearn.neighbors.NearestCentroid()
    scaler = sklearn.preprocessing.Normalizer()

    x_train = scaler.fit_transform(traindata[features])
    y_train = traindata[label].values.ravel()

    x_test = scaler.transform(testdata[features])
    y_test = testdata[label].values.ravel()

    start = time.time()
    model.fit(x_train, y_train)
    end = time.time()
    print(end - start)

    y_pred = model.predict(x_test)
    y_true = y_test

    bac = sklearn.metrics.balanced_accuracy_score(y_true, y_pred)
    print(bac)
    return model, scaler


cent, cent_scaler = train_nearcentroid()

6.493687152862549
0.509466228206819
