# Super Simple Keras model on TriFeature for multiplicity determination

In [1]:
features = ["T", "E", "Size", "EToF", "EnergyMoment", "TSpawn", "MaxEHit", "X", "Y", "Z"]
label = "prim"
dp = 12
neutrons = [1, 2, 3, 4]

In [2]:
import sys
from collections import defaultdict

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import joblib
import sklearn.preprocessing
from sklearn.metrics import (
    balanced_accuracy_score,
    confusion_matrix,
    plot_confusion_matrix,
)

sys.path.append("..")
from helpers import filename_for

Welcome to JupyROOT 6.16/00


In [3]:
files = [filename_for(15, dp, 600, 500, n, "inclxx", s, "clusterfeature.parquet") for n in neutrons for s in range(20)]
dfs = [pd.read_parquet(file) for file in files]
data = pd.concat(dfs, ignore_index=True).sample(frac=1, random_state=1337)

data = data[(abs(data["X"]) < 125) & (abs(data["Y"]) < 125) & (data["T"] < 100)]

print(data["prim"].value_counts())
display(data)

0.0    4225482
1.0    1412871
Name: prim, dtype: int64


Unnamed: 0,i_event,prim,T,E,Size,EToF,EnergyMoment,TSpawn,MaxEHit,X,Y,Z
1278499,201.0,0.0,72.743546,4.001890,1.0,356.984039,0.000000e+00,0.000000,4.001890,-7.500000,-9.346230,1507.5
123683,2819.0,0.0,70.245926,4.904631,2.0,445.918732,2.805709e+00,0.121427,2.531206,-6.497627,7.500000,1552.5
5226144,5531.0,1.0,62.695213,320.804871,10.0,625.841431,1.043842e+01,1.434420,94.084465,-17.500000,-4.665872,1507.5
2404152,6755.0,1.0,64.116348,337.163696,13.0,576.190308,1.368203e+01,2.412778,84.405426,-26.984325,2.500000,1512.5
5462128,6398.0,1.0,67.126511,264.252136,9.0,590.024841,8.335863e+00,0.875057,45.102974,12.224815,-12.500000,1592.5
...,...,...,...,...,...,...,...,...,...,...,...,...
1428135,5803.0,0.0,82.358917,1.178269,1.0,295.601349,2.273737e-13,0.000000,1.178269,-22.500000,1.662647,1607.5
480729,6068.0,0.0,78.656456,1.329785,1.0,283.639130,2.009718e-14,0.000000,1.329785,-117.500000,-85.086517,1507.5
3361959,7596.0,0.0,67.661819,41.934479,3.0,556.118164,3.580651e+00,0.262080,19.005957,-24.975262,-27.500000,1582.5
2065244,7478.0,0.0,68.507622,3.715144,1.0,515.199280,5.024296e-15,0.000000,3.715144,-22.107210,17.500000,1572.5


In [4]:
prim1 = data[data["prim"] == 1]
prim0 = data[data["prim"] == 0].sample(n=len(prim1.index), random_state=1337)
balanced_data = pd.concat([prim0, prim1], ignore_index=True).sample(frac=1, random_state=1337)

print(balanced_data["prim"].value_counts())

1.0    1412871
0.0    1412871
Name: prim, dtype: int64


In [5]:
msk = np.random.rand(len(balanced_data)) < 0.8
traindata = balanced_data[msk]
testdata = balanced_data[~msk]

print(traindata.shape, testdata.shape)

(2260294, 12) (565448, 12)


In [6]:
x_train = traindata[features].values
y_train = traindata[[label]].values.ravel()

x_val = testdata[features].values
y_val = testdata[[label]].values.ravel()

In [None]:
#from autosklearn.experimental.askl2 import AutoSklearn2Classifier
from autosklearn.classification import AutoSklearnClassifier

cls = AutoSklearnClassifier(n_jobs=5, memory_limit=50000, time_left_for_this_task=36000)
cls.fit(x_train, y_train)

  self.re = re.compile(self.reString)


In [11]:
print(cls.sprint_statistics())

auto-sklearn results:
  Dataset name: a20e9d3e-865e-11eb-94a6-1866da859056
  Metric: accuracy
  Best validation score: 0.903767
  Number of target algorithm runs: 26
  Number of successful target algorithm runs: 16
  Number of crashed target algorithm runs: 1
  Number of target algorithms that exceeded the time limit: 8
  Number of target algorithms that exceeded the memory limit: 1



In [12]:
x_test = testdata[features].values
y_true = testdata[[label]].values.ravel()

y_pred = cls.predict(x_test)
bac = balanced_accuracy_score(y_true, y_pred)

print(bac)

0.904949020689428


In [13]:
joblib.dump(cls, "models/autosklearn12.pkl")

['models/autosklearn12.pkl']