# Try to improve Cluster classification

In [1]:
import numpy as np
import pandas as pd
import time
import multiprocessing
import functools
from joblib import Parallel, delayed
import sys

import sklearn
from sklearn import *
from sklearn.experimental import enable_hist_gradient_boosting
import skopt

sys.path.append("..")
from helpers import filename_for

Welcome to JupyROOT 6.16/00


## Load, Rebalance (Downsample), Split

In [2]:
files = [filename_for(15, 30, 600, 500, n, "inclxx", s, "clusterfeature.pkl") for n in [1, 2, 3, 4] for s in range(20)]
dfs = [pd.read_pickle(file) for file in files]
data = pd.concat(dfs, ignore_index=True).sample(frac=1, random_state=1337)

print(data["prim"].value_counts())
display(data)

0.0    8670780
1.0    1782525
Name: prim, dtype: int64


Unnamed: 0,i_event,prim,T,E,Size,EToF,EnergyMoment,TSpawn,MaxEHit,X,Y,Z
3499177,1731.0,1.0,63.755749,149.914871,6.0,607.207764,6.798460e+00,1.218181,64.293854,-0.682722,-22.500000,1522.5
8869071,3484.0,0.0,74.362877,15.335159,4.0,580.952515,3.743494e+00,0.394498,9.338829,37.500000,-1.970521,1757.5
2585062,5713.0,0.0,68.605553,37.585819,1.0,600.481689,2.278178e-13,0.000000,37.585819,-76.964607,2.500000,1632.5
2894202,5004.0,0.0,74.553833,34.140911,2.0,625.573181,2.255625e+00,0.020336,23.147533,-6.335686,17.500000,1792.5
3878350,5814.0,0.0,86.484261,7.983827,1.0,264.821777,0.000000e+00,0.000000,7.983827,98.367348,67.500000,1622.5
...,...,...,...,...,...,...,...,...,...,...,...,...
480729,1884.0,0.0,79.410904,2.277636,1.0,296.172211,0.000000e+00,0.000000,2.277636,-72.500000,-76.763123,1547.5
3361959,3033.0,1.0,63.699387,613.164795,38.0,609.370972,3.672803e+01,5.340049,52.407722,3.392983,12.500000,1522.5
7087336,7733.0,0.0,70.274147,246.902069,11.0,568.768555,1.244804e+01,2.275502,52.906334,6.802186,37.500000,1652.5
8315069,6834.0,0.0,78.157318,2.550865,1.0,425.412720,0.000000e+00,0.000000,2.550865,41.877541,82.500000,1702.5


In [3]:
prim1 = data[data["prim"] == 1]
prim0 = data[data["prim"] == 0].sample(n=len(prim1.index), random_state=1337)
balanced_data = pd.concat([prim0, prim1], ignore_index=True).sample(frac=1, random_state=1337)

print(balanced_data["prim"].value_counts())

1.0    1782525
0.0    1782525
Name: prim, dtype: int64


In [4]:
msk = np.random.rand(len(balanced_data)) < 0.8
traindata = balanced_data[msk]
testdata = balanced_data[~msk]

print(traindata.shape, testdata.shape)

(2853700, 12) (711350, 12)


In [5]:
features = ["T", "E", "Size", "EToF", "EnergyMoment", "TSpawn", "MaxEHit", "X", "Y", "Z"]
label = ["prim"]

## RandomForestClassifier

In [8]:
def optimize_rfc():
    defaults = {
        "n_estimators": 100,
        "criterion": "gini",
        "max_depth": None,
        "min_samples_split": 2,
        "min_samples_leaf": 1,
        "min_weight_fraction_leaf": 0.0,
        "max_features": "auto",
        "max_leaf_nodes": None,
        "min_impurity_decrease": 0.0,
        "min_impurity_split": None,
        "bootstrap": True,
        "oob_score": False,
        "n_jobs": None,
        "random_state": None,
        "verbose": 0,
        "warm_start": False,
        "class_weight": None,
        "ccp_alpha": 0.0,
        "max_samples": None,
    }

    settings = defaults
    settings["n_jobs"] = -1

    model = sklearn.ensemble.RandomForestClassifier(**settings)

    opt = skopt.BayesSearchCV(
        model,
        {
            "n_estimators": skopt.space.Integer(10, 500),  # 0.905 <-> 0.912
            # "criterion": skopt.space.Categorical(["gini", "entropy"]),  # almost no difference?
            "min_samples_split": skopt.space.Integer(2, 5000),
            "min_samples_leaf": skopt.space.Integer(1, 5000),
            # "min_impurity_decrease": skopt.space.Real(0.0, 0.5),  # 0 is best
        },
        n_iter=50,
        cv=2,
        n_jobs=2,
    )

    start = time.time()
    opt.fit(traindata[features], traindata[label].values.ravel())
    end = time.time()

    y_pred = opt.predict(testdata[features])
    y_true = testdata[label].values.ravel()
    bac = sklearn.metrics.balanced_accuracy_score(y_true, y_pred)

    y_pred = opt.predict(data[features])
    y_true = data[label].values.ravel()
    bacall = sklearn.metrics.balanced_accuracy_score(y_true, y_pred)
    
    return opt, ("RandomForestClassifier", end - start, bac, bacall)

In [9]:
rfc_opt, rfc_result = optimize_rfc()
print(rfc_result)
print(rfc_opt.best_params_)
display(pd.DataFrame(rfc_opt.cv_results_))



('RandomForestClassifier', 7985.7673370838165, 0.9132678444597035, 0.9194869097605568)
OrderedDict([('min_samples_leaf', 6), ('min_samples_split', 36), ('n_estimators', 61)])


Unnamed: 0,split0_test_score,split1_test_score,mean_test_score,std_test_score,rank_test_score,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_min_samples_leaf,param_min_samples_split,param_n_estimators,params
0,0.907193,0.908061,0.907627,0.000434,38,34.573221,0.11894,1.430541,0.019465,1160,2100,94,"{'min_samples_leaf': 1160, 'min_samples_split'..."
1,0.904101,0.90462,0.90436,0.00026,43,114.782234,2.559237,4.17577,0.828215,3778,4114,386,"{'min_samples_leaf': 3778, 'min_samples_split'..."
2,0.910536,0.910916,0.910726,0.00019,24,52.098647,0.38721,1.778177,0.138666,65,1529,108,"{'min_samples_leaf': 65, 'min_samples_split': ..."
3,0.905147,0.905701,0.905424,0.000277,42,128.420776,0.917267,4.305318,0.237132,2780,3836,343,"{'min_samples_leaf': 2780, 'min_samples_split'..."
4,0.908713,0.909276,0.908995,0.000282,32,56.176945,0.271321,1.885463,0.12205,358,4234,120,"{'min_samples_leaf': 358, 'min_samples_split':..."
5,0.90363,0.904565,0.904097,0.000468,44,16.329762,0.007211,0.954827,0.010155,3668,1275,42,"{'min_samples_leaf': 3668, 'min_samples_split'..."
6,0.906889,0.907332,0.90711,0.000221,39,207.194877,0.324093,7.286741,0.076116,1450,932,491,"{'min_samples_leaf': 1450, 'min_samples_split'..."
7,0.902992,0.903845,0.903418,0.000426,48,128.626661,1.062452,4.310378,0.240939,4986,3602,364,"{'min_samples_leaf': 4986, 'min_samples_split'..."
8,0.905732,0.905856,0.905794,6.2e-05,41,59.032526,0.977486,1.780278,0.140141,2291,1880,146,"{'min_samples_leaf': 2291, 'min_samples_split'..."
9,0.905952,0.906553,0.906252,0.000301,40,130.922559,0.783307,4.215229,0.066467,2039,4348,322,"{'min_samples_leaf': 2039, 'min_samples_split'..."


## MPL

In [4]:
# skopt not doing the best job here? Try manual
def optimize_mpl(layers=(100,)):
    defaults = {
        "hidden_layer_sizes": (100,),
        "activation": "relu",
        "solver": "adam",
        "alpha": 0.0001,
        "batch_size": "auto",
        "learning_rate": "constant",
        "learning_rate_init": 0.001,
        "power_t": 0.5,
        "max_iter": 200,
        "shuffle": True,
        "random_state": None,
        "tol": 0.0001,
        "verbose": False,
        "warm_start": False,
        "momentum": 0.9,
        "nesterovs_momentum": True,
        "early_stopping": False,
        "validation_fraction": 0.1,
        "beta_1": 0.9,
        "beta_2": 0.999,
        "epsilon": 1e-08,
        "n_iter_no_change": 10,
        "max_fun": 15000,
    }

    settings = defaults
    settings["hidden_layer_sizes"] = layers
    settings["learning_rate"] = "adaptive"
    settings["early_stopping"] = True
    settings["validation_fraction"] = 0.2

    model = sklearn.neural_network.MLPClassifier(**settings)

    train_size = 500000
    x_train = traindata[:train_size][features]
    y_train = traindata[:train_size][label].values.ravel()

    x_test = testdata[features]
    y_test = testdata[label].values.ravel()

    start = time.time()
    model.fit(x_train, y_train)
    end = time.time()

    y_pred = model.predict(x_test)
    y_true = y_test

    bac = sklearn.metrics.balanced_accuracy_score(y_true, y_pred)
    return ("MPL-" + str(layers), bac, end - start)

In [None]:
for layer in [
    (50,),
    (100,),
    (200,),
    (500,),
    (50, 25),
    (50, 50),
    (100, 50),
    (200, 25),
    (500, 50),
]:
    result = optimize_mpl(layer)
    print(result)
    results.append(result)

In [None]:
# Note: PAC gives very different results for different random_state -> Disqualify?
#       this might explain difference between scalers
# Note: skopt produces some strange errors here ...
# Optimize:
#   fit_intercept=True/False -> No difference
#   loss="hinge"/"squared_hinge" -> No difference
def optimize_pac():
    x_train = traindata[features]
    y_train = traindata[label].values.ravel()

    x_test = testdata[features]
    y_test = testdata[label].values.ravel()

    model = sklearn.linear_model.PassiveAggressiveClassifier(
        max_iter=10000, tol=1e-6, n_jobs=-1, warm_start=False, class_weight="balanced", random_state=1337,
    )

    start = time.time()
    model.fit(x_train, y_train)
    end = time.time()

    y_pred = model.predict(x_test)
    y_true = y_test

    bac = sklearn.metrics.balanced_accuracy_score(y_true, y_pred)
    return ("PAC", bac, end - start)

In [None]:
result = optimize_pac()
print(result)
results.append(result)

In [20]:
def optimize_perceptron():
    model = sklearn.linear_model.Perceptron(
        # penalty=None,
        # alpha=0.0001,
        # fit_intercept=True,
        max_iter=10000,
        tol=0.0001,
        # shuffle=True,
        # verbose=0,
        # eta0=1.0,
        n_jobs=5,
        # random_state=1337,
        # early_stopping=False,
        # validation_fraction=0.1,
        # n_iter_no_change=5,
        class_weight="balanced",
        # warm_start=False
    )

    scaler = sklearn.preprocessing.Normalizer()

    opt = skopt.BayesSearchCV(
        model,
        {
            "penalty": skopt.space.Categorical(["l2", "l1", "elasticnet"]),
            "alpha": skopt.space.Categorical([0.1, 0.01, 0.001, 0.0001, 0.00001]),
            "fit_intercept": skopt.space.Categorical([True, False]),
            "eta0": skopt.space.Categorical([0.1, 0.25, 0.5, 0.75, 1.0]),
        },
        n_iter=50,
        cv=12,
        n_jobs=10,
    )

    x_train = scaler.fit_transform(traindata[features])
    y_train = traindata[label].values.ravel()

    x_test = scaler.transform(testdata[features])
    y_test = testdata[label].values.ravel()

    start = time.time()
    opt.fit(x_train, y_train)
    end = time.time()

    y_pred = opt.predict(x_test)
    y_true = y_test

    bac = sklearn.metrics.balanced_accuracy_score(y_true, y_pred)
    print(end - start, bac)
    print(opt.best_params_)
    display(pd.DataFrame(opt.cv_results_))


optimize_perceptron()

  X = check_array(X)
  X = check_array(X)
  X = check_array(X)
  X = check_array(X)
  X = check_array(X)
  X = check_array(X)
  X = check_array(X)
  X = check_array(X)
  X = check_array(X)
  X = check_array(X)
  X = check_array(X)
  X = check_array(X)
  X = check_array(X)
  X = check_array(X)
  X = check_array(X)
  X = check_array(X)
  X = check_array(X)
  X = check_array(X)
  X = check_array(X)
  X = check_array(X)
  X = check_array(X)
  X = check_array(X)
  X = check_array(X)
  X = check_array(X)
  X = check_array(X)
  X = check_array(X)
  X = check_array(X)
  X = check_array(X)
  X = check_array(X)
  X = check_array(X)
  X = check_array(X)
  X = check_array(X)
  X = check_array(X)
  X = check_array(X)
  X = check_array(X)
  X = check_array(X)
  X = check_array(X)
  X = check_array(X)
  X = check_array(X)
  X = check_array(X)
  X = check_array(X)
  X = check_array(X)
  X = check_array(X)
  X = check_array(X)
  X = check_array(X)
  X = check_array(X)
  X = check_array(X)
  X = check_a

1658.5892052650452 0.8774374919254401
OrderedDict([('alpha', 1e-05), ('eta0', 0.75), ('fit_intercept', True), ('penalty', 'elasticnet')])


Unnamed: 0,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,...,rank_test_score,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,param_eta0,param_fit_intercept,param_penalty,params
0,0.856159,0.844364,0.763602,0.687288,0.18977,0.234003,0.197109,0.212809,0.230172,0.683535,...,47,11.959903,0.646242,0.120364,0.013754,0.001,0.25,False,elasticnet,"{'alpha': 0.001, 'eta0': 0.25, 'fit_intercept'..."
1,0.901243,0.881711,0.83872,0.878828,0.454211,0.905782,0.455332,0.78188,0.886389,0.782781,...,34,15.527935,1.557045,0.11755,0.013983,1e-05,0.75,False,l1,"{'alpha': 1e-05, 'eta0': 0.75, 'fit_intercept'..."
2,0.903315,0.900577,0.895822,0.865751,0.88461,0.866291,0.875176,0.812621,0.894092,0.892417,...,28,14.198939,2.010234,0.111264,0.009045,0.0001,0.75,True,l2,"{'alpha': 0.0001, 'eta0': 0.75, 'fit_intercept..."
3,0.829477,0.170521,0.170521,0.17052,0.17052,0.829276,0.170522,0.829135,0.829478,0.829414,...,50,16.012708,2.023319,0.107845,0.008398,0.1,0.5,True,l2,"{'alpha': 0.1, 'eta0': 0.5, 'fit_intercept': T..."
4,0.905102,0.904069,0.82861,0.88181,0.888196,0.89242,0.8731,0.693658,0.90121,0.844484,...,24,15.772808,1.689184,0.106211,0.008074,0.001,1.0,True,l1,"{'alpha': 0.001, 'eta0': 1.0, 'fit_intercept':..."
5,0.629873,0.717537,0.822134,0.864994,0.874124,0.469429,0.508499,0.461719,0.516458,0.540114,...,42,12.006777,0.970923,0.111023,0.006733,0.0001,0.1,True,l1,"{'alpha': 0.0001, 'eta0': 0.1, 'fit_intercept'..."
6,0.831527,0.871016,0.886255,0.181048,0.170523,0.189064,0.170549,0.83007,0.182337,0.837804,...,49,15.774977,2.575142,0.107176,0.009172,0.01,0.5,False,elasticnet,"{'alpha': 0.01, 'eta0': 0.5, 'fit_intercept': ..."
7,0.902616,0.617659,0.814985,0.905607,0.886123,0.31534,0.872992,0.672438,0.896124,0.894976,...,36,16.973424,2.534266,0.107158,0.00775,0.0001,1.0,False,elasticnet,"{'alpha': 0.0001, 'eta0': 1.0, 'fit_intercept'..."
8,0.901922,0.902411,0.505266,0.904858,0.891052,0.753601,0.877707,0.349536,0.899536,0.90379,...,33,13.041293,1.758001,0.10791,0.01005,0.0001,0.75,False,l2,"{'alpha': 0.0001, 'eta0': 0.75, 'fit_intercept..."
9,0.707994,0.692316,0.591745,0.878828,0.411619,0.482024,0.525583,0.47718,0.889934,0.527056,...,41,12.577323,1.453766,0.110008,0.010482,1e-05,0.25,False,l1,"{'alpha': 1e-05, 'eta0': 0.25, 'fit_intercept'..."


In [18]:
def train_nearcentroid():
    model = sklearn.neighbors.NearestCentroid()
    scaler = sklearn.preprocessing.Normalizer()

    x_train = scaler.fit_transform(traindata[features])
    y_train = traindata[label].values.ravel()

    x_test = scaler.transform(testdata[features])
    y_test = testdata[label].values.ravel()

    start = time.time()
    model.fit(x_train, y_train)
    end = time.time()
    print(end - start)

    y_pred = model.predict(x_test)
    y_true = y_test

    bac = sklearn.metrics.balanced_accuracy_score(y_true, y_pred)
    print(bac)
    return model, scaler


cent, cent_scaler = train_nearcentroid()

1.209331750869751
0.8662147423478739
