# Try to improve LinearDiscriminantAnalysis

TLDR: Settings don't matter. Use defaults.

## Load Stuff

In [1]:
import numpy as np
import pandas as pd
import time
import sys
import joblib

import sklearn
from sklearn import *
from sklearn.experimental import enable_hist_gradient_boosting
import skopt

sys.path.append("..")
from helpers import filename_for
from plotconf import *

Welcome to JupyROOT 6.16/00


In [2]:
files = [filename_for(15, 30, 600, 500, n, "inclxx", s, "trifeature.pkl") for n in [1, 2, 3, 4] for s in range(20)]
dfs = [pd.read_pickle(file) for file in files]
data = pd.concat(dfs, ignore_index=True).sample(frac=1)
data = data[data["nHits"] > 0]
print(data.shape)

(789634, 6)


In [3]:
msk = np.random.rand(len(data)) < 0.8
traindata = data[msk]
testdata = data[~msk]

print(traindata.shape)
print(testdata.shape)

(631401, 6)
(158233, 6)


In [4]:
features = ["nHits", "nClus", "Edep"]
label = ["nPN"]

## Try to improve ...

In [5]:
def optimize_lda1():
    defaults = {
        "solver": "svd",
        "shrinkage": None,
        "priors": None,
        "n_components": None,
        "store_covariance": False,
        "tol": 0.0001,
    }

    model = sklearn.discriminant_analysis.LinearDiscriminantAnalysis(**defaults)

    opt = skopt.BayesSearchCV(
        model,
        {
            # "solver": skopt.space.Categorical(["svd", "lsqr", "eigen"]),
            # "shrinkage": skopt.space.Categorical(["auto", None]), # Works only with ‘lsqr’ and ‘eigen’ solvers.
            "n_components": skopt.space.Categorical([1, 2, 3]),
            "tol": skopt.space.Real(1e-3, 1e0, prior="log-uniform"),  # Only used if solver is ‘svd’.
        },
        n_iter=50,
        cv=2,
        n_jobs=2,
    )

    start = time.time()
    opt.fit(traindata[features], traindata[label].values.ravel())
    end = time.time()

    y_pred = opt.predict(testdata[features])
    y_true = testdata[label].values.ravel()
    bac = sklearn.metrics.balanced_accuracy_score(y_true, y_pred)

    y_pred = opt.predict(data[features])
    y_true = data[label].values.ravel()
    bacall = sklearn.metrics.balanced_accuracy_score(y_true, y_pred)

    return opt, ("LinearDiscriminantAnalysis", end - start, bac, bacall)

In [6]:
%%capture
lda1_opt, lda1_result = optimize_lda1()

In [7]:
print(lda1_result)
print(lda1_opt.best_params_)
display(pd.DataFrame(lda1_opt.cv_results_))

('LinearDiscriminantAnalysis', 91.94827389717102, 0.7110080400520146, 0.7106556727454868)
OrderedDict([('n_components', 1), ('tol', 0.013203039688571725)])


Unnamed: 0,split0_test_score,split1_test_score,mean_test_score,std_test_score,rank_test_score,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_components,param_tol,params
0,0.707454,0.707285,0.70737,8.4e-05,22,0.195408,0.001939,0.045246,0.003066,2,0.001283,"{'n_components': 2, 'tol': 0.001283024879055837}"
1,0.707454,0.707285,0.70737,8.4e-05,22,0.187454,0.003151,0.041637,0.000239,2,0.003098,"{'n_components': 2, 'tol': 0.003097510030759836}"
2,0.707454,0.707285,0.70737,8.4e-05,22,0.182658,0.001983,0.041912,0.000335,1,0.001678,"{'n_components': 1, 'tol': 0.001677704287873831}"
3,0.707638,0.707612,0.707625,1.3e-05,1,0.183942,0.004665,0.041367,8.1e-05,1,0.013203,"{'n_components': 1, 'tol': 0.013203039688571725}"
4,0.707638,0.707612,0.707625,1.3e-05,1,0.181227,0.00023,0.041736,0.000124,2,0.061922,"{'n_components': 2, 'tol': 0.06192219583955215}"
5,0.707454,0.707285,0.70737,8.4e-05,22,0.184072,0.003893,0.041579,0.00027,3,0.001642,"{'n_components': 3, 'tol': 0.001641704571408362}"
6,0.707638,0.707612,0.707625,1.3e-05,1,0.183685,0.003991,0.043573,0.000941,3,0.033053,"{'n_components': 3, 'tol': 0.03305299286226989}"
7,0.707638,0.707612,0.707625,1.3e-05,1,0.180888,3.6e-05,0.04203,0.000413,2,0.290972,"{'n_components': 2, 'tol': 0.29097234189168114}"
8,0.668794,0.668882,0.668838,4.4e-05,49,0.183873,0.002787,0.041901,0.000104,1,0.895909,"{'n_components': 1, 'tol': 0.8959087256706173}"
9,0.707454,0.707285,0.70737,8.4e-05,22,0.184425,0.002251,0.042789,0.001153,2,0.007085,"{'n_components': 2, 'tol': 0.007085471823383633}"


In [8]:
def optimize_lda2():
    defaults = {
        "solver": "svd",
        "shrinkage": None,
        "priors": None,
        "n_components": None,
        "store_covariance": False,
        "tol": 0.0001,
    }

    model = sklearn.discriminant_analysis.LinearDiscriminantAnalysis(**defaults)

    opt = skopt.BayesSearchCV(
        model,
        {
            "solver": skopt.space.Categorical(["lsqr", "eigen"]),
            # "shrinkage": skopt.space.Categorical(["auto"]),#, None]),  # Works only with ‘lsqr’ and ‘eigen’ solvers.
            "n_components": skopt.space.Categorical([1, 2, 3]),
            # "tol": skopt.space.Real(1e-3, 1e0, prior="log-uniform"), # Only used if solver is ‘svd’.
        },
        n_iter=50,
        cv=2,
        n_jobs=2,
    )

    start = time.time()
    opt.fit(traindata[features], traindata[label].values.ravel())
    end = time.time()

    y_pred = opt.predict(testdata[features])
    y_true = testdata[label].values.ravel()
    bac = sklearn.metrics.balanced_accuracy_score(y_true, y_pred)

    y_pred = opt.predict(data[features])
    y_true = data[label].values.ravel()
    bacall = sklearn.metrics.balanced_accuracy_score(y_true, y_pred)

    return opt, ("LinearDiscriminantAnalysis", end - start, bac, bacall)

In [9]:
%%capture
lda2_opt, lda2_result = optimize_lda2()

In [10]:
print(lda2_result)
print(lda2_opt.best_params_)
display(pd.DataFrame(lda2_opt.cv_results_))

('LinearDiscriminantAnalysis', 62.24114012718201, 0.7105857028964911, 0.710394295601435)
OrderedDict([('n_components', 3), ('solver', 'lsqr')])


Unnamed: 0,split0_test_score,split1_test_score,mean_test_score,std_test_score,rank_test_score,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_components,param_solver,params
0,0.707451,0.707301,0.707376,7.5e-05,1,0.160956,0.002367,0.041375,0.000518,3,lsqr,"{'n_components': 3, 'solver': 'lsqr'}"
1,0.707451,0.707301,0.707376,7.5e-05,1,0.158069,0.00066,0.040485,0.000491,3,lsqr,"{'n_components': 3, 'solver': 'lsqr'}"
2,0.707451,0.707301,0.707376,7.5e-05,1,0.159168,0.000602,0.040975,0.000668,1,lsqr,"{'n_components': 1, 'solver': 'lsqr'}"
3,0.707451,0.707301,0.707376,7.5e-05,1,0.171832,0.000408,0.042102,0.000114,2,eigen,"{'n_components': 2, 'solver': 'eigen'}"
4,0.707451,0.707301,0.707376,7.5e-05,1,0.158791,1.8e-05,0.041988,0.000989,3,lsqr,"{'n_components': 3, 'solver': 'lsqr'}"
5,0.707451,0.707301,0.707376,7.5e-05,1,0.174658,0.002469,0.042229,0.000728,3,eigen,"{'n_components': 3, 'solver': 'eigen'}"
6,0.707451,0.707301,0.707376,7.5e-05,1,0.158343,0.001902,0.040185,0.001342,2,lsqr,"{'n_components': 2, 'solver': 'lsqr'}"
7,0.707451,0.707301,0.707376,7.5e-05,1,0.173619,0.001833,0.042096,0.001052,1,eigen,"{'n_components': 1, 'solver': 'eigen'}"
8,0.707451,0.707301,0.707376,7.5e-05,1,0.158558,0.001815,0.040008,0.000741,3,lsqr,"{'n_components': 3, 'solver': 'lsqr'}"
9,0.707451,0.707301,0.707376,7.5e-05,1,0.158483,0.003119,0.041225,0.000519,1,lsqr,"{'n_components': 1, 'solver': 'lsqr'}"


## Just train model with defaults settings and save

In [11]:
model = sklearn.discriminant_analysis.LinearDiscriminantAnalysis()
model.fit(traindata[features], traindata[label].values.ravel())

LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
                           solver='svd', store_covariance=False, tol=0.0001)

In [12]:
y_pred = model.predict(testdata[features])
print(sklearn.metrics.balanced_accuracy_score(testdata[label], y_pred))

0.7105857028964911


In [13]:
joblib.dump(model, "models/mult_30dp_600AMeV_4n_LinearDiscriminantAnalysis.pkl")

['models/mult_30dp_600AMeV_4n_LinearDiscriminantAnalysis.pkl']