# Try to improve LinearDiscriminantAnalysis

TLDR: Settings don't matter. Use defaults.

## Load Stuff

In [1]:
import numpy as np
import pandas as pd
import time
import sys
import joblib

import sklearn
from sklearn import *
from sklearn.experimental import enable_hist_gradient_boosting
import skopt

sys.path.append("..")
from helpers import filename_for

Welcome to JupyROOT 6.16/00


In [2]:
files = [filename_for(15, 30, 600, 500, n, "inclxx", s, "trifeature.pkl") for n in [1, 2, 3, 4] for s in range(20)]
dfs = [pd.read_pickle(file) for file in files]
data = pd.concat(dfs, ignore_index=True).sample(frac=1)
data = data[data["nHits"] > 0]
print(data.shape)

(789634, 6)


In [3]:
msk = np.random.rand(len(data)) < 0.8
traindata = data[msk]
testdata = data[~msk]

print(traindata.shape)
print(testdata.shape)

(631384, 6)
(158250, 6)


In [4]:
features = ["nHits", "nClus", "Edep"]
label = ["nPN"]

## Try to improve ...

In [5]:
def optimize_lda1():
    defaults = {
        "solver": "svd",
        "shrinkage": None,
        "priors": None,
        "n_components": None,
        "store_covariance": False,
        "tol": 0.0001,
    }

    model = sklearn.discriminant_analysis.LinearDiscriminantAnalysis(**defaults)

    opt = skopt.BayesSearchCV(
        model,
        {
            # "solver": skopt.space.Categorical(["svd", "lsqr", "eigen"]),
            # "shrinkage": skopt.space.Categorical(["auto", None]), # Works only with ‘lsqr’ and ‘eigen’ solvers.
            "n_components": skopt.space.Categorical([1, 2, 3]),
            "tol": skopt.space.Real(1e-3, 1e0, prior="log-uniform"),  # Only used if solver is ‘svd’.
        },
        n_iter=50,
        cv=2,
        n_jobs=2,
    )

    start = time.time()
    opt.fit(traindata[features], traindata[label].values.ravel())
    end = time.time()

    y_pred = opt.predict(testdata[features])
    y_true = testdata[label].values.ravel()
    bac = sklearn.metrics.balanced_accuracy_score(y_true, y_pred)

    y_pred = opt.predict(data[features])
    y_true = data[label].values.ravel()
    bacall = sklearn.metrics.balanced_accuracy_score(y_true, y_pred)

    return opt, ("LinearDiscriminantAnalysis", end - start, bac, bacall)

In [6]:
%%capture
lda1_opt, lda1_result = optimize_lda1()

In [7]:
print(lda1_result)
print(lda1_opt.best_params_)
display(pd.DataFrame(lda1_opt.cv_results_))

('LinearDiscriminantAnalysis', 78.09813165664673, 0.7119446530820162, 0.7106872878412864)
OrderedDict([('n_components', 2), ('tol', 0.05904546236639551)])


Unnamed: 0,split0_test_score,split1_test_score,mean_test_score,std_test_score,rank_test_score,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_components,param_tol,params
0,0.70806,0.70705,0.707555,0.000505,1,0.301058,0.000302,0.047462,0.00048,2,0.059045,"{'n_components': 2, 'tol': 0.05904546236639551}"
1,0.668889,0.668753,0.668821,6.8e-05,48,0.197711,0.001758,0.041291,0.000208,2,0.506539,"{'n_components': 2, 'tol': 0.5065388066280351}"
2,0.70806,0.70705,0.707555,0.000505,1,0.183587,0.002006,0.041227,8.7e-05,2,0.011991,"{'n_components': 2, 'tol': 0.011990885416125256}"
3,0.70806,0.70705,0.707555,0.000505,1,0.183885,0.002446,0.041242,0.000155,3,0.011595,"{'n_components': 3, 'tol': 0.011595490128654762}"
4,0.70806,0.70705,0.707555,0.000505,1,0.184471,0.001905,0.041249,3.5e-05,2,0.030237,"{'n_components': 2, 'tol': 0.030236576981032715}"
5,0.707747,0.706717,0.707232,0.000515,28,0.184018,0.001234,0.056992,0.00406,2,0.002497,"{'n_components': 2, 'tol': 0.0024966122752312202}"
6,0.660824,0.661037,0.660931,0.000106,49,0.184471,0.002408,0.04896,0.004536,2,0.945613,"{'n_components': 2, 'tol': 0.9456128454430527}"
7,0.70806,0.70705,0.707555,0.000505,1,0.185351,0.001251,0.041687,0.000208,2,0.096741,"{'n_components': 2, 'tol': 0.09674129546631158}"
8,0.70806,0.70705,0.707555,0.000505,1,0.184215,0.002457,0.051058,0.001374,2,0.010617,"{'n_components': 2, 'tol': 0.01061712091348387}"
9,0.70806,0.70705,0.707555,0.000505,1,0.187764,0.003366,0.044015,0.003051,2,0.04531,"{'n_components': 2, 'tol': 0.04531010743225246}"


In [8]:
def optimize_lda2():
    defaults = {
        "solver": "svd",
        "shrinkage": None,
        "priors": None,
        "n_components": None,
        "store_covariance": False,
        "tol": 0.0001,
    }

    model = sklearn.discriminant_analysis.LinearDiscriminantAnalysis(**defaults)

    opt = skopt.BayesSearchCV(
        model,
        {
            "solver": skopt.space.Categorical(["lsqr", "eigen"]),
            # "shrinkage": skopt.space.Categorical(["auto"]),#, None]),  # Works only with ‘lsqr’ and ‘eigen’ solvers.
            "n_components": skopt.space.Categorical([1, 2, 3]),
            # "tol": skopt.space.Real(1e-3, 1e0, prior="log-uniform"), # Only used if solver is ‘svd’.
        },
        n_iter=50,
        cv=2,
        n_jobs=2,
    )

    start = time.time()
    opt.fit(traindata[features], traindata[label].values.ravel())
    end = time.time()

    y_pred = opt.predict(testdata[features])
    y_true = testdata[label].values.ravel()
    bac = sklearn.metrics.balanced_accuracy_score(y_true, y_pred)

    y_pred = opt.predict(data[features])
    y_true = data[label].values.ravel()
    bacall = sklearn.metrics.balanced_accuracy_score(y_true, y_pred)

    return opt, ("LinearDiscriminantAnalysis", end - start, bac, bacall)

In [9]:
%%capture
lda2_opt, lda2_result = optimize_lda2()

In [10]:
print(lda2_result)
print(lda2_opt.best_params_)
display(pd.DataFrame(lda2_opt.cv_results_))

('LinearDiscriminantAnalysis', 63.20529079437256, 0.7116389101523001, 0.7103993936585928)
OrderedDict([('n_components', 1), ('solver', 'eigen')])


Unnamed: 0,split0_test_score,split1_test_score,mean_test_score,std_test_score,rank_test_score,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_components,param_solver,params
0,0.707712,0.706762,0.707237,0.000475,1,0.174511,0.000526,0.042199,0.0003991127,1,eigen,"{'n_components': 1, 'solver': 'eigen'}"
1,0.707712,0.706762,0.707237,0.000475,1,0.161963,0.003654,0.041432,0.0002250671,1,lsqr,"{'n_components': 1, 'solver': 'lsqr'}"
2,0.707712,0.706762,0.707237,0.000475,1,0.161571,0.00348,0.041525,0.0001670122,3,lsqr,"{'n_components': 3, 'solver': 'lsqr'}"
3,0.707712,0.706762,0.707237,0.000475,1,0.175476,0.003743,0.042096,0.0004194975,2,eigen,"{'n_components': 2, 'solver': 'eigen'}"
4,0.707712,0.706762,0.707237,0.000475,1,0.175007,0.00347,0.041997,0.0004086494,1,eigen,"{'n_components': 1, 'solver': 'eigen'}"
5,0.707712,0.706762,0.707237,0.000475,1,0.175818,0.002022,0.042518,0.0003553629,2,eigen,"{'n_components': 2, 'solver': 'eigen'}"
6,0.707712,0.706762,0.707237,0.000475,1,0.174182,0.001415,0.041534,4.994869e-05,1,eigen,"{'n_components': 1, 'solver': 'eigen'}"
7,0.707712,0.706762,0.707237,0.000475,1,0.170997,0.002781,0.04489,0.003655553,1,lsqr,"{'n_components': 1, 'solver': 'lsqr'}"
8,0.707712,0.706762,0.707237,0.000475,1,0.161649,0.002878,0.040295,0.0007096529,2,lsqr,"{'n_components': 2, 'solver': 'lsqr'}"
9,0.707712,0.706762,0.707237,0.000475,1,0.162134,0.002849,0.040497,0.0007129908,3,lsqr,"{'n_components': 3, 'solver': 'lsqr'}"


## Just train model with defaults settings and save

In [11]:
model = sklearn.discriminant_analysis.LinearDiscriminantAnalysis()
model.fit(traindata[features], traindata[label].values.ravel())

LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
                           solver='svd', store_covariance=False, tol=0.0001)

In [12]:
y_pred = model.predict(testdata[features])
print(sklearn.metrics.balanced_accuracy_score(testdata[label], y_pred))

0.7116389101523001


In [13]:
joblib.dump(model, "models/mult_30dp_600AMeV_4n_LinearDiscriminantAnalysis.pkl")

['models/mult_30dp_600AMeV_4n_LinearDiscriminantAnalysis.pkl']