# 2. Higgs@L3_Logistic_Regression

## Prerequisites
The BDTs are used as a precut and must thus be already available.

## Data preparation
As mentioned above, the BDT response is used for a precut on the data.
We want to ignore the `fmmis` variable.
It is identified as a strong discriminator.
It is nevertheless not used in the Classifier.
Instead, it will later be used, together with the Classifier's response, in a 2D analysis.

## Logistic regression training
This is executed & evaluated for each of the mass hypothesis.
The coefficients are saved in a plain `.txt` file.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

import helpers
from load_data import data, mc_higgs_models, mc_no_higgs_frames

## Data preparation

In [None]:
def getTrainAndTest(higgs_mass):
    df_no_higgs = pd.concat(mc_no_higgs_frames)
    df_higgs = mc_higgs_models[higgs_mass]
    df_MVA = pd.concat([df_no_higgs, df_higgs])
    df_MVA = df_MVA[helpers.kinematical_vars + ["class", "weight"]]

    with open(f"tmp/BDT_{higgs_mass}.pkl", "rb") as fid:
        bdt_loaded = pickle.load(fid)
    df_MVA["bdt"] = bdt_loaded.decision_function(df_MVA[helpers.kinematical_vars])
    df_MVA = df_MVA[df_MVA["bdt"] > helpers.bdt_cut[higgs_mass]]
    del df_MVA["bdt"]

    del df_MVA["mmis"]
    target = df_MVA.pop("class")

    X_train, X_test, y_train, y_test = train_test_split(df_MVA, target, 
        stratify=target, random_state=42,train_size=0.65)

    X_train_w = X_train.pop("weight")
    X_test_w = X_test.pop("weight")

    return X_train, X_test, y_train, y_test, X_train_w, X_test_w 

X_train, X_test, y_train, y_test, X_train_w, X_test_w = getTrainAndTest("higgs_85")

## Train a Logistic Regression

In [None]:
log_regs = {}
for higgs_mass in mc_higgs_models:
    X_train, X_test, y_train, y_test, X_train_w, X_test_w = getTrainAndTest(higgs_mass)
    log_regs[higgs_mass] = LogisticRegression(C=100).fit(X_train, y_train, sample_weight=X_train_w)
    print(f"{higgs_mass} training score: {100*log_regs[higgs_mass].score(X_train, y_train):.2f} %.")
    print(f"{higgs_mass} test set score: {100*log_regs[higgs_mass].score(X_test, y_test):.2f} %.\n")

In [None]:
for higgs_mass, log_reg in log_regs.items():
    symbol = dict(higgs_85="r^", higgs_90="b*", higgs_95="go")[higgs_mass]
    plt.plot(log_reg.coef_.T, helpers.symbol[higgs_mass], 
             label=f"$m_\mathrm{{H}}$ = {higgs_mass[-2:]} GeV")
plt.title("Logistic Regression after BDT preselection")
plt.xlabel("feature $f_i$")
plt.ylabel("coefficient $\\alpha_i$")
plt.xticks(np.arange(len(X_train.columns)), X_train.columns, rotation=90)
plt.legend()
plt.savefig("plots/logisitic_regression_coefficients.png")

## Save the model coefficients and the intercepts for future usage

In [None]:
for higgs_mass, log_reg in log_regs.items():
    print(higgs_mass)
    coefs = np.concatenate([log_reg.coef_[0], log_reg.intercept_])
    print(coefs, "\n")
    helpers.log_reg_coeffs_dir.mkdir(parents=True, exist_ok=True)
    np.savetxt(helpers.log_reg_coeffs_dir / f"{higgs_mass}.txt", coefs)