# 3. Higgs@L3-2D

Here we want to look into two-dimensional distributions.

## Prerequisites

The calculations build on both the BDT variables and the Logistic Regression output.
Those two steps/notebooks have to be executed first.

## Preselection 

The preselection schemes used in this notebook are:

1. **No preselection**.
2. Preselection by  1D and 2D cuts that were chosen **by eye**.
3. Preselection using machine learning **algorithm responses**:
  - The previously trained BDT response.
  - The previously trained Logistic Regression response.
   
## Likelihood analysis

The same kind of likelihood analysis as in `Higgs@L3.ipynb` can be performed in this case as well.
For more documentation, we refer to that notebook.

Since here we build 2D, instead of 1D, histograms, empty bins are more likely and should be handled in a meaningful way.
Especially thought has to be put into the case of bins with zero background expectation, but non-zero signal/data.
As this is a sign of not enough Monte Carlo data, a rebinning should be performed.

In [None]:
import itertools
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import helpers
from load_data import data, mc_higgs_models, mc_no_higgs_frames, getPreselectedSBD
from plotting import BkgSigHistos, LogLikRatioPlots
from stats import LogLikRatio, LogLikRatioObserved
import variables

In [None]:
mc_no_higgs = pd.concat(mc_no_higgs_frames, ignore_index=True) 

## Composed variable with MVA methods

Using the coefficients from Higgs@L3_Logistic_Regression.ipynb

In [None]:
for frame in itertools.chain(mc_higgs_models.values(), 
                             mc_no_higgs_frames.values(), 
                             [mc_no_higgs, data]):
    helpers.addLogisticRegressionResults(df=frame)


# Prepare the histograms (Selection Cuts)

In [None]:
variable =  "mmis"
binning = variables.info[variable]["binning"]
by_eye_bkgs = {}
by_eye_sigs = {} 
by_eye_datas = {} 

for m_higgs, higgs_model_df in mc_higgs_models.items():
    bkg = np.zeros(len(binning)-1)
    for bkg_df in mc_no_higgs_frames.values():
        df = bkg_df[helpers.byEyeSelectionCut(bkg_df, int(m_higgs[-2:]))]
        bkg += np.histogram(df[variable], bins=binning, weights=df["weight"])[0]
    by_eye_bkgs[m_higgs] = bkg

    df = higgs_model_df[helpers.byEyeSelectionCut(higgs_model_df, int(m_higgs[-2:]))]
    by_eye_sigs[m_higgs] = np.histogram(df[variable], bins=binning, weights=df["weight"])[0]

    df = data[helpers.byEyeSelectionCut(data, int(m_higgs[-2:]))]
    by_eye_datas[m_higgs] = np.histogram(df[variable], bins=binning, weights=df["weight"])[0]

    print(f"# Data events in model {m_higgs}: {sum(by_eye_datas[m_higgs])}.")

In [None]:
BkgSigHistos(backgrounds=by_eye_bkgs, signals=by_eye_sigs, datas=by_eye_datas, 
                var=variable, binning=binning)
plt.savefig("plots/VariableDists/{variable}")

## The Log-likelihood ratio we define to be

$$
    -2 \ln (Q(m_H)) = 2 s_{tot} - 2 \sum_{i=1}^{N_{bins}} N_i \ln \left( 1 + \frac{s_i(m_H)}{b_i} \right).
$$

In [None]:
n_exp = 10000
llrs, llrs_obs = {}, {}
for higgs_mass in by_eye_bkgs:
    llrs[higgs_mass] = LogLikRatio(
        background=by_eye_bkgs[higgs_mass],
        signal=by_eye_sigs[higgs_mass],
        n_experiments=n_exp
    )
    llrs_obs[higgs_mass] = LogLikRatioObserved(
        background=by_eye_bkgs[higgs_mass], 
        signal=by_eye_sigs[higgs_mass],
        data=by_eye_datas[higgs_mass],
    )

In [None]:
cl_s_and_b, quantiles_b, quantiles_s_plus_b = LogLikRatioPlots(llrs, llrs_obs, n_bins=300)
plt.savefig(f"plots/loglikeliratio/by_eye_{variable}")

$$CL_s = \frac{CL_{s+b}}{CL_b} = \frac{CL_{s+b}}{1-(1-CL_b)}$$

In [None]:
for higgs_mass in cl_s_and_b:
    one_minus_cl_b, cl_s_plus_b = cl_s_and_b[higgs_mass]
    cl_s = cl_s_plus_b / (1 - one_minus_cl_b) 
    print(f"{higgs_mass}: Confidence level CL >= {cl_s:.3f}")

# 2D 

## Variable Correlations

In [None]:
var1 = "mmis"
bins = (50,50)
def corrPlot(var2, selector=None):
    t_df = [("bkg", mc_no_higgs), ("data", data)]
    for higgs_mass in mc_higgs_models:
        t_df.append((f"{higgs_mass}_signal", mc_higgs_models[higgs_mass]))
    fig = plt.figure(figsize=(12, 8))
    for i, (title, df) in enumerate(t_df):
        if i < 2:
            ax = plt.subplot2grid(shape=(2,6), loc=(0,1+2*i), colspan=2)
        else:
            ax = plt.subplot2grid(shape=(2,6), loc=(1,2*(i-2)), colspan=2)
        if selector is not None:
            df = selector(df)
        ax.hist2d(df[var1], df[var2], bins=bins, weights=df["weight"])
        ax.set_xlabel(var1)
        ax.set_ylabel(var2)
        ax.set_title(title)
    plt.tight_layout()

for higgs_mass in mc_higgs_models:
    var2 = f"composed_{higgs_mass}"
    corrPlot(var2)
    plt.savefig(f"plots/corr_{var1}_{var2}", facecolor="white")

## By-eye selection cut

### Demonstration

In [None]:
df = data[helpers.byEyeSelectionCut(data, 85, out=True)]

In [None]:
by_eye = lambda df: df[helpers.byEyeSelectionCut(df)]

for higgs_mass in mc_higgs_models:
    var2 = f"composed_{higgs_mass}"
    corrPlot(var2, selector=by_eye)
    plt.savefig(f"plots/corr_by_eye_{var1}_{var2}", facecolor="white")

## Boosted decision trees selection cut


In [None]:
s_bdt, b_bdt, d_bdt = getPreselectedSBD()
for frame in itertools.chain(s_bdt.values(), b_bdt.values(), d_bdt.values()):
    helpers.addLogisticRegressionResults(frame)

In [None]:
    var1 = "mmis"
    var2 = "composed"
    binning = [variables.info[v]["binning"] for v in [var1, var2]]
    
    s_hist2d, b_hist2d, d_hist2d = {}, {}, {}
    for higgs_mass in s_bdt:
        if "composed" in var1:
                var1 = "composed_" + higgs_mass
                binning[0] = np.linspace(min(b_bdt[higgs_mass][var1]), 
                                         max(b_bdt[higgs_mass][var1]), 
                                         100, endpoint=True)
        if "composed" in var2:
                var2 = "composed_" + higgs_mass
                binning[1] = np.linspace(min(b_bdt[higgs_mass][var2]), 
                                         max(b_bdt[higgs_mass][var2]), 
                                         100, endpoint=True)
    
        s_hist2d[higgs_mass] = np.histogram2d(
                s_bdt[higgs_mass][var1], s_bdt[higgs_mass][var2], 
                bins=binning, 
                weights=s_bdt[higgs_mass]["weight"]
        )[0]
        b_hist2d[higgs_mass] = np.histogram2d(
                b_bdt[higgs_mass][var1], b_bdt[higgs_mass][var2], 
                bins=binning, 
                weights=b_bdt[higgs_mass]["weight"]
        )[0]
        d_hist2d[higgs_mass] = np.histogram2d(
                d_bdt[higgs_mass][var1], d_bdt[higgs_mass][var2], 
                bins=binning, 
                weights=d_bdt[higgs_mass]["weight"]
        )[0]
        print(f"# Data events in model {higgs_mass}: {sum(sum(d_hist2d[higgs_mass]))}.")
   

### Throw MC toys & calculate log likelihood ratios

In [None]:
n_exp = 10000
llrs, llrs_obs = {}, {}
for higgs_mass in by_eye_bkgs:
    llrs[higgs_mass] = LogLikRatio(
        background=b_hist2d[higgs_mass],
        signal=s_hist2d[higgs_mass],
        n_experiments=n_exp
    )
    llrs_obs[higgs_mass] = LogLikRatioObserved(
        background=b_hist2d[higgs_mass], 
        signal=s_hist2d[higgs_mass],
        data=d_hist2d[higgs_mass],
    )

In [None]:
cl_s_and_b, quantiles_b, quantiles_s_plus_b = LogLikRatioPlots(llrs, llrs_obs, n_bins=300)
plt.savefig(f"plots/loglikeliratio/{var1}-{var2}")

## Confidence levels


In [None]:
for higgs_mass in cl_s_and_b:
    one_minus_cl_b, cl_s_plus_b = cl_s_and_b[higgs_mass]
    cl_s = cl_s_plus_b / (1 - one_minus_cl_b) 
    print(f"{higgs_mass}: Confidence level CL >= {cl_s:.3f}")