# 3. Higgs@L3-2D

Here we want to look into two-dimensional distributions.

## Prerequisites

The calculations build on both the BDT variables and the Logistic Regression output.
Those two steps/notebooks have to be executed first.

## Preselection 

The preselection schemes used in this notebook are:

1. **No preselection**.
2. Preselection by  1D and 2D cuts that were chosen **by eye**.
3. Preselection using machine learning **algorithm responses**:
  - The previously trained BDT response.
  - The previously trained Logistic Regression response.
   
## Likelihood analysis

The same kind of likelihood analysis as in `Higgs@L3.ipynb` can be performed in this case as well.
For more documentation, we refer to that notebook.

Since here we build 2D, instead of 1D, histograms, empty bins are more likely and should be handled in a meaningful way.
Especially thought has to be put into the case of bins with zero background expectation, but non-zero signal/data.
As this is a sign of not enough Monte Carlo data, a rebinning should be performed.

In [None]:
import itertools
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import plotting_2D as pl  # Has an additional funtion for 2D histograms.
import stats as stat
import helpers

from load_data import data, mc_higgs_models, mc_no_higgs_frames
import variables

In [None]:
mc_no_higgs = pd.concat(mc_no_higgs_frames, ignore_index=True) 

## Composed variable with MVA methods

Using the coefficients from Higgs@L3_Logistic_Regression.ipynb

In [None]:
for frame in itertools.chain(mc_higgs_models.values(), 
                             mc_no_higgs_frames.values(), 
                             [mc_no_higgs, data]):
    helpers.addLogisticRegressionResults(df=frame)


## Chose a data frame

In [None]:


df = data[helpers.byEyeSelectionCut(data, 85, out=True)]

# Prepare the histograms (Selection Cuts)

In [None]:
variable =  "mmis"
binning = variables.info[variable]["binning"]
by_eye_bkgs = {}
by_eye_sigs = {} 
by_eye_datas = {} 

for m_higgs, higgs_model_df in mc_higgs_models.items():
    bkg = np.zeros(len(binning)-1)
    for bkg_df in mc_no_higgs_frames.values():
        df = bkg_df[helpers.byEyeSelectionCut(bkg_df, int(m_higgs[-2:]))]
        bkg += np.histogram(df[variable], bins=binning, weights=df["weight"])[0]
    by_eye_bkgs[m_higgs] = bkg

    df = higgs_model_df[helpers.byEyeSelectionCut(higgs_model_df, int(m_higgs[-2:]))]
    by_eye_sigs[m_higgs] = np.histogram(df[variable], bins=binning, weights=df["weight"])[0]

    df = data[helpers.byEyeSelectionCut(data, int(m_higgs[-2:]))]
    by_eye_datas[m_higgs] = np.histogram(df[variable], bins=binning, weights=df["weight"])[0]

    print(f"# Data events in model {m_higgs}: {sum(by_eye_datas[m_higgs])}.")

In [None]:
b = by_eye_bkgs["higgs_85"]
np.where(b == 717)
by_eye_sigs["higgs_85"][np.where(b == 717)].shape

In [None]:
pl.BkgSigHistos(backgrounds=by_eye_bkgs, signals=by_eye_sigs, datas=by_eye_datas, 
                var=variable, binning=binning,
                save_as=f"plots/VariableDists/{variable}"
)

## The Log-likelihood ratio we define to be

$$
    -2 \ln (Q(m_H)) = 2 s_{tot} - 2 \sum_{i=1}^{N_{bins}} N_i \ln \left( 1 + \frac{s_i(m_H)}{b_i} \right).
$$

In [None]:
n_exp = 10000
llrs = []
llrs_obs = []
for higgs_mass in by_eye_bkgs:
    llrs.append(stat.LogLikRatio(
        background=by_eye_bkgs[higgs_mass],
        signal=by_eye_sigs[higgs_mass],
        n_experiments=n_exp
    ))
    llrs_obs.append(stat.LogLikRatioObserved(
        background=by_eye_bkgs[higgs_mass], 
        signal=by_eye_sigs[higgs_mass],
        data=by_eye_datas[higgs_mass],
        ))

In [None]:
CLlist, QuantileList_b, QuantileList_sPlusb = pl.LogLikRatioPlots(
    llrs,
    obs=llrs_obs, Nbins=300,
    savepath=f"plots/loglikeliratio/{variable}",
)

$$CL_s = \frac{CL_{s+b}}{CL_b} = \frac{CL_{s+b}}{1-(1-CL_b)}$$

In [None]:
CLs = [entry[1]/(1-entry[0]) for entry in CLlist]
print(CLs)

print("Confidence level CL >= ")
for i in range(len(CLs)):
    print("    ", 1 - CLs[i])


# 2D 

## Variable Correlations

In [None]:
# Specify the selection with the sel function.
sel = lambda df: df[helpers.byEyeSelectionCut(df)]
sel = lambda x: x

pl.TwoDHist(var1='mmis',var2='composed',
            mc_higgs_models={n: sel(df) for n, df in mc_higgs_models.items()},
            mc_no_higgs=sel(mc_no_higgs), 
            data=sel(data), 
            bins=(50,50),
            savepath="plots/Corr_",
)

## By-eye selection cut

In [None]:
var1 = "mmis"
var2 = "composed"

bkgModels = []
sigModels = [] 
data_histModels = [] 

binning1 = variables.info[var1]["binning"]
binning2 = variables.info[var2]["binning"]
binning = (6,6)#(10,10)#binning=[binnning1,binning2]))

# alternatively for by-eye
for m_higgs_name, signal_df in mc_higgs_models.items():
    if var1 == "composed":
            var1 += "_" + m_higgs_name
    if var2 == "composed":
            var2 += "_" + m_higgs_name
    try:bkg = np.zeros(binning)
    except: bkg = np.zeros((len(binning1)-1, len(binning2)-1))
    for bkg_df in mc_no_higgs_frames.values():            
        df = bkg_df[helpers.byEyeSelectionCut(bkg_df, int(m_higgs_name[-2:]))]
        bkg += np.histogram2d(df[var1], df[var2], bins=binning, weights=df["weight"])[0]
        bkgModels.append(bkg)
    
    df = signal_df[helpers.byEyeSelectionCut(signal_df, int(m_higgs_name[-2:]))]
    sigModels.append(np.histogram2d(
        df[var1], df[var2], bins=binning, weights=df["weight"])[0])
    
    df = data[helpers.byEyeSelectionCut(data, int(m_higgs_name[-2:]))]
    data_histModels.append(np.histogram2d(
        df[var1], df[var2], bins=binning, weights=df["weight"])[0])

    print(f"# data events in model {m_higgs_name}: {sum(sum(data_histModels[-1]))}")


## Boosted decision trees selection cut


In [None]:
s_bdt, b_bdt, d_bdt = {}, {}, {}
for hyp_name in helpers.higgs_hypothesis:
    s_bdt[hyp_name] = pd.read_csv(f"tmp/sig_{hyp_name[-2:]}")
    b_bdt[hyp_name] = pd.read_csv(f"tmp/bkg_{hyp_name[-2:]}")
    d_bdt[hyp_name] = pd.read_csv(f"tmp/data_{hyp_name[-2:]}")
for frame in itertools.chain(s_bdt.values(), b_bdt.values(), d_bdt.values()):
    helpers.addLogisticRegressionResults(frame)

### search for good cuts in composed variable

In [None]:
n_bins=100

for higgs_mass in s_bdt:
    plt.figure()
    m_h = higgs_mass[-2:]
    _, b, _ = plt.hist(s_bdt[higgs_mass]["composed_"+higgs_mass], 
            bins=n_bins, label="sig_"+m_h, 
            alpha=.8, density=True, color="red")
    plt.hist(b_bdt[higgs_mass]["composed_"+higgs_mass], 
            bins=b, label="bkg_"+m_h, 
            alpha=.8, density=True, color='blue')
    for p in []: #np.array([-5.,-1.4,0.8,5.]):
        plt.axvline(p, color="black")
    plt.legend()
    plt.xlabel("composed MVA variable")
    plt.ylabel("p.d.f.")
    plt.savefig(f"plots/CutsOnMVA_{higgs_mass}.png")

In [None]:
var1 = "mmis"
var2 = "composed"
binning = [variables.info[v]["binning"] for v in [var1, var2]]

s_hist2d, b_hist2d, d_hist2d = {}, {}, {}
for higgs_mass in s_bdt:
    if "composed" in var1:
            var1 = "composed_" + higgs_mass
            binning[0] = np.linspace(min(b_bdt[higgs_mass][var1]), 
                                     max(b_bdt[higgs_mass][var1]), 
                                     100, endpoint=True)
    if "composed" in var2:
            var2 = "composed_" + higgs_mass
            binning[1] = np.linspace(min(b_bdt[higgs_mass][var2]), 
                                     max(b_bdt[higgs_mass][var2]), 
                                     100, endpoint=True)

    s_hist2d[higgs_mass] = np.histogram2d(
            s_bdt[higgs_mass][var1], s_bdt[higgs_mass][var2], 
            bins=binning, 
            weights=s_bdt[higgs_mass]["weight"]
    )[0]
    b_hist2d[higgs_mass] = np.histogram2d(
            b_bdt[higgs_mass][var1], b_bdt[higgs_mass][var2], 
            bins=binning, 
            weights=b_bdt[higgs_mass]["weight"]
    )[0]
    d_hist2d[higgs_mass] = np.histogram2d(
            d_bdt[higgs_mass][var1], d_bdt[higgs_mass][var2], 
            bins=binning, 
            weights=d_bdt[higgs_mass]["weight"]
    )[0]
    print(f"# Data events in model {higgs_mass}: {sum(sum(d_hist2d[higgs_mass]))}.")

### Throw MC toys & calculate log likelihood ratios

In [None]:
n_exp = 10000
llrs = []
llrs_obs = []
for higgs_mass in by_eye_bkgs:
    llrs.append(stat.LogLikRatio(
        background=b_hist2d[higgs_mass],
        signal=s_hist2d[higgs_mass],
        n_experiments=n_exp
    ))
    llrs_obs.append(stat.LogLikRatioObserved(
        background=b_hist2d[higgs_mass], 
        signal=s_hist2d[higgs_mass],
        data=d_hist2d[higgs_mass],
        ))

In [None]:
CLlist, Quantiles_b, Quantiles_sPlusb = pl.LogLikRatioPlots(
    llrs,
    obs=llrs_obs, Nbins=300,
    savepath=f"plots/loglikeliratio/{var1}-{var2}",
)

## Confidence levels


In [None]:
CLs = [entry[1]/(1-entry[0]) for entry in CLlist]
print(CLs)

#print("binning = ", binning)
print("Confidence level CL >= ")
for i in range(len(CLs)):
    print("    ", 1 - CLs[i])

## Print data to use it in other code

In [None]:
print("CLlist: \n", CLlist, "\n")
print("Quantiles_b: \n", Quantiles_b, "\n")
print("Quantiles_sPlusb: \n", Quantiles_sPlusb, "\n")


# Results:

## winner:

binning = [(np.linspace(50,130,28),np.array([-5.,-1.4,0.8,5.])) , (np.linspace(50,130,28),np.array([-5.,-1.8,-0.8,5.])), (np.linspace(50,130,28),np.array([-5.,-1.8,-1.,5.]))]


    Confidence level CL >= 
     0.961754104926
     0.63456405694
     0.515910192215
     

## out of:
    
binning = [(np.linspace(50,130,14),np.array([-5.,-.1,5.])),(np.linspace(50,130,28),np.array([-5.,-1.,5.])),(np.linspace(50,130,14),np.array([-5.,-1.,5.]))] 
     
     Confidence level CL >= 
     0.627515400411
     0.498518831993
     0.432560077789
     
binning = [(np.linspace(50,130,28),np.array([-5.,-.7,1.,5.])),(np.linspace(50,130,28),np.array([-5.,-.3,5.])),(np.linspace(50,130,28),np.array([-5.,-1.8,-1.,5.]))] 

    Confidence level CL >= 
     0.867198335645
     0.311822510384
     0.52351902392
     
binning = [(np.linspace(50,130,14),np.array([-5.,-1.,5.])),(np.linspace(50,130,14),np.array([-5.,-1,5.])),(np.linspace(50,130,14),np.array([-5.,-1.,5.]))] 

    Confidence level CL >= 
     0.784446138765
     0.466722830666
     0.437692955375


binning = [(np.linspace(50,130,28),np.array([-5.,-1.,5.])),(np.linspace(50,130,28),np.array([-5.,-1,5.])),(np.linspace(50,130,28),np.array([-5.,-1.,5.]))] 

    Confidence level CL >= 
     0.791839859987
     0.495279516283
     0.475018532246
     
binning = [(np.linspace(50,130,14),np.array([-5.,-2,-1,0.1,5.])),(np.linspace(50,130,28),np.array([-5.,-1.8,-0.8,.7,5.])),(np.linspace(50,130,14),np.array([-5.,-1.6,-0.8,0,5.]))] 

    Confidence level CL >= 
     0.635538790906
     0.509347258486
     0.396903225806

binning = [(np.linspace(50,130,14),np.linspace(-4,4,7)),(np.linspace(50,130,28),np.array([-5.,-1.8,-0.4,5.])),(np.linspace(50,130,14),np.array([-5.,-1.6,-0.8,0,1.8,5.]))] 

    Confidence level CL >= 
     0.848068181818
     0.476338729763
     0.400889121339
     
binning = [(np.linspace(50,130,28),np.linspace(-4,4,7)),(np.linspace(50,130,14),np.array([-5.,-1.8,-0.8,5.])),(np.linspace(50,130,28),np.array([-5.,-1.6,-0.8,0,1.8,5.]))] 
     
    Confidence level CL >= 
     0.87096403539
     0.610092252973
     0.408384897083
     
binning = [(np.linspace(50,130,28),np.linspace(-4,4,9)),(np.linspace(50,130,14),np.array([-5.,-1.6,-0.6,5.])),(np.linspace(50,130,28),np.array([-5.,-1.6,-1.,5.]))] 

    Confidence level CL >= 
     0.723699182787
     0.500732984293
     0.523329129887
     
binning = [(np.linspace(50,130,28),np.linspace(-4,4,4)),(np.linspace(50,130,14),np.array([-5.,-2,-1,5.])),(np.linspace(50,130,28),np.array([-5.,-1.4,-1.,5.]))] 

    Confidence level CL >= 
     0.95799887997
     0.486776946581
     0.498662470496
    
     
90 .1: 0.369854597583 (linspace 14)
90 -2: 0.551464897734 (linspace 28)
90 -1.8 & -0.8: 0.63482831898 (linspace 28)


binning = [(7,7),(8,8),(7,4)]

    Confidence level CL >= 
     0.952580028667
     0.026902690269
     0.3249055812
     
binning = [(8,6),(8,6),(7,3)]     

    Confidence level CL >= 
     0.838144867118
     0.016
     0.410908646203

binning =  [(8, 7), (2, 2), (6, 3)]

    Confidence level CL >= 
     0.938706400271
     0.162020730603
     0.127041742287

binning =  [(7, 8), (5, 4), (5, 3)]

    Confidence level CL >= 
     0.929531705131
     0.030703070307
     0.268274383708
     
binning =  [(7, 6), (9, 9), (7, 4)]

    Confidence level CL >= 
     0.844532554257
     0.03971985993
     0.324955492722
     
binning =  [(28,3)]

    Confidence level CL >= 
     0.550010022049
     0.14181198637
     0.0339
     
binning = [(15,4),(15,4),(15,4)]

    Confidence level CL >= 
     0.1064
     0.046704670467
     0.749083063646
     
binning =  [(28, 1), (28, 1), (28, 1)]

    Confidence level CL >= 
     0.862589928058
     0.432515663162
     0.443015521064
     
binning =  [(28, 2), (28, 2), (28, 2)]

    Confidence level CL >= 
     0.95981765997
     0.183398653402
     0.436894285396
     
     
binning =  [(28, 3), (28, 3), (28, 3)]

    Confidence level CL >= 
     0.55825145378
     0.139593145606
     0.0312062412483
     
binning =  [(14, 2), (14, 2), (14, 2)]

    Confidence level CL >= 
     0.963696369637
     0.109874686717
     0.554731224782

linspace 28*2
    0.940685820204
     0.281580016247
     0.720828215476
     
