# Higgs@L3

Here we go back to using a one-dimensional distribution (in `mmis`).
BDTs are used for the preselection.

## Prerequisites

The BDT responsed for each of the mass hypothesis is needed to run this notebook.

## Some information on the MC simulation and detector data sets

## (1D) Likelihood analysis

## Quantile plots

In [None]:
import itertools
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import pickle

import helpers
import plotting as pl
import stats as stat
import variables

## Some information on the MC simulation and detector data sets

In [None]:
from load_data import data, mc_higgs_models, mc_no_higgs_frames

\begin{array}{|l||c|c|c|c|}
\hline
\textrm{sample name} & \textrm{type of data} & \textrm{real data} & \textrm{No. of events} & \sigma   [\textrm{pb}] \\ \hline
\textrm{higgs_qq} & q \bar{q} & - & 200000 & 102 \\ \hline
\textrm{higgs_ww} & W^+ W^- & - & 294500 & 16.5 \\ \hline
\textrm{higgs_zz} & ZZ & - & 196000 & 0.975 \\ \hline
\textrm{higgs_zee} & Z e^+ e^- & - & 29500 & 3.35 \\ \hline
\textrm{higgs_wen} & q \bar{q} e \nu_e & - & 81786 & 2.90 \\ \hline
\textrm{higgs_eeqq} & \textrm{two photon coll} & - & 5940000 & 15600 \\ \hline
\textrm{higgs_data} & \textrm{data} & x & - & - \\ \hline
\textrm{higgs_higgs_85} & \textrm{Higgs} (m_{H} = 85 \ \textrm{GeV}) & - & 3972 & 0.0940\\ \hline
\textrm{higgs_higgs_90} & \textrm{Higgs} (m_{H} = 90 \ \textrm{GeV}) & - & 3973 & 0.0667\\ \hline
\textrm{higgs_higgs_95} & \textrm{Higgs} (m_{H} = 95 \ \textrm{GeV}) & - & 3971 & 0.0333\\ \hline	
\end{array}

### Introduce weights in order to rescale all the MC samples to the same luminosity

per definition the weight of each measured event (data) is $1$. 
Thus, we want to rescale the MC to the same integrated Luminosity of the data taking which is $L = 176.773 \ \mathrm{pb}^{-1}$.

The weight for each MC sample is:
$$ 
        \mathrm{weight}_\mathrm{MC} = L \cdot  \frac{\sigma_\mathrm{MC} }{N_\mathrm{MC}}
$$

In [None]:
bdt_collection = {} 
for higgs_mass in mc_higgs_models:
    with open(f"tmp/BDT_{higgs_mass}.pkl", "rb") as fid:
        bdt_collection[higgs_mass] = pickle.load(fid)
    
for frame in itertools.chain(mc_higgs_models.values(), 
                             mc_no_higgs_frames.values(), 
                             [data]):
    helpers.addLogisticRegressionResults(df=frame)
    for higgs_mass in mc_higgs_models:
        bdt_response = bdt_collection[higgs_mass].decision_function
        frame[f"BDT_selCut{higgs_mass[-2:]}"] = bdt_response(frame[helpers.kinematical_vars])


### Preselection: Cut on the BDT value

In [None]:
vars = helpers.kinematical_vars + ["weight"]
sig, bkg, dat = {}, {}, {}
for higgs_mass in mc_higgs_models:
    s = mc_higgs_models[higgs_mass]
    b = pd.concat(mc_no_higgs_frames, ignore_index=True)
    sig[higgs_mass] = s[s[f"BDT_selCut{higgs_mass[-2:]}"] > helpers.bdt_cut[higgs_mass]][vars]
    bkg[higgs_mass] = b[b[f"BDT_selCut{higgs_mass[-2:]}"] > helpers.bdt_cut[higgs_mass]][vars]

    dat[higgs_mass] = data[data[f"BDT_selCut{higgs_mass[-2:]}"] > cut[higgs_mass]][vars]

    sig[higgs_mass].to_csv(f"tmp/sig_{higgs_mass[-2:]}.csv")
    bkg[higgs_mass].to_csv(f"tmp/bkg_{higgs_mass[-2:]}.csv")
    dat[higgs_mass].to_csv(f"tmp/data_{higgs_mass[-2:]}.csv")

### Purity in range $90 \pm 5$ GeV

In [None]:
for higgs_mass in mc_higgs_models:
    s = sig[higgs_mass]
    b = bkg[higgs_mass]
    sig_in_interval = s[(s["mmis"] < 95) & (s["mmis"] > 85)]["weight"].sum()
    bkg_in_interval = b[(b["mmis"] < 95) & (b["mmis"] > 85)]["weight"].sum()

    purity = sig_in_interval / bkg_in_interval
    print(f"{higgs_mass}: Purity = {100*purity:.2f} % with {len(dat[higgs_mass]):2} observed events.")

### Distribution in variable of interest

In [None]:
variable = "mmis"

binning = variables.info[variable]["binning"]
bkg_histos = {}
sig_histos = {}
data_histos = {}

for higgs_mass in sig:
    sig_histos[higgs_mass] = np.histogram(
           sig[higgs_mass][variable], bins=binning, weights=sig[higgs_mass]["weight"])[0]
    bkg_histos[higgs_mass] = np.histogram(
           bkg[higgs_mass][variable], bins=binning, weights=bkg[higgs_mass]["weight"])[0]
    data_histos[higgs_mass] = np.histogram(
           dat[higgs_mass][variable], bins=binning, weights=dat[higgs_mass]["weight"])[0]  

In [None]:
pl.BkgSigHistos(backgrounds=bkg_histos,
                signals=sig_histos,
                datas=data_histos,
                var=variable, 
                binning=binning,
                # asymm_errors=True,
                save_as=f"plots/VariableDists/{variable}.png")

## (1D) Likelihood analysis

The Log-likelihood ratio is defined as

$$
    -2 \log (Q(m_H)) = 2 s_{tot} - 2 \sum_{i=1}^{N_{bins}} N_i \log \left( 1 + \frac{s_i(m_H)}{b_i} \right)
$$.

In [None]:
n_exp = 10000
llrs = []
llrs_obs = []
for higgs_mass in bkg_histos:
    llrs.append(stat.LogLikRatio(
        background=bkg_histos[higgs_mass],
        signal=sig_histos[higgs_mass],
        n_experiments=n_exp
    ))
    llrs_obs.append(stat.LogLikRatioObserved(
        background=bkg_histos[higgs_mass], 
        signal=sig_histos[higgs_mass],
        data=data_histos[higgs_mass],
        ))

In [None]:
CLlist, Quantiles_b, Quantiles_sPlusb = pl.LogLikRatioPlots(
    llrs,
    obs=llrs_obs, Nbins=300,
    savepath=f"plots/loglikeliratio/{variable}.png",
)                                                           
print(CLlist)

# Quantile plot

$$CL_s = \frac{CL_{s+b}}{CL_b} = \frac{CL_{s+b}}{1-(1-CL_b)}$$

In [None]:
CLs = [entry[1]/(1-entry[0]) for entry in CLlist]
print(CLs)

print("Confidence level CL >= ")
for i in range(len(CLs)):
    print("    ", 1 - CLs[i])

In [None]:
print(Quantiles_b[0])
print(Quantiles_b[1])
print(Quantiles_b[2])

# print(Quantiles_sPlusb)

In [None]:
print(Quantiles_b[:][0][0])

In [None]:
MHiggs = np.array([85.,90.,95])

In [None]:
Quantiles_b = [[4.17450166378066, [0.36682136305160284, 7.6648752727822966], [-3.9168189752685834, 10.837942190056509]], [2.5979840592736423, [-0.46927828916865977, 5.3244394801112449], [-3.9909498744172254, 7.7100879733441445]], [0.70195337492436138, [-0.90642270726311835, 2.0884844802583942], [-2.7921050105174032, 3.2531706087389818]]]
Quantiles_sPlusb = [[-5.3446990880419776, [-10.738912847408141, -0.42644536626695029], [-16.609086644365437, 4.015848317916948]], [-3.0821314008046912, [-6.9446099136579615, 0.5531424936454421], [-11.261497663317497, 3.8476094604908759]], [-0.68457773040967318, [-2.6257212778773198, 0.9792595959911683], [-4.8441710464117724, 2.4212519455385628]]]

In [None]:
plt.figure(figsize=(9,7))



y_b = np.array([Quantiles_b[0][0],Quantiles_b[1][0],Quantiles_b[2][0]])
y_b_oneS_lower = np.array([Quantiles_b[0][1][0],Quantiles_b[1][1][0],Quantiles_b[2][1][0]])
y_b_oneS_upper = np.array([Quantiles_b[0][1][1],Quantiles_b[1][1][1],Quantiles_b[2][1][1]])


plt.plot(MHiggs,y_b,'b--',label='expected bkg',linewidth=3)


plt.fill_between(MHiggs, y_b_oneS_lower, y_b_oneS_upper,facecolor='lawngreen',alpha=0.99)




#plt.plot(MHiggs,[Quantiles_b[0][1][0],Quantiles_b[1][1][0],Quantiles_b[2][1][0]],'y-',color='lawngreen')
#plt.plot(MHiggs,[Quantiles_b[0][1][1],Quantiles_b[1][1][1],Quantiles_b[2][1][1]],'y-',color='lawngreen')


y_b_twoS_lower = np.array([Quantiles_b[0][2][0],Quantiles_b[1][2][0],Quantiles_b[2][2][0]])
y_b_twoS_upper = np.array([Quantiles_b[0][2][1],Quantiles_b[1][2][1],Quantiles_b[2][2][1]])

plt.fill_between(MHiggs, y_b_twoS_lower, y_b_oneS_lower,facecolor='yellow',alpha=0.9)
plt.fill_between(MHiggs, y_b_oneS_upper, y_b_twoS_upper,facecolor='yellow',alpha=0.9)


#plt.plot(MHiggs,[Quantiles_b[0][2][0],Quantiles_b[1][2][0],Quantiles_b[2][2][0]],'g-',color='yellow')
#plt.plot(MHiggs,[Quantiles_b[0][2][1],Quantiles_b[1][2][1],Quantiles_b[2][2][1]],'g-',color='yellow')





plt.plot(MHiggs,[Quantiles_sPlusb[0][0],Quantiles_sPlusb[1][0],Quantiles_sPlusb[2][0]],'-.',label='expected sig+bkg',
        color='brown',linewidth=3)


plt.plot(MHiggs,llrs_obs,'r-',label='observed',linewidth=3)

plt.hlines(0,85,95,linewidth=1)

plt.xlim(85.,95.)
plt.ylim(-6,13)
plt.xlabel(r'$m_{H}$ [GeV]',fontsize=14)
plt.ylabel(r'$-2 \ln (Q)$',fontsize=14)
plt.legend(fontsize=14)
plt.savefig("plots/quantile_plot_2D.png")

### 1 - $CL_b$ plots

In [None]:
n_pseudo_experiments = 10000
llr_pseudo_observed = {higgs_mass: np.zeros(n_pseudo_experiments) for higgs_mass in bkg_histos}

for i in range(n_pseudo_experiments):
    for higgs_mass in bkg_histos:
        n_s_plus_b_pseudo = np.random.poisson(lam=sig_histos[higgs_mass]+bkg_histos[higgs_mass])   
        llr_pseudo_observed[higgs_mass][i] = stat.LogLikRatioObserved(background=bkg_histos[higgs_mass],
                                                    signal=sig_histos[higgs_mass],
                                                    data=n_s_plus_b_pseudo)

In [None]:
CL_one_minus_CLb_s_plus_b = {}
for i, higgs_mass in enumerate(bkg_histos):
    CL_one_minus_CLb_s_plus_b[higgs_mass] = stat.GetCLOneMinusb(array=llrs[i][0], cut=llr_pseudo_observed[higgs_mass])

In [None]:
plt.figure(figsize=(9,6))
higgs_masses = [85, 90, 95]
plt.plot(higgs_masses, [np.mean(CL_one_minus_CLb_s_plus_b[f"higgs_{h}"]) for h in higgs_masses],'-.',color='brown',label='expected sig+bkg',linewidth=3)
plt.plot(higgs_masses, [0.5,0.5,0.5],'--',color='blue',label='expected bkg',linewidth=3)
plt.plot(higgs_masses, [CLlist[0][0],CLlist[1][0],CLlist[2][0]],'r-',label='observed',linewidth=3)

plt.hlines(0.16,85,95,color='k',linewidth=1)
plt.text(95,0.16,r'$1 \sigma$',fontsize=14)
plt.hlines(0.023,85,95,color='k',linewidth=1)
plt.text(95,0.023,r'$2 \sigma$',fontsize=14)
plt.hlines(1.35*10**(-3),85,95,color='k',linewidth=1)
plt.text(95,1.35*10**(-3),r'$3 \sigma$',fontsize=14)

plt.semilogy()
plt.xlim(85,95)
plt.legend(fontsize=14,loc='best')
plt.xlabel(r'$m_{H}$ [GeV]',fontsize=14)
plt.ylabel(r'1 - CL${}_\mathrm{b}$',fontsize=14)

plt.savefig("plots/OneMinusCLb_plot.png")