# What happens in this notebook :-)

* We read in MC samples for background and different signal models. We also read in the actual data from the L3 detector.

* We train a linear model called Logistic Regression.

* The coefficients found by the algorithm are plotted and used in Higgs@L3-2D.ipynb.


### import packages

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression

from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier

import itertools
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os

import plotting as pl
import helpers

import pickle

from load_data import data, mc_higgs_models, mc_no_higgs_frames

### Load the BDT from sig-bkg classification

In [None]:
with open('tmp/BDT_higgs_85.pkl', 'rb') as fid:
    gnb_loaded = pickle.load(fid)
gnb_loaded.get_params()

In [None]:
for frame in itertools.chain(mc_higgs_models.values(), 
                             mc_no_higgs_frames.values(), 
                             [data]):
    frame["BDT_selCut"] = gnb_loaded.decision_function(frame[helpers.kinematical_vars])


In [None]:
print('There are ', len(data['weight']), 'data events recorded')

cut = -3.15# -3.4#1.41379310345
print('There are ', len(data[data['BDT_selCut']>cut]['weight']), 'candidates selected after the cut')

In [None]:
mc_no_higgs = pd.concat(mc_no_higgs_frames, ignore_index=True) 
df_mH = pd.concat([mc_no_higgs, mc_higgs_models["higgs_85"]], ignore_index=True)
df_mH = df_mH[helpers.kinematical_vars + ["class", "weight", "BDT_selCut"]]

df_mH = df_mH[df_mH["BDT_selCut"] > cut]
del df_mH["BDT_selCut"]

# Delete mmis because this is already the one golden disc variable

In [None]:
# Delete mmis because this is already the one golden disc variable
del df_mH['mmis']

# extract target values
target = df_mH['class']
del df_mH['class']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_mH, target, 
                                                    stratify=target, 
                                                    random_state=42,
                                                   train_size=0.65)
X_train_w = X_train['weight']
X_test_w = X_test['weight']


print('No of train events', len(y_train))
print('No of signal events',np.count_nonzero(y_train))
print('fraction of bkg in training set',1 - 1.*np.count_nonzero(y_train) / len(y_train))

print('-----------------------------')

print('No of test events',len(y_test))
print('No of signal events',np.count_nonzero(y_test))
print('fraction of bkg in test set',1 - 1.*np.count_nonzero(y_test) / len(y_test))


del X_train['weight']
del X_test['weight']


In [None]:
logreg = LogisticRegression(C=100).fit(X_train, y_train,sample_weight=X_train_w)

In [None]:
print("Training set score: {:.3f}".format(logreg.score(X_train, y_train)))
print("Test set score: {:.3f}".format(logreg.score(X_test, y_test)))

In [None]:
#plt.figure(figsize=(3,3))
plt.plot(logreg.coef_.T,'r^')
plt.title('Logistic Regression ($m_\mathrm{H}$ = 85 GeV)')
plt.ylabel(r'coefficient $\alpha_i$',fontsize=14)
plt.xlabel(r'feature $f_i$',fontsize=14)
#plt.plot(tree.coef_.T,'bo',label='tree')

plt.xticks(np.arange(X_train.shape[1]),X_train.columns,rotation=90)
plt.savefig("plots/LogRegCoef.png")

In [None]:
print(logreg.coef_)

In [None]:
coef_85 = np.array([ 0.313,  0.195,  0.284, -0.002, -0.031,  0.069, -0.019, -0.053,
       -0.173, -0.232, -0.295,  0.011,  0.004, -0.019,  0.009, -0.052,
       -0.033, -0.055, -0.003, -0.036,  0.106, -0.059, -0.006,  0.034,
        0.011, -0.027, -0.032,  0.057,  0.001, -0.003, -0.011,  0.024])

coef_90 = np.array([ 0.127,  0.184,  0.217, -0.043, -0.029, -0.016,  0.044, -0.046,
       -0.084, -0.149, -0.297,  0.002,  0.044,  0.051,  0.011, -0.049,
        0.039,  0.029,  0.012, -0.03 ,  0.072, -0.036,  0.03 , -0.001,
        0.053, -0.03 , -0.026,  0.028, -0.018, -0.004, -0.032, -0.046])

coef_95 = np.array([ 0.106,  0.048,  0.134, -0.091, -0.146,  0.059,  0.116,  0.008,
       -0.073, -0.094, -0.192, -0.001,  0.059,  0.021,  0.062, -0.023,
        0.039,  0.059,  0.018, -0.012,  0.035, -0.043,  0.069,  0.016,
       -0.002, -0.022,  0.001,  0.057,  0.006,  0.005,  0.013,  0.027])

In [None]:
coef_90_2 = np.array([ 0.158,  0.244,  0.335, -0.027, -0.078, -0.033,  0.115, -0.09 ,
       -0.047, -0.152, -0.198, -0.002, -0.   , -0.017, -0.01 , -0.023,
        0.035, -0.069, -0.048, -0.042,  0.11 , -0.004,  0.012,  0.022,
        0.009, -0.032, -0.026, -0.002, -0.024, -0.   , -0.02 , -0.14 ])