# MSSMHbb-SPECT stage 1: Training

## Description
This is a standalone plugin to evaluate classification techniques for the MSSM Hbb analysis of the DESY CMS Higgs -> bb group. 
In this stage the trainings will be performed. 

This code is independent of CMSSW, however, running in the CMSSW area of the main analysis framework is recommended.


## Installation
Download the repository . It is recommended to clone it in the analysis-test area


## Inputs
This code reads the output of the MSSM  Hbb analysis, with the variables relevant for the training saved in a flat ntuple, ant their corresponding weight saved in a branch in an event-basis.


In [1]:
import ROOT
from ROOT import TMVA, TFile, TTree, TCut, TCanvas, TString


Welcome to JupyROOT 6.24/08


In [2]:

#load the input files and trees
signalFile = TFile.Open("rootfiles_signal/mssmHbb_2018_FH_600_sr.root")
backgroundFileQCDbEnriched1 = TFile.Open("rootfiles_QCD/mssmHbb_2018_FH_QCD_bEnriched_HT100to200_sr.root")
backgroundFileQCDbEnriched2 = TFile.Open("rootfiles_QCD/mssmHbb_2018_FH_QCD_bEnriched_HT200to300_sr.root")
backgroundFileQCDbEnriched3 = TFile.Open("rootfiles_QCD/mssmHbb_2018_FH_QCD_bEnriched_HT300to500_sr.root")
backgroundFileQCDbEnriched4 = TFile.Open("rootfiles_QCD/mssmHbb_2018_FH_QCD_bEnriched_HT500to700_sr.root")
backgroundFileQCDbEnriched5 = TFile.Open("rootfiles_QCD/mssmHbb_2018_FH_QCD_bEnriched_HT700to1000_sr.root")
backgroundFileQCDbEnriched6 = TFile.Open("rootfiles_QCD/mssmHbb_2018_FH_QCD_bEnriched_HT1000to1500_sr.root")
backgroundFileQCDbEnriched7 = TFile.Open("rootfiles_QCD/mssmHbb_2018_FH_QCD_bEnriched_HT1500to2000_sr.root")
backgroundFileQCDbEnriched8 = TFile.Open("rootfiles_QCD/mssmHbb_2018_FH_QCD_bEnriched_HT2000toInf_sr.root")

backgroundFileQCDbGenFilter1 = TFile.Open("rootfiles_QCD/mssmHbb_2018_FH_QCD_HT100to200_BGenFilter_sr.root")
backgroundFileQCDbGenFilter2 = TFile.Open("rootfiles_QCD/mssmHbb_2018_FH_QCD_HT200to300_BGenFilter_sr.root")
backgroundFileQCDbGenFilter3 = TFile.Open("rootfiles_QCD/mssmHbb_2018_FH_QCD_HT300to500_BGenFilter_sr.root")
backgroundFileQCDbGenFilter4 = TFile.Open("rootfiles_QCD/mssmHbb_2018_FH_QCD_HT500to700_BGenFilter_sr.root")
backgroundFileQCDbGenFilter5 = TFile.Open("rootfiles_QCD/mssmHbb_2018_FH_QCD_HT700to1000_BGenFilter_sr.root")
backgroundFileQCDbGenFilter6 = TFile.Open("rootfiles_QCD/mssmHbb_2018_FH_QCD_HT1000to1500_BGenFilter_sr.root")
backgroundFileQCDbGenFilter7 = TFile.Open("rootfiles_QCD/mssmHbb_2018_FH_QCD_HT1500to2000_BGenFilter_sr.root")
backgroundFileQCDbGenFilter8 = TFile.Open("rootfiles_QCD/mssmHbb_2018_FH_QCD_HT2000toInf_BGenFilter_sr.root")

#output file
outputFile = TFile.Open("TMVAOutput.root", "RECREATE")

In [3]:
# Create a TMVA factory
factory = TMVA.Factory("TMVA_Classification", outputFile, "AnalysisType=Classification")

# Define the DataLoader and set up the variables
loader = TMVA.DataLoader("dataset")

# Get the signal and background TTrees
signalTree = signalFile.Get("mssmhbb_MVA")
backgroundTreeQCDbEnriched1 = backgroundFileQCDbEnriched1.Get("mssmhbb_MVA")
backgroundTreeQCDbEnriched2 = backgroundFileQCDbEnriched2.Get("mssmhbb_MVA")
backgroundTreeQCDbEnriched3 = backgroundFileQCDbEnriched3.Get("mssmhbb_MVA")
backgroundTreeQCDbEnriched4 = backgroundFileQCDbEnriched4.Get("mssmhbb_MVA")
backgroundTreeQCDbEnriched5 = backgroundFileQCDbEnriched5.Get("mssmhbb_MVA")
backgroundTreeQCDbEnriched6 = backgroundFileQCDbEnriched6.Get("mssmhbb_MVA")
backgroundTreeQCDbEnriched7 = backgroundFileQCDbEnriched7.Get("mssmhbb_MVA")
backgroundTreeQCDbEnriched8 = backgroundFileQCDbEnriched8.Get("mssmhbb_MVA")
backgroundTreeQCDbGenFilter1 = backgroundFileQCDbGenFilter1.Get("mssmhbb_MVA")
backgroundTreeQCDbGenFilter2 = backgroundFileQCDbGenFilter2.Get("mssmhbb_MVA")
backgroundTreeQCDbGenFilter3 = backgroundFileQCDbGenFilter3.Get("mssmhbb_MVA")
backgroundTreeQCDbGenFilter4 = backgroundFileQCDbGenFilter4.Get("mssmhbb_MVA")
backgroundTreeQCDbGenFilter5 = backgroundFileQCDbGenFilter5.Get("mssmhbb_MVA")
backgroundTreeQCDbGenFilter6 = backgroundFileQCDbGenFilter6.Get("mssmhbb_MVA")
backgroundTreeQCDbGenFilter7 = backgroundFileQCDbGenFilter7.Get("mssmhbb_MVA")
backgroundTreeQCDbGenFilter8 = backgroundFileQCDbGenFilter8.Get("mssmhbb_MVA")


# Define the input variables based on your feature names and data types
input_variables = {
    "dr_jets12": "F",
    "dr_jets13": "F",
    "dr_jets23": "F",
    "eta_jet1": "F",
    "eta_jet2": "F",
    "eta_jet3": "F",
    "deta_jet12": "F",
    "deta_jet13": "F",
    "deta_jet23": "F",
    "phi_jet1": "F",
    "phi_jet2": "F",
    "phi_jet3": "F",
    "dphi_jet12": "F",
    "dphi_jet13": "F",
    "dphi_jet23": "F",
    "pT_jet1": "F",
    "pT_jet2": "F",
    "pT_jet3": "F",
    "dpT_jet12": "F",
    "dpT_jet13": "F",
    "dpT_jet23": "F",
    "pTratio_jet12": "F",
    "pTratio_jet13": "F",
    "pTratio_jet23": "F",
    "pTimbalance_jet12": "F",
    "pTimbalance_jet13": "F",
    "pTimbalance_jet23": "F",
#    "qglikelyhood_jet1": "F",
#    "qglikelyhood_jet2": "F",
#    "qglikelyhood_jet3": "F"
}

# Add the input variables to the DataLoader
for var_name, var_type in input_variables.items():
    loader.AddVariable(var_name, var_type)

In [4]:
# Function to check variable names and types
def check_variable_names_and_types(loader, expected_variables):
    for var_name, var_type in expected_variables.items():
        var_index = loader.GetDataSetInfo().FindVarIndex(var_name)
        if var_index == -1:
            print(f"Variable '{var_name}' not found in the dataset.")
        elif loader.GetDataSetInfo().GetVariableInfo(var_index).GetVarType() != var_type:
            print(f"Variable '{var_name}' has an unexpected type. Expected: {var_type}, Actual: {loader.GetDataSetInfo().GetVariableInfo(var_index).GetVarType()}")
        else:
            print(f"Expected: {var_type}, Actual: {loader.GetDataSetInfo().GetVariableInfo(var_index).GetVarType()}")

# Call the function to check variable names and types
check_variable_names_and_types(loader, input_variables)

Expected: F, Actual: F
Expected: F, Actual: F
Expected: F, Actual: F
Expected: F, Actual: F
Expected: F, Actual: F
Expected: F, Actual: F
Expected: F, Actual: F
Expected: F, Actual: F
Expected: F, Actual: F
Expected: F, Actual: F
Expected: F, Actual: F
Expected: F, Actual: F
Expected: F, Actual: F
Expected: F, Actual: F
Expected: F, Actual: F
Expected: F, Actual: F
Expected: F, Actual: F
Expected: F, Actual: F
Expected: F, Actual: F
Expected: F, Actual: F
Expected: F, Actual: F
Expected: F, Actual: F
Expected: F, Actual: F
Expected: F, Actual: F
Expected: F, Actual: F
Expected: F, Actual: F
Expected: F, Actual: F


In [5]:


# load the signal and background trees
loader.AddSignalTree(signalTree, 1.0)
#loader.AddBackgroundTree(backgroundTreeQCDbEnriched1, 1.0)
#loader.AddBackgroundTree(backgroundTreeQCDbEnriched2, 1.0)
#loader.AddBackgroundTree(backgroundTreeQCDbEnriched3, 1.0)
loader.AddBackgroundTree(backgroundTreeQCDbEnriched4, 1.0)
loader.AddBackgroundTree(backgroundTreeQCDbEnriched5, 1.0)
loader.AddBackgroundTree(backgroundTreeQCDbEnriched6, 1.0)
loader.AddBackgroundTree(backgroundTreeQCDbEnriched7, 1.0)
loader.AddBackgroundTree(backgroundTreeQCDbEnriched8, 1.0)
#loader.AddBackgroundTree(backgroundTreeQCDbGenFilter1, 1.0)
#loader.AddBackgroundTree(backgroundTreeQCDbGenFilter2, 1.0)
#loader.AddBackgroundTree(backgroundTreeQCDbGenFilter3, 1.0)
loader.AddBackgroundTree(backgroundTreeQCDbGenFilter4, 1.0)
loader.AddBackgroundTree(backgroundTreeQCDbGenFilter5, 1.0)
loader.AddBackgroundTree(backgroundTreeQCDbGenFilter6, 1.0)
loader.AddBackgroundTree(backgroundTreeQCDbGenFilter7, 1.0)
#loader.AddBackgroundTree(backgroundTreeQCDbGenFilter8, 1.0)




<HEADER> DataSetInfo              : [dataset] : Added class "Signal"
                         : Add Tree mssmhbb_MVA of type Signal with 144900 events
<HEADER> DataSetInfo              : [dataset] : Added class "Background"
                         : Add Tree mssmhbb_MVA of type Background with 10324 events
                         : Add Tree mssmhbb_MVA of type Background with 3361 events
                         : Add Tree mssmhbb_MVA of type Background with 2701 events
                         : Add Tree mssmhbb_MVA of type Background with 2368 events
                         : Add Tree mssmhbb_MVA of type Background with 1359 events
                         : Add Tree mssmhbb_MVA of type Background with 207 events
                         : Add Tree mssmhbb_MVA of type Background with 279 events
                         : Add Tree mssmhbb_MVA of type Background with 133 events
                         : Add Tree mssmhbb_MVA of type Background with 135 events


In [6]:
# Define the signal and background cuts if needed
#loader.PrepareTrainingAndTestTree(signalCut, backgroundCut, "nTrain_Signal=:nTrain_Background=0:SplitMode=Random:NormMode=NumEvents:!V")
signalCut = ""
backgroundCut = ""

loader.PrepareTrainingAndTestTree(signalCut, backgroundCut, "nTrain_Signal=1000:nTrain_Background=200:SplitMode=Random:NormMode=NumEvents:!V")
#loader.PrepareTrainingAndTestTree(signalCut, backgroundCut, "nTrain_Signal=0.3:nTrain_Background=0.3:SplitMode=Random:NormMode=NumEvents:RandomSeed=1234:!V")

In [7]:

# Define the method (e.g., BDT) and add it to the factory ----> factory.BookMethod(loader, methodType, methodTitle, options)
factory.BookMethod(loader, ROOT.TString("BDT"), ROOT.TString("BDT"), "")

<cppyy.gbl.TMVA.MethodBDT object at 0x7b19400>

<HEADER> Factory                  : Booking method: BDT
                         : 
                         : Building event vectors for type 2 Signal
                         : Dataset[dataset] :  create input formulas for tree mssmhbb_MVA
                         : Building event vectors for type 2 Background
                         : Dataset[dataset] :  create input formulas for tree mssmhbb_MVA
                         : Building event vectors for type 2 Background
                         : Dataset[dataset] :  create input formulas for tree mssmhbb_MVA
                         : Building event vectors for type 2 Background
                         : Dataset[dataset] :  create input formulas for tree mssmhbb_MVA
                         : Building event vectors for type 2 Background
                         : Dataset[dataset] :  create input formulas for tree mssmhbb_MVA
                         : Building event vectors for type 2 Background
                         : Dataset[dat

In [8]:
# Train and test the classifier
factory.TrainAllMethods()
factory.TestAllMethods()
factory.EvaluateAllMethods()

<HEADER> Factory                  : Train all methods
<HEADER> Factory                  : [dataset] : Create Transformation "I" with events from all classes.
                         : 
<HEADER>                          : Transformation, Variable selection : 
                         : Input : variable 'dr_jets12' <---> Output : variable 'dr_jets12'
                         : Input : variable 'dr_jets13' <---> Output : variable 'dr_jets13'
                         : Input : variable 'dr_jets23' <---> Output : variable 'dr_jets23'
                         : Input : variable 'eta_jet1' <---> Output : variable 'eta_jet1'
                         : Input : variable 'eta_jet2' <---> Output : variable 'eta_jet2'
                         : Input : variable 'eta_jet3' <---> Output : variable 'eta_jet3'
                         : Input : variable 'deta_jet12' <---> Output : variable 'deta_jet12'
                         : Input : variable 'deta_jet13' <---> Output : variable 'deta_jet13'
      

0%, time left: unknown
6%, time left: 1 sec
12%, time left: 1 sec
18%, time left: 1 sec
25%, time left: 0 sec
31%, time left: 0 sec
37%, time left: 0 sec
43%, time left: 0 sec
50%, time left: 0 sec
56%, time left: 0 sec
62%, time left: 0 sec
68%, time left: 0 sec
75%, time left: 0 sec
81%, time left: 0 sec
87%, time left: 0 sec
93%, time left: 0 sec
0%, time left: unknown
7%, time left: 0 sec
13%, time left: 0 sec
19%, time left: 0 sec
25%, time left: 0 sec
32%, time left: 0 sec
38%, time left: 0 sec
44%, time left: 0 sec
50%, time left: 0 sec
57%, time left: 0 sec
63%, time left: 0 sec
69%, time left: 0 sec
75%, time left: 0 sec
82%, time left: 0 sec
88%, time left: 0 sec
94%, time left: 0 sec
0%, time left: unknown
6%, time left: 5 sec
12%, time left: 5 sec
18%, time left: 5 sec
25%, time left: 4 sec
31%, time left: 4 sec
37%, time left: 3 sec
43%, time left: 3 sec
50%, time left: 3 sec
56%, time left: 2 sec
62%, time left: 2 sec
68%, time left: 1 sec
75%, time left: 1 sec
81%, time 

In [9]:
# Save the factory and cleanup
factory.Delete()
outputFile.Close()
signalFile.Close()
backgroundFileQCDbEnriched1.Close()
backgroundFileQCDbEnriched2.Close()
backgroundFileQCDbEnriched3.Close()
backgroundFileQCDbEnriched4.Close()
backgroundFileQCDbEnriched5.Close()
backgroundFileQCDbEnriched6.Close()
backgroundFileQCDbEnriched7.Close()
backgroundFileQCDbEnriched8.Close()
backgroundFileQCDbGenFilter1.Close()
backgroundFileQCDbGenFilter2.Close()
backgroundFileQCDbGenFilter3.Close()
backgroundFileQCDbGenFilter4.Close()
backgroundFileQCDbGenFilter5.Close()
backgroundFileQCDbGenFilter6.Close()
backgroundFileQCDbGenFilter7.Close()
backgroundFileQCDbGenFilter8.Close()