In [None]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
import sys
import numpy as np
import os
import re
import glob
import h5py

import matplotlib.pyplot as plt
%matplotlib inline
#from keras import backend as K

import pylab
import random
import math
import sklearn
from sklearn.metrics import roc_curve, roc_auc_score, auc
from sklearn.model_selection import train_test_split

import copy
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

sys.path.insert(0,'../python/')
from NN_helpers_keras import *
from tensorflow import keras
from tensorflow.keras import layers,callbacks,models,optimizers,layers,initializers,regularizers,constraints


In [None]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

In [None]:
np.random.RandomState(seed=42)

## W vs QCD

In [None]:
inpDir='../data/'
cols=dict()
hidden_size1=500
hidden_size2=250
hidden_size3=100
hidden_size4=50
batch_size = 5000
epochs = 400
cmap = get_cmap(30)

#branch ordering
#0'selGenJets_nom_pt', 1'selGenJets_nom_msoftdrop',  
#2'selGenJets_nom_tau_0p5_1', 3'selGenJets_nom_tau_1_1', 4'selGenJets_nom_tau_2_1',
#5'selGenJets_nom_tau_0p5_2', 6'selGenJets_nom_tau_1_2', 7'selGenJets_nom_tau_2_2',
#8'selGenJets_nom_tau_0p5_3', 9'selGenJets_nom_tau_1_3', 10'selGenJets_nom_tau_2_3',
#11'selGenJets_nom_tau_0p5_4', 12'selGenJets_nom_tau_1_4', 13'selGenJets_nom_tau_2_4', 
#14'selGenJets_nom_tau_0p5_5', 15'selGenJets_nom_tau_1_5', 16'selGenJets_nom_tau_2_5',
#17'selGenJets_nom_tau21', 18'selGenJets_nom_tau32', 
#19'selGenJets_nom_tau21_WTA',20'selGenJets_nom_tau32_WTA', 
#21'selGenJets_nom_tau21_exkT',22'selGenJets_nom_tau32_exkT', 
cols['2-body'] = [3,4]
cols['3-body'] = [2,3,4,6,7]
cols['4-body'] = [2,3,4,5,6,7,9,10]
cols['5-body'] = [2,3,4,5,6,7,8,9,10,12,13]
cols['6-body'] = [2,3,4,5,6,7,8,9,10,11,12,13,15,16]
cols['tau21'] = [17]
cols['tau32'] = [18]
cols['tau21_WTA'] = [19]
cols['tau32_WTA'] = [20]
cols['tau21_exkT'] = [21]
cols['tau32_exkT'] = [22]

Ms = [2,3,4,5,6]

In [None]:
import gc
from collections import OrderedDict
genDictScoreNN = OrderedDict()
genDictScoreObs = OrderedDict()
recoDictScoreNN = OrderedDict()
recoDictScoreObs = OrderedDict()


#### Training

In [None]:
#load data
for sampleType in ['Wlike']:#,'toplike']:
    for genORreco in ['gen','reco']:#,'reco']
        print(sampleType,genORreco)
        for f in os.listdir(inpDir):

            if not(sampleType in f and genORreco in f): continue
            print (f,genORreco)
            file = h5py.File(inpDir+f,'r')

            inputs = np.array(file['inputs']).reshape(file['inputs'].shape[1],file['inputs'].shape[2])
            targets = np.array(file['target']).flatten()#reshape(targets.shape[1])
            #print(inputs.shape[0],targets[0:100])
            dataset_dict = split_data(inputs,targets)
            file.close()
        
        print(f"Train set shape: Inputs {dataset_dict['inputs']['train'].shape}, Target {dataset_dict['targets']['train'].shape}")
        print(f"Validate set shape: Inputs {dataset_dict['inputs']['validate'].shape}, Target {dataset_dict['targets']['validate'].shape}")
        print(f"Test set shape: Inputs {dataset_dict['inputs']['test'].shape}, Target {dataset_dict['targets']['test'].shape}")

        ### M-body DNN training for W/top vs. QCD ###
        inputs_train = dataset_dict['inputs']['train']
        inputs_val = dataset_dict['inputs']['validate']
        inputs_test = dataset_dict['inputs']['test']
        target_train = dataset_dict['targets']['train']
        target_val = dataset_dict['targets']['validate']
        target_test = dataset_dict['targets']['test']

In [None]:
for sampleType in ['Wlike']:#,'toplike']:
    for genORreco in ['gen','reco']:#,'reco']
        print(sampleType,genORreco)
        
        for M in Ms:
            plt.figure(figsize=(12, 9))

            input_shape=(len(cols[f'{M}-body']),)
            for i in range(1,10):
                DNN_WvsQCD_Mbody = keras.Sequential([
                        layers.Dense(hidden_size1, activation='relu', input_shape=input_shape),
                        layers.Dropout(0.2),
                        layers.Dense(hidden_size2, activation='relu'),
                        layers.Dropout(0.2),
                        layers.Dense(hidden_size3, activation='relu'),
                        layers.Dropout(0.1),
                        layers.Dense(hidden_size4, activation='relu'),
                        layers.Dropout(0.1),
                        layers.Dense(1, activation='sigmoid')
                    ])
                DNN_WvsQCD_Mbody.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
                #DNN_WvsQCD_Mbody.summary()
                modelname = f"{genORreco}_DNN_WvsQCD_{M}body_{i}"   
                #print (f"Training {modelname}")
                check = keras.callbacks.ModelCheckpoint(filepath=f"../saved_weightsAndES/{modelname}_check.h5", verbose=0)
                early = keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, verbose=1, mode='auto')
                history = DNN_WvsQCD_Mbody.fit(  
                                                inputs_train[:,cols[f'{M}-body']], target_train, 
                                                batch_size=batch_size, epochs=epochs, 
                                                steps_per_epoch=5,
                                                validation_data=(inputs_val[:,cols[f'{M}-body']], target_val),
                                                validation_batch_size=int(batch_size/2), validation_steps=4, 
                                                verbose=0, callbacks=[check, early]
                                               )
                savelosses(history,outDir='../saved_weightsAndES/',name=modelname)
                savemodel(DNN_WvsQCD_Mbody,outDir='../models/',name=modelname)

                plt.plot(history.history['loss'], color=cmap(i), label=(f"{i}_loss"))
                plt.plot(history.history['val_loss'], linestyle="--", color=cmap(i))    

                pred = DNN_WvsQCD_Mbody.predict([inputs_test[:,cols[f'{M}-body']]])
                fpr,tpr,_ = roc_curve(target_test,pred)
                ls = ['-' , '--' , '-.']
                area = auc(fpr, tpr)
                if area>score[0]:
                    score[0]=area
                    score[1]=i
                if area<=0.6:
                    continue
                l = 'Training %d, auc=%f'%(i,area)
                print(l,modelname,score)
                del(DNN_WvsQCD_Mbody)
                gc.collect()

            #plt.yscale('log')
            plt.xlabel('Epoch', fontsize = 20)
            plt.ylabel('Loss', fontsize = 20)
            plt.title('Training Error by Epoch', fontsize = 20)    
            plt.legend(loc='best', fontsize = 15, fancybox=True, framealpha=0.0)
            plt.rc('xtick', labelsize = 16)
            plt.rc('ytick', labelsize = 16)   
            plt.savefig(f"../Plots/{genORreco}_DNN_WvsQCD_{M}body_losses.pdf")
            plt.savefig(f"../Plots/{genORreco}_DNN_WvsQCD_{M}body_losses.png")
            plt.show()
            


#### Testing

In [None]:
genDictScoreNN=dict()
recoDictScoreNN=dict()
genDictScoreObs=dict()
recoDictScoreObs=dict()
gc.collect()

problem = ['WvsQCD']#,'topvsQCD']
for p in problem:#,'toplike']:
    for genORreco in ['gen','reco']:
        for obs in cols.keys():
            if not('tau' in obs):
                genDictScoreNN[f'{obs}_DNN_{p}']=dict()
                recoDictScoreNN[f'{obs}_DNN_{p}']=dict()
            else:  
                if('32' in obs and 'W' in p): continue 
                if('21' in obs and 'top' in p): continue 
                genDictScoreObs[f'{obs}_{p}']=dict()
                recoDictScoreObs[f'{obs}_{p}']=dict()
            print(p,genORreco,obs)

print(genDictScoreNN,recoDictScoreNN,genDictScoreObs,recoDictScoreObs)

for p in problem:#,'toplike']:
    for genORreco in ['gen','reco']:
        for obs in cols.keys():
            print(obs,genORreco,p)
            #print(sampleType,genORreco)
            
            if not('tau' in obs):
                
                best_score=[0,0]
                for i in range(1,10):
                    
                    modelname=f'{genORreco}_DNN_{p}_{obs.split("-")[0]}body_{i}'
                    model = loadmodel(modelname,inDir='../models/')
                    prediction = np.array(model.predict(np.array(inputs_test[:,cols[obs]]),verbose=0))
                    fpr, tpr, _ = roc_curve(np.array(target_test),prediction)
                    area = auc(fpr,tpr)
                    
                    if area>best_score[0]:
                        
                        best_score[0]=area
                        best_score[1]=i
                        if 'gen' in genORreco:
                            genDictScoreNN[f'{obs}_DNN_{p}']['fpr'] = fpr
                            genDictScoreNN[f'{obs}_DNN_{p}']['tpr'] = tpr
                            genDictScoreNN[f'{obs}_DNN_{p}']['AUC'] = np.round(area,3)
                            genDictScoreNN[f'{obs}_DNN_{p}']['model'] = modelname
                        elif 'reco' in genORreco:
                            recoDictScoreNN[f'{obs}_DNN_{p}']['fpr'] = fpr
                            recoDictScoreNN[f'{obs}_DNN_{p}']['tpr'] = tpr
                            recoDictScoreNN[f'{obs}_DNN_{p}']['AUC'] = np.round(area,3)
                            recoDictScoreNN[f'{obs}_DNN_{p}']['model'] = modelname
                            
                        #print(obs,modelname,genDictScoreNN,recoDictScoreNN)

                    elif area<=0.6:
                        continue
            elif ('tau' in obs):
                if('32' in obs and 'W' in p): continue 
                if('21' in obs and 'top' in p): continue 
                
                inds_0 = np.where(target_test==0)
                inds_1 = np.where(target_test==1)
                print (len(inds_0),len(inds_1))
                fpr,tpr,area = makeROC_obs(sig=inputs_test[:,cols[obs]][inds_1],
                                           bkg=inputs_test[:,cols[obs]][inds_0])
                if 'gen' in genORreco:
                    genDictScoreObs[f'{obs}_{p}']['fpr'] = fpr
                    genDictScoreObs[f'{obs}_{p}']['tpr'] = tpr
                    genDictScoreObs[f'{obs}_{p}']['AUC'] = np.round(area,3)
                elif 'reco' in genORreco:
                    recoDictScoreObs[f'{obs}_{p}']['fpr'] = fpr
                    recoDictScoreObs[f'{obs}_{p}']['tpr'] = tpr
                    recoDictScoreObs[f'{obs}_{p}']['AUC'] = np.round(area,3)
                    
            

In [None]:
print(genDictScoreNN)
print(genDictScoreObs)

### Make saturation ROC curves

In [None]:
import mplhep as hep
import matplotlib.pyplot as plt
hep.style.use("CMS")
%matplotlib inline

In [None]:
fig, ax = plt.subplots(figsize=(12,9))

#gen level
for i in genDictScoreNN.keys():
    plt.plot(genDictScoreNN[i]['tpr'],1./genDictScoreNN[i]['fpr'], label=i.split('_DNN')[0]+f' DNN (AUC={genDictScoreNN[i]["AUC"]})',
             ls='-', lw=2)
for i in genDictScoreObs.keys():
    print(i)
    if 'ex' in i: lab=r'$\%s_{21}^{%s}$'%(i.split('21')[0],r'excl.\ k_{T}')
    elif 'WTA' in i: lab=r'$\%s_{21}^{%s}$'%(i.split('21')[0],r'WTA\ k_{T}')
    elif i=='tau21_WvsQCD': lab=r'$\%s_{21}^{%s}$'%(i.split('21')[0],r'OP\ k_{T}' )
    plt.plot(genDictScoreObs[i]['tpr'],1./genDictScoreObs[i]['fpr'], label=lab+f' (AUC={genDictScoreObs[i]["AUC"]})',
             ls='--', lw=2)
    
hep.cms.label(rlabel=r"Boosted $W$ vs. QCD, 13 TeV")
ax.legend(title='Hadron-level',ncols=1,fontsize=16,title_fontsize=18)
ax.set_xlabel('Signal efficiency', loc='center')
ax.set_ylabel('QCD rejection rate', loc='center')
plt.yscale('log')

plt.ylim(0.8,700)
plt.xlim(0,1.0)
fig.savefig('../Plots/SaturationROC_gen_WvsQCD.pdf', dpi=1200)
fig.savefig('../Plots/SaturationROC_gen_WvsQCD.png', dpi=1200)



In [None]:
fig, ax = plt.subplots(figsize=(12,9))

#reco level
for i in recoDictScoreNN.keys():
    plt.plot(recoDictScoreNN[i]['tpr'],1./recoDictScoreNN[i]['fpr'], label=i.split('_DNN')[0]+f' DNN (AUC={recoDictScoreNN[i]["AUC"]})',
             ls='-', lw=2)
for i in recoDictScoreObs.keys():
    print(i)
    if 'ex' in i: lab=r'$\%s_{21}^{%s}$'%(i.split('21')[0],r'excl.\ k_{T}')
    elif 'WTA' in i: lab=r'$\%s_{21}^{%s}$'%(i.split('21')[0],r'WTA\ k_{T}')
    elif i=='tau21_WvsQCD': lab=r'$\%s_{21}^{%s}$'%(i.split('21')[0],r'OP\ k_{T}' )
    plt.plot(recoDictScoreObs[i]['tpr'],1./recoDictScoreObs[i]['fpr'], label=lab+f' (AUC={recoDictScoreObs[i]["AUC"]})',
             ls='--', lw=2)
    
hep.cms.label(rlabel=r"Boosted $W$ vs. QCD, 13 TeV")
ax.legend(title='Detector-level',ncols=1,fontsize=16,title_fontsize=18)
ax.set_xlabel('Signal efficiency', loc='center')
ax.set_ylabel('QCD rejection rate', loc='center')
plt.yscale('log')

plt.ylim(0.8,700)
plt.xlim(0,1.0)
fig.savefig('../Plots/SaturationROC_reco_WvsQCD.pdf', dpi=1200)
fig.savefig('../Plots/SaturationROC_reco_WvsQCD.png', dpi=1200)



In [None]:
gc.collect()

## top vs QCD

#### Training

In [None]:
inpDir='../data/'
cols=dict()
hidden_size1=500
hidden_size2=250
hidden_size3=100
hidden_size4=50
batch_size = 5000
epochs = 400
cmap = get_cmap(30)

#branch ordering
#0'selGenJets_nom_pt', 1'selGenJets_nom_msoftdrop',  
#2'selGenJets_nom_tau_0p5_1', 3'selGenJets_nom_tau_1_1', 4'selGenJets_nom_tau_2_1',
#5'selGenJets_nom_tau_0p5_2', 6'selGenJets_nom_tau_1_2', 7'selGenJets_nom_tau_2_2',
#8'selGenJets_nom_tau_0p5_3', 9'selGenJets_nom_tau_1_3', 10'selGenJets_nom_tau_2_3',
#11'selGenJets_nom_tau_0p5_4', 12'selGenJets_nom_tau_1_4', 13'selGenJets_nom_tau_2_4', 
#14'selGenJets_nom_tau_0p5_5', 15'selGenJets_nom_tau_1_5', 16'selGenJets_nom_tau_2_5',
#17'selGenJets_nom_tau21', 18'selGenJets_nom_tau32', 
#19'selGenJets_nom_tau21_WTA',20'selGenJets_nom_tau32_WTA', 
#21'selGenJets_nom_tau21_exkT',22'selGenJets_nom_tau32_exkT', 
cols['2-body'] = [3,4]
cols['3-body'] = [2,3,4,6,7]
cols['4-body'] = [2,3,4,5,6,7,9,10]
cols['5-body'] = [2,3,4,5,6,7,8,9,10,12,13]
cols['6-body'] = [2,3,4,5,6,7,8,9,10,11,12,13,15,16]
cols['tau21'] = [17]
cols['tau32'] = [18]
cols['tau21_WTA'] = [19]
cols['tau32_WTA'] = [20]
cols['tau21_exkT'] = [21]
cols['tau32_exkT'] = [22]

Ms = [2,3,4,5,6]

In [None]:
import gc
from collections import OrderedDict
genDictScoreNN = OrderedDict()
genDictScoreObs = OrderedDict()
recoDictScoreNN = OrderedDict()
recoDictScoreObs = OrderedDict()


#### Training

In [None]:
#load data
for sampleType in ['toplike']:
    for genORreco in ['gen','reco']:#,'reco']
        print(sampleType,genORreco)
        for f in os.listdir(inpDir):

            if not(sampleType in f and genORreco in f): continue
            print (f,genORreco)
            file = h5py.File(inpDir+f,'r')

            inputs = np.array(file['inputs']).reshape(file['inputs'].shape[1],file['inputs'].shape[2])
            targets = np.array(file['target']).flatten()#reshape(targets.shape[1])
            #print(inputs.shape[0],targets[0:100])
            dataset_dict = split_data(inputs,targets)
            file.close()
        
        print(f"Train set shape: Inputs {dataset_dict['inputs']['train'].shape}, Target {dataset_dict['targets']['train'].shape}")
        print(f"Validate set shape: Inputs {dataset_dict['inputs']['validate'].shape}, Target {dataset_dict['targets']['validate'].shape}")
        print(f"Test set shape: Inputs {dataset_dict['inputs']['test'].shape}, Target {dataset_dict['targets']['test'].shape}")

        ### M-body DNN training for W/top vs. QCD ###
        inputs_train = dataset_dict['inputs']['train']
        inputs_val = dataset_dict['inputs']['validate']
        inputs_test = dataset_dict['inputs']['test']
        target_train = dataset_dict['targets']['train']
        target_val = dataset_dict['targets']['validate']
        target_test = dataset_dict['targets']['test']

In [None]:
for sampleType in ['toplike']:#,'toplike']:
    for genORreco in ['gen','reco']:#,'reco']
        print(sampleType,genORreco)
        
        score=[0,0]

        for M in Ms:
            plt.figure(figsize=(12, 9))

            input_shape=(len(cols[f'{M}-body']),)
            for i in range(1,10):
                DNN_topvsQCD_Mbody = keras.Sequential([
                        layers.Dense(hidden_size1, activation='relu', input_shape=input_shape),
                        layers.Dropout(0.2),
                        layers.Dense(hidden_size2, activation='relu'),
                        layers.Dropout(0.2),
                        layers.Dense(hidden_size3, activation='relu'),
                        layers.Dropout(0.1),
                        layers.Dense(hidden_size4, activation='relu'),
                        layers.Dropout(0.1),
                        layers.Dense(1, activation='sigmoid')
                    ])
                DNN_topvsQCD_Mbody.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
                #DNN_topvsQCD_Mbody.summary()
                modelname = f"{genORreco}_DNN_topvsQCD_{M}body_{i}"   
                #print (f"Training {modelname}")
                check = keras.callbacks.ModelCheckpoint(filepath=f"../saved_weightsAndES/{modelname}_check.h5", verbose=0)
                early = keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, verbose=1, mode='auto')
                history = DNN_topvsQCD_Mbody.fit(  
                                                inputs_train[:,cols[f'{M}-body']], target_train, 
                                                batch_size=batch_size, epochs=epochs, 
                                                steps_per_epoch=5,
                                                validation_data=(inputs_val[:,cols[f'{M}-body']], target_val),
                                                validation_batch_size=int(batch_size/2), validation_steps=4, 
                                                verbose=0, callbacks=[check, early]
                                               )
                savelosses(history,outDir='../saved_weightsAndES/',name=modelname)
                savemodel(DNN_topvsQCD_Mbody,outDir='../models/',name=modelname)

                plt.plot(history.history['loss'], color=cmap(i), label=(f"{i}_loss"))
                plt.plot(history.history['val_loss'], linestyle="--", color=cmap(i))    

                pred = DNN_topvsQCD_Mbody.predict([inputs_test[:,cols[f'{M}-body']]])
                fpr,tpr,_ = roc_curve(target_test,pred)
                ls = ['-' , '--' , '-.']
                area = auc(fpr, tpr)
                if area>score[0]:
                    score[0]=area
                    score[1]=i
                if area<=0.6:
                    continue
                l = 'Training %d, auc=%f'%(i,area)
                print(l,modelname,score)
                del(DNN_topvsQCD_Mbody)
                gc.collect()

            #plt.yscale('log')
            plt.xlabel('Epoch', fontsize = 20)
            plt.ylabel('Loss', fontsize = 20)
            plt.title('Training Error by Epoch', fontsize = 20)    
            plt.legend(loc='best', fontsize = 15, fancybox=True, framealpha=0.0)
            plt.rc('xtick', labelsize = 16)
            plt.rc('ytick', labelsize = 16)   
            plt.savefig(f"../Plots/{genORreco}_DNN_topvsQCD_{M}body_losses.pdf")
            plt.savefig(f"../Plots/{genORreco}_DNN_topvsQCD_{M}body_losses.png")
            plt.show()

#### Testing

In [None]:
genDictScoreNN=dict()
recoDictScoreNN=dict()
genDictScoreObs=dict()
recoDictScoreObs=dict()
gc.collect()

problem = ['topvsQCD']
for p in problem:#,'toplike']:
    for genORreco in ['gen','reco']:
        for obs in cols.keys():
            if not('tau' in obs):
                genDictScoreNN[f'{obs}_DNN_{p}']=dict()
                recoDictScoreNN[f'{obs}_DNN_{p}']=dict()
            else:  
                if('32' in obs and 'W' in p): continue 
                if('21' in obs and 'top' in p): continue 
                genDictScoreObs[f'{obs}_{p}']=dict()
                recoDictScoreObs[f'{obs}_{p}']=dict()
            print(p,genORreco,obs)

print(genDictScoreNN,recoDictScoreNN,genDictScoreObs,recoDictScoreObs)

for p in problem:#,'toplike']:
    for genORreco in ['gen','reco']:
        for obs in cols.keys():
            print(obs,genORreco,p)
            #print(sampleType,genORreco)
            
            if not('tau' in obs):
                
                best_score=[0,0]
                for i in range(1,10):
                    
                    modelname=f'{genORreco}_DNN_{p}_{obs.split("-")[0]}body_{i}'
                    model = loadmodel(modelname,inDir='../models/')
                    prediction = np.array(model.predict(np.array(inputs_test[:,cols[obs]]),verbose=0))
                    fpr, tpr, _ = roc_curve(np.array(target_test),prediction)
                    area = auc(fpr,tpr)
                    
                    if area>best_score[0]:
                        
                        best_score[0]=area
                        best_score[1]=i
                        if 'gen' in genORreco:
                            genDictScoreNN[f'{obs}_DNN_{p}']['fpr'] = fpr
                            genDictScoreNN[f'{obs}_DNN_{p}']['tpr'] = tpr
                            genDictScoreNN[f'{obs}_DNN_{p}']["AUC"] = np.round(area,3)
                            genDictScoreNN[f'{obs}_DNN_{p}']['model'] = modelname
                        elif 'reco' in genORreco:
                            recoDictScoreNN[f'{obs}_DNN_{p}']['fpr'] = fpr
                            recoDictScoreNN[f'{obs}_DNN_{p}']['tpr'] = tpr
                            recoDictScoreNN[f'{obs}_DNN_{p}']["AUC"] = np.round(area,3)
                            recoDictScoreNN[f'{obs}_DNN_{p}']['model'] = modelname
                            
                        #print(obs,modelname,genDictScoreNN,recoDictScoreNN)

                    elif area<=0.6:
                        continue
            elif ('tau' in obs):
                if('32' in obs and 'W' in p): continue 
                if('21' in obs and 'top' in p): continue 
                
                inds_0 = np.where(target_test==0)
                inds_1 = np.where(target_test==1)
                print (len(inds_0),len(inds_1))
                fpr,tpr,area = makeROC_obs(sig=inputs_test[:,cols[obs]][inds_1],
                                           bkg=inputs_test[:,cols[obs]][inds_0])
                if 'gen' in genORreco:
                    genDictScoreObs[f'{obs}_{p}']['fpr'] = fpr
                    genDictScoreObs[f'{obs}_{p}']['tpr'] = tpr
                    genDictScoreObs[f'{obs}_{p}']["AUC"] = np.round(area,3)
                elif 'reco' in genORreco:
                    recoDictScoreObs[f'{obs}_{p}']['fpr'] = fpr
                    recoDictScoreObs[f'{obs}_{p}']['tpr'] = tpr
                    recoDictScoreObs[f'{obs}_{p}']["AUC"] = np.round(area,3)
                    
            

In [None]:
print(genDictScoreNN)
print(genDictScoreObs)

### Make saturation ROC curves

In [None]:
import mplhep as hep
import matplotlib.pyplot as plt
hep.style.use("CMS")
%matplotlib inline

In [None]:
fig, ax = plt.subplots(figsize=(12,9))

#gen level
for i in genDictScoreNN.keys():
    plt.plot(genDictScoreNN[i]['tpr'],1./genDictScoreNN[i]['fpr'], label=i.split('_DNN')[0]+f' DNN (AUC={genDictScoreNN[i]["AUC"]})',
             ls='-', lw=2)
for i in genDictScoreObs.keys():
    print(i)
    if 'ex' in i: lab=r'$\%s_{32}^{%s}$'%(i.split('32')[0],r'excl.\ k_{T}')
    elif 'WTA' in i: lab=r'$\%s_{32}^{%s}$'%(i.split('32')[0],r'WTA\ k_{T}')
    elif i=='tau32_topvsQCD': lab=r'$\%s_{32}^{%s}$'%(i.split('32')[0],r'OP\ k_{T}' )
    plt.plot(genDictScoreObs[i]['tpr'],1./genDictScoreObs[i]['fpr'], label=lab+f' (AUC={genDictScoreObs[i]["AUC"]})',
             ls='--', lw=2)
    
hep.cms.label(rlabel=r"Boosted $top$ vs. QCD, 13 TeV")
ax.legend(title='Hadron-level',ncols=1,fontsize=16,title_fontsize=18)
ax.set_xlabel('Signal efficiency', loc='center')
ax.set_ylabel('QCD rejection rate', loc='center')
plt.yscale('log')

plt.ylim(0.8,700)
plt.xlim(0,1.0)
fig.savefig('../Plots/SaturationROC_gen_topvsQCD.pdf', dpi=1200)
fig.savefig('../Plots/SaturationROC_gen_topvsQCD.png', dpi=1200)



In [None]:
fig, ax = plt.subplots(figsize=(12,9))

#reco level
for i in recoDictScoreNN.keys():
    plt.plot(recoDictScoreNN[i]['tpr'],1./recoDictScoreNN[i]['fpr'], label=i.split('_DNN')[0]+f' DNN (AUC={recoDictScoreNN[i]["AUC"]})',
             ls='-', lw=2)
for i in recoDictScoreObs.keys():
    print(i)
    if 'ex' in i: lab=r'$\%s_{32}^{%s}$'%(i.split('32')[0],r'excl.\ k_{T}')
    elif 'WTA' in i: lab=r'$\%s_{32}^{%s}$'%(i.split('32')[0],r'WTA\ k_{T}')
    elif i=='tau32_topvsQCD': lab=r'$\%s_{32}^{%s}$'%(i.split('32')[0],r'OP\ k_{T}' )
    plt.plot(recoDictScoreObs[i]['tpr'],1./recoDictScoreObs[i]['fpr'], label=lab+f' (AUC={recoDictScoreObs[i]["AUC"]})',
             ls='--', lw=2)
    
hep.cms.label(rlabel=r"Boosted $top$ vs. QCD, 13 TeV")
ax.legend(title='Detector-level',ncols=1,fontsize=16,title_fontsize=18)
ax.set_xlabel('Signal efficiency', loc='center')
ax.set_ylabel('QCD rejection rate', loc='center')
plt.yscale('log')

plt.ylim(0.8,700)
plt.xlim(0,1.0)
fig.savefig('../Plots/SaturationROC_reco_topvsQCD.pdf', dpi=1200)
fig.savefig('../Plots/SaturationROC_reco_topvsQCD.png', dpi=1200)



In [None]:
gc.collect()

In [None]:
from IPython.core.display import HTML
HTML("<script>Jupyter.notebook.kernel.restart()</script>")