## Setup


In [None]:
# Notebook to develop the BDT response & selection performance 
# work done by Kaushal Gumpula 
# see more at: https://github.com/kgumpula2/searchingfornues_BDT

In [None]:
import sys
import uproot
import matplotlib.pylab as pylab
import numpy as np
import math
from sklearn.model_selection import train_test_split
import pickle
import xgboost as xgb
import awkward
import matplotlib.pyplot as plt
import pandas as pd
from matplotlib import gridspec
from selection_functions_kaushal import *

In [None]:
# Doing Run3?

ISRUN3 = False

In [None]:
# Load in offline flux reweight maps

import ROOT

if ISRUN3:
    flux_maps = ROOT.TFile.Open("/uboone/data/users/kmiller/numi-ppfx/uboone/CV/no-threshold/feb2020/rhc/ppfx_weight_maps_rhc.root")
    plots_path = "/uboone/data/users/kmiller/searchingfornues_v33/v08_00_00_33/plots/rhc/"
    
else: 
    flux_maps = ROOT.TFile.Open("/uboone/data/users/kmiller/numi-ppfx/uboone/CV/no-threshold/feb2020/fhc/ppfx_weight_maps.root")
    plots_path = "/uboone/app/users/kgumpula/work/searching_fornues_kaushal/plots/fhc/"
    
numu_map = flux_maps.Get("hratio_numu")
numubar_map = flux_maps.Get("hratio_numubar")
nue_map = flux_maps.Get("hratio_nue")
nuebar_map = flux_maps.Get("hratio_nuebar")


In [None]:
# POT normalization factors

if not ISRUN3: 
    overlay_pot = 2.31956E21        
    dirt_pot = 1.42143E21
    beamon_pot = 8.793E19 


    beamon_ntrig = 2323130.0
    beamoff_ntrig = 4015961.99 

else: # RHC POT
    overlay_pot = 1.57761E21
    dirt_pot = 4.65831e+20
    beamon_pot = 3.991E19
    
    beamon_ntrig = 815581.0
    beamoff_ntrig = 1550231.025000


In [None]:
fold = "nuselection"
tree = "NeutrinoSelectionFilter"


EXT = ""
OVRLY  = ""
DRT = ""

path = '/uboone/data/users/kmistry/work/MCC9/searchingfornues/'

    
if not ISRUN3: 
    # Run 1 FHC
    OVRLY = 'ntuple_files_v5/neutrinoselection_filt_run1_overlay'
    EXT = 'ntuple_files_v3/neutrinoselection_filt_run1_beamoff'
    DRT = 'ntuple_files_v3/neutrinoselection_filt_run1_dirt_overlay'

else: 
    # Run 3b RHC
    OVRLY = 'ntuple_files_v5/neutrinoselection_filt_run3b_overlay'
    EXT = 'ntuple_files_v2/neutrinoselection_filt_run3b_beamoff'
    DRT = 'ntuple_files_v3/neutrinoselection_filt_run3b_dirt_overlay'

overlay = uproot.open(path+OVRLY+".root")[fold][tree]
ext = uproot.open(path+EXT+".root")[fold][tree]
dirt = uproot.open(path+DRT+".root")[fold][tree]  
    
uproot_v = [overlay,ext,dirt]

variables = [
    "shr_dedx_Y", "selected", "nu_pdg", "shr_theta",
    "trk_score_v", 
    "category", "shr_tkfit_dedx_Y", "shr_tkfit_dedx_U", "shr_tkfit_dedx_V",
    "shr_tkfit_nhits_Y", "shr_tkfit_nhits_U", "shr_tkfit_nhits_V",
    "shr_hits_tot", "shr_hits_max", "ccnc", 
    "trk_bkt_pdg", "hits_ratio", "n_tracks_contained", 
    "crtveto", "crthitpe", "_closestNuCosmicDist",
    "NeutrinoEnergy2",
    "run","sub","evt",
    "CosmicIP",
    "trk_llr_pid_score_v", # trk-PID score
    "reco_nu_vtx_sce_x","reco_nu_vtx_sce_y","reco_nu_vtx_sce_z",
    "trkshrhitdist0","trkshrhitdist1","trkshrhitdist2", # distance between track and shower in 2D
    "shrsubclusters0","shrsubclusters1","shrsubclusters2", # number of sub-clusters in shower
    "shr_tkfit_npointsvalid","shr_tkfit_npoints", # fitted vs. all hits for shower
    "nproton", "nu_e", "n_showers_contained", "nu_purity_from_pfp", 
    "shr_distance", "trk_distance", "isVtxInFiducial",
    "hits_y", "shr_pz", "shr_energy", "shr_dedx_U", "shr_dedx_V", 
    "shr_phi", "trk_phi", "trk_theta",
    "trk_pid_chipr_v",
    "trk_len", "mc_pdg", "slnunhits", "slnhits", "shr_score", "trk_score", "trk_hits_tot",
    "true_e_visible", "matched_E", "shr_bkt_E", "trk_bkt_E", 
    "trk_energy", "tksh_distance", "tksh_angle",
    "npi0", "topological_score",
    "shr_energy_tot_cali", "shr_dedx_Y_cali", "nslice", "interaction",
    "reco_nu_vtx_x", "reco_nu_vtx_y", "reco_nu_vtx_z", "contained_fraction",
    "true_nu_vtx_x", "true_nu_vtx_y" , "true_nu_vtx_z", 
    "npion", "shr_energy_cali", "all_trk_energies", 
    "all_trk_hits", "all_shr_energies", "all_shr_hits",
    "shrmoliereavg", "CosmicDirAll3D", "CosmicIPAll3D",
    "elec_e", "proton_e", "nelec", "nmuon", "theta",
    "elec_px", "elec_py", "elec_pz","nu_pt"
]

variables = variables

if not ISRUN3: # i.e. if using Run1
    variables.remove("_closestNuCosmicDist")
    variables.remove("crtveto")
    variables.remove("crthitpe")
    

overlay = overlay.pandas.df(variables + ["weightSplineTimesTune", "ppfx_cv", "swtrig_pre"], flatten=False)
dirt = dirt.pandas.df(variables + ["weightSplineTimesTune", "ppfx_cv", "swtrig_pre"], flatten=False)
#data = data.pandas.df(variables, flatten=False)
ext = ext.pandas.df(variables, flatten=False)

# how to get the LLR-PID value for the "track candidate" (proton for nue selection, muon for numu)
# can be done for any variable
# code from Giuseppe!
#LLR-PID : log likelihood ratio particle ID ? 

df_v = [overlay,ext,dirt]

for i,df in enumerate(df_v):
    up = uproot_v[i]
    trk_llr_pid_v = up.array('trk_llr_pid_score_v')
    trk_id = up.array('trk_id')-1 # I think we need this -1 to get the right result
    trk_llr_pid_v_sel = awkward.fromiter([pidv[tid] if tid<len(pidv) else 9999. for pidv,tid in zip(trk_llr_pid_v,trk_id)])
    df['trkpid'] = trk_llr_pid_v_sel
    df['subcluster'] = df['shrsubclusters0'] + df['shrsubclusters1'] + df['shrsubclusters2']
    df['trkfit'] = df['shr_tkfit_npointsvalid'] / df['shr_tkfit_npoints']


In [None]:
#shr_tkfit_dedx_most

for df in [overlay, dirt, ext]:
    df['shr_tkfit_dedx_most'] = df['shr_tkfit_dedx_Y']
    df.loc[df['shr_tkfit_nhits_Y'] < df['shr_tkfit_nhits_V'], 'shr_tkfit_dedx_most'] = df['shr_tkfit_dedx_V']
    df.loc[(df['shr_tkfit_nhits_V'] < df['shr_tkfit_nhits_U']) & (df['shr_tkfit_nhits_Y'] < df['shr_tkfit_nhits_U']), 'shr_tkfit_dedx_most'] = df['shr_tkfit_dedx_U']


## Weights

In [None]:
# cuts applied for bad GENIE weights 

for i,df in enumerate([overlay,dirt]):
    df.loc[ df['weightSplineTimesTune'] <= 0, 'weightSplineTimesTune' ] = 1.
    df.loc[ df['weightSplineTimesTune'] == np.inf, 'weightSplineTimesTune' ] = 1.
    df.loc[ df['weightSplineTimesTune'] > 100, 'weightSplineTimesTune' ] = 1.
    df.loc[ np.isnan(df['weightSplineTimesTune']) == True, 'weightSplineTimesTune' ] = 1.

In [None]:
# tuned POT normalization 
dirt_tune = 1
ext_tune = 1
if not ISRUN3: 
    dirt_tune = 0.35
    ext_tune = .98
else: 
    dirt_tune = 0.35
    ext_tune = .94

# POT normalization weights (scale to overlay)
dirt_scale2 = dirt_tune*(overlay_pot/dirt_pot)
beamoff_scale2 = ext_tune*((overlay_pot/beamon_pot)*(beamon_ntrig/beamoff_ntrig))

dirt['pot_scale_overlay'] = dirt_scale2
ext['pot_scale_overlay'] = beamoff_scale2
overlay['pot_scale_overlay'] = [1 for i in range(len(overlay))]

if not ISRUN3:
    data_pot = 9.23E20
else:
    data_pot = 11.95E20
    
# POT normalization weights(scaled to data)
dirt_scale3 = dirt_tune*(data_pot/dirt_pot)
beamoff_scale3 = ext_tune*((data_pot/beamon_pot)*(beamon_ntrig/beamoff_ntrig))

dirt['pot_scale_data'] = dirt_scale3
ext['pot_scale_data'] = beamoff_scale3
overlay['pot_scale_data'] = data_pot/overlay_pot

In [None]:
# combined flux * genie * POT weight 
# ext gets POT weight only 

# totweight scales to OVERLAY POT
# flux weights will change values for RHC by at most 10%

overlay['totweight_overlay'] = overlay['weightSplineTimesTune']*overlay['ppfx_cv']
dirt['totweight_overlay'] = dirt['pot_scale_overlay']*dirt['weightSplineTimesTune']*dirt['ppfx_cv']

# for data
overlay['totweight_data'] = overlay['pot_scale_data']*overlay['weightSplineTimesTune']*overlay['ppfx_cv']
dirt['totweight_data'] = dirt['pot_scale_data']*dirt['weightSplineTimesTune']*dirt['ppfx_cv']

In [None]:
# cut out RHC runs from FHC dataset 
if not ISRUN3: 
    overlay = overlay.query('run<=10000')
    dirt = dirt.query('run<=10000')

## BDT

In [None]:
import selection_functions_BDT
import importlib
importlib.reload(selection_functions_BDT)
from selection_functions_BDT import *

In [None]:
PRE_QUERY = 'nslice==1 and 10<=reco_nu_vtx_sce_x<=246 and -106<=reco_nu_vtx_sce_y<=106 and 10<=reco_nu_vtx_sce_z<=1026 and contained_fraction>0.9 and n_showers_contained>0 and n_tracks_contained>0'+' and shr_energy_tot_cali>0.07'+' and n_showers_contained==1'
LOOSE_CUTS = 'shr_score<0.3 and trkpid<0.35 and shrmoliereavg<15 and shr_tkfit_dedx_Y<7 and tksh_distance<12'
LOOSE_CUTS = PRE_QUERY+' and '+LOOSE_CUTS

In [None]:
ISNUEBAR = False

In [None]:
if ISRUN3:
    BOX_CUTS = 'nslice==1 and 10<=reco_nu_vtx_sce_x<=246 and -106<=reco_nu_vtx_sce_y<=106 and 10<=reco_nu_vtx_sce_z<=1026 and contained_fraction>0.9 and n_showers_contained==1 and n_tracks_contained>0 and shr_energy_tot_cali>0.07 and shrmoliereavg < 8.0 and tksh_distance < 4.0 and shr_tkfit_dedx_Y < 4.0 and -0.8<tksh_angle<0.8 and trkpid < 0 and shr_score < 0.125'
else:
    BOX_CUTS = 'nslice==1 and 10<=reco_nu_vtx_sce_x<=246 and -106<=reco_nu_vtx_sce_y<=106 and 10<=reco_nu_vtx_sce_z<=1026 and contained_fraction>0.9 and n_showers_contained==1 and n_tracks_contained>0 and shr_energy_tot_cali>0.07 and shrmoliereavg < 8.0 and tksh_distance < 5.0 and shr_tkfit_dedx_Y < 4.0 and -0.9<tksh_angle<0.8 and trkpid < 0 and shr_score < 0.125'

## BDT Analysis

In [None]:
ISDATA = True
mc = pd.concat([overlay.query('swtrig_pre==1'),dirt.query('swtrig_pre==1')], ignore_index=True)

In [None]:
# pre-determined boosting round numbers

if ISRUN3:
    if ISNUEBAR:
        p_rounds = 200
        lc_rounds = 300
    else:
        p_rounds = 300
        lc_rounds = 200
else:
    p_rounds = 300
    lc_rounds = 200

In [None]:
bdt_p = main_BDT(mc, ext, PRE_QUERY, p_rounds, test_size=0.5, ISDATA=ISDATA, ISNUEBAR=ISNUEBAR)
bdt_lc = main_BDT(mc, ext, LOOSE_CUTS, lc_rounds, test_size=0.5, ISDATA=ISDATA, ISNUEBAR=ISNUEBAR)

In [None]:
lc_test_results = bdt_lc[0]
p_test_results = bdt_p[0]

In [None]:
bdt_svb_plot(lc_test_results)
bdt_svb_plot(p_test_results, is_log=True)

In [None]:
# loose cuts bdt results split into infv, outfv, cosmic, and ext events
datasets_bdt = split_events(lc_test_results)

plot_mc('BDT_score', 30, 0, 1, 'tuple()', datasets_bdt, 'dist', True, False, ISRUN3, 'totweight_overlay')
plot_mc('BDT_score', 30, 0, 1, 'tuple()', datasets_bdt, 'dist', False, False, ISRUN3, 'totweight_overlay')
plot_mc('nu_e', 20, 0, 5, 'BDT_score > 0.5', datasets_bdt, 'BDT', True, False, ISRUN3, 'totweight_overlay', ylim=60)
plot_mc('nu_e', 20, 0, 5, 'BDT_score > 0.5', datasets_bdt, 'BDT', False, False, ISRUN3, 'totweight_overlay', ylim=140)

In [None]:
# note test sample is equivalent regardless of query as query step happens after the initial split of testing and training data
full_test_df = bdt_lc[3]

In [None]:
print(len(bdt_lc[3]))
print(len(bdt_lc[4]))
print(len(bdt_lc[0]))

In [None]:
bdt_pe_plot(lc_test_results, np.arange(0, 0.7, 0.025), full_test_df, ISDATA)
bdt_pe_plot(p_test_results, np.arange(0, 0.7, 0.025), full_test_df, ISDATA)

In [None]:
xvals = np.arange(0, 0.7, 0.025)

if ISDATA:
    mc_weight = 'totweight_data'
else:
    mc_weight = 'totweight_overlay'

gen_num = sum(full_test_df.query('is_signal==1 or is_cont_signal==1')[mc_weight])
eff_box = sum(full_test_df.query(BOX_CUTS+' and is_signal==1')[mc_weight])/gen_num * 100
pur_box = sum(full_test_df.query(BOX_CUTS+' and is_signal==1')[mc_weight]) / sum(full_test_df.query(BOX_CUTS)['weight']) * 100

results_box = [pur_box, eff_box]

In [None]:
lc_bdt_pe = bdt_pe(bdt_lc[0], xvals, full_test_df, ISDATA)
p_bdt_pe = bdt_pe(bdt_p[0], xvals, full_test_df, ISDATA)

In [None]:
bdt_box_plot(lc_bdt_pe, results_box, xvals)
bdt_box_plot(lc_bdt_pe, results_box, xvals, second_results_bdt=p_bdt_pe)

## Cross-Validation

In [None]:
from sklearn.model_selection import RepeatedStratifiedKFold

In [None]:
ISNUEBAR = False
ISDATA = True

In [None]:
if ISDATA:
    mc_weight = 'totweight_data'
else:
    mc_weight = 'totweight_overlay'

In [None]:
splits = 2
repeats = 20
cv = RepeatedStratifiedKFold(n_splits=splits, n_repeats=repeats, random_state=36851234)

bdt_score_arr = np.arange(0, 0.7, 0.025)

In [None]:
df_cv = pd.concat([mc, ext], ignore_index=True)

In [None]:
#cosmic cont. in FV signal definition for convenience in efficiency calculations later
df_cv['is_cont_signal'] = np.where(((df_cv.nu_pdg == 12) & (df_cv.ccnc == 0) & (df_cv.nproton > 0) & (df_cv.npion == 0) & (df_cv.npi0 == 0)
                               & (df_cv.nu_purity_from_pfp <= 0.5)
                               & (10 <= df_cv.true_nu_vtx_x) & (df_cv.true_nu_vtx_x <= 246)
                                & (-106 <= df_cv.true_nu_vtx_y) & (df_cv.true_nu_vtx_y <= 106)
                               & (10 <= df_cv.true_nu_vtx_z) & (df_cv.true_nu_vtx_z <= 1026)), 1, 0)

#true signal definition
df_cv['is_signal'] = np.where(((df_cv.nu_pdg == 12) & (df_cv.ccnc == 0) & (df_cv.nproton > 0) & (df_cv.npion == 0) & (df_cv.npi0 == 0)
                               & (df_cv.nu_purity_from_pfp > 0.5)
                               & (10 <= df_cv.true_nu_vtx_x) & (df_cv.true_nu_vtx_x <= 246)
                                & (-106 <= df_cv.true_nu_vtx_y) & (df_cv.true_nu_vtx_y <= 106)
                               & (10 <= df_cv.true_nu_vtx_z) & (df_cv.true_nu_vtx_z <= 1026)), 1, 0)

if ISNUEBAR:
    #bdt signal definition (doesn't distinguish between nue and nuebar)
    df_cv['is_nuebar_signal'] = np.where((((df_cv.nu_pdg == 12) | (df_cv.nu_pdg == -12)) & (df_cv.ccnc == 0) & (df_cv.nproton > 0) & (df_cv.npion == 0) & (df_cv.npi0 == 0)
                                   & (df_cv.nu_purity_from_pfp > 0.5)
                                   & (10 <= df_cv.true_nu_vtx_x) & (df_cv.true_nu_vtx_x <= 246)
                                    & (-106 <= df_cv.true_nu_vtx_y) & (df_cv.true_nu_vtx_y <= 106)
                                   & (10 <= df_cv.true_nu_vtx_z) & (df_cv.true_nu_vtx_z <= 1026)), 1, 0)

In [None]:
varlist = [
    "shr_score", "shrmoliereavg", "trkpid",
    "n_showers_contained", "shr_tkfit_dedx_Y", "tksh_distance",
    "tksh_angle", "subcluster", "trkshrhitdist2"]
    
#model params
params = {
    'eta': 0.02,
    'tree_method': 'exact',
    'max_depth': 3,
    'subsample': 0.8,
    'colsample_bytree': 1,
    'silent': 1,
    'min_child_weight': 1,
    'seed': 2002,
    'gamma': 1,
    'max_delta_step': 0,
    #'scale_pos_weight': 4.7,
    'eval_metric': ['error', 'auc', 'aucpr']
}

In [None]:
final_purity = []
final_efficiency=[]
fp_err = []
fe_err = []
fp_2=[]
fe_2=[]
fp_err2 = []
fe_err2 = []

box_pur = []
box_eff = []
boxp_err = []
boxe_err = []

for train_index, test_index in cv.split(df_cv, df_cv['is_signal']):
    train, test = df_cv.iloc[train_index], df_cv.iloc[test_index]    
    
    if ISRUN3:
        if ISNUEBAR:
            p_rounds = 200
            lc_rounds = 300
        else:
            p_rounds = 300
            lc_rounds = 200
    else:
        p_rounds = 300
        lc_rounds = 200
        
    bdt_cv_p = bdt_raw_results(train, test, PRE_QUERY, varlist, params, p_rounds, ISNUEBAR)
    bdt_cv_lc = bdt_raw_results(train, test, LOOSE_CUTS, varlist, params, lc_rounds, ISNUEBAR)
    
    # saves purity, efficiency and respective errors on current test sample for loose cuts BDT
    pur, pur_err, eff, eff_err = bdt_pe(bdt_cv_lc[0], bdt_score_arr, test, ISDATA)
    final_purity.append(pur)
    final_efficiency.append(eff)
    fp_err2.append(pur_err)
    fe_err2.append(eff_err)
    
    # saves purity, efficiency and respective errors on current test sample for preselection BDT
    pur2, pur_err2, eff2, eff_err2 = bdt_pe(bdt_cv_p[0], bdt_score_arr, test, ISDATA)
    fp_2.append(pur2)
    fe_2.append(eff2)
    fp_err2.append(pur_err2)
    fe_err2.append(eff_err2)
    
    # saves purity and efficiency and respective errors for lienar box cut selection performance on current test sample
    sig_sel = sum(test.query(BOX_CUTS+' and is_signal==1')[mc_weight])
    tot_sel = sum(test.query(BOX_CUTS)['weight'])
    tot_sig = sum(test.query('is_signal==1 or is_cont_signal==1')[mc_weight])
    p = sig_sel / tot_sel
    e = sig_sel / tot_sig
    box_pur.append(p * 100)
    box_eff.append(e * 100)
    boxp_err.append(math.sqrt(sig_sel) / tot_sel * 100)
    boxe_err.append(math.sqrt((e * (1-e)) / tot_sig) * 100)

In [None]:
# averages results column-wise which is equivalent to averaging results over the same BDT_score cut
results_bdt = [np.mean(final_purity, axis=0), np.full(28, 0), np.mean(final_efficiency, axis=0), np.full(28, 0)]
second_results_bdt = [np.mean(fp_2, axis=0), np.full(28, 0), np.mean(fe_2, axis=0), np.full(28, 0)]

# linear box cut selection is a normal average over each distinct test sample
results_box = [np.mean(box_pur), np.mean(box_eff)]

In [None]:
bdt_box_plot(results_bdt, results_box, bdt_score_arr, second_results_bdt=second_results_bdt)