# Prep Work

## Import libraries

In [1]:
import numpy as np
from numpy.random import RandomState
import numpy.ma as ma

import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
%matplotlib inline

import h5py
import ot
from numpy.random import Generator, PCG64

In [2]:
sigAliasList    = ['sig_A', 'sig_h0', 'sig_hch', 'sig_LQ']
sigFilenameList = ['Ato4l_lepFilter_13TeV_filtered.h5', 'hToTauTau_13TeV_PU20_filtered.h5', 'hChToTauNu_13TeV_PU20_filtered.h5', 'leptoquark_LOWMASS_lepFilter_13TeV_filtered.h5']

In [3]:
#-- Set base directory and data directory path --#
basePath   = '/Users/bobli/Dropbox/AnomalyDetection/OnML4Jets2021DataChallenge/anomaly_detection_code/'
dataPath   = 'data/ADC2021/'

bkgPath    = basePath+dataPath+'background_for_training.h5'
sigPathList = []
for x in sigFilenameList:
  sigPathList.append(basePath+dataPath+x)

## Functions

In [4]:
%cd ~/Dropbox/AnomalyDetection/OnML4Jets2021DataChallenge/anomaly_detection_code/functions
%run centralFunctions.ipynb

/Users/bobli/Library/CloudStorage/Dropbox/AnomalyDetection/OnML4Jets2021DataChallenge/anomaly_detection_code/functions


## Loading Data

In [5]:
# create a dictionary to store data
dataDict = {}
dataDict['bkg'] = h5py.File(bkgPath, 'r')

for i in range(len(sigAliasList)):
  alias   = sigAliasList[i]
  sigPath = sigPathList[i]
  dataDict[alias] = h5py.File(sigPath, 'r')

In [6]:
# store data in dictionary as numpy arrays
bkg_data = dataDict['bkg']['Particles'][:, :, 0:3]
sig_data = {}

for alias in sigAliasList:
  sig_data[alias] = dataDict[alias]['Particles'][:, :, 0:3]

# kNN with distance matrix on 3D ground space, model-agnostic training

In [7]:
# set basic parameters
nEvents = 1000
random_state = Generator(PCG64(1))
np.random.seed(100)
OTSCHEME = {}
OTSCHEME['normPT'] = False
OTSCHEME['balanced'] = True
OTSCHEME['noZeroPad'] = False
OTSCHEME['individualOT'] = False

In [8]:
neighbor_list = list(range(5, 400,10))

train_events = []
test_events = []
events = {}

events['bkg'] = randomDataSample(bkg_data, nEvents + nEvents // 5, random_state)
train_events.extend(events['bkg'][:nEvents])
test_events.extend(events['bkg'][nEvents:])

for alias in sigAliasList:
    events[alias] = randomDataSample(sig_data[alias] , nEvents + nEvents // 5, random_state)
    train_events.extend(events[alias][:nEvents])
    if alias == 'sig_LQ':
        test_events.extend(events[alias][nEvents:])
    
del bkg_data, sig_data

permutation = np.random.permutation(nEvents * 5)

test_events = np.asarray(test_events)
train_events = np.asarray(train_events)
train_labels = np.array([0] * nEvents + [1] * 4 * nEvents)
train_events = train_events[permutation]
train_labels = train_labels[permutation]
test_labels = np.array([0] * (nEvents // 5) + [1] * (nEvents // 5))
print(test_labels.shape)
total_events = np.concatenate((train_events, test_events))
labels = np.concatenate((train_labels, test_labels))
print(total_events.shape)
print(labels.shape)

distance_matrix = calcOTDistance(total_events, total_events, OTSCHEME, '3D', Matrix=True)

neighbor_list = list(range(5, int(np.sqrt(nEvents * 5)), 6))

best_auc, best_k, best_model, auc_list = kNN_with_distance_matrix(distance_matrix, labels, nEvents * 4, nEvents, nEvents//5 * 2, neighbor_list, AUC_list=True)

(400,)
(5400, 19, 3)
(5400,)


  2%|‚ñè         | 639099/29160000 [01:08<51:18, 9264.02it/s]


KeyboardInterrupt: 

In [None]:

print(best_auc, best_k, auc_list)



0.87125 29 [0.8100598461100028, 0.8387004844685096, 0.8461100028498149, 0.8443644913080651, 0.8493872898261613, 0.8478911370760901]


In [None]:
0.8315277777777778 29 [0.8100598461100028, 0.8387004844685096, 0.8461100028498149, 0.8443644913080651, 0.8493872898261613, 0.8478911370760901]
0.7904166666666668 29 [0.8100598461100028, 0.8387004844685096, 0.8461100028498149, 0.8443644913080651, 0.8493872898261613, 0.8478911370760901]
0.8905555555555554 29 [0.8100598461100028, 0.8387004844685096, 0.8461100028498149, 0.8443644913080651, 0.8493872898261613, 0.8478911370760901]
0.87125 29 [0.8100598461100028, 0.8387004844685096, 0.8461100028498149, 0.8443644913080651, 0.8493872898261613, 0.8478911370760901]