# Prep Work

## Import libraries

In [11]:
import numpy as np
from numpy.random import RandomState
import numpy.ma as ma

import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
%matplotlib inline

import h5py
import ot
from numpy.random import Generator, PCG64
from sklearn import svm
from sklearn.metrics import roc_auc_score

In [12]:
sigAliasList    = ['sig_A', 'sig_h0', 'sig_hch', 'sig_LQ']
sigFilenameList = ['Ato4l_lepFilter_13TeV_filtered.h5', 'hToTauTau_13TeV_PU20_filtered.h5', 'hChToTauNu_13TeV_PU20_filtered.h5', 'leptoquark_LOWMASS_lepFilter_13TeV_filtered.h5']

In [13]:
#-- Set base directory and data directory path --#
basePath   = '/Users/hanchengli/Dropbox/AnomalyDetection/OnML4Jets2021DataChallenge/'
# basePath   = '/Users/bobli/Dropbox/AnomalyDetection/OnML4Jets2021DataChallenge/'
dataPath   = 'Data/'

bkgPath    = basePath+dataPath+'background_for_training.h5'
augPath    = '/Users/hanchengli/Desktop/Research/anomalyAugmented_background_for_training.h5'
# augPath    = '/Users/bobli/Desktop/Research/anomalyAugmented_background_for_training.h5'
sigPathList = []
for x in sigFilenameList:
  sigPathList.append(basePath+dataPath+x)

## Functions

In [14]:
%cd ~/Dropbox/AnomalyDetection/OnML4Jets2021DataChallenge/anomaly_detection_code/functions
%run centralFunctions.ipynb

/Users/hanchengli/Library/CloudStorage/Dropbox/AnomalyDetection/OnML4Jets2021DataChallenge/anomaly_detection_code/functions


## Loading Data

In [15]:
dataDict = {}
dataDict['bkg'] = h5py.File(bkgPath, 'r')
dataDict['aug'] = h5py.File(augPath, 'r')

for i in range(len(sigAliasList)):
  alias   = sigAliasList[i]
  sigPath = sigPathList[i]
  dataDict[alias] = h5py.File(sigPath, 'r')

In [16]:
bkg_data = dataDict['bkg']['Particles'][:, :, 0:3]
aug_data = dataDict['aug']['augBkg'][:, :, 0:3]
sig_data = {}

for alias in sigAliasList:
  sig_data[alias] = dataDict[alias]['Particles'][:, :, 0:3]

# kNN

In [17]:
nEvents = 500
random_state = Generator(PCG64(1))
np.random.seed(100)
OTSCHEME = {}
OTSCHEME['normPT'] = False
OTSCHEME['balanced'] = True
OTSCHEME['noZeroPad'] = False
OTSCHEME['individualOT'] = False

In [18]:
events = {}
events['bkg'] = randomDataSample(bkg_data, nEvents + nEvents // 5, random_state)
random_state = Generator(PCG64(2))
events['aug'] = randomDataSample(aug_data, nEvents, random_state)

for alias in sigAliasList:
    events[alias] = randomDataSample(sig_data[alias], nEvents // 5, random_state)

del bkg_data, sig_data, aug_data

In [19]:
train_events = np.concatenate((events['bkg'][:nEvents], events['aug']))
train_labels = np.concatenate((np.zeros(nEvents), np.ones(nEvents)))

permutation = np.random.permutation(len(train_labels))
train_events = train_events[permutation]
train_labels = train_labels[permutation]

train_matrix = calcOTDistance(train_events, train_events, OTSCHEME, '3D', Matrix=True)

neighbor_list = list(range(5, 500,10))

best_auc, best_k, best_model = kNN_with_distance_matrix(train_matrix, train_labels, nEvents * 3 // 2, nEvents * 2 // 5, nEvents // 10, neighbor_list)

100%|██████████| 1000000/1000000 [01:40<00:00, 9957.28it/s]
Fitting Models: 100%|██████████| 50/50 [00:00<00:00, 90.10it/s]


In [20]:
for alias in sigAliasList:
    test_events = np.concatenate((events['bkg'][nEvents:], events[alias]))
    test_labels = np.concatenate((np.zeros(nEvents // 5), np.ones(nEvents // 5)))
    test_matrix = calcOTDistance_non_square(test_events, train_events, OTSCHEME, '3D', Matrix=True)
    
    model = KNeighborsClassifier(n_neighbors=best_k, metric='precomputed')
    model.fit(train_matrix, train_labels)
    
    pred = model.predict(test_matrix)
    auc = roc_auc_score(test_labels, pred)
    
    print(auc)

100%|██████████| 200000/200000 [00:35<00:00, 5627.49it/s]


0.77


100%|██████████| 200000/200000 [00:34<00:00, 5805.76it/s]


0.6399999999999999


100%|██████████| 200000/200000 [00:36<00:00, 5522.59it/s]


0.7349999999999999


100%|██████████| 200000/200000 [00:37<00:00, 5335.81it/s]

0.6849999999999999



