# Prep Work

## Import libraries

In [1]:
import numpy as np
from numpy.random import RandomState
import numpy.ma as ma

import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
%matplotlib inline

import h5py
import ot
from numpy.random import Generator, PCG64
from sklearn import svm
from sklearn.metrics import roc_auc_score
import json

In [2]:
sigAliasList    = ['sig_A', 'sig_h0', 'sig_hch', 'sig_LQ']
sigFilenameList = ['Ato4l_lepFilter_13TeV_filtered.h5', 'hToTauTau_13TeV_PU20_filtered.h5', 'hChToTauNu_13TeV_PU20_filtered.h5', 'leptoquark_LOWMASS_lepFilter_13TeV_filtered.h5']

In [3]:
#-- Set base directory and data directory path --#
basePath   = '/Users/bobli/Dropbox/AnomalyDetection/OnML4Jets2021DataChallenge/anomaly_detection_code/'
dataPath   = 'data/ADC2021/'

bkgPath    = basePath+dataPath+'background_for_training.h5'
augPath    = basePath+'data/AnomalyAugmentedBackground/anomalyAugmented_background_for_training.h5'
sigPathList = []
for x in sigFilenameList:
  sigPathList.append(basePath+dataPath+x)

## Functions

In [4]:
%cd ~/Dropbox/AnomalyDetection/OnML4Jets2021DataChallenge/anomaly_detection_code/functions
%run centralFunctions.ipynb

/Users/bobli/Library/CloudStorage/Dropbox/AnomalyDetection/OnML4Jets2021DataChallenge/anomaly_detection_code/functions


## Loading Data

In [5]:
# create a dictionary to store all the data
dataDict = {}
dataDict['bkg'] = h5py.File(bkgPath, 'r')
dataDict['aug'] = h5py.File(augPath, 'r')

for i in range(len(sigAliasList)):
  alias   = sigAliasList[i]
  sigPath = sigPathList[i]
  dataDict[alias] = h5py.File(sigPath, 'r')

In [6]:
# store the data in dictionary as numpy arrays
bkg_data = dataDict['bkg']['Particles'][:, :, 0:3]
aug_data = dataDict['aug']['augBkg'][:, :, 0:3]
sig_data = {}

for alias in sigAliasList:
  sig_data[alias] = dataDict[alias]['Particles'][:, :, 0:3]

# kNN

In [7]:
# set basic parameters
nEvents = 1000
random_state = Generator(PCG64(1))
np.random.seed(100)
OTSCHEME = {}
OTSCHEME['normPT'] = False
OTSCHEME['balanced'] = True
OTSCHEME['noZeroPad'] = False
OTSCHEME['individualOT'] = False

In [8]:
# create a dictionary to store all the randomly sampled events
events = {}
events['bkg'] = randomDataSample(bkg_data, nEvents + nEvents * 5, random_state)
random_state = Generator(PCG64(2))
events['aug'] = randomDataSample(aug_data, nEvents, random_state)

for alias in sigAliasList:
    events[alias] = randomDataSample(sig_data[alias], nEvents * 5, random_state)

del bkg_data, sig_data, aug_data

In [9]:
# create the test dataset
train_events = np.concatenate((events['bkg'][:nEvents], events['aug']))
train_labels = np.concatenate((np.zeros(nEvents), np.ones(nEvents)))

# randomly permute the test dataset
permutation = np.random.permutation(len(train_labels))
train_events = train_events[permutation]
train_labels = train_labels[permutation]

# calculate the train distance matrix
train_matrix = calcOTDistance(train_events, train_events, OTSCHEME, '3D', Matrix=True)

# list of neighbor numbers
neighbor_list = list(range(5, 500,10))

100%|██████████| 4000000/4000000 [03:18<00:00, 20195.19it/s]


In [10]:
# Get the best hyperparameters
best_auc, best_k, best_model, _ = kNN_with_distance_matrix(train_matrix, train_labels, nEvents * 3 // 2, nEvents * 2 // 5, nEvents // 10, neighbor_list)

Fitting Models: 100%|██████████| 50/50 [00:00<00:00, 56.06it/s]


In [11]:
aucs = np.zeros((len(sigAliasList),5))
scoreDict = {}

# iterate over all signals and repeat the process 5 times to get the metrics
for j, alias in enumerate(sigAliasList):
    scoreDict[alias] = {}
    for i in range(0, 5): 
        test_events = np.concatenate((events['bkg'][nEvents*(i+1):nEvents*(i+2)], events[alias][nEvents*i:nEvents*(i+1)]))
        test_labels = np.concatenate((np.zeros(nEvents), np.ones(nEvents)))
        test_matrix = calcOTDistance_non_square(test_events, train_events, OTSCHEME, '3D', Matrix=True)
        model = KNeighborsClassifier(n_neighbors=best_k, metric='precomputed')
        model.fit(train_matrix, train_labels)
        
        pred = model.predict_proba(test_matrix)
        kNN_metrics = kNN_ROC_metrics(test_labels, pred[:, 1], Interpolate=True)
        auc = roc_auc_score(test_labels, pred[:, 1])
        
        scoreDict[alias]['repeat'+str(i)] = kNN_metrics
        aucs[j, i] = auc

In [12]:
mean_aucs = {}
std_aucs = {}
for i, alias in enumerate(sigAliasList):
    mean_aucs[alias] = np.mean(aucs, axis=1)[i]
    std_aucs[alias] = np.std(aucs, axis=1)[i]

print(mean_aucs)

print(std_aucs)

{'sig_A': 0.8324266999999999, 'sig_h0': 0.6864171, 'sig_hch': 0.8134343000000002, 'sig_LQ': 0.7569604}
{'sig_A': 0.005851705653909785, 'sig_h0': 0.008349107349890724, 'sig_hch': 0.006917888020198079, 'sig_LQ': 0.007814717143953435}


In [13]:
print(aucs)

[[0.83191   0.8421575 0.8237975 0.833035  0.8312335]
 [0.6968705 0.693912  0.6832435 0.684681  0.6733785]
 [0.8202505 0.8143665 0.809467  0.8023495 0.820738 ]
 [0.7497665 0.7652115 0.7600545 0.745738  0.7640315]]


In [14]:
transposed_dict = {
    col: {row: scoreDict[row][col] for row in scoreDict}
    for col in scoreDict[next(iter(scoreDict))]
}
# Convert NumPy arrays in the lists to regular lists
converted_dict = {
    row: {
        col: [transposed_dict[row][col][0]] + [arr.tolist() for arr in transposed_dict[row][col][1:]]
        for col in transposed_dict[row]
    }
    for row in transposed_dict
}
%cd ~/Dropbox/AnomalyDetection/OnML4Jets2021DataChallenge/anomaly_detection_code/experiments/OT_ML/OT_anomaly_kNN/
with open('kNN_3D_anomalyaug.json', 'w') as json_file:
    json.dump(converted_dict, json_file, indent=4)

/Users/bobli/Library/CloudStorage/Dropbox/AnomalyDetection/OnML4Jets2021DataChallenge/anomaly_detection_code/experiments/OT_ML/OT_anomaly_kNN
