# Prep Work

## Import libraries

In [1]:
import numpy as np
from numpy.random import RandomState
import numpy.ma as ma

import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
%matplotlib inline

import h5py
import ot
from numpy.random import Generator, PCG64
from sklearn import svm
from sklearn.metrics import roc_auc_score

In [2]:
sigAliasList    = ['sig_A', 'sig_h0', 'sig_hch', 'sig_LQ']
sigFilenameList = ['Ato4l_lepFilter_13TeV_filtered.h5', 'hToTauTau_13TeV_PU20_filtered.h5', 'hChToTauNu_13TeV_PU20_filtered.h5', 'leptoquark_LOWMASS_lepFilter_13TeV_filtered.h5']

In [3]:
#-- Set base directory and data directory path --#
basePath   = '/Users/hanchengli/Dropbox/AnomalyDetection/OnML4Jets2021DataChallenge/'
# basePath   = '/Users/bobli/Dropbox/AnomalyDetection/OnML4Jets2021DataChallenge/'
dataPath   = 'Data/'

bkgPath    = basePath+dataPath+'background_for_training.h5'
sigPathList = []
for x in sigFilenameList:
  sigPathList.append(basePath+dataPath+x)

## Functions

In [4]:
%cd ~/Dropbox/AnomalyDetection/OnML4Jets2021DataChallenge/anomaly_detection_code/functions
%run centralFunctions.ipynb

/Users/hanchengli/Library/CloudStorage/Dropbox/AnomalyDetection/OnML4Jets2021DataChallenge/anomaly_detection_code/functions


## Loading Data

In [5]:
dataDict = {}
dataDict['bkg'] = h5py.File(bkgPath, 'r')

for i in range(len(sigAliasList)):
  alias   = sigAliasList[i]
  sigPath = sigPathList[i]
  dataDict[alias] = h5py.File(sigPath, 'r')

In [6]:
bkg_data = dataDict['bkg']['Particles'][:, :, 0:3]
sig_data = {}

for alias in sigAliasList:
  sig_data[alias] = dataDict[alias]['Particles'][:, :, 0:3]

# One Class SVM on 3D Ground Space

In [7]:
test_size = 200
train_size = 2000
random_state = Generator(PCG64(1))
np.random.seed(100)
OTSCHEME = {}
OTSCHEME['normPT'] = False
OTSCHEME['balanced'] = True
OTSCHEME['noZeroPad'] = False
OTSCHEME['individualOT'] = False

In [8]:
bkg_events = randomDataSample(bkg_data, train_size + test_size * 5, random_state)
sig_events = {}
for alias in sigAliasList:
    sig_events[alias] = randomDataSample(sig_data[alias], test_size * 5, random_state)

train_events = bkg_events[:train_size]
test_events = []

for i in range(0, 5):
    test_events.append({})
    test_events[i]['bkg'] = bkg_events[train_size+test_size*i:train_size+test_size*(i+1)]
    for alias in sigAliasList:
        test_events[i][alias] = sig_events[alias][test_size*i:test_size*(i+1)]
    
del bkg_data, sig_data


In [9]:
train_matrix = calcOTDistance(train_events, train_events, OTSCHEME, '3D', Matrix=True)

100%|██████████| 4000000/4000000 [06:52<00:00, 9693.62it/s] 


In [10]:
test_labels = np.asarray([1] * test_size + [-1] * test_size * 4)
nu_list = [0.001, 0.01, 0.1, 0.15, 0.20, 0.25, 0.3]
gamma_list = [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10]

best_aucs = {}
best_gammas = {}
best_nus = {}

aucs = np.zeros((5,4,len(nu_list), len(gamma_list)))
best_aucs = np.zeros((5,4))
best_gammas = np.zeros((5,4))
best_nus = np.zeros((5,4))

for j in range(0, 5):
    test_set = []
    test_set.extend(test_events[j]['bkg'])
    for alias in sigAliasList:
        test_set.extend(test_events[j][alias])
    test_set = np.asarray(test_set)
    # print(test_set.shape)
    test_matrix = calcOTDistance_non_square(test_set, train_events, OTSCHEME, '3D', Matrix=True)
    for i in range(0, len(sigAliasList)):
        for k, nu in enumerate(nu_list):
            for l, gamma in enumerate(gamma_list):
                train_matrix_gamma = np.exp(-gamma*train_matrix)
                test_matrix_gamma = np.exp(-gamma*np.concatenate((test_matrix[0:test_size, :], test_matrix[test_size * (i+1):test_size * (i+2), :])))
                model = svm.OneClassSVM(nu = nu, kernel='precomputed')
                model.fit(train_matrix_gamma)
                train_pred = model.predict(train_matrix_gamma)
                # auc = roc_auc_score(np.asarray([-1] * train_pred.shape[0]), train_pred)
                # print(auc)
                # print(np.sum(train_pred == 1))
                pred = model.predict(test_matrix_gamma)
                # print(np.sum(pred == test_labels))
                auc = roc_auc_score(np.concatenate((test_labels[0:test_size], test_labels[test_size * (i+1):test_size * (i+2)])), pred)
                aucs[j][i][k][l] = auc;
        best_aucs[j][i] = np.amax(aucs[j][i], axis=None)
        best_gamma_index = np.unravel_index(np.argmax(aucs[j][i]), aucs[j][i].shape)[1]
        best_gammas[j][i] = gamma_list[best_gamma_index]
        best_nu_index = np.unravel_index(np.argmax(aucs[j][i]), aucs[j][i].shape)[0]
        best_nus[j][i] = nu_list[best_nu_index]

100%|██████████| 2000000/2000000 [05:41<00:00, 5850.30it/s]
100%|██████████| 2000000/2000000 [05:53<00:00, 5655.63it/s]
100%|██████████| 2000000/2000000 [05:18<00:00, 6281.49it/s]
100%|██████████| 2000000/2000000 [05:19<00:00, 6256.73it/s]
100%|██████████| 2000000/2000000 [05:18<00:00, 6272.73it/s]


{'sig_A': array([0.730625, 0.75125 , 0.7225  , 0.721875, 0.733125]), 'sig_h0': array([0.730625, 0.75125 , 0.7225  , 0.721875, 0.733125]), 'sig_hch': array([0.730625, 0.75125 , 0.7225  , 0.721875, 0.733125]), 'sig_LQ': array([0.730625, 0.75125 , 0.7225  , 0.721875, 0.733125])}
{'sig_A': array([0.0775, 0.1   , 0.1   , 0.0505, 0.1   ]), 'sig_h0': array([0.0775, 0.1   , 0.1   , 0.0505, 0.1   ]), 'sig_hch': array([0.0775, 0.1   , 0.1   , 0.0505, 0.1   ]), 'sig_LQ': array([0.0775, 0.1   , 0.1   , 0.0505, 0.1   ])}
{'sig_A': array([0.1175, 0.1875, 0.225 , 0.2125, 0.275 ]), 'sig_h0': array([0.1175, 0.1875, 0.225 , 0.2125, 0.275 ]), 'sig_hch': array([0.1175, 0.1875, 0.225 , 0.2125, 0.275 ]), 'sig_LQ': array([0.1175, 0.1875, 0.225 , 0.2125, 0.275 ])}


In [18]:
mean_aucs = {}
std_aucs = {}
mean_nus = {}
std_nus = {}
mean_gammas = {}
std_gammas = {}

for i, alias in enumerate(sigAliasList):
    mean_aucs[alias] = np.mean(best_aucs, axis=0)[i]
    std_aucs[alias] = np.std(best_aucs, axis=0)[i]
    mean_nus[alias] = np.mean(best_nus, axis=0)[i]
    std_nus[alias] = np.std(best_nus, axis=0)[i]
    mean_gammas[alias] = np.mean(best_gammas, axis=0)[i]
    std_gammas[alias] = np.std(best_gammas, axis=0)[i]

print(mean_aucs)
print(mean_nus)
print(mean_gammas)

print(std_aucs)
print(std_nus)
print(std_gammas)

{'sig_A': 0.7270000000000001, 'sig_h0': 0.6595, 'sig_hch': 0.8130000000000001, 'sig_LQ': 0.7280000000000001}
{'sig_A': 0.17200000000000001, 'sig_h0': 0.182, 'sig_hch': 0.2, 'sig_LQ': 0.26}
{'sig_A': 0.1, 'sig_h0': 0.08020000000000001, 'sig_hch': 0.06220000000000001, 'sig_LQ': 0.1}
{'sig_A': 0.0128840987267251, 'sig_h0': 0.015604486534327191, 'sig_hch': 0.01668831926827863, 'sig_LQ': 0.01399999999999999}
{'sig_A': 0.09453041838477179, 'sig_h0': 0.09947864092356709, 'sig_hch': 0.0316227766016838, 'sig_LQ': 0.0374165738677394}
{'sig_A': 0.0, 'sig_h0': 0.0396, 'sig_hch': 0.04638275541621045, 'sig_LQ': 0.0}


In [16]:
print(best_aucs)
print(np.mean(best_aucs, axis=0))

[[0.7125 0.66   0.8225 0.7275]
 [0.74   0.6725 0.8375 0.755 ]
 [0.7275 0.63   0.815  0.7175]
 [0.7125 0.6625 0.79   0.7225]
 [0.7425 0.6725 0.8    0.7175]]
[0.727  0.6595 0.813  0.728 ]


In [None]:
# train_matrix_gamma = train_matrix
# test_matrix_gamma = np.concatenate((test_matrix[0:test_size, :], test_matrix[test_size * (i+1):test_size * (i+2), :]))
# model = svm.OneClassSVM(nu = 0.1, kernel='precomputed')
# model.fit(train_matrix_gamma)
# train_pred = model.predict(train_matrix_gamma)
# # auc = roc_auc_score(np.asarray([-1] * train_pred.shape[0]), train_pred)
# # print(auc)
# # print(np.sum(train_pred == 1))
# pred = model.predict(test_matrix_gamma)
# # print(np.sum(pred == test_labels))
# auc = roc_auc_score(np.concatenate((test_labels[0:test_size], test_labels[test_size * (i+1):test_size * (i+2)])), pred)
# print(auc)
# aucs.append(auc)