# Prep Work

## Import libraries

In [1]:
import numpy as np
from numpy.random import RandomState
import numpy.ma as ma

import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
%matplotlib inline

import h5py
import ot
from numpy.random import Generator, PCG64
from sklearn import svm
from sklearn.metrics import roc_auc_score

In [2]:
sigAliasList    = ['sig_A', 'sig_h0', 'sig_hch', 'sig_LQ']
sigFilenameList = ['Ato4l_lepFilter_13TeV_filtered.h5', 'hToTauTau_13TeV_PU20_filtered.h5', 'hChToTauNu_13TeV_PU20_filtered.h5', 'leptoquark_LOWMASS_lepFilter_13TeV_filtered.h5']

In [3]:
#-- Set base directory and data directory path --#
basePath   = '/Users/bobli/Dropbox/AnomalyDetection/OnML4Jets2021DataChallenge/'
dataPath   = 'Data/'

bkgPath    = basePath+dataPath+'background_for_training.h5'
sigPathList = []
for x in sigFilenameList:
  sigPathList.append(basePath+dataPath+x)

## Functions

In [4]:
%cd ~/Dropbox/AnomalyDetection/OnML4Jets2021DataChallenge/anomaly_detection_code/functions
%run centralFunctions.ipynb

/Users/bobli/Library/CloudStorage/Dropbox/AnomalyDetection/OnML4Jets2021DataChallenge/anomaly_detection_code/functions


## Loading Data

In [5]:
dataDict = {}
dataDict['bkg'] = h5py.File(bkgPath, 'r')

for i in range(len(sigAliasList)):
  alias   = sigAliasList[i]
  sigPath = sigPathList[i]
  dataDict[alias] = h5py.File(sigPath, 'r')

In [6]:
bkg_data = dataDict['bkg']['Particles'][:, :, 0:3]
sig_data = {}

for alias in sigAliasList:
  sig_data[alias] = dataDict[alias]['Particles'][:, :, 0:3]

# One Class SVM on 3D Ground Space

In [7]:
test_size = 1000
train_size = 10000
random_state = Generator(PCG64(1))
np.random.seed(100)
OTSCHEME = {}
OTSCHEME['normPT'] = False
OTSCHEME['balanced'] = True
OTSCHEME['noZeroPad'] = False
OTSCHEME['individualOT'] = False

In [8]:
events = randomDataSample(bkg_data, train_size + test_size, random_state)

train_events = events[:train_size]
test_events = events[train_size:]

events = {}
for alias in sigAliasList:
    events[alias] = randomDataSample(sig_data[alias], test_size, random_state)
    test_events = np.concatenate((test_events, events[alias]))
    
del bkg_data, sig_data
    
train_matrix = calcOTDistance(train_events, train_events, OTSCHEME, '3D', Matrix=True)

print(np.max(train_matrix), np.min(train_matrix))


100%|██████████| 100000000/100000000 [1:02:18<00:00, 26751.86it/s]


32937.94819405766 0.0


In [9]:


test_matrix = calcOTDistance_non_square(test_events, train_events, OTSCHEME, '3D', Matrix=True)

print(test_matrix.shape)

100%|██████████| 50000000/50000000 [56:22<00:00, 14784.06it/s]  


(5000, 10000)


In [25]:
test_labels = np.asarray([1] * test_size + [-1] * test_size * 4)
nu_list = [0.001, 0.01, 0.1, 0.15, 0.20, 0.25, 0.3]
gamma_list = [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10]
aucs = []
for gamma in gamma_list:
    train_matrix_gamma = np.exp(-gamma*train_matrix)
    test_matrix_gamma = np.exp(-gamma*test_matrix)
    model = svm.OneClassSVM(nu = 0.25, kernel='precomputed')
    model.fit(train_matrix_gamma)
    train_pred = model.predict(train_matrix_gamma)
    # auc = roc_auc_score(np.asarray([-1] * train_pred.shape[0]), train_pred)
    # print(auc)
    # print(np.sum(train_pred == 1))
    pred = model.predict(test_matrix_gamma)
    # print(np.sum(pred == test_labels))
    auc = roc_auc_score(test_labels, pred)
    aucs.append(auc)
    
    


In [26]:
print(max(aucs), gamma_list[aucs.index(max(aucs))])

0.723375 0.1


In [12]:
print(train_matrix.shape)

(10000, 10000)
