# Prep Work

## Import libraries

In [1]:
import numpy as np
from numpy.random import RandomState
import numpy.ma as ma

import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
%matplotlib inline

import h5py
import ot
from numpy.random import Generator, PCG64
from sklearn import metrics
import itertools

from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.neighbors import KNeighborsClassifier
from tqdm import tqdm

In [2]:
sigAliasList    = ['sig_A', 'sig_h0', 'sig_hch', 'sig_LQ']
sigFilenameList = ['Ato4l_lepFilter_13TeV_filtered.h5', 'hToTauTau_13TeV_PU20_filtered.h5', 'hChToTauNu_13TeV_PU20_filtered.h5', 'leptoquark_LOWMASS_lepFilter_13TeV_filtered.h5']

In [3]:
#-- Set base directory and data directory path --#
basePath   = '/Users/hanchengli/Dropbox/AnomalyDetection/OnML4Jets2021DataChallenge/'
dataPath   = 'Data/'

bkgPath    = basePath+dataPath+'background_for_training.h5'
sigPathList = []
for x in sigFilenameList:
  sigPathList.append(basePath+dataPath+x)

## Functions

In [4]:
%cd ~/Dropbox/AnomalyDetection/OnML4Jets2021DataChallenge/anomaly_detection_code/functions
%run centralFunctions.ipynb

/Users/hanchengli/Library/CloudStorage/Dropbox/AnomalyDetection/OnML4Jets2021DataChallenge/anomaly_detection_code/functions


## Loading Data

In [5]:
dataDict = {}
dataDict['bkg'] = h5py.File(bkgPath, 'r')

for i in range(len(sigAliasList)):
  alias   = sigAliasList[i]
  sigPath = sigPathList[i]
  dataDict[alias] = h5py.File(sigPath, 'r')

In [6]:
bkg_data = dataDict['bkg']['Particles'][:, :, 0:3]
sig_data = {}

for alias in sigAliasList:
  sig_data[alias] = dataDict[alias]['Particles'][:, :, 0:3]

# SVM Classification with 3D Ground Space

In [7]:
nEvents = 500
random_state = Generator(PCG64(1))
np.random.seed(100)
OTSCHEME = {}
OTSCHEME['normPT'] = False
OTSCHEME['balanced'] = True
OTSCHEME['noZeroPad'] = False
OTSCHEME['individualOT'] = False

In [8]:
avg_aucs = []
std_aucs = []
avg_ks = []
std_ks = []

gamma_list = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5]
C_list = [0.001, 0.005, 0.01, 0.05, 1, 5, 10, 50, 100]

In [9]:
events = {}

events['bkg'] = randomDataSample(bkg_data, nEvents, random_state)

for alias in sigAliasList:
    events[alias] = randomDataSample(sig_data[alias], nEvents, random_state)

del sig_data, bkg_data

for alias in sigAliasList:
    permutation = np.random.permutation(nEvents * 2)
    event_list = np.concatenate((events['bkg'],events[alias]))
    event_labels = np.asarray([0] * nEvents + [1] * nEvents)
    event_list = event_list[permutation]
    event_labels = event_labels[permutation]
    
    distance_matrix = calcOTDistance(event_list, event_list, OTSCHEME, '3D', Matrix = True)
    
    auc_list, best_gamma_list, best_C_list = SVM_cross_validation(distance_matrix, event_labels, gamma_list, C_list)
    print(np.mean(auc_list), np.std(auc_list))
    print(np.mean(best_gamma_list), np.std(best_gamma_list))
    print(np.mean(best_C_list), np.std(best_C_list))

100%|██████████| 1000000/1000000 [01:30<00:00, 11032.78it/s]


0.8134419326034952 0.02167092387662736
0.044000000000000004 0.0332264954516723
4.4 3.32264954516723


100%|██████████| 1000000/1000000 [01:31<00:00, 10935.79it/s]


0.6576255043088532 0.03136348551614414
0.124 0.1914523439396865
13.4 18.596773913773326


100%|██████████| 1000000/1000000 [01:32<00:00, 10811.86it/s]


0.8497758309872318 0.011160574753527205
0.0162 0.017255723688098395
1.6019999999999999 1.7417278777122447


100%|██████████| 1000000/1000000 [01:31<00:00, 10903.19it/s]


0.7817128997985493 0.01556181048367067
0.016000000000000004 0.01714642819948225
10.221999999999998 19.892504518033924
