# Import libraries

In [1]:
import numpy as np
from numpy.random import RandomState
import numpy.ma as ma


import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
%matplotlib inline

import h5py
import ot
from numpy.random import Generator, PCG64
from sklearn import metrics
import itertools

from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.neighbors import KNeighborsClassifier
from tqdm import tqdm

In [2]:
sigAliasList    = ['sig_A', 'sig_h0', 'sig_hch', 'sig_LQ']
sigFilenameList = ['Ato4l_lepFilter_13TeV_filtered.h5', 'hToTauTau_13TeV_PU20_filtered.h5', 'hChToTauNu_13TeV_PU20_filtered.h5', 'leptoquark_LOWMASS_lepFilter_13TeV_filtered.h5']

In [3]:
#-- Set base directory and data directory path --#
basePath   = '/Users/hanchengli/Dropbox/AnomalyDetection/OnML4Jets2021DataChallenge/'
dataPath   = 'Data/'

bkgPath    = basePath+dataPath+'background_for_training.h5'
sigPathList = []
for x in sigFilenameList:
  sigPathList.append(basePath+dataPath+x)

# Functions

In [4]:
%cd ~/Dropbox/AnomalyDetection/OnML4Jets2021DataChallenge/anomaly_detection_code/functions
%run centralFunctions.ipynb

/Users/hanchengli/Library/CloudStorage/Dropbox/AnomalyDetection/OnML4Jets2021DataChallenge/anomaly_detection_code/functions


# Loading Data

In [5]:
dataDict = {}
dataDict['bkg'] = h5py.File(bkgPath, 'r')

for i in range(len(sigAliasList)):
  alias   = sigAliasList[i]
  sigPath = sigPathList[i]
  dataDict[alias] = h5py.File(sigPath, 'r')

In [6]:
bkg_data = dataDict['bkg']['Particles'][:,:,0:3]
sig_data = {}

for alias in sigAliasList:
  sig_data[alias] = dataDict[alias]['Particles'][:,:,0:3]

# Low $p_T$ range

In [7]:
nEvents = 500
random_state = Generator(PCG64(123))
OTSCHEME = {}
OTSCHEME['normPT'] = True
OTSCHEME['balanced'] = True
OTSCHEME['noZeroPad'] = False
OTSCHEME['individualOT'] = False

In [8]:
total_event_pT = {}

total_event_pT['bkg'] = np.sum(bkg_data[:, :, 0], axis=1)

for alias in sigAliasList:
    total_event_pT[alias] = np.sum(sig_data[alias][:,:,0], axis=1)

pTrange = [0,50,100,150,200,500,1000]
avg_aucs = []
std_aucs = []
avg_ks = []
std_ks = []

filtered_events = {}
filtered_events['bkg'] = {}
for alias in sigAliasList:
    filtered_events[alias] = {}

for i in range(0, len(pTrange)-1):
    lower_bound = pTrange[i]
    upper_bound = pTrange[i+1]
    
    mask = (total_event_pT['bkg'] >= lower_bound) & (total_event_pT['bkg'] <= upper_bound)
    
    filtered_events['bkg'][str(pTrange[i+1])] = randomDataSample(bkg_data[mask],nEvents,random_state)
    
    for alias in sigAliasList:
        mask = (total_event_pT[alias] >= lower_bound) & (total_event_pT[alias] <= upper_bound)
        filtered_events[alias][str(pTrange[i+1])] = randomDataSample(sig_data[alias][mask],nEvents,random_state)

del sig_data
del bkg_data

In [9]:
gamma_list = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 15, 20]
C_list = [0.001, 0.005, 0.01, 0.05, 1, 5, 10, 50, 100]

# This result in done using SVC() with probability = false and using predict instead of predict_proba
for i in range(0, len(pTrange)-1):
    np.random.seed(i)
    permutation = np.random.permutation(nEvents*2)
    for alias in sigAliasList:
        event_list = np.concatenate((filtered_events['bkg'][str(pTrange[i+1])],filtered_events[alias][str(pTrange[i+1])]))
        event_labels = np.asarray([0] * nEvents + [1] * nEvents)
        event_list = event_list[permutation]
        event_labels = event_labels[permutation]
        
        distance_matrix = calcOTDistance(event_list, event_list, OTSCHEME, '2D', Matrix = True)
        
        auc_list, best_gamma_list, best_C_list = SVM_cross_validation(distance_matrix, event_labels, gamma_list, C_list)
        print(np.mean(auc_list), np.std(auc_list))
        print(np.mean(best_gamma_list), np.std(best_gamma_list))
        print(np.mean(best_C_list), np.std(best_C_list))

100%|██████████| 1000000/1000000 [02:50<00:00, 5872.12it/s]


0.6303405149081824 0.01936360204743351
10.2 6.794115100585212
2.6 1.9595917942265426


100%|██████████| 1000000/1000000 [02:51<00:00, 5837.64it/s]


0.5444127984285997 0.06622861144491755
0.7 0.2449489742783178
22.6 38.856659660861226


100%|██████████| 1000000/1000000 [02:52<00:00, 5805.41it/s]


0.6157855276197932 0.015616934656263569
7.0 2.449489742783178
4.2 1.6000000000000003


100%|██████████| 1000000/1000000 [02:51<00:00, 5821.43it/s]


0.7750932857568176 0.024140940819309398
17.0 4.0
1.0 0.0


100%|██████████| 1000000/1000000 [02:52<00:00, 5796.31it/s]


0.6964910086152898 0.012843045984173497
1.6009999999999998 1.742642820545851
2.6 1.9595917942265426


100%|██████████| 1000000/1000000 [02:52<00:00, 5781.88it/s]


0.6153292029603513 0.06408431310884657
0.8001999999999999 0.3996
20.8 39.6


100%|██████████| 1000000/1000000 [02:52<00:00, 5806.92it/s]


0.5944919096722272 0.05212170640404553
1.8 1.6000000000000003
2.6002 3.720075289560683


100%|██████████| 1000000/1000000 [02:52<00:00, 5794.86it/s]


0.7115222133710833 0.040458109444738155
1.0 0.0
1.0 0.0


100%|██████████| 1000000/1000000 [02:52<00:00, 5784.16it/s]


0.6302056114519298 0.045675514833859586
0.8099999999999999 0.38
1.8 1.6000000000000003


100%|██████████| 1000000/1000000 [02:52<00:00, 5806.47it/s]


0.5397642824836262 0.02868639060740983
1.231 1.920193740225189
13.2 18.465102220134064


100%|██████████| 1000000/1000000 [02:51<00:00, 5818.82it/s]


0.5413249915593005 0.029942327641509028
1.3004000000000002 1.8865206704406925
41.4 47.86898787315228


100%|██████████| 1000000/1000000 [02:52<00:00, 5811.37it/s]


0.6651737604279623 0.046440870268945555
1.8 1.6000000000000003
1.0 0.0


100%|██████████| 1000000/1000000 [02:53<00:00, 5765.62it/s]


0.5982509435618326 0.019377380550728705
1.0 0.0
3.6 3.5552777669262356


100%|██████████| 1000000/1000000 [02:53<00:00, 5765.27it/s]


0.5372287329997312 0.0341793542763526
0.61 0.35832945734337834
22.6 38.85665966086123


100%|██████████| 1000000/1000000 [02:53<00:00, 5779.62it/s]


0.6115124436548282 0.055130815707967636
0.6 0.2
2.8 3.6000000000000005


100%|██████████| 1000000/1000000 [02:54<00:00, 5744.84it/s]


0.636469730343084 0.03313239884638092
0.62 0.34292856398964494
12.6 19.022092419079453


100%|██████████| 1000000/1000000 [02:54<00:00, 5740.09it/s]


0.5822591712503499 0.03318717737023812
2.3 2.2045407685048604
1.0 0.0


100%|██████████| 1000000/1000000 [02:53<00:00, 5754.69it/s]


0.5901046969701162 0.033122303664527716
0.52 0.28565713714171403
2.8 3.6


100%|██████████| 1000000/1000000 [02:54<00:00, 5739.24it/s]


0.6700494587175196 0.03193073439978575
0.502 0.3130750708695919
1.8 1.6


100%|██████████| 1000000/1000000 [02:54<00:00, 5746.52it/s]


0.6106804868201502 0.03914501436455363
0.64 0.440908153700972
1.8 1.6


100%|██████████| 1000000/1000000 [02:54<00:00, 5730.99it/s]


0.7152123109207675 0.04007276061305116
0.7001999999999999 0.3996500469160488
10.8 19.6


100%|██████████| 1000000/1000000 [02:54<00:00, 5724.64it/s]


0.5323381725416881 0.03160765183078647
1.3102 1.8798606756884937
21.6 39.23060030129542


100%|██████████| 1000000/1000000 [02:54<00:00, 5735.31it/s]


0.6125320918765759 0.01999592837451562
0.82 0.36
1.8 1.6000000000000003


100%|██████████| 1000000/1000000 [02:56<00:00, 5673.17it/s]


0.5534120659596762 0.03589229688079254
0.5022 0.4447643870635328
23.4 38.44268460968874
