# Google Drive Preliminaries

# Import libraries

In [5]:
import numpy as np
from numpy.random import RandomState
import numpy.ma as ma

import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
%matplotlib inline

import h5py
import ot
from numpy.random import Generator, PCG64
from sklearn import metrics
import itertools

from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.neighbors import KNeighborsClassifier
from tqdm import tqdm

In [6]:
sigAliasList    = ['sig_A', 'sig_h0', 'sig_hch', 'sig_LQ']
sigFilenameList = ['Ato4l_lepFilter_13TeV_filtered.h5', 'hToTauTau_13TeV_PU20_filtered.h5', 'hChToTauNu_13TeV_PU20_filtered.h5', 'leptoquark_LOWMASS_lepFilter_13TeV_filtered.h5']

In [19]:
#-- Set base directory and data directory path --#
basePath   = '/Users/hanchengli/Dropbox/AnomalyDetection/OnML4Jets2021DataChallenge/'
dataPath   = 'Data/'

bkgPath    = basePath+dataPath+'background_for_training.h5'
sigPathList = []
for x in sigFilenameList:
  sigPathList.append(basePath+dataPath+x)

# Functions

In [20]:
%cd ~/Dropbox/AnomalyDetection/OnML4Jets2021DataChallenge/anomaly_detection_code/functions
%run centralFunctions.ipynb

/Users/hanchengli/Library/CloudStorage/Dropbox/AnomalyDetection/OnML4Jets2021DataChallenge/anomaly_detection_code/functions


# Loading Data

In [21]:
dataDict = {}
dataDict['bkg'] = h5py.File(bkgPath, 'r')

for i in range(len(sigAliasList)):
  alias   = sigAliasList[i]
  sigPath = sigPathList[i]
  dataDict[alias] = h5py.File(sigPath, 'r')

FileNotFoundError: [Errno 2] Unable to open file (unable to open file: name = 'Users/Dropbox/AnomalyDetection/OnML4Jets2021DataChallenge/Data/background_for_training.h5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)

In [6]:
bkg_data = dataDict['bkg']['Particles'][:, :, 0:3]
sig_data = {}

for alias in sigAliasList:
  sig_data[alias] = dataDict[alias]['Particles'][:, :, 0:3]

# kNN using distance matrix with 3d ground space

In [7]:
nEvents = 500
random_state = Generator(PCG64(1))
OTSCHEME = {}
OTSCHEME['normPT'] = True
OTSCHEME['balanced'] = True
OTSCHEME['noZeroPad'] = False
OTSCHEME['individualOT'] = False

In [8]:
events = {}

events['bkg'] = randomDataSample(bkg_data, nEvents, random_state)

neighbor_list = list(range(5, 400,10))
avg_aucs = []
std_aucs = []
avg_ks = []
std_ks = []

i = 2
for alias in sigAliasList:
    i += 1
    events[alias] = randomDataSample(sig_data[alias], nEvents, random_state)
    np.random.seed(i)
    permutation = np.random.permutation(nEvents * 2)
    
    event_list = np.concatenate((events['bkg'], events[alias]))
    event_labels = np.asarray([0] * nEvents + [1] * nEvents)
    event_list = event_list[permutation]
    event_labels = event_labels[permutation]
    
    distance_matrix = calcOTDistance(event_list, event_list, OTSCHEME, '2D', Matrix = True)
    
    avg_auc, std_auc, avg_k, std_k = kNN_cross_validation(distance_matrix, event_labels, neighbor_list, k_fold=5)
    print(avg_auc, std_auc, avg_k, std_k)
    avg_aucs.append(avg_auc)
    std_aucs.append(std_auc)
    avg_ks.append(avg_k)
    std_ks.append(std_k)

100%|██████████| 4000000/4000000 [02:31<00:00, 26471.55it/s] 
Fitting Models: 100%|██████████| 40/40 [00:00<00:00, 78.74it/s] 
Fitting Models: 100%|██████████| 40/40 [00:00<00:00, 81.22it/s] 
Fitting Models: 100%|██████████| 40/40 [00:00<00:00, 78.57it/s] 
Fitting Models: 100%|██████████| 40/40 [00:00<00:00, 78.24it/s] 
Fitting Models: 100%|██████████| 40/40 [00:00<00:00, 77.92it/s] 


0.9004593599544226 0.007165193436908108 33.0 11.661903789690601


100%|██████████| 4000000/4000000 [02:30<00:00, 26643.74it/s] 
Fitting Models: 100%|██████████| 40/40 [00:00<00:00, 79.94it/s] 
Fitting Models: 100%|██████████| 40/40 [00:00<00:00, 78.56it/s] 
Fitting Models: 100%|██████████| 40/40 [00:00<00:00, 79.80it/s] 
Fitting Models: 100%|██████████| 40/40 [00:00<00:00, 79.67it/s] 
Fitting Models: 100%|██████████| 40/40 [00:00<00:00, 70.77it/s]


0.7685846173333186 0.01419185335688538 53.0 20.396078054371138


100%|██████████| 4000000/4000000 [02:36<00:00, 25588.77it/s] 
Fitting Models: 100%|██████████| 40/40 [00:00<00:00, 68.98it/s]
Fitting Models: 100%|██████████| 40/40 [00:00<00:00, 79.32it/s] 
Fitting Models: 100%|██████████| 40/40 [00:00<00:00, 82.60it/s] 
Fitting Models: 100%|██████████| 40/40 [00:00<00:00, 78.93it/s] 
Fitting Models: 100%|██████████| 40/40 [00:00<00:00, 80.32it/s] 


0.9341840740483727 0.006266176281982134 185.0 109.17875251164945


100%|██████████| 4000000/4000000 [02:33<00:00, 26028.59it/s]
Fitting Models: 100%|██████████| 40/40 [00:00<00:00, 79.02it/s] 
Fitting Models: 100%|██████████| 40/40 [00:00<00:00, 76.77it/s] 
Fitting Models: 100%|██████████| 40/40 [00:00<00:00, 78.13it/s] 
Fitting Models: 100%|██████████| 40/40 [00:00<00:00, 75.28it/s] 
Fitting Models: 100%|██████████| 40/40 [00:00<00:00, 73.84it/s]


0.8755415378400864 0.005315485084715778 115.0 33.46640106136302
