# Google Drive Preliminaries

# Import libraries

In [14]:
import numpy as np
from numpy.random import RandomState
import numpy.ma as ma

import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
%matplotlib inline

import h5py
import ot
from numpy.random import Generator, PCG64
from sklearn import metrics
import itertools

from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.neighbors import KNeighborsClassifier
from tqdm import tqdm
import json

In [15]:
sigAliasList    = ['sig_A', 'sig_h0', 'sig_hch', 'sig_LQ']
sigFilenameList = ['Ato4l_lepFilter_13TeV_filtered.h5', 'hToTauTau_13TeV_PU20_filtered.h5', 'hChToTauNu_13TeV_PU20_filtered.h5', 'leptoquark_LOWMASS_lepFilter_13TeV_filtered.h5']

In [16]:
#-- Set base directory and data directory path --#
# basePath   = '/Users/hanchengli/Dropbox/AnomalyDetection/OnML4Jets2021DataChallenge/'
basePath   = '/Users/bobli/Dropbox/AnomalyDetection/OnML4Jets2021DataChallenge/'
dataPath   = 'Data/'

bkgPath    = basePath+dataPath+'background_for_training.h5'
sigPathList = []
for x in sigFilenameList:
  sigPathList.append(basePath+dataPath+x)

# Functions

In [17]:
%cd ~/Dropbox/AnomalyDetection/OnML4Jets2021DataChallenge/anomaly_detection_code/functions
%run centralFunctions.ipynb

/Users/bobli/Library/CloudStorage/Dropbox/AnomalyDetection/OnML4Jets2021DataChallenge/anomaly_detection_code/functions


# Loading Data

In [18]:
# Create a dictionary to store the data
dataDict = {}
dataDict['bkg'] = h5py.File(bkgPath, 'r')

for i in range(len(sigAliasList)):
  alias   = sigAliasList[i]
  sigPath = sigPathList[i]
  dataDict[alias] = h5py.File(sigPath, 'r')

In [19]:
# store data in dictionary as numpy arrays
bkg_data = dataDict['bkg']['Particles'][:, :, 0:3]
sig_data = {}

for alias in sigAliasList:
  sig_data[alias] = dataDict[alias]['Particles'][:, :, 0:3]

KeyboardInterrupt: 

# kNN using distance matrix with 3d ground space

In [None]:
# set basic parameters
nEvents = 1000
random_state = Generator(PCG64(1))
np.random.seed(100)
OTSCHEME = {}
OTSCHEME['normPT'] = False
OTSCHEME['balanced'] = True
OTSCHEME['noZeroPad'] = False
OTSCHEME['individualOT'] = False

In [None]:
events = {}

events['bkg'] = randomDataSample(bkg_data, nEvents, random_state)

neighbor_list = list(range(5, 400,10))
avg_aucs = []
std_aucs = []
avg_ks = []
std_ks = []

# Loop over the signals and sample the data
for alias in sigAliasList:
    events[alias] = randomDataSample(sig_data[alias], nEvents, random_state)

del sig_data, bkg_data

scoreDict = {}

# Loop over the signals and use kNN to classify the data and collect the results
for alias in sigAliasList:
    permutation = np.random.permutation(nEvents * 2)
    
    event_list = np.concatenate((events['bkg'], events[alias]))
    event_labels = np.asarray([0] * nEvents + [1] * nEvents)
    event_list = event_list[permutation]
    event_labels = event_labels[permutation]

    distance_matrix = calcOTDistance(event_list, event_list, OTSCHEME, '3D', Matrix = True)
    
    avg_auc, std_auc, avg_k, std_k, metrics_list = kNN_cross_validation(distance_matrix, event_labels, neighbor_list, k_fold=5)
    print(avg_auc, std_auc, avg_k, std_k)
    avg_aucs.append(avg_auc)
    std_aucs.append(std_auc)
    avg_ks.append(avg_k)
    std_ks.append(std_k)
    scoreDict[alias] = metrics_list

100%|██████████| 4000000/4000000 [02:38<00:00, 25310.53it/s] 
Fitting Models: 100%|██████████| 40/40 [00:00<00:00, 73.59it/s]
Fitting Models: 100%|██████████| 40/40 [00:00<00:00, 72.48it/s] 
Fitting Models: 100%|██████████| 40/40 [00:00<00:00, 71.32it/s]
Fitting Models: 100%|██████████| 40/40 [00:00<00:00, 73.44it/s]
Fitting Models: 100%|██████████| 40/40 [00:00<00:00, 69.34it/s]


0.9020570647076511 0.014260414200471186 25.0 8.94427190999916


100%|██████████| 4000000/4000000 [02:43<00:00, 24426.34it/s] 
Fitting Models: 100%|██████████| 40/40 [00:00<00:00, 72.94it/s]
Fitting Models: 100%|██████████| 40/40 [00:00<00:00, 75.02it/s] 
Fitting Models: 100%|██████████| 40/40 [00:00<00:00, 73.05it/s] 
Fitting Models: 100%|██████████| 40/40 [00:00<00:00, 73.58it/s] 
Fitting Models: 100%|██████████| 40/40 [00:00<00:00, 71.16it/s]


0.7712851497591762 0.018137049369822508 89.0 26.5329983228432


100%|██████████| 4000000/4000000 [02:56<00:00, 22674.14it/s]
Fitting Models: 100%|██████████| 40/40 [00:01<00:00, 22.01it/s]
Fitting Models: 100%|██████████| 40/40 [00:00<00:00, 67.08it/s]
Fitting Models: 100%|██████████| 40/40 [00:00<00:00, 72.08it/s]
Fitting Models: 100%|██████████| 40/40 [00:00<00:00, 74.24it/s] 
Fitting Models: 100%|██████████| 40/40 [00:00<00:00, 71.39it/s]


0.9197953147848965 0.006547174999609766 193.0 136.14697940094007


100%|██████████| 4000000/4000000 [02:41<00:00, 24697.35it/s]
Fitting Models: 100%|██████████| 40/40 [00:00<00:00, 72.45it/s] 
Fitting Models: 100%|██████████| 40/40 [00:00<00:00, 71.65it/s]
Fitting Models: 100%|██████████| 40/40 [00:00<00:00, 72.12it/s] 
Fitting Models: 100%|██████████| 40/40 [00:00<00:00, 71.22it/s]
Fitting Models: 100%|██████████| 40/40 [00:00<00:00, 69.75it/s]


0.8766399182331396 0.014149130221591219 93.0 48.744230427815765


In [None]:
print(scoreDict)

{'sig_A': {'repeat0': (0.9075297060662915, array([0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 2.72596844e-04, 8.32137733e-04, 1.39167862e-03,
       1.95121951e-03, 2.51076040e-03, 3.07030129e-03, 3.62984218e-03,
       4.18938307e-03, 4.74892396e-03, 5.30846485e-03, 5.86800574e-03,
       6.42754663e-03, 6.98708752e-03, 7.54662841e-03, 8.10616930e-03,
       8.66571019e-03, 9.22525108e-03, 9.75609756e-03, 9.75609756e-03,
       9.75609756e-03, 9.75609756e-03, 9.75609756e-03, 9.75609756e-03,
       9.75609756e-03, 9.75609756e-03, 9.75609756e-03, 9.75609756e-03,
       9.75609756e-03, 9.75609756e-03, 9.75609756e-03, 9.75609756e-03,
       9.75609756e-03, 9.75609756e

In [21]:
transposed_dict = {
    col: {row: scoreDict[row][col] for row in scoreDict}
    for col in scoreDict[next(iter(scoreDict))]
}
# Convert NumPy arrays in the lists to regular lists
converted_dict = {
    row: {
        col: [transposed_dict[row][col][0]] + [arr.tolist() for arr in transposed_dict[row][col][1:]]
        for col in transposed_dict[row]
    }
    for row in transposed_dict
}
%cd ~/Dropbox/AnomalyDetection/OnML4Jets2021DataChallenge/anomaly_detection_code/distance_matrix_classification/kNN
with open('kNN_distance_matrix_3D.json', 'w') as json_file:
    json.dump(converted_dict, json_file, indent=4)

/Users/bobli/Library/CloudStorage/Dropbox/AnomalyDetection/OnML4Jets2021DataChallenge/anomaly_detection_code/distance_matrix_classification/kNN


/Users/bobli/Library/CloudStorage/Dropbox/AnomalyDetection/OnML4Jets2021DataChallenge/anomaly_detection_code/distance_matrix_classification/kNN
