# Prep Work

## Import libraries

In [1]:
import json
import numpy as np
import h5py
from numpy.random import Generator, PCG64
import sys
sys.path.insert(0, '/Users/bobli/Dropbox/AnomalyDetection/OnML4Jets2021DataChallenge/anomaly_detection_code/functions/')
from centralFunctions import *

In [2]:
sigAliasList    = ['sig_A', 'sig_h0', 'sig_hch', 'sig_LQ']
sigFilenameList = ['Ato4l_lepFilter_13TeV_filtered.h5', 'hToTauTau_13TeV_PU20_filtered.h5', 'hChToTauNu_13TeV_PU20_filtered.h5', 'leptoquark_LOWMASS_lepFilter_13TeV_filtered.h5']

In [3]:
#-- Set base directory and data directory path --#
# basePath   = '/Users/hanchengli/Dropbox/AnomalyDetection/OnML4Jets2021DataChallenge/'
basePath   = '/Users/bobli/Dropbox/AnomalyDetection/OnML4Jets2021DataChallenge/anomaly_detection_code/'
dataPath   = 'data/ADC2021/'

bkgPath    = basePath+dataPath+'background_for_training.h5'
sigPathList = []
for x in sigFilenameList:
  sigPathList.append(basePath+dataPath+x)

## Loading Data

In [4]:
dataDict = {}
dataDict['bkg'] = h5py.File(bkgPath, 'r')

for i in range(len(sigAliasList)):
  alias   = sigAliasList[i]
  sigPath = sigPathList[i]
  dataDict[alias] = h5py.File(sigPath, 'r')

In [5]:
bkg_data = dataDict['bkg']['Particles'][:, :, 0:3]
sig_data = {}

for alias in sigAliasList:
  sig_data[alias] = dataDict[alias]['Particles'][:, :, 0:3]

# One Class SVM on 3D Ground Space

In [6]:
test_size = 1000
train_size = 10000
random_state = Generator(PCG64(1))
np.random.seed(100)
OTSCHEME = {}
OTSCHEME['normPT'] = False
OTSCHEME['balanced'] = True
OTSCHEME['noZeroPad'] = False
OTSCHEME['individualOT'] = False

In [7]:
bkg_events = randomDataSample(bkg_data, train_size + test_size * 6, random_state)
sig_events = {}
for alias in sigAliasList:
    sig_events[alias] = randomDataSample(sig_data[alias], test_size * 6, random_state)

train_events = bkg_events[:train_size]
test_events = []

val_events = []
val_events.extend(bkg_events[train_size+test_size*5:])
for alias in sigAliasList:
    val_events.extend(sig_events[alias][test_size*5:])
val_events = np.asarray(val_events)
for i in range(0, 5):
    test_events.append({})
    test_events[i]['bkg'] = bkg_events[train_size+test_size*i:train_size+test_size*(i+1)]
    for alias in sigAliasList:
        test_events[i][alias] = sig_events[alias][test_size*i:test_size*(i+1)]
    
del bkg_data, sig_data


In [8]:
# train_matrix = calcOTDistance(train_events, train_events, OTSCHEME, '3D', Matrix=True)
# filePath = '/Users/hanchengli/Dropbox/AnomalyDetection/OnML4Jets2021DataChallenge/Data/train_matrix_10k.npy'
filePath = '/Users/bobli/Dropbox/AnomalyDetection/OnML4Jets2021DataChallenge/anomaly_detection_code/data/train_matrix_10k.npy'
train_matrix = np.load(filePath)

In [9]:
val_labels = np.asarray([1] * test_size + [-1] * test_size * 4)
nu = 0.2
gamma_list = [0.05, 0.15, 0.25, 0.35, 0.45, 0.55]
auc_list = []
val_matrix = parallel_OT_non_square(val_events, train_events, OTSCHEME, '3D')
for gamma in gamma_list:
    auc, f1_score, _ = OneClassSVM_with_distance_matrix(train_matrix, val_matrix, val_labels, gamma, nu)
    auc_list.append(auc)
    
max_index = auc_list.index(max(auc_list))
gamma = gamma_list[max_index]

In [10]:
test_labels = np.asarray([1] * test_size + [-1] * test_size * 4)

best_aucs = {}

aucs = np.zeros((5,4))

f1_scores = np.zeros((5,4))

ROC_metrics = {}

for j in range(0, 5):
    test_set = []
    test_set.extend(test_events[j]['bkg'])
    ROC_metrics['repeat'+str(j)] = {}
    for alias in sigAliasList:
        test_set.extend(test_events[j][alias])
    test_set = np.asarray(test_set)
    test_matrix = parallel_OT_non_square(test_set, train_events, OTSCHEME, '3D')
    for i, alias in enumerate(sigAliasList):
        specific_test_labels = np.concatenate((test_labels[0:test_size], test_labels[test_size * (i+1):test_size * (i+2)]))
        specific_test_matrix = np.concatenate((test_matrix[0:test_size, :], test_matrix[test_size * (i+1):test_size * (i+2), :]))
        auc, f1_score, ROC_metrics['repeat'+str(j)][alias] = OneClassSVM_with_distance_matrix(train_matrix, specific_test_matrix, specific_test_labels, gamma, nu)
        
        aucs[j, i] = auc
        f1_scores[j, i] = f1_score

In [11]:
mean_aucs = {}
std_aucs = {}

for i, alias in enumerate(sigAliasList):
    mean_aucs[alias] = np.mean(aucs, axis=0)[i]
    std_aucs[alias] = np.std(aucs, axis=0)[i]

In [12]:
mean_f1_scores = {}
std_f1_scores = {}

for i, alias in enumerate(sigAliasList):
    mean_f1_scores[alias] = np.mean(f1_scores, axis=0)[i]
    std_f1_scores[alias] = np.std(f1_scores, axis=0)[i]

In [13]:
transposed_dict = {
    col: {row: ROC_metrics[row][col] for row in ROC_metrics}
    for col in ROC_metrics[next(iter(ROC_metrics))]
}
# Convert NumPy arrays in the lists to regular lists
converted_dict = {
    row: {
        col: [transposed_dict[row][col][0]] + [arr.tolist() for arr in transposed_dict[row][col][1:]]
        for col in transposed_dict[row]
    }
    for row in transposed_dict
}
%cd ~/Dropbox/AnomalyDetection/OnML4Jets2021DataChallenge/anomaly_detection_code/experiments/OT_ML/OT_oneClassSVM
with open('OneClassSVM.json', 'w') as json_file:
    json.dump(converted_dict, json_file, indent=4)

/Users/bobli/Library/CloudStorage/Dropbox/AnomalyDetection/OnML4Jets2021DataChallenge/anomaly_detection_code/experiments/OT_ML/OT_oneClassSVM


In [14]:
f1_scores_dict = {}
for i, alias in enumerate(sigAliasList):
    f1_scores_dict[alias] = {}
    for j in range(0, 5):
        f1_scores_dict[alias]['repeat'+str(j)] = f1_scores[j, i].tolist()

with open('OneClassSVM_f1_scores.json', 'w') as json_file:
    json.dump(f1_scores_dict, json_file, indent=4)