In [1]:
import numpy as np
from numpy.random import RandomState
import numpy.ma as ma

import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
%matplotlib inline

import h5py
import ot
from numpy.random import Generator, PCG64
from sklearn import metrics
import itertools

from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.neighbors import KNeighborsClassifier
from tqdm import tqdm

In [2]:
sigAliasList    = ['sig_A', 'sig_h0', 'sig_hch', 'sig_LQ']
sigFilenameList = ['Ato4l_lepFilter_13TeV_filtered.h5', 'hToTauTau_13TeV_PU20_filtered.h5', 'hChToTauNu_13TeV_PU20_filtered.h5', 'leptoquark_LOWMASS_lepFilter_13TeV_filtered.h5']

In [3]:
#-- Set base directory and data directory path --#
basePath   = './'
dataPath   = 'Data/'

bkgPath    = basePath+dataPath+'background_for_training.h5'
sigPathList = []
for x in sigFilenameList:
  sigPathList.append(basePath+dataPath+x)

In [37]:
%run centralFunctions.ipynb

In [5]:
dataDict = {}
dataDict['bkg'] = h5py.File(bkgPath, 'r')

for i in range(len(sigAliasList)):
  alias   = sigAliasList[i]
  sigPath = sigPathList[i]
  dataDict[alias] = h5py.File(sigPath, 'r')

In [6]:
bkg_data = dataDict['bkg']['Particles'][:,:,0:3]
sig_data = {}

for alias in sigAliasList:
  sig_data[alias] = dataDict[alias]['Particles'][:,:,0:3]

In [18]:
nEvents = 500
random_state = Generator(PCG64(123))
OTSCHEME = {}
OTSCHEME['normPT'] = True
OTSCHEME['balanced'] = True
OTSCHEME['noZeroPad'] = False
OTSCHEME['individualOT'] = False

In [19]:
total_event_pT_bkg = np.sum(bkg_data[:, :, 0], axis=1)
mask = total_event_pT_bkg <= 50
low_pT_events_bkg = randomDataSample(bkg_data[mask],nEvents, random_state)

In [20]:
total_event_pT_A = np.sum(sig_data['sig_A'][:,:,0],axis = 1)
mask = total_event_pT_A <= 50
low_pT_events_A = randomDataSample(sig_data['sig_A'][mask],nEvents, random_state)

In [21]:
total_event_pT_h0 = np.sum(sig_data['sig_h0'][:,:,0],axis = 1)
mask = total_event_pT_h0 <= 50
low_pT_events_h0 = randomDataSample(sig_data['sig_h0'][mask],nEvents, random_state)

In [22]:
total_event_pT_hch = np.sum(sig_data['sig_hch'][:,:,0],axis = 1)
mask = total_event_pT_hch <= 50
low_pT_events_hch = randomDataSample(sig_data['sig_hch'][mask],nEvents, random_state)

In [23]:
total_event_pT_LQ = np.sum(sig_data['sig_LQ'][:,:,0],axis = 1)
mask = total_event_pT_LQ <= 50
low_pT_events_LQ = randomDataSample(sig_data['sig_LQ'][mask],nEvents, random_state)

In [24]:
bkg_A_event_list = np.concatenate((low_pT_events_bkg,low_pT_events_A))
bkg_A_labels = np.array([0] * nEvents + [1] * nEvents)
np.random.seed(1)
permutation = np.random.permutation(2*nEvents)
bkg_A_event_list = bkg_A_event_list[permutation]
bkg_A_labels = bkg_A_labels[permutation]
bkg_A_dm = calcOTDistance(bkg_A_event_list, bkg_A_event_list, OTSCHEME, '2D', Matrix=True)

100%|██████████| 1000000/1000000 [01:09<00:00, 14458.29it/s]


In [25]:
bkg_A_dm.shape[0]

1000

In [38]:
neighbor_list = [5, 10, 12, 14, 16, 18, 20, 25, 30, 40, 50, 60, 70, 80, 90]
avg_auc, std_auc, avg_k, std_k = kNN_cross_validation(bkg_A_dm, bkg_A_labels, neighbor_list, k_fold=5, AUC_list=True)
print(avg_auc, std_auc, avg_k, std_k)

Fitting Models: 100%|██████████| 15/15 [00:00<00:00, 272.26it/s]
Fitting Models: 100%|██████████| 15/15 [00:00<00:00, 453.42it/s]
Fitting Models: 100%|██████████| 15/15 [00:00<00:00, 432.90it/s]
Fitting Models: 100%|██████████| 15/15 [00:00<00:00, 473.75it/s]
Fitting Models: 100%|██████████| 15/15 [00:00<00:00, 435.33it/s]

0.5233987845918623 0.05933125774638761 41.0 21.071307505705477





In [31]:
print(range(0,5))

range(0, 5)
