In [1]:
import os
import shutil

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization  # to create AdamW optmizer

import matplotlib.pyplot as plt
tf.get_logger().setLevel('ERROR')

import pprint
pp = pprint.PrettyPrinter(indent=4)
import math
import numpy as np
import sklearn.metrics as sk 
import pickle

from scipy import stats
from scipy.spatial import distance

from tensorflow import keras

from sklearn.model_selection import train_test_split

from scipy.stats import entropy

from tqdm import tqdm

In [2]:
if os.path.exists('20news-18828'):
  shutil.rmtree('20news-18828')

In [3]:
url = 'http://qwone.com/~jason/20Newsgroups/20news-18828.tar.gz'

dataset = tf.keras.utils.get_file('20news-18828.tar.gz', url,
                                  untar=True, cache_dir='.',
                                  cache_subdir='')

dataset_dir = os.path.join(os.path.dirname(dataset), '20news')
train_dir = os.path.join(dataset_dir, 'train')

In [4]:
rootPath = '20news-18828'
files = os.listdir(rootPath)

all_classes = {}
for ind in range(len(files)):
  all_classes[ind] = files[ind]

all_classes

{0: 'comp.sys.mac.hardware',
 1: 'comp.os.ms-windows.misc',
 2: 'rec.motorcycles',
 3: 'talk.politics.misc',
 4: 'comp.graphics',
 5: 'talk.politics.mideast',
 6: 'sci.med',
 7: 'rec.sport.baseball',
 8: 'talk.religion.misc',
 9: 'sci.electronics',
 10: 'comp.windows.x',
 11: 'sci.crypt',
 12: 'talk.politics.guns',
 13: 'rec.sport.hockey',
 14: 'rec.autos',
 15: 'sci.space',
 16: 'comp.sys.ibm.pc.hardware',
 17: 'soc.religion.christian',
 18: 'misc.forsale',
 19: 'alt.atheism'}

In [5]:
# models_path = "outputs/"
# pickle_file_name = models_path + 'all_classes'
# out_file = open(pickle_file_name,"wb")
# pickle.dump(all_classes, out_file)
# out_file.close()

# Set the experiment

In [6]:
models_path = "outputs/"
pickle_file_name = models_path + 'all_classes'
out_file = open(pickle_file_name,"rb")
all_classes = pickle.load(out_file)
out_file.close()

In [7]:
all_classes

{0: 'comp.sys.mac.hardware',
 1: 'comp.os.ms-windows.misc',
 2: 'rec.motorcycles',
 3: 'talk.politics.misc',
 4: 'comp.graphics',
 5: 'talk.politics.mideast',
 6: 'sci.med',
 7: 'rec.sport.baseball',
 8: 'talk.religion.misc',
 9: 'sci.electronics',
 10: 'comp.windows.x',
 11: 'sci.crypt',
 12: 'talk.politics.guns',
 13: 'rec.sport.hockey',
 14: 'rec.autos',
 15: 'sci.space',
 16: 'comp.sys.ibm.pc.hardware',
 17: 'soc.religion.christian',
 18: 'misc.forsale',
 19: 'alt.atheism'}

In [8]:
first_10_classes_dict = {}
for ind in all_classes:
  if ind <= 9:
    first_10_classes_dict[ind] = all_classes[ind]

id_classes_ind = [0, 1, 2, 3, 4, 5, 6, 7, 8]
anom_classes_ind = [9]

id_classes_dict = {}
for ind in all_classes:
  if ind in id_classes_ind:
    id_classes_dict[ind] = first_10_classes_dict[ind]

anom_classes_dict = {}
for ind in all_classes:
  if ind in anom_classes_ind:
    anom_classes_dict[ind] = first_10_classes_dict[ind]

print("first_10_classes_dict")
pp.pprint(first_10_classes_dict)
print("id_classes_dict")
pp.pprint(id_classes_dict)
print("anom_classes_dict")
pp.pprint(anom_classes_dict)

id_classes_whole = list(id_classes_dict.values())
anom_classes = list(anom_classes_dict.values())

print("\nID Classes\n------------")
pp.pprint(id_classes_whole)
print("Anomalous Classes\n------------")
pp.pprint(anom_classes)

anom_class_name_for_path = "anom_%d" % (anom_classes_ind[0])
anom_class_name_for_path

first_10_classes_dict
{   0: 'comp.sys.mac.hardware',
    1: 'comp.os.ms-windows.misc',
    2: 'rec.motorcycles',
    3: 'talk.politics.misc',
    4: 'comp.graphics',
    5: 'talk.politics.mideast',
    6: 'sci.med',
    7: 'rec.sport.baseball',
    8: 'talk.religion.misc',
    9: 'sci.electronics'}
id_classes_dict
{   0: 'comp.sys.mac.hardware',
    1: 'comp.os.ms-windows.misc',
    2: 'rec.motorcycles',
    3: 'talk.politics.misc',
    4: 'comp.graphics',
    5: 'talk.politics.mideast',
    6: 'sci.med',
    7: 'rec.sport.baseball',
    8: 'talk.religion.misc'}
anom_classes_dict
{9: 'sci.electronics'}

ID Classes
------------
[   'comp.sys.mac.hardware',
    'comp.os.ms-windows.misc',
    'rec.motorcycles',
    'talk.politics.misc',
    'comp.graphics',
    'talk.politics.mideast',
    'sci.med',
    'rec.sport.baseball',
    'talk.religion.misc']
Anomalous Classes
------------
['sci.electronics']


'anom_9'

In [9]:
# Test has all class labels 0 - 9. Further separated by Test_OOD
testPath = rootPath + "/test"
if os.path.exists(testPath):
  shutil.rmtree(testPath)
os.mkdir(testPath)

for className in files:
  if className in first_10_classes_dict.values() and className not in anom_classes:
    path = os.path.join(rootPath, className)
    classTestPath = testPath+"/"+className
    os.mkdir(classTestPath)
    idFiles = os.listdir(path)
    lenIdFiles = len(idFiles)
    testLen = math.floor(0.2 * lenIdFiles)
    print(className, lenIdFiles, testLen)
    for ind, idFile in enumerate(idFiles):
      if ind <= testLen:
        shutil.move(os.path.join(rootPath, className, idFile), classTestPath)

comp.sys.mac.hardware 961 192
comp.os.ms-windows.misc 985 197
rec.motorcycles 994 198
talk.politics.misc 775 155
comp.graphics 973 194
talk.politics.mideast 940 188
sci.med 990 198
rec.sport.baseball 994 198
talk.religion.misc 628 125


In [10]:
# Validation has all class labels except for anomalous class 
# Here, 0 - 8
testPath = rootPath + "/val"
if os.path.exists(testPath):
  shutil.rmtree(testPath)
os.mkdir(testPath)

for className in files:
  if className in id_classes_whole:
    path = os.path.join(rootPath, className)
    classTestPath = testPath+"/"+className
    os.mkdir(classTestPath)
    idFiles = os.listdir(path)
    lenIdFiles = len(idFiles)
    testLen = math.floor(0.2 * lenIdFiles)
    print(className, lenIdFiles, testLen)
    for ind, idFile in enumerate(idFiles):
      if ind <= testLen:
        shutil.move(os.path.join(rootPath, className, idFile), classTestPath)

comp.sys.mac.hardware 768 153
comp.os.ms-windows.misc 787 157
rec.motorcycles 795 159
talk.politics.misc 619 123
comp.graphics 778 155
talk.politics.mideast 751 150
sci.med 791 158
rec.sport.baseball 795 159
talk.religion.misc 502 100


In [11]:
# Train has all class labels except for anomalous class 
# Here, 0 - 8
testPath = rootPath + "/train"
if os.path.exists(testPath):
  shutil.rmtree(testPath)
os.mkdir(testPath)

for className in files:
  if className in id_classes_whole:
    path = os.path.join(rootPath, className)
    classTestPath = testPath+"/"+className
    os.mkdir(classTestPath)
    idFiles = os.listdir(path)
    lenIdFiles = len(idFiles)
    # testLen = math.floor(0.2 * lenIdFiles)
    print(className, lenIdFiles)
    for idFile in idFiles:
      shutil.move(os.path.join(rootPath, className, idFile), classTestPath)

comp.sys.mac.hardware 614
comp.os.ms-windows.misc 629
rec.motorcycles 635
talk.politics.misc 495
comp.graphics 622
talk.politics.mideast 600
sci.med 632
rec.sport.baseball 635
talk.religion.misc 401


In [12]:
# Test_OOD separates anomalous classes in the test.
# Test contains 0 - 8
# Test_OOD contains 9
# files = os.listdir(rootPath + "/test")

testPath = rootPath + "/test_ood"
if os.path.exists(testPath):
  shutil.rmtree(testPath)
os.mkdir(testPath)

for className in files:
  if className in anom_classes:
    path = os.path.join(rootPath, className)
    classTestPath = testPath+"/"+className
    os.mkdir(classTestPath)
    idFiles = os.listdir(path)
    lenIdFiles = len(idFiles)
    # testLen = math.floor(0.2 * lenIdFiles)
    # print(className, lenIdFiles, testLen)
    for idFile in idFiles:
      shutil.move(os.path.join(rootPath, className, idFile), classTestPath)

In [13]:
def update_files_to_txt(id_ood_path):
  print("Files in folder '%s' are renamed as .txt" % id_ood_path)
  files = os.listdir(id_ood_path)
  
  for i in files:
    path = os.path.join(id_ood_path, i)
    files = os.listdir(path)  
    for index, file in enumerate(files):
      os.rename(os.path.join(path, file), os.path.join(path, ''.join([str(index), '.txt'])))

In [14]:
update_files_to_txt('20news-18828/test')
update_files_to_txt('20news-18828/test_ood')
update_files_to_txt('20news-18828/val')
update_files_to_txt('20news-18828/train')

Files in folder '20news-18828/test' are renamed as .txt
Files in folder '20news-18828/test_ood' are renamed as .txt
Files in folder '20news-18828/val' are renamed as .txt
Files in folder '20news-18828/train' are renamed as .txt


In [15]:
AUTOTUNE = tf.data.AUTOTUNE
batch_size = 32
seed = 42

test_ds = tf.keras.preprocessing.text_dataset_from_directory(
    '20news-18828/test',
    batch_size=batch_size)

test_class_names = test_ds.class_names
test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)

Found 1654 files belonging to 9 classes.


In [16]:
test_class_names

['comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.mac.hardware',
 'rec.motorcycles',
 'rec.sport.baseball',
 'sci.med',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [17]:
AUTOTUNE = tf.data.AUTOTUNE
batch_size = 32
seed = 42

test_ood_ds = tf.keras.preprocessing.text_dataset_from_directory(
    '20news-18828/test_ood',
    batch_size=batch_size)

test_ood_class_names = test_ood_ds.class_names
test_ood_ds = test_ood_ds.cache().prefetch(buffer_size=AUTOTUNE)

Found 981 files belonging to 1 classes.


In [18]:
test_ood_class_names

['sci.electronics']

In [19]:
AUTOTUNE = tf.data.AUTOTUNE
batch_size = 32
seed = 42

train_ds = tf.keras.preprocessing.text_dataset_from_directory(
    '20news-18828/train',
    batch_size=batch_size)

train_class_names = train_ds.class_names
train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)

Found 5263 files belonging to 9 classes.


In [20]:
train_class_names

['comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.mac.hardware',
 'rec.motorcycles',
 'rec.sport.baseball',
 'sci.med',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [21]:
AUTOTUNE = tf.data.AUTOTUNE
batch_size = 32
seed = 42

val_ds = tf.keras.preprocessing.text_dataset_from_directory(
    '20news-18828/val',
    batch_size=batch_size)

val_class_names = val_ds.class_names
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)

Found 1323 files belonging to 9 classes.


In [22]:
val_class_names

['comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.mac.hardware',
 'rec.motorcycles',
 'rec.sport.baseball',
 'sci.med',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [23]:
# Checking for class labels
def get_set_labels(ds):
  labels = []
  for _, label in ds:
    labels.extend(label.numpy())
  return set(labels)

print("Train labels", get_set_labels(train_ds))
print("Val labels", get_set_labels(val_ds))
print("Test ID labels", get_set_labels(test_ds))
print("Test OOD labels", get_set_labels(test_ood_ds))

Train labels {0, 1, 2, 3, 4, 5, 6, 7, 8}
Val labels {0, 1, 2, 3, 4, 5, 6, 7, 8}
Test ID labels {0, 1, 2, 3, 4, 5, 6, 7, 8}
Test OOD labels {0}


In [24]:
# Class labels and class names
def get_class_names(class_names):
  for ind, name in enumerate(class_names):
    print(ind, name)

print("Train class names")
get_class_names(train_class_names)
print("\nVal class names")
get_class_names(val_class_names)
print("\nTest ID class names")
get_class_names(test_class_names)
print("\nTest OOD class names")
get_class_names(test_ood_class_names)

Train class names
0 comp.graphics
1 comp.os.ms-windows.misc
2 comp.sys.mac.hardware
3 rec.motorcycles
4 rec.sport.baseball
5 sci.med
6 talk.politics.mideast
7 talk.politics.misc
8 talk.religion.misc

Val class names
0 comp.graphics
1 comp.os.ms-windows.misc
2 comp.sys.mac.hardware
3 rec.motorcycles
4 rec.sport.baseball
5 sci.med
6 talk.politics.mideast
7 talk.politics.misc
8 talk.religion.misc

Test ID class names
0 comp.graphics
1 comp.os.ms-windows.misc
2 comp.sys.mac.hardware
3 rec.motorcycles
4 rec.sport.baseball
5 sci.med
6 talk.politics.mideast
7 talk.politics.misc
8 talk.religion.misc

Test OOD class names
0 sci.electronics


In [25]:
all_test_class_names = test_class_names + test_ood_class_names

models_path = "outputs/" + anom_class_name_for_path + "/"

try:
  os.mkdir(models_path)
except:
  print(models_path, "is already present.")
  pass

pickle_file_name = models_path + 'all_test_class_names'
out_file = open(pickle_file_name,"wb")
pickle.dump(all_test_class_names, out_file)
out_file.close()

In [26]:
class InOutData:
  def __init__(self, leave_out_ind, id_classes_whole): 
    self.leave_out_class = id_classes_whole[leave_out_ind]
    self.id_classes_whole = id_classes_whole
    self.id_classes, self.ood_classes = self.get_id_ood_classes()

    # do the steps for train ds
    self.root_path = '20news-18828/train'
    self.reset_id_ood_folders()
    self.divide_id_ood_folders()
    self.id_path = self.root_path + '/id_classes'
    # self.ood_path = self.root_path + '/ood_classes'
    self.update_files_to_txt(self.id_path)
    # self.update_files_to_txt(self.ood_path)
    self.train_ds, self.train_class_names = self.get_ds(self.id_path)

    # do the steps for val ds
    self.root_path = '20news-18828/val'
    self.reset_id_ood_folders()
    self.divide_id_ood_folders()
    self.id_path = self.root_path + '/id_classes'
    self.ood_path = self.root_path + '/ood_classes'
    self.update_files_to_txt(self.id_path)
    self.update_files_to_txt(self.ood_path)
    self.val_ds, self.val_class_names = self.get_ds(self.id_path)
    self.ood_ds, self.ood_class_names = self.get_ds(self.ood_path)

  def get_id_ood_classes(self):
    print("Fetching ID and OOD classes")
    id_classes, ood_classes = [], []
    for i in self.id_classes_whole:
      if i not in self.leave_out_class:
        id_classes.append(i)
      else:
        ood_classes.append(i)
    print("ID Classes")
    for name in id_classes:
      ind = id_classes_whole.index(name)
      print(ind, name)
    print("OOD Classes")
    for name in ood_classes:
      ind = id_classes_whole.index(name)
      print(ind, name)
    return id_classes, ood_classes

  def reset_id_ood_folders(self):
    print("Resetting ID and OOD folders")
    if os.path.exists(self.root_path + "/id_classes"):
      shutil.rmtree(self.root_path + "/id_classes")

    if os.path.exists(self.root_path + "/ood_classes"):
      shutil.rmtree(self.root_path + "/ood_classes")
      
    os.mkdir(self.root_path + "/id_classes")
    os.mkdir(self.root_path + "/ood_classes")
  
  def divide_id_ood_folders(self):
    print("Dividing dataset as ID and OOD")
    files = os.listdir(self.root_path)

    id_ood_str = ""

    for i in files:
      if i not in ["id_classes", "ood_classes"]:
        if i in self.id_classes:
          id_ood_str = "id_classes"
        elif i in self.ood_classes:
          id_ood_str = "ood_classes"
        else:
          id_ood_str = ""
        path = os.path.join(self.root_path, id_ood_str, i)

      if id_ood_str != "":
        try:
          os.mkdir(path)
        except:
          pass

      orPath = os.path.join(self.root_path, i)

      files_ = os.listdir(orPath)
      for i in files_:
        try:
          shutil.copy(os.path.join(orPath, i), path)
        except:
          pass

  def update_files_to_txt(self, id_ood_path):
      print("Files in folder '%s' are renamed as .txt" % id_ood_path)
      files = os.listdir(id_ood_path)

      for i in files:
        path = os.path.join(id_ood_path, i)
        files = os.listdir(path)  
        for index, file in enumerate(files):
          os.rename(os.path.join(path, file), os.path.join(path, ''.join([str(index), '.txt'])))

  def get_ds(self, path):
    AUTOTUNE = tf.data.AUTOTUNE
    batch_size = 32
    seed = 42

    ds = tf.keras.preprocessing.text_dataset_from_directory(
        path,
        batch_size=batch_size)

    class_names = ds.class_names
    ds = ds.cache().prefetch(buffer_size=AUTOTUNE)

    return ds, class_names

In [27]:
class BERTModel:
  def __init__(self, train_ds):
    self.tfhub_handle_encoder = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3'
    self.tfhub_handle_preprocess = 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'
    self.bert_preprocess_model = hub.KerasLayer(self.tfhub_handle_preprocess)
    self.bert_model = hub.KerasLayer(self.tfhub_handle_encoder)
    self.train_ds = train_ds
    
    self.num_of_classes = 8 # after leaving out one class, only 8 classes left
    self.classifier_model = self.build_classifier_model()

    self.loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    self.metrics = tf.metrics.SparseCategoricalAccuracy()
    self.optimizer = self.build_optimizer()

    self.classifier_model.compile(
        optimizer=self.optimizer,
        loss=self.loss,
        metrics=self.metrics,
      )


  def build_classifier_model(self):
    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
    preprocessing_layer = hub.KerasLayer(self.tfhub_handle_preprocess, name='preprocessing')
    encoder_inputs = preprocessing_layer(text_input)
    encoder = hub.KerasLayer(self.tfhub_handle_encoder, trainable=True, name='BERT_encoder')
    outputs = encoder(encoder_inputs)
    net = outputs['pooled_output']
    net = tf.keras.layers.Dropout(0.1)(net)
    net = tf.keras.layers.Dense(self.num_of_classes, activation=None, name='classifier')(net)
    return tf.keras.Model(text_input, net)

  def build_optimizer(self):
    epochs = 10
    steps_per_epoch = tf.data.experimental.cardinality(self.train_ds).numpy()
    num_train_steps = steps_per_epoch * epochs
    num_warmup_steps = int(0.1*num_train_steps)
    # print(num_train_steps)
    init_lr = 3e-5
    optimizer = optimization.create_optimizer(init_lr=init_lr,
                                              num_train_steps=num_train_steps,
                                              num_warmup_steps=num_warmup_steps,
                                              optimizer_type='adamw')
    return optimizer

In [28]:
def getScores(class_ind, model, classifier_name):
  def get_labels_and_logits(ds):
    y = []
    logits = []
    for text_batch, label_batch in ds:
      label = label_batch.numpy()
      for lbl in label:
        y.append(lbl)
      text_test = text_batch
      bert_raw_result = model(text_test)
      for batch in bert_raw_result:
        logits.append(batch)
    return y, logits


  y_val, logits_val = get_labels_and_logits(class_ind.val_ds)
  y_ood, logits_ood = get_labels_and_logits(class_ind.ood_ds)
  y_test, logits_test = get_labels_and_logits(test_ds)
  y_anom, logits_anom = get_labels_and_logits(test_ood_ds)

  out = {
      'y_val': y_val,
      'y_ood': y_ood,
      'y_test': y_test,
      'y_anom': y_anom,
      'logits_val': logits_val,
      'logits_ood': logits_ood,
      'logits_test': logits_test,
      'logits_anom': logits_anom,
      'train_class_names': class_ind.train_class_names,
      'val_class_names': class_ind.val_class_names,
      'ood_class_names': class_ind.ood_class_names
  }
  
  models_path = "outputs/" + anom_class_name_for_path + "/"
  #if os.path.exists(models_path):
  #  shutil.rmtree(models_path)
  #os.mkdir(models_path)
  
  pickle_file_name = models_path + '%s_out' % classifier_name
  out_file = open(pickle_file_name,"wb")
  pickle.dump(out, out_file)
  out_file.close()

In [29]:
def train_model(classifier_name, leave_out_ind):
  # Fetch dataset
  class_ind = InOutData(leave_out_ind=leave_out_ind, id_classes_whole=id_classes_whole)
  # Build and plot model
  classifier_ind = BERTModel(class_ind.train_ds)

  # Train model
  classifier_ind.classifier_model.fit(x=class_ind.train_ds,
                                      batch_size=32,
                                      validation_data=class_ind.val_ds,
                                      epochs=10, 
                                      verbose=1)

  # Save model
  # models_path = '/content/drive/MyDrive/Uni Projects/Anomaly Detection using Ensembles/Experiments/Text Dataset/models/'
  # dataset_name = '%s_model' % classifier_name
  # saved_model_path = models_path+'{}'.format(dataset_name.replace('/', '_'))
  # classifier_ind.classifier_model.save(saved_model_path, include_optimizer=True)
  getScores(class_ind, classifier_ind.classifier_model, classifier_name)

  return classifier_ind.classifier_model

# Models

In [30]:
model0 = train_model(classifier_name="class_0", leave_out_ind=0)

Fetching ID and OOD classes
ID Classes
1 comp.os.ms-windows.misc
2 rec.motorcycles
3 talk.politics.misc
4 comp.graphics
5 talk.politics.mideast
6 sci.med
7 rec.sport.baseball
8 talk.religion.misc
OOD Classes
0 comp.sys.mac.hardware
Resetting ID and OOD folders
Dividing dataset as ID and OOD
Files in folder '20news-18828/train/id_classes' are renamed as .txt
Found 2324 files belonging to 8 classes.
Resetting ID and OOD folders
Dividing dataset as ID and OOD
Files in folder '20news-18828/val/id_classes' are renamed as .txt
Files in folder '20news-18828/val/ood_classes' are renamed as .txt
Found 607 files belonging to 8 classes.
Found 79 files belonging to 1 classes.
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [31]:
model1 = train_model(classifier_name="class_1", leave_out_ind=1)

Fetching ID and OOD classes
ID Classes
0 comp.sys.mac.hardware
2 rec.motorcycles
3 talk.politics.misc
4 comp.graphics
5 talk.politics.mideast
6 sci.med
7 rec.sport.baseball
8 talk.religion.misc
OOD Classes
1 comp.os.ms-windows.misc
Resetting ID and OOD folders
Dividing dataset as ID and OOD
Files in folder '20news-18828/train/id_classes' are renamed as .txt
Found 2313 files belonging to 8 classes.
Resetting ID and OOD folders
Dividing dataset as ID and OOD
Files in folder '20news-18828/val/id_classes' are renamed as .txt
Files in folder '20news-18828/val/ood_classes' are renamed as .txt
Found 605 files belonging to 8 classes.
Found 81 files belonging to 1 classes.
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [32]:
model2 = train_model(classifier_name="class_2", leave_out_ind=2)

Fetching ID and OOD classes
ID Classes
0 comp.sys.mac.hardware
1 comp.os.ms-windows.misc
3 talk.politics.misc
4 comp.graphics
5 talk.politics.mideast
6 sci.med
7 rec.sport.baseball
8 talk.religion.misc
OOD Classes
2 rec.motorcycles
Resetting ID and OOD folders
Dividing dataset as ID and OOD
Files in folder '20news-18828/train/id_classes' are renamed as .txt
Found 2310 files belonging to 8 classes.
Resetting ID and OOD folders
Dividing dataset as ID and OOD
Files in folder '20news-18828/val/id_classes' are renamed as .txt
Files in folder '20news-18828/val/ood_classes' are renamed as .txt
Found 604 files belonging to 8 classes.
Found 82 files belonging to 1 classes.
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [33]:
model3 = train_model(classifier_name="class_3", leave_out_ind=3)

Fetching ID and OOD classes
ID Classes
0 comp.sys.mac.hardware
1 comp.os.ms-windows.misc
2 rec.motorcycles
4 comp.graphics
5 talk.politics.mideast
6 sci.med
7 rec.sport.baseball
8 talk.religion.misc
OOD Classes
3 talk.politics.misc
Resetting ID and OOD folders
Dividing dataset as ID and OOD
Files in folder '20news-18828/train/id_classes' are renamed as .txt
Found 2372 files belonging to 8 classes.
Resetting ID and OOD folders
Dividing dataset as ID and OOD
Files in folder '20news-18828/val/id_classes' are renamed as .txt
Files in folder '20news-18828/val/ood_classes' are renamed as .txt
Found 617 files belonging to 8 classes.
Found 69 files belonging to 1 classes.
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [34]:
model4 = train_model(classifier_name="class_4", leave_out_ind=4)

Fetching ID and OOD classes
ID Classes
0 comp.sys.mac.hardware
1 comp.os.ms-windows.misc
2 rec.motorcycles
3 talk.politics.misc
5 talk.politics.mideast
6 sci.med
7 rec.sport.baseball
8 talk.religion.misc
OOD Classes
4 comp.graphics
Resetting ID and OOD folders
Dividing dataset as ID and OOD
Files in folder '20news-18828/train/id_classes' are renamed as .txt
Found 2318 files belonging to 8 classes.
Resetting ID and OOD folders
Dividing dataset as ID and OOD
Files in folder '20news-18828/val/id_classes' are renamed as .txt
Files in folder '20news-18828/val/ood_classes' are renamed as .txt
Found 607 files belonging to 8 classes.
Found 79 files belonging to 1 classes.
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [35]:
model5 = train_model(classifier_name="class_5", leave_out_ind=5)

Fetching ID and OOD classes
ID Classes
0 comp.sys.mac.hardware
1 comp.os.ms-windows.misc
2 rec.motorcycles
3 talk.politics.misc
4 comp.graphics
6 sci.med
7 rec.sport.baseball
8 talk.religion.misc
OOD Classes
5 talk.politics.mideast
Resetting ID and OOD folders
Dividing dataset as ID and OOD
Files in folder '20news-18828/train/id_classes' are renamed as .txt
Found 2331 files belonging to 8 classes.
Resetting ID and OOD folders
Dividing dataset as ID and OOD
Files in folder '20news-18828/val/id_classes' are renamed as .txt
Files in folder '20news-18828/val/ood_classes' are renamed as .txt
Found 606 files belonging to 8 classes.
Found 80 files belonging to 1 classes.
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [36]:
model6 = train_model(classifier_name="class_6", leave_out_ind=6)

Fetching ID and OOD classes
ID Classes
0 comp.sys.mac.hardware
1 comp.os.ms-windows.misc
2 rec.motorcycles
3 talk.politics.misc
4 comp.graphics
5 talk.politics.mideast
7 rec.sport.baseball
8 talk.religion.misc
OOD Classes
6 sci.med
Resetting ID and OOD folders
Dividing dataset as ID and OOD
Files in folder '20news-18828/train/id_classes' are renamed as .txt
Found 2310 files belonging to 8 classes.
Resetting ID and OOD folders
Dividing dataset as ID and OOD
Files in folder '20news-18828/val/id_classes' are renamed as .txt
Files in folder '20news-18828/val/ood_classes' are renamed as .txt
Found 604 files belonging to 8 classes.
Found 82 files belonging to 1 classes.
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [37]:
model7 = train_model(classifier_name="class_7", leave_out_ind=7)

Fetching ID and OOD classes
ID Classes
0 comp.sys.mac.hardware
1 comp.os.ms-windows.misc
2 rec.motorcycles
3 talk.politics.misc
4 comp.graphics
5 talk.politics.mideast
6 sci.med
8 talk.religion.misc
OOD Classes
7 rec.sport.baseball
Resetting ID and OOD folders
Dividing dataset as ID and OOD
Files in folder '20news-18828/train/id_classes' are renamed as .txt
Found 2310 files belonging to 8 classes.
Resetting ID and OOD folders
Dividing dataset as ID and OOD
Files in folder '20news-18828/val/id_classes' are renamed as .txt
Files in folder '20news-18828/val/ood_classes' are renamed as .txt
Found 604 files belonging to 8 classes.
Found 82 files belonging to 1 classes.
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [38]:
model8 = train_model(classifier_name="class_8", leave_out_ind=8)

Fetching ID and OOD classes
ID Classes
0 comp.sys.mac.hardware
1 comp.os.ms-windows.misc
2 rec.motorcycles
3 talk.politics.misc
4 comp.graphics
5 talk.politics.mideast
6 sci.med
7 rec.sport.baseball
OOD Classes
8 talk.religion.misc
Resetting ID and OOD folders
Dividing dataset as ID and OOD
Files in folder '20news-18828/train/id_classes' are renamed as .txt
Found 2428 files belonging to 8 classes.
Resetting ID and OOD folders
Dividing dataset as ID and OOD
Files in folder '20news-18828/val/id_classes' are renamed as .txt
Files in folder '20news-18828/val/ood_classes' are renamed as .txt
Found 634 files belonging to 8 classes.
Found 52 files belonging to 1 classes.
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# Reload logits from pickle file

In [39]:
anom_class_name_for_path = 'anom_9'

In [40]:
models_path = "outputs/" + anom_class_name_for_path + "/"
pickle_file_name = models_path + 'all_test_class_names'
out_file = open(pickle_file_name,"rb")
all_test_class_names = pickle.load(out_file)
out_file.close()

In [41]:
def getLogits(classifier_name):
  pickle_file_name = models_path + '%s_out' % classifier_name
  out_file = open(pickle_file_name,"rb")
  model_out = pickle.load(out_file)
  out_file.close()

  return model_out

In [42]:
output0 = getLogits("class_0")
output1 = getLogits("class_1")
output2 = getLogits("class_2")
output3 = getLogits("class_3")
output4 = getLogits("class_4")
output5 = getLogits("class_5")
output6 = getLogits("class_6")
output7 = getLogits("class_7")
output8 = getLogits("class_8")

In [43]:
outputs = [
          output0,
          output1,
          output2,
          output3,
          output4,
          output5,
          output6,
          output7,
          output8
          ]
print(len(outputs))
print(outputs[0].keys())

9
dict_keys(['y_val', 'y_ood', 'y_test', 'y_anom', 'logits_val', 'logits_ood', 'logits_test', 'logits_anom', 'train_class_names', 'val_class_names', 'ood_class_names'])


# Temperature scaling

In [44]:
def temp_cal(y_pred, y):

  temp = tf.Variable(initial_value=1.0, trainable=True, dtype=tf.float32) 

  def compute_loss():
      y_pred_model_w_temp = tf.math.divide(y_pred, temp)
      loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(\
                                  tf.convert_to_tensor(keras.utils.to_categorical(np.asarray(y))), y_pred_model_w_temp))
      return loss

  optimizer = tf.optimizers.Adam(learning_rate=0.01)

  print('Temperature Initial value: {}'.format(temp.numpy()))

  for i in range(300):
      opts = optimizer.minimize(compute_loss, var_list=[temp])


  print('Temperature Final value: {}'.format(temp.numpy()))

  return temp

In [45]:
temp_list = []

for output in outputs:
  temp = temp_cal(output['logits_val'], #+output['logits_ood'], 
                  output['y_val']) #+output['y_ood'])
  temp_list.append(temp)

Temperature Initial value: 1.0
Temperature Final value: 1.695942759513855
Temperature Initial value: 1.0
Temperature Final value: 1.6615626811981201
Temperature Initial value: 1.0
Temperature Final value: 1.7107561826705933
Temperature Initial value: 1.0
Temperature Final value: 1.7225301265716553
Temperature Initial value: 1.0
Temperature Final value: 1.7363266944885254
Temperature Initial value: 1.0
Temperature Final value: 1.7249693870544434
Temperature Initial value: 1.0
Temperature Final value: 1.6090620756149292
Temperature Initial value: 1.0
Temperature Final value: 1.7317672967910767
Temperature Initial value: 1.0
Temperature Final value: 1.7360734939575195


In [46]:
def temp_scaling(y_pred, temp):
  return tf.math.divide(y_pred, temp)

In [47]:
softmax_list = []

for output in outputs:
    softmax = tf.nn.softmax(output['logits_val'])
    softmax_list.append(softmax)

In [48]:
ood_softmax_list = []

for output in outputs:
    ood_softmax = tf.nn.softmax(output['logits_ood'])
    ood_softmax_list.append(ood_softmax)

In [49]:
new_logits_list = []
new_softmax_list = []

for temp, output in zip(temp_list, outputs):
  new_logits = temp_scaling(output['logits_val'], temp)
  new_logits_list.append(new_logits)

  new_softmax = tf.nn.softmax(new_logits)
  new_softmax_list.append(new_softmax)

In [50]:
new_logits_ood_list = []
new_ood_softmax_list = []

for temp, output in zip(temp_list, outputs):
  new_logits_ood = temp_scaling(output['logits_ood'], temp)
  new_logits_ood_list.append(new_logits_ood)

  new_ood_softmax = tf.nn.softmax(new_logits_ood)
  new_ood_softmax_list.append(new_ood_softmax)

In [51]:
test_softmax_list = []
for temp, output in zip(temp_list, outputs):
  test_softmax = tf.nn.softmax(temp_scaling(output['logits_test'], temp))
  test_softmax_list.append(test_softmax)

anom_softmax_list = []
for temp, output in zip(temp_list, outputs):
  anom_softmax = tf.nn.softmax(temp_scaling(output['logits_anom'], temp))
  anom_softmax_list.append(anom_softmax)

# Compute reference vectors

In [52]:
# Finding ref vector for ID - with temp scaling
ref_vector_ID_with_tmp = []

for new_softmax in new_softmax_list:
  max_softmax = 0
  for s in new_softmax:
    max_softmax += np.max(s)
  new_max = max_softmax / len(new_softmax)
  # new_max = np.max(new_softmax)
  ref_vector_ID_with_tmp.append(new_max)

ref_vector_ID_with_tmp

[0.947583729220183,
 0.9578432826956441,
 0.9542997828777263,
 0.9565142840630039,
 0.9615651853198471,
 0.9516753924168376,
 0.9489715546369553,
 0.9494689795749867,
 0.9532428966711748]

In [53]:
# Finding ref vector for OOD - with temp scaling
ref_vector_OOD_with_tmp = []

for new_ood_softmax in new_ood_softmax_list:
  max_softmax = 0
  for s in new_ood_softmax:
    max_softmax += np.max(s)
  max = max_softmax / len(new_ood_softmax)
  # new_max = np.max(new_softmax)
  ref_vector_OOD_with_tmp.append(max)

ref_vector_OOD_with_tmp

[0.8569068440908119,
 0.8970223466555277,
 0.7936741019167551,
 0.710397071596505,
 0.8461606570436985,
 0.8106797691434622,
 0.6374774582502318,
 0.6833199269887877,
 0.7488633279617016]

# Compute entropy values

In [54]:
# Finding entropy of ID - after temp scaling
id_entropy = 0
for new_softmax in new_softmax_list:
  ent_classifier = 0
  for sm in new_softmax:
    ent = entropy(sm, base=len(sm))
    ent_classifier += ent
  ent_classifier = ent_classifier / len(new_softmax)
  id_entropy += ent_classifier
id_entropy = id_entropy / len(new_softmax_list)
id_entropy

0.09597307926155983

In [55]:
# Finding entropy of OOD - after temp scaling
ood_entropy = 0
for new_ood_softmax in new_ood_softmax_list:
  ent_classifier = 0
  for sm in new_ood_softmax:
    ent = entropy(sm, base=len(sm))
    ent_classifier += ent
  ent_classifier = ent_classifier / len(new_ood_softmax)
  ood_entropy += ent_classifier
ood_entropy = ood_entropy / len(new_ood_softmax_list)
ood_entropy

0.34136378063117623

# Reference vector

In [56]:
test_softmax_list = []
for temp, output in zip(temp_list, outputs):
  test_softmax = tf.nn.softmax(temp_scaling(output['logits_test'], temp))
  test_softmax_list.append(test_softmax)

anom_softmax_list = []
for temp, output in zip(temp_list, outputs):
  anom_softmax = tf.nn.softmax(temp_scaling(output['logits_anom'], temp))
  anom_softmax_list.append(anom_softmax)

In [57]:
y_true = []
y_true_ = []
y_pred_softmax = []
y_pred = []
sim_ID = []
sim_OOD = []
for ind in tqdm(range(len(test_softmax_list[0]))):
  pred_vector_max = []
  # pred_vector_min = []
  for test_softmax in test_softmax_list:
    pred_vector_max.append(np.max(test_softmax[ind]))
    # pred_vector_min.append(np.min(test_softmax[ind]))

  dist_from_ID = distance.euclidean(pred_vector_max, ref_vector_ID_with_tmp)
  dist_from_OOD = distance.euclidean(pred_vector_max, ref_vector_OOD_with_tmp)
  # dist_from_ID_min = distance.euclidean(pred_vector_min, ref_vector_ID_with_tmp_min)
  # dist_from_OOD_min = distance.euclidean(pred_vector_min, ref_vector_OOD_with_tmp_min)
  sim_with_ID = (1 / (1 + dist_from_ID)) # + (1 / (1 + dist_from_ID_min))
  sim_with_OOD = (1 / (1 + dist_from_OOD)) # + (1 / (1 + dist_from_OOD_min))
  sim_ID.append(sim_with_ID)
  sim_OOD.append(sim_with_OOD)
  
  if sim_with_ID >= sim_with_OOD:
    pred = 1 # ID
  else:
    pred = 0 # OOD
  y_pred.append(pred)
  y_pred_softmax.append(np.max(pred_vector_max))
  y_true.append(1) # because ID data
  y_true_.append(0) # because ID data


for ind in tqdm(range(len(anom_softmax_list[0]))):
  pred_vector_max = []
  # pred_vector_min = []
  for anom_softmax in anom_softmax_list:
    pred_vector_max.append(np.max(anom_softmax[ind]))
    # pred_vector_min.append(np.min(anom_softmax[ind]))

  dist_from_ID = distance.euclidean(pred_vector_max, ref_vector_ID_with_tmp)
  dist_from_OOD = distance.euclidean(pred_vector_max, ref_vector_OOD_with_tmp)
  # dist_from_ID_min = distance.euclidean(pred_vector_min, ref_vector_ID_with_tmp_min)
  # dist_from_OOD_min = distance.euclidean(pred_vector_min, ref_vector_OOD_with_tmp_min)
  sim_with_ID = (1 / (1 + dist_from_ID)) # + (1 / (1 + dist_from_ID_min))
  sim_with_OOD = (1 / (1 + dist_from_OOD)) # + (1 / (1 + dist_from_OOD_min))
  sim_ID.append(sim_with_ID)
  sim_OOD.append(sim_with_OOD)
  
  if sim_with_ID >= sim_with_OOD:
    pred = 1 # ID
  else:
    pred = 0 # OOD
  y_pred.append(pred)
  y_pred_softmax.append(np.max(pred_vector_max))
  y_true.append(0) # because OOD data
  y_true_.append(1) # because ID data

100%|██████████| 1654/1654 [00:02<00:00, 665.81it/s]
100%|██████████| 981/981 [00:01<00:00, 672.97it/s]


In [58]:
print("Scores with temperature scaling - considering only softmax values, no similarity checked")

auroc_ensemble = sk.roc_auc_score(y_true, y_pred_softmax)
print('AUROC (%):', round(100*auroc_ensemble, 2))

print("Scores with temperature scaling \
- giving predictions (1 for ID and 0 for OOD) \
 comparing similarity with both ref vectors")
auroc_ref_detection = sk.roc_auc_score(y_true, y_pred)
print('AUROC (%):', round(100*auroc_ref_detection, 2))


print("Scores with temperature scaling \
- giving sim_ID scores")
auroc_ref_sim = sk.roc_auc_score(y_true, sim_ID)
print('AUROC (%):', round(100*auroc_ref_sim, 2))

print("Scores with temperature scaling \
- giving sim_OOD scores")
# auroc_ref_sim_ood = sk.roc_auc_score(y_true, sim_OOD)
# print('AUROC (%):', round(100*auroc_ref_sim_ood, 2))
auroc_ref_sim_ood = sk.roc_auc_score(y_true_, sim_OOD)
print('AUROC (%):', round(100*auroc_ref_sim_ood, 2))

Scores with temperature scaling - considering only softmax values, no similarity checked
AUROC (%): 90.76
Scores with temperature scaling - giving predictions (1 for ID and 0 for OOD)  comparing similarity with both ref vectors
AUROC (%): 74.56
Scores with temperature scaling - giving sim_ID scores
AUROC (%): 75.36
Scores with temperature scaling - giving sim_OOD scores
AUROC (%): 68.5


# Reference vector + Decision Rule

In [59]:
all_test_class_names

['comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.mac.hardware',
 'rec.motorcycles',
 'rec.sport.baseball',
 'sci.med',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc',
 'sci.electronics']

In [60]:
mappings = []
for output in outputs:
  # map_train_to_test_indices
  mapping = {}
  for test_ind, test_class_name in enumerate(all_test_class_names):
    for train_ind, class_name in enumerate(output['train_class_names']):
      if test_class_name == class_name:
        mapping[train_ind] = test_ind
  mappings.append(mapping)

mappings

[{0: 0, 1: 1, 2: 3, 3: 4, 4: 5, 5: 6, 6: 7, 7: 8},
 {0: 0, 1: 2, 2: 3, 3: 4, 4: 5, 5: 6, 6: 7, 7: 8},
 {0: 0, 1: 1, 2: 2, 3: 4, 4: 5, 5: 6, 6: 7, 7: 8},
 {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 8},
 {0: 1, 1: 2, 2: 3, 3: 4, 4: 5, 5: 6, 6: 7, 7: 8},
 {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 7, 7: 8},
 {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 6, 6: 7, 7: 8},
 {0: 0, 1: 1, 2: 2, 3: 3, 4: 5, 5: 6, 6: 7, 7: 8},
 {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7}]

In [61]:
# Decision rule
def get_auroc_with_threshold_count(threshold_count):
  y_pred_decision_ref = []
  y_pred_only_decision = []
  y_true = []

  for ind in tqdm(range(len(test_softmax_list[0]))):
    preds = []
    pred_vector = []
    for test_softmax, mapping in zip(test_softmax_list, mappings):
      pred_vector.append(np.max(test_softmax[ind]))
      pred_classifier = np.argmax(test_softmax[ind])
      mapped_pred = mapping[pred_classifier]
      preds.append(mapped_pred)
    counts = np.bincount(preds)
    max_count = np.max(counts)

    dist_from_ID = distance.euclidean(pred_vector, ref_vector_ID_with_tmp)
    dist_from_OOD = distance.euclidean(pred_vector, ref_vector_OOD_with_tmp)
    sim_with_ID = 1 / (1 + dist_from_ID)
    sim_with_OOD = 1 / (1 + dist_from_OOD)

    if max_count >= threshold_count:
      y_pred_only_decision.append(1)
    else:
      y_pred_only_decision.append(0)

    if sim_with_ID >= sim_with_OOD and max_count >= threshold_count:
      y_pred_decision_ref.append(1)
    else:
      y_pred_decision_ref.append(0)
    y_true.append(1) # because ID data


  for ind in tqdm(range(len(anom_softmax_list[0]))):
    preds = []
    pred_vector = []
    for anom_softmax, mapping in zip(anom_softmax_list, mappings):
      pred_vector.append(np.max(test_softmax[ind]))
      pred_classifier = np.argmax(anom_softmax[ind])
      mapped_pred = mapping[pred_classifier]
      preds.append(mapped_pred)
    counts = np.bincount(preds)
    max_count = np.max(counts)

    dist_from_ID = distance.euclidean(pred_vector, ref_vector_ID_with_tmp)
    dist_from_OOD = distance.euclidean(pred_vector, ref_vector_OOD_with_tmp)
    sim_with_ID = 1 / (1 + dist_from_ID)
    sim_with_OOD = 1 / (1 + dist_from_OOD)

    if max_count >= threshold_count:
      y_pred_only_decision.append(1)
    else:
      y_pred_only_decision.append(0)

    if sim_with_ID >= sim_with_OOD and max_count >= threshold_count:
      y_pred_decision_ref.append(1)
    else:
      y_pred_decision_ref.append(0)
    y_true.append(0) # because OOD data

  print("\n\nScores with Decision rule with threshold count =", threshold_count)
  auroc_only_dec = sk.roc_auc_score(y_true, y_pred_only_decision)
  print('AUROC (%):', round(100*auroc_only_dec, 2))

  print("\n\nScores with Decision rule with threshold count =", threshold_count, 
        "\n and reference vector")
  auroc_dec_ref = sk.roc_auc_score(y_true, y_pred_decision_ref)
  print('AUROC (%):', round(100*auroc_dec_ref, 2))
  return auroc_only_dec, auroc_dec_ref

In [62]:
auroc_only_dec_7, auroc_dec_ref_7 = get_auroc_with_threshold_count(7)

100%|██████████| 1654/1654 [00:04<00:00, 344.20it/s]
100%|██████████| 981/981 [00:02<00:00, 347.65it/s]



Scores with Decision rule with threshold count = 7
AUROC (%): 59.85


Scores with Decision rule with threshold count = 7 
 and reference vector
AUROC (%): 61.98





In [63]:
auroc_only_dec_8, auroc_dec_ref_8 = get_auroc_with_threshold_count(8)

100%|██████████| 1654/1654 [00:04<00:00, 335.66it/s]
100%|██████████| 981/981 [00:02<00:00, 328.09it/s]



Scores with Decision rule with threshold count = 8
AUROC (%): 65.64


Scores with Decision rule with threshold count = 8 
 and reference vector
AUROC (%): 68.08





# Entropy values

In [64]:
  y_pred = []
  y_true = []
  y_true_ = []
  sim_ID = []
  sim_OOD = []

  for ind in tqdm(range(len(test_softmax_list[0]))):
    pred_entropy = 0
    for test_softmax in test_softmax_list:
      pred = test_softmax[ind]
      ent = entropy(pred, base=len(pred))
      pred_entropy += ent

    pred_entropy = pred_entropy/len(test_softmax_list)
    dist_from_ID = distance.euclidean(pred_entropy, id_entropy)
    dist_from_OOD = distance.euclidean(pred_entropy, ood_entropy)
    sim_with_ID = 1 / (1 + dist_from_ID)
    sim_with_OOD = 1 / (1 + dist_from_OOD)
    sim_ID.append(sim_with_ID)
    sim_OOD.append(sim_with_OOD)

    if sim_with_ID >= sim_with_OOD:
      y_pred.append(1)
    else:
      y_pred.append(0)
    y_true.append(1) # because ID data
    y_true_.append(0)


  for ind in tqdm(range(len(anom_softmax_list[0]))):
    pred_entropy = 0
    for test_softmax in anom_softmax_list:
      pred = test_softmax[ind]
      ent = entropy(pred, base=len(pred))
      pred_entropy += ent
    
    pred_entropy = pred_entropy/len(anom_softmax_list)
    dist_from_ID = distance.euclidean(pred_entropy, id_entropy)
    dist_from_OOD = distance.euclidean(pred_entropy, ood_entropy)
    sim_with_ID = 1 / (1 + dist_from_ID)
    sim_with_OOD = 1 / (1 + dist_from_OOD)
    sim_ID.append(sim_with_ID)
    sim_OOD.append(sim_with_OOD)

    if sim_with_ID >= sim_with_OOD:
      y_pred.append(1)
    else:
      y_pred.append(0)
    y_true.append(0) # because OOD data
    y_true_.append(1)

  print("\n\nScores with Entropy ref value - detecting ID and OOD and giving these to AUROC")
  auroc_ent_detection = sk.roc_auc_score(y_true, y_pred)
  print('AUROC (%):', round(100*auroc_ent_detection, 2))

  print("\n\nScores with Entropy ref value - sim ID to AUROC")
  auroc_ent_sim = sk.roc_auc_score(y_true, sim_ID)
  print('AUROC (%):', round(100*auroc_ent_sim, 2))

  print("\n\nScores with Entropy ref value - sim OOD to AUROC")
  auroc = sk.roc_auc_score(y_true, sim_OOD)
  print('AUROC (%):', round(100*auroc, 2))

  auroc_ent_sim_ood = sk.roc_auc_score(y_true_, sim_OOD)
  print('AUROC (%):', round(100*auroc, 2))

100%|██████████| 1654/1654 [00:02<00:00, 565.14it/s]
100%|██████████| 981/981 [00:01<00:00, 578.67it/s]



Scores with Entropy ref value - detecting ID and OOD and giving these to AUROC
AUROC (%): 78.13


Scores with Entropy ref value - sim ID to AUROC
AUROC (%): 82.78


Scores with Entropy ref value - sim OOD to AUROC
AUROC (%): 20.55
AUROC (%): 20.55





# Entropy value + Decision Rule

In [65]:
def get_auroc_with_threshold_count(threshold_count):
  y_pred_decision_ref = []
  y_true = []

  for ind in tqdm(range(len(test_softmax_list[0]))):
    preds = []
    pred_entropy = 0
    for test_softmax, mapping in zip(test_softmax_list, mappings):
      # to get entropy
      pred = test_softmax[ind]  # vector of length 9
      ent = entropy(pred, base=len(pred))
      pred_entropy += ent
      # to get max count
      pred_classifier = np.argmax(test_softmax[ind])  # argmax of vector
      mapped_pred = mapping[pred_classifier]
      preds.append(mapped_pred)
    counts = np.bincount(preds)
    max_count = np.max(counts)

    pred_entropy = pred_entropy/len(test_softmax_list)
    dist_from_ID = distance.euclidean(pred_entropy, id_entropy)
    dist_from_OOD = distance.euclidean(pred_entropy, ood_entropy)
    sim_with_ID = 1 / (1 + dist_from_ID)
    sim_with_OOD = 1 / (1 + dist_from_OOD)

    if sim_with_ID >= sim_with_OOD and max_count >= threshold_count:
      y_pred_decision_ref.append(1)
    else:
      y_pred_decision_ref.append(0)
    y_true.append(1) # because ID data


  for ind in tqdm(range(len(anom_softmax_list[0]))):
    preds = []
    pred_entropy = 0
    for test_softmax, mapping in zip(anom_softmax_list, mappings):
      # to get entropy
      pred = test_softmax[ind]  # vector of length 9
      ent = entropy(pred, base=len(pred))
      pred_entropy += ent
      # to get max count
      pred_classifier = np.argmax(test_softmax[ind])  # argmax of vector
      mapped_pred = mapping[pred_classifier]
      preds.append(mapped_pred)
    counts = np.bincount(preds)
    max_count = np.max(counts)

    pred_entropy = pred_entropy/len(anom_softmax_list)
    dist_from_ID = distance.euclidean(pred_entropy, id_entropy)
    dist_from_OOD = distance.euclidean(pred_entropy, ood_entropy)
    sim_with_ID = 1 / (1 + dist_from_ID)
    sim_with_OOD = 1 / (1 + dist_from_OOD)

    if sim_with_ID >= sim_with_OOD and max_count >= threshold_count:
      y_pred_decision_ref.append(1)
    else:
      y_pred_decision_ref.append(0)
    y_true.append(0) # because OOD data

  print("\n\nScores with Decision rule with threshold count =", threshold_count, 
        "\n and entropy reference")
  auroc_dec_ent = sk.roc_auc_score(y_true, y_pred_decision_ref)
  print('AUROC (%):', round(100*auroc_dec_ent, 2))
  return auroc_dec_ent

In [66]:
auroc_dec_ent_7 = get_auroc_with_threshold_count(7)

100%|██████████| 1654/1654 [00:05<00:00, 315.97it/s]
100%|██████████| 981/981 [00:03<00:00, 312.58it/s]



Scores with Decision rule with threshold count = 7 
 and entropy reference
AUROC (%): 78.1





In [67]:
auroc_dec_ent_8 = get_auroc_with_threshold_count(8)

100%|██████████| 1654/1654 [00:05<00:00, 315.55it/s]
100%|██████████| 981/981 [00:03<00:00, 316.03it/s]



Scores with Decision rule with threshold count = 8 
 and entropy reference
AUROC (%): 78.29





# Method 2

In [68]:
x_train = []
y_train = []

for new_softmax in new_softmax_list:
  for sm in new_softmax:
    x_train.append(sm)
    y_train.append(1)

for new_softmax in new_ood_softmax_list:
  for sm in new_softmax:
    x_train.append(sm)
    y_train.append(0)

len(x_train), len(y_train)

(6174, 6174)

In [69]:
x_test = []
y_test = []

for new_softmax in test_softmax_list:
  for sm in new_softmax:
    x_test.append(sm)
    y_test.append(1)

for new_softmax in anom_softmax_list:
  for sm in new_softmax:
    x_test.append(sm)
    y_test.append(0)

len(x_test), len(y_test)

(23715, 23715)

In [70]:
svm_train_ip, svm_test_ip, svm_train_lb, svm_test_lb = train_test_split(x_train, y_train, test_size=0.20, stratify=y_train)
print(len(svm_train_ip),len(svm_test_ip),len(svm_train_lb))

4939 1235 4939


In [71]:
from sklearn.utils.class_weight import compute_class_weight
class_wts = compute_class_weight('balanced', np.unique(svm_train_lb), svm_train_lb)
class_wts ={0:class_wts[0], 1:class_wts[1]}
class_wts



{0: 4.498178506375227, 1: 0.5625284738041002}

In [72]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
clf = make_pipeline(StandardScaler(), SVC(gamma='auto',class_weight=class_wts))
clf.fit(svm_train_ip, svm_train_lb)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('svc',
                 SVC(class_weight={0: 4.498178506375227, 1: 0.5625284738041002},
                     gamma='auto'))])

In [73]:
pred = clf.predict(svm_test_ip)
len(pred) ### svm fitted on whole data with weights

1235

In [74]:
clf.score(svm_test_ip,svm_test_lb) 

0.8178137651821862

In [75]:
y_pred = clf.predict(x_test)

In [76]:
print("Scores for predictions from trained SVM (Method 2)")
auroc_method_2 = sk.roc_auc_score(y_test, y_pred)
print('AUROC (%):', round(100*auroc_method_2, 2))

Scores for predictions from trained SVM (Method 2)
AUROC (%): 61.24


# Results

In [77]:
metrics = \
[
'Euclidean Distance Between Reference Vector',	
'Similarity score with just IN Reference Vector',
'Similarity score with just OOD Reference Vector',
'Euclidean Distance between Entropy Reference Value',
'Similarity score with just IN Entropy Reference Value',
'Similarity score with just OOD Entropy Reference Value',
'Decision Rule (>=7)',
'Decision Rule (>=8)',
'Decision Rule(>=7) + Reference Vector',
'Decision Rule(>=8) + Reference Vector',
'Decision Rule(>=7) + Entropy Reference Value',
'Decision Rule(>=8) + Entropy Reference Value',
'Average Max softmax value of each classifier',
'Method 2 - Learning Softmax Pattern'
]

In [78]:
values = [
          auroc_ref_detection,
          auroc_ref_sim,
          auroc_ref_sim_ood,
          auroc_ent_detection,
          auroc_ent_sim,
          auroc_ent_sim_ood,
          auroc_only_dec_7,
          auroc_only_dec_8,
          auroc_dec_ref_7,
          auroc_dec_ref_8,
          auroc_dec_ent_7,
          auroc_dec_ent_8,
          auroc_ensemble,
          auroc_method_2
]

In [79]:
values_rounded = []
for value in values:
  values_rounded.append(round(100*value, 2))

In [80]:
for metric, value in zip(metrics, values_rounded):
  print(metric, "\t\t", value)

print(values_rounded)

Euclidean Distance Between Reference Vector 		 74.56
Similarity score with just IN Reference Vector 		 75.36
Similarity score with just OOD Reference Vector 		 68.5
Euclidean Distance between Entropy Reference Value 		 78.13
Similarity score with just IN Entropy Reference Value 		 82.78
Similarity score with just OOD Entropy Reference Value 		 79.45
Decision Rule (>=7) 		 59.85
Decision Rule (>=8) 		 65.64
Decision Rule(>=7) + Reference Vector 		 61.98
Decision Rule(>=8) + Reference Vector 		 68.08
Decision Rule(>=7) + Entropy Reference Value 		 78.1
Decision Rule(>=8) + Entropy Reference Value 		 78.29
Average Max softmax value of each classifier 		 90.76
Method 2 - Learning Softmax Pattern 		 61.24
[74.56, 75.36, 68.5, 78.13, 82.78, 79.45, 59.85, 65.64, 61.98, 68.08, 78.1, 78.29, 90.76, 61.24]
