In [1]:
import os
import shutil

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization  # to create AdamW optmizer

import matplotlib.pyplot as plt
tf.get_logger().setLevel('ERROR')

import pprint
pp = pprint.PrettyPrinter(indent=4)
import math
import numpy as np
import sklearn.metrics as sk 
import pickle

from scipy import stats
from scipy.spatial import distance

from tensorflow import keras

from sklearn.model_selection import train_test_split

from scipy.stats import entropy

from tqdm import tqdm

## Train and save classifiers

In [2]:
if os.path.exists('20news-18828'):
  shutil.rmtree('20news-18828')

In [3]:
url = 'http://qwone.com/~jason/20Newsgroups/20news-18828.tar.gz'

dataset = tf.keras.utils.get_file('20news-18828.tar.gz', url,
                                  untar=True, cache_dir='.',
                                  cache_subdir='')

dataset_dir = os.path.join(os.path.dirname(dataset), '20news')
train_dir = os.path.join(dataset_dir, 'train')

In [4]:
rootPath = '20news-18828'
files = os.listdir(rootPath)

all_classes = {}
for ind in range(len(files)):
  all_classes[ind] = files[ind]

all_classes

{0: 'comp.sys.mac.hardware',
 1: 'comp.os.ms-windows.misc',
 2: 'rec.motorcycles',
 3: 'talk.politics.misc',
 4: 'comp.graphics',
 5: 'talk.politics.mideast',
 6: 'sci.med',
 7: 'rec.sport.baseball',
 8: 'talk.religion.misc',
 9: 'sci.electronics',
 10: 'comp.windows.x',
 11: 'sci.crypt',
 12: 'talk.politics.guns',
 13: 'rec.sport.hockey',
 14: 'rec.autos',
 15: 'sci.space',
 16: 'comp.sys.ibm.pc.hardware',
 17: 'soc.religion.christian',
 18: 'misc.forsale',
 19: 'alt.atheism'}

In [5]:
# models_path = "/content/drive/MyDrive/Uni Projects/Anomaly Detection using Ensembles/Experiments/Text Dataset/models/outputs/"
# pickle_file_name = models_path + 'all_classes'
# out_file = open(pickle_file_name,"wb")
# pickle.dump(all_classes, out_file)
# out_file.close()

## Set the experiment

In [6]:
models_path = "outputs/"
pickle_file_name = models_path + 'all_classes'
out_file = open(pickle_file_name,"rb")
all_classes = pickle.load(out_file)
out_file.close()

In [7]:
all_classes

{0: 'comp.sys.mac.hardware',
 1: 'comp.os.ms-windows.misc',
 2: 'rec.motorcycles',
 3: 'talk.politics.misc',
 4: 'comp.graphics',
 5: 'talk.politics.mideast',
 6: 'sci.med',
 7: 'rec.sport.baseball',
 8: 'talk.religion.misc',
 9: 'sci.electronics',
 10: 'comp.windows.x',
 11: 'sci.crypt',
 12: 'talk.politics.guns',
 13: 'rec.sport.hockey',
 14: 'rec.autos',
 15: 'sci.space',
 16: 'comp.sys.ibm.pc.hardware',
 17: 'soc.religion.christian',
 18: 'misc.forsale',
 19: 'alt.atheism'}

In [8]:
first_10_classes_dict = {}
for ind in all_classes:
  if ind <= 9:
    first_10_classes_dict[ind] = all_classes[ind]

id_classes_ind = [0, 1, 2, 3, 4, 5, 6, 7, 8]
anom_classes_ind = [9]

id_classes_dict = {}
for ind in all_classes:
  if ind in id_classes_ind:
    id_classes_dict[ind] = first_10_classes_dict[ind]

anom_classes_dict = {}
for ind in all_classes:
  if ind in anom_classes_ind:
    anom_classes_dict[ind] = first_10_classes_dict[ind]

print("first_10_classes_dict")
pp.pprint(first_10_classes_dict)
print("id_classes_dict")
pp.pprint(id_classes_dict)
print("anom_classes_dict")
pp.pprint(anom_classes_dict)

id_classes_whole = list(id_classes_dict.values())
anom_classes = list(anom_classes_dict.values())

print("\nID Classes\n------------")
pp.pprint(id_classes_whole)
print("Anomalous Classes\n------------")
pp.pprint(anom_classes)

anom_class_name_for_path = "anom_%d" % (anom_classes_ind[0])
anom_class_name_for_path

first_10_classes_dict
{   0: 'comp.sys.mac.hardware',
    1: 'comp.os.ms-windows.misc',
    2: 'rec.motorcycles',
    3: 'talk.politics.misc',
    4: 'comp.graphics',
    5: 'talk.politics.mideast',
    6: 'sci.med',
    7: 'rec.sport.baseball',
    8: 'talk.religion.misc',
    9: 'sci.electronics'}
id_classes_dict
{   0: 'comp.sys.mac.hardware',
    1: 'comp.os.ms-windows.misc',
    2: 'rec.motorcycles',
    3: 'talk.politics.misc',
    4: 'comp.graphics',
    5: 'talk.politics.mideast',
    6: 'sci.med',
    7: 'rec.sport.baseball',
    8: 'talk.religion.misc'}
anom_classes_dict
{9: 'sci.electronics'}

ID Classes
------------
[   'comp.sys.mac.hardware',
    'comp.os.ms-windows.misc',
    'rec.motorcycles',
    'talk.politics.misc',
    'comp.graphics',
    'talk.politics.mideast',
    'sci.med',
    'rec.sport.baseball',
    'talk.religion.misc']
Anomalous Classes
------------
['sci.electronics']


'anom_9'

In [9]:
# Test has all class labels 0 - 9. Further separated by Test_OOD
testPath = rootPath + "/test"
if os.path.exists(testPath):
  shutil.rmtree(testPath)
os.mkdir(testPath)

for className in files:
  if className in first_10_classes_dict.values() and className not in anom_classes:
    path = os.path.join(rootPath, className)
    classTestPath = testPath+"/"+className
    os.mkdir(classTestPath)
    idFiles = os.listdir(path)
    lenIdFiles = len(idFiles)
    testLen = math.floor(0.2 * lenIdFiles)
    print(className, lenIdFiles, testLen)
    for ind, idFile in enumerate(idFiles):
      if ind <= testLen:
        shutil.move(os.path.join(rootPath, className, idFile), classTestPath)

comp.sys.mac.hardware 961 192
comp.os.ms-windows.misc 985 197
rec.motorcycles 994 198
talk.politics.misc 775 155
comp.graphics 973 194
talk.politics.mideast 940 188
sci.med 990 198
rec.sport.baseball 994 198
talk.religion.misc 628 125


In [10]:
# Validation has all class labels except for anomalous class 
# Here, 0 - 8
testPath = rootPath + "/val"
if os.path.exists(testPath):
  shutil.rmtree(testPath)
os.mkdir(testPath)

for className in files:
  if className in id_classes_whole:
    path = os.path.join(rootPath, className)
    classTestPath = testPath+"/"+className
    os.mkdir(classTestPath)
    idFiles = os.listdir(path)
    lenIdFiles = len(idFiles)
    testLen = math.floor(0.2 * lenIdFiles)
    print(className, lenIdFiles, testLen)
    for ind, idFile in enumerate(idFiles):
      if ind <= testLen:
        shutil.move(os.path.join(rootPath, className, idFile), classTestPath)

comp.sys.mac.hardware 768 153
comp.os.ms-windows.misc 787 157
rec.motorcycles 795 159
talk.politics.misc 619 123
comp.graphics 778 155
talk.politics.mideast 751 150
sci.med 791 158
rec.sport.baseball 795 159
talk.religion.misc 502 100


In [11]:
# Train has all class labels except for anomalous class 
# Here, 0 - 8
testPath = rootPath + "/train"
if os.path.exists(testPath):
  shutil.rmtree(testPath)
os.mkdir(testPath)

for className in files:
  if className in id_classes_whole:
    path = os.path.join(rootPath, className)
    classTestPath = testPath+"/"+className
    os.mkdir(classTestPath)
    idFiles = os.listdir(path)
    lenIdFiles = len(idFiles)
    # testLen = math.floor(0.2 * lenIdFiles)
    print(className, lenIdFiles)
    for idFile in idFiles:
      shutil.move(os.path.join(rootPath, className, idFile), classTestPath)

comp.sys.mac.hardware 614
comp.os.ms-windows.misc 629
rec.motorcycles 635
talk.politics.misc 495
comp.graphics 622
talk.politics.mideast 600
sci.med 632
rec.sport.baseball 635
talk.religion.misc 401


In [12]:
# Test_OOD separates anomalous classes in the test.
# Test contains 0 - 8
# Test_OOD contains 9
# files = os.listdir(rootPath + "/test")

testPath = rootPath + "/test_ood"
if os.path.exists(testPath):
  shutil.rmtree(testPath)
os.mkdir(testPath)

for className in files:
  if className in anom_classes:
    path = os.path.join(rootPath, className)
    classTestPath = testPath+"/"+className
    os.mkdir(classTestPath)
    idFiles = os.listdir(path)
    lenIdFiles = len(idFiles)
    # testLen = math.floor(0.2 * lenIdFiles)
    # print(className, lenIdFiles, testLen)
    for idFile in idFiles:
      shutil.move(os.path.join(rootPath, className, idFile), classTestPath)

In [13]:
def update_files_to_txt(id_ood_path):
  print("Files in folder '%s' are renamed as .txt" % id_ood_path)
  files = os.listdir(id_ood_path)
  
  for i in files:
    path = os.path.join(id_ood_path, i)
    files = os.listdir(path)  
    for index, file in enumerate(files):
      os.rename(os.path.join(path, file), os.path.join(path, ''.join([str(index), '.txt'])))

In [14]:
update_files_to_txt('20news-18828/test')
update_files_to_txt('20news-18828/test_ood')
update_files_to_txt('20news-18828/val')
update_files_to_txt('20news-18828/train')

Files in folder '20news-18828/test' are renamed as .txt
Files in folder '20news-18828/test_ood' are renamed as .txt
Files in folder '20news-18828/val' are renamed as .txt
Files in folder '20news-18828/train' are renamed as .txt


In [15]:
AUTOTUNE = tf.data.AUTOTUNE
batch_size = 32
seed = 42

test_ds = tf.keras.preprocessing.text_dataset_from_directory(
    '20news-18828/test',
    batch_size=batch_size)

test_class_names = test_ds.class_names
test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)

Found 1654 files belonging to 9 classes.


In [16]:
test_class_names

['comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.mac.hardware',
 'rec.motorcycles',
 'rec.sport.baseball',
 'sci.med',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [17]:
AUTOTUNE = tf.data.AUTOTUNE
batch_size = 32
seed = 42

test_ood_ds = tf.keras.preprocessing.text_dataset_from_directory(
    '20news-18828/test_ood',
    batch_size=batch_size)

test_ood_class_names = test_ood_ds.class_names
test_ood_ds = test_ood_ds.cache().prefetch(buffer_size=AUTOTUNE)

Found 981 files belonging to 1 classes.


In [18]:
test_ood_class_names

['sci.electronics']

In [19]:
AUTOTUNE = tf.data.AUTOTUNE
batch_size = 32
seed = 42

train_ds = tf.keras.preprocessing.text_dataset_from_directory(
    '20news-18828/train',
    batch_size=batch_size)

train_class_names = train_ds.class_names
train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)

Found 5263 files belonging to 9 classes.


In [20]:
train_class_names

['comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.mac.hardware',
 'rec.motorcycles',
 'rec.sport.baseball',
 'sci.med',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [21]:
AUTOTUNE = tf.data.AUTOTUNE
batch_size = 32
seed = 42

val_ds = tf.keras.preprocessing.text_dataset_from_directory(
    '20news-18828/val',
    batch_size=batch_size)

val_class_names = val_ds.class_names
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)

Found 1323 files belonging to 9 classes.


In [22]:
val_class_names

['comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.mac.hardware',
 'rec.motorcycles',
 'rec.sport.baseball',
 'sci.med',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [23]:
# Checking for class labels
def get_set_labels(ds):
  labels = []
  for _, label in ds:
    labels.extend(label.numpy())
  return set(labels)

print("Train labels", get_set_labels(train_ds))
print("Val labels", get_set_labels(val_ds))
print("Test ID labels", get_set_labels(test_ds))
print("Test OOD labels", get_set_labels(test_ood_ds))

Train labels {0, 1, 2, 3, 4, 5, 6, 7, 8}
Val labels {0, 1, 2, 3, 4, 5, 6, 7, 8}
Test ID labels {0, 1, 2, 3, 4, 5, 6, 7, 8}
Test OOD labels {0}


In [24]:
# Class labels and class names
def get_class_names(class_names):
  for ind, name in enumerate(class_names):
    print(ind, name)

print("Train class names")
get_class_names(train_class_names)
print("\nVal class names")
get_class_names(val_class_names)
print("\nTest ID class names")
get_class_names(test_class_names)
print("\nTest OOD class names")
get_class_names(test_ood_class_names)

Train class names
0 comp.graphics
1 comp.os.ms-windows.misc
2 comp.sys.mac.hardware
3 rec.motorcycles
4 rec.sport.baseball
5 sci.med
6 talk.politics.mideast
7 talk.politics.misc
8 talk.religion.misc

Val class names
0 comp.graphics
1 comp.os.ms-windows.misc
2 comp.sys.mac.hardware
3 rec.motorcycles
4 rec.sport.baseball
5 sci.med
6 talk.politics.mideast
7 talk.politics.misc
8 talk.religion.misc

Test ID class names
0 comp.graphics
1 comp.os.ms-windows.misc
2 comp.sys.mac.hardware
3 rec.motorcycles
4 rec.sport.baseball
5 sci.med
6 talk.politics.mideast
7 talk.politics.misc
8 talk.religion.misc

Test OOD class names
0 sci.electronics


In [25]:
all_test_class_names = test_class_names + test_ood_class_names

models_path = "outputs/" + anom_class_name_for_path + "/"

try:
  os.mkdir(models_path)
except:
  print(models_path, "is already present.")
  pass

pickle_file_name = models_path + 'all_test_class_names'
out_file = open(pickle_file_name,"wb")
pickle.dump(all_test_class_names, out_file)
out_file.close()

outputs/anom_9/ is already present.


In [26]:
class BERTModel:
  def __init__(self, train_ds):
    self.tfhub_handle_encoder = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3'
    self.tfhub_handle_preprocess = 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'
    self.bert_preprocess_model = hub.KerasLayer(self.tfhub_handle_preprocess)
    self.bert_model = hub.KerasLayer(self.tfhub_handle_encoder)
    self.train_ds = train_ds
    
    self.num_of_classes = 9 # not leaving out one class, so 9 classes
    self.classifier_model = self.build_classifier_model()

    self.loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    self.metrics = tf.metrics.SparseCategoricalAccuracy()
    self.optimizer = self.build_optimizer()

    self.classifier_model.compile(
        optimizer=self.optimizer,
        loss=self.loss,
        metrics=self.metrics,
      )


  def build_classifier_model(self):
    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
    preprocessing_layer = hub.KerasLayer(self.tfhub_handle_preprocess, name='preprocessing')
    encoder_inputs = preprocessing_layer(text_input)
    encoder = hub.KerasLayer(self.tfhub_handle_encoder, trainable=True, name='BERT_encoder')
    outputs = encoder(encoder_inputs)
    net = outputs['pooled_output']
    net = tf.keras.layers.Dropout(0.1)(net)
    net = tf.keras.layers.Dense(self.num_of_classes, activation=None, name='classifier')(net)
    return tf.keras.Model(text_input, net)

  def build_optimizer(self):
    epochs = 10
    steps_per_epoch = tf.data.experimental.cardinality(self.train_ds).numpy()
    num_train_steps = steps_per_epoch * epochs
    num_warmup_steps = int(0.1*num_train_steps)
    # print(num_train_steps)
    init_lr = 3e-5
    optimizer = optimization.create_optimizer(init_lr=init_lr,
                                              num_train_steps=num_train_steps,
                                              num_warmup_steps=num_warmup_steps,
                                              optimizer_type='adamw')
    return optimizer

In [27]:
def getScores(model):
  def get_labels_and_logits(ds):
    y = []
    logits = []
    for text_batch, label_batch in ds:
      label = label_batch.numpy()
      for lbl in label:
        y.append(lbl)
      text_test = text_batch
      bert_raw_result = model(text_test)
      for batch in bert_raw_result:
        logits.append(batch)
    return y, logits

  y_val, logits_val = get_labels_and_logits(val_ds)
  y_test, logits_test = get_labels_and_logits(test_ds)
  y_anom, logits_anom = get_labels_and_logits(test_ood_ds)

  out = {
      'y_val': y_val,
      'y_test': y_test,
      'y_anom': y_anom,
      'logits_val': logits_val,
      'logits_test': logits_test,
      'logits_anom': logits_anom
  }
  
  models_path = "outputs/" + anom_class_name_for_path + "/"
  #if os.path.exists(models_path):
  #  shutil.rmtree(models_path)
  
  pickle_file_name = models_path + 'baseline'
  out_file = open(pickle_file_name,"wb")
  pickle.dump(out, out_file)
  out_file.close()

In [28]:
def train_model():
  # Fetch dataset
  # class_ind = InOutData(leave_out_ind=leave_out_ind, id_classes_whole=id_classes_whole)
  # Build and plot model
  classifier_ind = BERTModel(train_ds)

  # Train model
  classifier_ind.classifier_model.fit(x=train_ds,
                                      batch_size=32,
                                      validation_data=val_ds,
                                      epochs=10, 
                                      verbose=1)

  # Save model
  # models_path = '/content/drive/MyDrive/Uni Projects/Anomaly Detection using Ensembles/Experiments/Text Dataset/models/'
  # dataset_name = '%s_model' % classifier_name
  # saved_model_path = models_path+'{}'.format(dataset_name.replace('/', '_'))
  # classifier_ind.classifier_model.save(saved_model_path, include_optimizer=True)
  getScores(classifier_ind.classifier_model)

  return classifier_ind.classifier_model

In [29]:
model = train_model()

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


## Reload logits from pickle files

In [30]:
models_path = "outputs/" + anom_class_name_for_path + "/"
pickle_file_name = models_path + 'all_test_class_names'
out_file = open(pickle_file_name,"rb")
all_test_class_names = pickle.load(out_file)
out_file.close()

In [31]:
models_path = "outputs/" + anom_class_name_for_path + "/"
pickle_file_name = models_path + 'baseline'
out_file = open(pickle_file_name,"rb")
output = pickle.load(out_file)
out_file.close()

In [32]:
outputs = [
           output
]

## Temperature scaling

In [33]:
def temp_cal(y_pred, y):

  temp = tf.Variable(initial_value=1.0, trainable=True, dtype=tf.float32) 

  def compute_loss():
      y_pred_model_w_temp = tf.math.divide(y_pred, temp)
      loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(\
                                  tf.convert_to_tensor(keras.utils.to_categorical(np.asarray(y))), y_pred_model_w_temp))
      return loss

  optimizer = tf.optimizers.Adam(learning_rate=0.01)

  print('Temperature Initial value: {}'.format(temp.numpy()))

  for i in range(300):
      opts = optimizer.minimize(compute_loss, var_list=[temp])


  print('Temperature Final value: {}'.format(temp.numpy()))

  return temp

In [34]:
temp_list = []

for output in outputs:
  temp = temp_cal(output['logits_val'], output['y_val'])
  temp_list.append(temp)

Temperature Initial value: 1.0
Temperature Final value: 1.866462230682373


In [35]:
def temp_scaling(y_pred, temp):
  return tf.math.divide(y_pred, temp)

In [36]:
softmax_list = []

for output in outputs:
    softmax = tf.nn.softmax(output['logits_val'])
    softmax_list.append(softmax)

In [37]:
new_logits_list = []
new_softmax_list = []

for temp, output in zip(temp_list, outputs):
  new_logits = temp_scaling(output['logits_val'], temp)
  new_logits_list.append(new_logits)

  new_softmax = tf.nn.softmax(new_logits)
  new_softmax_list.append(new_softmax)

## Reference Vector

In [38]:
new_logits_list = []
test_softmax_list = []

for temp, output in zip(temp_list, outputs):
  new_logits = temp_scaling(output['logits_test'], temp)
  new_logits_list.append(new_logits)

  new_softmax = tf.nn.softmax(new_logits)
  test_softmax_list.append(new_softmax)

new_logits_list = []
anom_softmax_list = []

for temp, output in zip(temp_list, outputs):
  new_logits = temp_scaling(output['logits_anom'], temp)
  new_logits_list.append(new_logits)

  new_softmax = tf.nn.softmax(new_logits)
  anom_softmax_list.append(new_softmax)


In [39]:
y_true = []
y_pred_softmax = []

for ind in tqdm(range(len(test_softmax_list[0]))):
  pred_vector = []
  for test_softmax in test_softmax_list:
    pred_vector.append(np.max(test_softmax[ind]))

  y_pred_softmax.append(np.max(pred_vector))
  y_true.append(1) # because ID data

for ind in tqdm(range(len(anom_softmax_list[0]))):
  pred_vector = []
  for anom_softmax in anom_softmax_list:
    pred_vector.append(np.max(anom_softmax[ind]))

  y_pred_softmax.append(np.max(pred_vector))
  y_true.append(0) # because OOD data

100%|██████████| 1654/1654 [00:00<00:00, 5806.00it/s]
100%|██████████| 981/981 [00:00<00:00, 5948.75it/s]


In [40]:
print("Scores with temperature scaling - considering only softmax values, no similarity checked")

auroc = sk.roc_auc_score(y_true, y_pred_softmax)
print('AUROC (%):', round(100*auroc, 2))

Scores with temperature scaling - considering only softmax values, no similarity checked
AUROC (%): 90.66
