In this notebook, we implement the out-of-distribution detection methods described in the article ["A baseline for detecting misclassified and out-of-distribution examples in neural networks"](https://arxiv.org/pdf/1610.02136.pdf)

## Imports

In [2]:
! pip install datasets
! pip install transformers
! pip install evaluate
! pip install rouge_score

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.11.0-py3-none-any.whl (468 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m468.7/468.7 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting xxhash
  Downloading xxhash-3.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.2/212.2 kB[0m [31m17.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting dill<0.3.7,>=0.3.0
  Downloading dill-0.3.6-py3-none-any.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0.0,>=0.11.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200

In [3]:
import matplotlib.pyplot as plt
import seaborn as sn
import pandas as pd
import torch
import numpy as np
import evaluate
import copy

from datasets import load_dataset, load_metric
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
from torch.optim import AdamW
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_scheduler, EncoderDecoderModel
from tqdm.auto import tqdm

## Baseline

In [None]:
in_dataset = load_dataset("yelp_review_full")
out_dataset = load_dataset("paws","labeled_final")

Downloading builder script:   0%|          | 0.00/4.41k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.04k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/6.55k [00:00<?, ?B/s]

Downloading and preparing dataset yelp_review_full/yelp_review_full to /root/.cache/huggingface/datasets/yelp_review_full/yelp_review_full/1.0.0/e8e18e19d7be9e75642fc66b198abadb116f73599ec89a69ba5dd8d1e57ba0bf...


Downloading data:   0%|          | 0.00/196M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/650000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Dataset yelp_review_full downloaded and prepared to /root/.cache/huggingface/datasets/yelp_review_full/yelp_review_full/1.0.0/e8e18e19d7be9e75642fc66b198abadb116f73599ec89a69ba5dd8d1e57ba0bf. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

Downloading builder script:   0%|          | 0.00/8.43k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/7.52k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/9.33k [00:00<?, ?B/s]

Downloading and preparing dataset paws/labeled_final to /root/.cache/huggingface/datasets/paws/labeled_final/1.1.0/8d567c6472623f42bd2cc635cad06932d0f0cd2f897db56013c1180f4317d338...


Downloading data:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/49401 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/8000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/8000 [00:00<?, ? examples/s]

Dataset paws downloaded and prepared to /root/.cache/huggingface/datasets/paws/labeled_final/1.1.0/8d567c6472623f42bd2cc635cad06932d0f0cd2f897db56013c1180f4317d338. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased", use_fast=True)

def tokenize_function_text(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True)

def tokenize_function_sentence1(examples):
    return tokenizer(examples['sentence1'], padding="max_length", truncation=True)
	

tokenized_in_dataset = in_dataset.map(tokenize_function_text, batched=True)
tokenized_out_dataset = out_dataset.map(tokenize_function_sentence1, batched=True)

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Map:   0%|          | 0/650000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Map:   0%|          | 0/49401 [00:00<?, ? examples/s]

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

In [None]:
tokenized_in_dataset = tokenized_in_dataset.remove_columns(["text"])
tokenized_in_dataset = tokenized_in_dataset.rename_column("label", "labels")
tokenized_in_dataset.set_format("torch")

tokenized_out_dataset = tokenized_out_dataset.remove_columns(["id","sentence1","sentence2"])
tokenized_out_dataset = tokenized_out_dataset.rename_column("label", "labels")
tokenized_out_dataset.set_format("torch")

In [None]:
small_train_dataset = tokenized_in_dataset["train"].shuffle(seed=42).select(range(10000))
small_in_eval_dataset = tokenized_in_dataset["test"].shuffle(seed=42).select(range(1000))
small_out_eval_dataset = tokenized_out_dataset["test"].shuffle(seed=42).select(range(1000))

In [None]:
train_dataloader = DataLoader(small_train_dataset, shuffle=True, batch_size=8)
in_eval_dataloader = DataLoader(small_in_eval_dataset, batch_size=8)
out_eval_dataloader = DataLoader(small_out_eval_dataset, batch_size=8)

In [None]:
# This model is equal to BERT + a linear layer for classification. In our custom model we designed a FastText + a hidden layer and linear layer for classification
model = AutoModelForSequenceClassification.from_pretrained("prajjwal1/bert-tiny", num_labels=5)

Downloading (…)lve/main/config.json:   0%|          | 0.00/285 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/17.8M [00:00<?, ?B/s]

Some weights of the model checkpoint at prajjwal1/bert-tiny were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initia

In [None]:
optimizer = AdamW(model.parameters(), lr=5e-5)

In [None]:
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

In [None]:
progress_bar = tqdm(range(num_training_steps))
device='cpu'

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

  0%|          | 0/3750 [00:00<?, ?it/s]

KeyboardInterrupt: ignored

In [None]:
metric = evaluate.load("accuracy")
model.eval()
preds, trues = [], []
for i, batch in tqdm(enumerate(in_eval_dataloader), desc="evaluating", total=in_eval_dataloader.__len__()):
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

    _, tag_seq  = torch.max(logits, 1)
    preds.extend(tag_seq.cpu().detach().tolist())
    trues.extend(batch['labels'].cpu().detach().tolist())

metric.compute()

In [None]:
print(classification_report(np.array(trues).flatten(), np.array(preds).flatten()))

In [None]:
cm = confusion_matrix(np.array(trues).flatten(), np.array(preds).flatten())
df_cm = pd.DataFrame(cm)
# config plot sizes
sn.set(font_scale=1)
sn.heatmap(df_cm, annot=True, annot_kws={"size": 8}, cmap='coolwarm', linewidth=0.5, fmt="")
plt.show()

In [None]:
metric = evaluate.load("accuracy")
model.eval()
preds, trues = [], []
confidence_in=[]
confidence_out=[]
for i, batch in tqdm(enumerate(in_eval_dataloader), desc="evaluating", total=in_eval_dataloader.__len__()):
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
    logits = outputs.logits
    softmaxx=torch.nn.functional.softmax(logits).max()
    confidence_in.append(float(softmaxx))
for i, batch in tqdm(enumerate(out_eval_dataloader), desc="evaluating", total=out_eval_dataloader.__len__()):
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
    logits = outputs.logits
    softmaxx=torch.nn.functional.softmax(logits).max()
    confidence_out.append(float(softmaxx))

In [None]:
anomaly_in=[-x for x in confidence_in]
anomaly_out=[-x for x in confidence_out]

plt.hist(anomaly_in,bins=40,label='in-distribution anomaly score',edgecolor='none')
plt.hist(anomaly_out,bins=40,label='out-of-distribution anomaly score',edgecolor='none', alpha=0.6,color='red')
plt.legend()
plt.show()

tpr,fpr=[],[]
auroc=0
for i in range(100):
    threshold=-i/100
    tp=len([x for x in anomaly_out if x>threshold])
    fn=len([x for x in anomaly_out if x<threshold])
    tn=len([x for x in anomaly_in if x<threshold])
    fp=len([x for x in anomaly_in if x>threshold])
    tpr.append(tp/(tp+fn))
    fpr.append(fp/(fp+tn))
    if i!=0:
      auroc+=tpr[-1]*(fpr[-1]-fpr[-2])
plt.plot(fpr,tpr,label="Roc curve")
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.legend()
plt.show()
print("AUROC (ie ability to distinguish OOD examples): ",auroc)

## Better method with decoder

In [None]:
# print("\n##########\n".join([str(x) for x in model.named_modules()]))

In [None]:
def new_label(example):
    example["labels"] = example["input_ids"].float()
    return example

tokenized_in_dataset_for_decoder = tokenized_in_dataset.map(new_label)
tokenized_out_dataset_for_decoder = tokenized_out_dataset.map(new_label)

Map:   0%|          | 0/650000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Map:   0%|          | 0/49401 [00:00<?, ? examples/s]

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

In [None]:
small_train_dataset_for_decoder = tokenized_in_dataset_for_decoder["train"].shuffle(seed=42).select(range(2000))
small_in_eval_dataset_for_decoder = tokenized_in_dataset_for_decoder["test"].shuffle(seed=42).select(range(1000))
small_out_eval_dataset_for_decoder = tokenized_out_dataset_for_decoder["test"].shuffle(seed=42).select(range(1000))


train_dataloader_for_decoder = DataLoader(small_train_dataset_for_decoder, shuffle=True, batch_size=8)
in_eval_dataloader_for_decoder = DataLoader(small_in_eval_dataset_for_decoder, shuffle=True, batch_size=8)
out_eval_dataloader_for_decoder = DataLoader(small_out_eval_dataset_for_decoder, shuffle=True, batch_size=8)

In [None]:
model4=copy.deepcopy(model)

model4.classifier=torch.nn.Identity()

In [None]:
model4.save_pretrained('pretrained_save')

In [4]:
#encoder_decoder_model = EncoderDecoderModel.from_encoder_decoder_pretrained("pretrained_save", "prajjwal1/bert-tiny")
encoder_decoder_model = EncoderDecoderModel.from_encoder_decoder_pretrained("prajjwal1/bert-tiny", "prajjwal1/bert-tiny")

Downloading (…)lve/main/config.json:   0%|          | 0.00/285 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/17.8M [00:00<?, ?B/s]

Some weights of the model checkpoint at prajjwal1/bert-tiny were not used when initializing BertModel: ['cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at prajjwal1/bert-tiny were not used when initializing BertLMHeadModel: ['cls.seq_re

In [12]:
params_to_update=[]
for name,param in encoder_decoder_model.named_parameters():
    if name[:7]=="decoder":
        params_to_update.append(param)
        param.requires_grad = True
    elif name[:7]=="encoder":
        param.requires_grad = False

In [None]:
encoder_decoder_model.config.decoder_start_token_id = tokenizer.cls_token_id
encoder_decoder_model.config.pad_token_id = tokenizer.pad_token_id

optimizer = AdamW(params_to_update, lr=5e-5)
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader_for_decoder)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

progress_bar = tqdm(range(num_training_steps))
encoder_decoder_model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader_for_decoder:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = encoder_decoder_model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)  

  0%|          | 0/750 [00:00<?, ?it/s]

In [None]:
rouge = evaluate.load("rouge")


def generate_sentence(batch):
    inputs = tokenizer(batch["text"], padding="max_length", truncation=True, max_length=512, return_tensors="pt")
    input_ids = inputs.input_ids
    attention_mask = inputs.attention_mask
    outputs = encoder_decoder_model.generate(input_ids, attention_mask=attention_mask)
    output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    batch["pred_sentence"] = output_str
    return batch

in_test_dataset=in_dataset['test'].shuffle(seed=42).select(range(200))
results_in = in_test_dataset.map(generate_sentence, batched=True, batch_size=8)

rouge.compute(predictions=results_in["pred_sentence"], references=results_in["text"], rouge_types=["rouge2"])["rouge2"]#.mid

In [None]:
def generate_sentence2(batch):
    inputs = tokenizer(batch["sentence1"], padding="max_length", truncation=True, max_length=512, return_tensors="pt")
    input_ids = inputs.input_ids#.to("cuda")
    attention_mask = inputs.attention_mask#.to("cuda")
    outputs = encoder_decoder_model.generate(input_ids, attention_mask=attention_mask)
    output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    batch["pred_sentence"] = output_str
    return batch

out_test_dataset=out_dataset['test'].shuffle(seed=42).select(range(200))
results_out = out_test_dataset.map(generate_sentence2, batched=True, batch_size=8)

rouge.compute(predictions=results_out["pred_sentence"], references=results_out["sentence1"], rouge_types=["rouge2"])["rouge2"]#.mid


In [None]:
print(model4)