In [1]:

import torch
from torch.utils.data import Dataset, DataLoader

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from toddbenchmark.generation_datasets import prep_dataset, prep_model
from datasets import load_dataset


In [2]:
% load_ext autoreload
% autoreload 2


In [3]:
model, tokenizer = prep_model("Helsinki-NLP/opus-mt-de-en")




## Load and prep dataset using ToddBenchmark

In [4]:

in_dataset = prep_dataset("wmt16", "de-en", tokenizer=tokenizer, train_max_size=0, validation_max_size=1000,
                          test_max_size=100)
out_dataset = prep_dataset("wmt16", "ro-en", tokenizer=tokenizer, train_max_size=1000, validation_max_size=1000,
                           test_max_size=100)

# For the sake of this example we only use 100 samples to keep things quick!
in_val = in_dataset[1]
in_test = in_dataset[2]
out_test = out_dataset[2]

del in_dataset
del out_dataset


Found cached dataset wmt16 (/home/mdarrin/.cache/huggingface/datasets/wmt16/de-en/1.0.0/746749a11d25c02058042da7502d973ff410e73457f3d305fc1177dc0e8c4227)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset wmt16 (/home/mdarrin/.cache/huggingface/datasets/wmt16/ro-en/1.0.0/746749a11d25c02058042da7502d973ff410e73457f3d305fc1177dc0e8c4227)


  0%|          | 0/3 [00:00<?, ?it/s]

In [5]:
# Make dataloader
in_val_loader = DataLoader(in_val, shuffle=False, batch_size=4)
in_test_loader = DataLoader(in_test, shuffle=False, batch_size=4)
out_test_loader = DataLoader(out_test, shuffle=False, batch_size=4)


## Feature based filters


In [6]:

from Todd import extract_embeddings

# We work here in a case where the classes do not matter
# So we can skip retrieving them and the only class key will be 0
# It would be different in a classification problem with enough data we would have a reference per class

ref_embeddings, _ = extract_embeddings(model, tokenizer, in_val_loader, layers=[6])

### Mahalanobis

In [8]:
from Todd import MahalanobisScorer

maha_detector = MahalanobisScorer(layers=[6])
maha_detector.fit(ref_embeddings)

In [9]:

def eval_loader(loader):
    with torch.no_grad():
        for batch in loader:
            inputs = tokenizer(
                batch["source"], padding=True, truncation=True, return_tensors="pt"
            )
            output = model.generate(
                **inputs,
                return_dict_in_generate=True,
                output_hidden_states=True,
                output_scores=True,
            )

            print(maha_detector(output))
            break


eval_loader(in_test_loader)
eval_loader(out_test_loader)

tensor([4652.0269,  745.2454, 1136.8346, 1710.2563])
tensor([10499.9854,  6349.9004, 10801.1152,  5237.2954])


### Cosine



In [10]:
from Todd.featuresscorers import CosineProjectionScorer

cosine_detector = CosineProjectionScorer(layers=[6])
cosine_detector.fit(ref_embeddings)


def eval_loader(loader):
    with torch.no_grad():
        for batch in loader:
            inputs = tokenizer(
                batch["source"], padding=True, truncation=True, return_tensors="pt"
            )
            output = model.generate(
                **inputs,
                return_dict_in_generate=True,
                output_hidden_states=True,
                output_scores=True,
            )

            # print(cosine_detector.compute_scores(output))
            print(cosine_detector(output))
            break


print("IN")
eval_loader(in_test_loader)
print("OUT")
eval_loader(out_test_loader)


IN
tensor([-0.5659, -0.8544, -0.7983, -0.7156])
OUT
tensor([-0.3937, -0.4819, -0.3789, -0.5035])


## Decoder based filters

In [11]:
from Todd.itscorers import SequenceRenyiNegScorer

### Output mode

It output a score / a filter on each sequence returned for each sample in the batch

In [14]:

renyi_entropy_scorer = SequenceRenyiNegScorer(pad_token_id=tokenizer.pad_token_id, mode="output",
                                              num_return_sequences=2, num_beam=2, batch_size=4)


def eval_loader(loader):
    with torch.no_grad():
        for batch in loader:
            inputs = tokenizer(
                batch["source"], padding=True, truncation=True, return_tensors="pt"
            )
            output = model.generate(
                **inputs,
                return_dict_in_generate=True,
                output_hidden_states=True,
                output_scores=True,
                num_return_sequences=2,
                num_beams=2,
                do_sample=False,
            )

            print(renyi_entropy_scorer(
                output,
            ))

            print(renyi_entropy_scorer.compute_scores(
                output,
            ))
            del output
            break


print("IN DATA")
eval_loader(in_test_loader)
print("OUT DATA")
eval_loader(out_test_loader)


IN DATA
tensor([[-10.3531, -10.4222],
        [-10.4674, -10.4793],
        [-10.4999, -10.5295],
        [-10.5211, -10.5511]])
tensor([[-10.3531, -10.4222],
        [-10.4674, -10.4793],
        [-10.4999, -10.5295],
        [-10.5211, -10.5511]])
OUT DATA
tensor([[-10.2332, -10.2277],
        [-10.3162, -10.3185],
        [-10.2982, -10.2804],
        [-10.2965, -10.2874]])
tensor([[-10.2332, -10.2277],
        [-10.3162, -10.3185],
        [-10.2982, -10.2804],
        [-10.2965, -10.2874]])


### Input Mode
It only returns a score for each sample in the bach by aggregating the scores of the generated sequences.

In [15]:
renyi_entropy_scorer = SequenceRenyiNegScorer(pad_token_id=tokenizer.pad_token_id, mode="input",
                                              batch_size=4,
                                              num_return_sequences=2,
                                              num_beam=2)


def eval_loader(loader):
    with torch.no_grad():
        for batch in loader:
            inputs = tokenizer(
                batch["source"], padding=True, truncation=True, return_tensors="pt"
            )
            output = model.generate(
                **inputs,
                return_dict_in_generate=True,
                output_hidden_states=True,
                output_scores=True,
                num_return_sequences=2,
                num_beams=2,
                do_sample=False,
            )

            print(renyi_entropy_scorer(
                output,

            ))

            print(renyi_entropy_scorer.compute_scores(
                output,
            ))
            del output
            break


print("IN DATA")
eval_loader(in_test_loader)

print("OUT DATA")
eval_loader(out_test_loader)


IN DATA
tensor([-10.3876, -10.4734, -10.5147, -10.5361])
tensor([-10.3876, -10.4734, -10.5147, -10.5361])
OUT DATA
tensor([-10.2304, -10.3173, -10.2893, -10.2919])
tensor([-10.2304, -10.3173, -10.2893, -10.2919])


### Beam ranking using Info-projection

In [17]:
from Todd.itscorers import BeamRenyiInformationProjection

In [18]:
# Make dataloader
# Smaller batch so it runs on CPU and fits in memory
in_val_loader = DataLoader(in_val, shuffle=False, batch_size=6)
in_test_loader = DataLoader(in_test, shuffle=False, batch_size=6)
out_test_loader = DataLoader(out_test, shuffle=False, batch_size=6)


In [20]:
batch_self_projector = BeamRenyiInformationProjection(pad_token_id=tokenizer.pad_token_id, mode="input",
                                                      use_soft_projection=True, n_neighbors=2,
                                                      num_return_sequences=4,
                                                      num_beams=2)


def eval_loader(loader):
    with torch.no_grad():
        for batch in loader:
            inputs = tokenizer(
                batch["source"], padding=True, truncation=True, return_tensors="pt"
            )
            output = model.generate(
                **inputs,
                return_dict_in_generate=True,
                output_hidden_states=True,
                output_scores=True,
                num_return_sequences=4,
                num_beams=4,
                do_sample=False,
            )

            candidate_scores = batch_self_projector.per_output_scores(
                output,

            )

            candidate_scores, indices = torch.sort(candidate_scores, dim=-1, descending=False)

            print(indices)
            print(candidate_scores)

            print(output.sequences[indices].shape)

            del output
            break


print("IN DATA")
eval_loader(in_test_loader)

print("OUT DATA")
eval_loader(out_test_loader)


IN DATA
tensor([[3, 0, 2, 1],
        [1, 2, 0, 3],
        [1, 2, 3, 0],
        [1, 2, 3, 0],
        [2, 1, 0, 3],
        [3, 0, 2, 1]])
tensor([[0.0531, 0.0609, 0.0947, 0.1003],
        [0.0334, 0.0384, 0.0498, 0.0870],
        [0.0269, 0.0274, 0.0285, 0.0438],
        [0.0240, 0.0259, 0.0389, 0.0427],
        [0.0358, 0.0441, 0.0488, 0.0559],
        [0.0130, 0.0188, 0.0264, 0.0348]])
torch.Size([6, 4, 30])
OUT DATA
tensor([[0, 1, 2, 3],
        [0, 2, 3, 1],
        [0, 2, 1, 3],
        [2, 0, 3, 1],
        [2, 0, 1, 3],
        [1, 0, 3, 2]])
tensor([[0.0026, 0.0040, 0.0044, 0.0520],
        [0.0139, 0.0145, 0.0171, 0.0179],
        [0.0054, 0.0070, 0.0078, 0.0081],
        [0.0104, 0.0113, 0.0114, 0.0152],
        [0.0145, 0.0164, 0.0271, 0.0297],
        [0.0034, 0.0044, 0.0045, 0.0057]])
torch.Size([6, 4, 125])
