In [1]:

import torch
from torch.utils.data import Dataset, DataLoader

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from toddbenchmark.generation_datasets import prep_dataset, prep_model
from datasets import load_dataset


In [2]:
%load_ext autoreload
%autoreload 2


In [3]:
model, tokenizer = prep_model("Helsinki-NLP/opus-mt-de-en")




## Load and prep dataset using ToddBenchmark

In [4]:

in_dataset = prep_dataset("wmt16", "de-en", tokenizer=tokenizer, train_max_size=0, validation_max_size=1000, test_max_size=100)
out_dataset = prep_dataset("wmt16", "ro-en", tokenizer=tokenizer, train_max_size=1000, validation_max_size=1000, test_max_size=100)

# For the sake of this example we only use 100 samples to keep things quick!
in_val = in_dataset[1]
in_test = in_dataset[2]
out_test = out_dataset[2]


del in_dataset
del out_dataset


Found cached dataset wmt16 (/home/mdarrin/.cache/huggingface/datasets/wmt16/de-en/1.0.0/746749a11d25c02058042da7502d973ff410e73457f3d305fc1177dc0e8c4227)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset wmt16 (/home/mdarrin/.cache/huggingface/datasets/wmt16/ro-en/1.0.0/746749a11d25c02058042da7502d973ff410e73457f3d305fc1177dc0e8c4227)


  0%|          | 0/3 [00:00<?, ?it/s]

In [5]:
# Make dataloader
in_val_loader = DataLoader(in_val, shuffle=False, batch_size=4)
in_test_loader = DataLoader(in_test, shuffle=False, batch_size=4)
out_test_loader = DataLoader(out_test, shuffle=False, batch_size=4)


## Feature based filters


In [6]:

from Todd import extract_embeddings

# We work here in a case where the classes do not matter
# So we can skip retrieving them and the only class key will be 0
# It would be different in a classification problem with enough data we would have a reference per class

ref_embeddings, _ = extract_embeddings(model, tokenizer, in_val_loader, layers=[6])

### Mahalanobis

In [8]:
from Todd import MahalanobisScorer

maha_detector = MahalanobisScorer(layers=[6])
maha_detector.fit(ref_embeddings)

In [9]:

def eval_loader(loader):
    with torch.no_grad():
        for batch in loader:
            inputs = tokenizer(
                batch["source"], padding=True, truncation=True, return_tensors="pt"
            )
            output = model.generate(
                **inputs,
                return_dict_in_generate=True,
                output_hidden_states=True,
                output_scores=True,
            )

            print(maha_detector(output))
            break


eval_loader(in_test_loader)
eval_loader(out_test_loader)

tensor([4652.0269,  745.2454, 1136.8346, 1710.2563])
tensor([10499.9854,  6349.9004, 10801.1152,  5237.2954])


### Cosine



In [10]:
from Todd.featuresscorers import CosineProjectionScorer
cosine_detector = CosineProjectionScorer(layers=[6])
cosine_detector.fit(ref_embeddings)

def eval_loader(loader):
    with torch.no_grad():
        for batch in loader:
            inputs = tokenizer(
                batch["source"], padding=True, truncation=True, return_tensors="pt"
            )
            output = model.generate(
                **inputs,
                return_dict_in_generate=True,
                output_hidden_states=True,
                output_scores=True,
            )

            # print(cosine_detector.compute_scores(output))
            print(cosine_detector(output))
            break

print("IN")
eval_loader(in_test_loader)
print("OUT")
eval_loader(out_test_loader)


IN
tensor([-0.5659, -0.8544, -0.7983, -0.7156])
OUT
tensor([-0.3937, -0.4819, -0.3789, -0.5035])


## Decoder based filters

In [11]:
from Todd.itscorers import SequenceRenyiNegScorer

### Output mode

It output a score / a filter on each sequence returned for each sample in the batch

In [12]:

renyi_entropy_scorer = SequenceRenyiNegScorer(pad_token_id=tokenizer.pad_token_id, mode="output")

def eval_loader(loader):
    with torch.no_grad():
        for batch in loader:
            inputs = tokenizer(
                batch["source"], padding=True, truncation=True, return_tensors="pt"
            )
            output = model.generate(
                **inputs,
                return_dict_in_generate=True,
                output_hidden_states=True,
                output_scores=True,
                num_return_sequences=2,
                num_beams=2,
                do_sample=False,
            )

            print(renyi_entropy_scorer(
                output,
                batch_size=4,
                num_return_sequences=2,
                num_beam=2,
            ))


            print(renyi_entropy_scorer.compute_scores(
                output,
                batch_size=4,
                num_return_sequences=2,
                num_beam=2,
            ))
            del output
            break

print("IN DATA")
eval_loader(in_test_loader)
print("OUT DATA")
eval_loader(out_test_loader)


IN DATA


TypeError: SequenceRenyiNegScorer.per_output_scores() got an unexpected keyword argument 'batch_size'

### Input Mode
It only returns a score for each sample in the bach by aggregating the scores of the generated sequences.

In [None]:
renyi_entropy_scorer = SequenceRenyiNegScorer(pad_token_id=tokenizer.pad_token_id, mode="input")

def eval_loader(loader):
    with torch.no_grad():
        for batch in loader:
            inputs = tokenizer(
                batch["source"], padding=True, truncation=True, return_tensors="pt"
            )
            output = model.generate(
                **inputs,
                return_dict_in_generate=True,
                output_hidden_states=True,
                output_scores=True,
                num_return_sequences=2,
                num_beams=2,
                do_sample=False,
            )

            print(renyi_entropy_scorer(
                output,
                batch_size=4,
                num_return_sequences=2,
                num_beam=2,
            ))


            print(renyi_entropy_scorer.compute_scores(
                output,
                batch_size=4,
                num_return_sequences=2,
                num_beam=2,
            ))
            del output
            break

print("IN DATA")
eval_loader(in_test_loader)

print("OUT DATA")
eval_loader(out_test_loader)


### Beam ranking using Info-projection

In [None]:
from Todd.itscorers import BeamRenyiInformationProjection

In [None]:
# Make dataloader
# Smaller batch so it runs on CPU and fits in memory
in_val_loader = DataLoader(in_val, shuffle=False, batch_size=6)
in_test_loader = DataLoader(in_test, shuffle=False, batch_size=6)
out_test_loader = DataLoader(out_test, shuffle=False, batch_size=6)


In [None]:
batch_self_projector = BeamRenyiInformationProjection(-10.35, pad_token_id=tokenizer.pad_token_id, mode="input", use_soft_projection=True, n_neighbors=2)

def eval_loader(loader):
    with torch.no_grad():
        for batch in loader:
            inputs = tokenizer(
                batch["source"], padding=True, truncation=True, return_tensors="pt"
            )
            output = model.generate(
                **inputs,
                return_dict_in_generate=True,
                output_hidden_states=True,
                output_scores=True,
                num_return_sequences=4,
                num_beams=4,
                do_sample=False,
            )

            candidate_scores = batch_self_projector.per_output_scores(
                output,
                batch_size=6,
                num_return_sequences=4,
                num_beams=2,
            )

            candidate_scores, indices = torch.sort(candidate_scores, dim=-1, descending=False)


            print(indices)
            print(candidate_scores)

            print(output.sequences[indices].shape)

            del output
            break

print("IN DATA")
eval_loader(in_test_loader)

print("OUT DATA")
eval_loader(out_test_loader)
