In [23]:

import torch
from torch.utils.data import Dataset, DataLoader

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from toddbenchmark.generation_datasets import prep_dataset, prep_model
from datasets import load_dataset


In [24]:
% load_ext autoreload
% autoreload 2


UsageError: Line magic function `%` not found.


In [None]:
model, tokenizer = prep_model("Helsinki-NLP/opus-mt-de-en")


## Load and prep dataset using ToddBenchmark

In [25]:

in_dataset = prep_dataset("wmt16", "de-en", tokenizer=tokenizer, train_max_size=0, validation_max_size=1000,
                          test_max_size=100)
out_dataset = prep_dataset("wmt16", "ro-en", tokenizer=tokenizer, train_max_size=1000, validation_max_size=1000,
                           test_max_size=100)

# For the sake of this example we only use 100 samples to keep things quick!
in_val = in_dataset[1]
in_test = in_dataset[2]
out_test = out_dataset[2]

del in_dataset
del out_dataset


Found cached dataset wmt16 (/home/mdarrin/.cache/huggingface/datasets/wmt16/de-en/1.0.0/746749a11d25c02058042da7502d973ff410e73457f3d305fc1177dc0e8c4227)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset wmt16 (/home/mdarrin/.cache/huggingface/datasets/wmt16/ro-en/1.0.0/746749a11d25c02058042da7502d973ff410e73457f3d305fc1177dc0e8c4227)


  0%|          | 0/3 [00:00<?, ?it/s]

In [26]:
# Make dataloader
in_val_loader = DataLoader(in_val, shuffle=False, batch_size=4)
in_test_loader = DataLoader(in_test, shuffle=False, batch_size=4)
out_test_loader = DataLoader(out_test, shuffle=False, batch_size=4)


## Feature based filters


### Mahalanobis

In [27]:
from Todd import MahalanobisScorer

maha_detector = MahalanobisScorer(layers=[6])


In [28]:
maha_detector.accumulated_embeddings

defaultdict(list, {})

In [33]:
# Accumulate reference embeddings


def prepare_embeddings(scorer, loader):
    with torch.no_grad():
        for batch in loader:
            inputs = tokenizer(
                batch["source"], padding=True, truncation=True, return_tensors="pt"
            )
            output = model.generate(
                **inputs,
                return_dict_in_generate=True,
                output_hidden_states=True,
                output_scores=True,
            )

            scorer.accumulate(output)

prepare_embeddings(maha_detector, loader=in_val_loader)
maha_detector.fit()



AttributeError: 'MahalanobisScorer' object has no attribute 'accumulated_embeddings'

In [30]:

def eval_loader(loader):
    with torch.no_grad():
        for batch in loader:
            inputs = tokenizer(
                batch["source"], padding=True, truncation=True, return_tensors="pt"
            )
            output = model.generate(
                **inputs,
                return_dict_in_generate=True,
                output_hidden_states=True,
                output_scores=True,
            )

            print(maha_detector(output))
            break


eval_loader(in_test_loader)
eval_loader(out_test_loader)

tensor([6124.3984, 3304.8125, 3317.0156, 2627.2996])
tensor([13637.9580, 10162.7705,  7508.2700,  9693.5605])


### Cosine



In [35]:
from Todd.featuresscorers import CosineProjectionScorer

cosine_detector = CosineProjectionScorer(layers=[6])

prepare_embeddings(scorer=cosine_detector, loader=in_val_loader)


def eval_loader(loader):
    with torch.no_grad():
        for batch in loader:
            inputs = tokenizer(
                batch["source"], padding=True, truncation=True, return_tensors="pt"
            )
            output = model.generate(
                **inputs,
                return_dict_in_generate=True,
                output_hidden_states=True,
                output_scores=True,
            )

            # print(cosine_detector.compute_scores(output))
            print(cosine_detector(output))
            break


print("IN")
eval_loader(in_test_loader)
print("OUT")
eval_loader(out_test_loader)


IN


RuntimeError: stack expects a non-empty TensorList

## Decoder based filters

In [None]:
from Todd.itscorers import SequenceRenyiNegScorer

### Output mode

It output a score / a filter on each sequence returned for each sample in the batch

In [None]:

renyi_entropy_scorer = SequenceRenyiNegScorer(pad_token_id=tokenizer.pad_token_id, mode="output",
                                              num_return_sequences=2, num_beam=2, batch_size=4)


def eval_loader(loader):
    with torch.no_grad():
        for batch in loader:
            inputs = tokenizer(
                batch["source"], padding=True, truncation=True, return_tensors="pt"
            )
            output = model.generate(
                **inputs,
                return_dict_in_generate=True,
                output_hidden_states=True,
                output_scores=True,
                num_return_sequences=2,
                num_beams=2,
                do_sample=False,
            )

            print(renyi_entropy_scorer(
                output,
            ))

            print(renyi_entropy_scorer.compute_scores(
                output,
            ))
            del output
            break


print("IN DATA")
eval_loader(in_test_loader)
print("OUT DATA")
eval_loader(out_test_loader)


### Input Mode
It only returns a score for each sample in the bach by aggregating the scores of the generated sequences.

In [None]:
renyi_entropy_scorer = SequenceRenyiNegScorer(pad_token_id=tokenizer.pad_token_id, mode="input",
                                              batch_size=4,
                                              num_return_sequences=2,
                                              num_beam=2)


def eval_loader(loader):
    with torch.no_grad():
        for batch in loader:
            inputs = tokenizer(
                batch["source"], padding=True, truncation=True, return_tensors="pt"
            )
            output = model.generate(
                **inputs,
                return_dict_in_generate=True,
                output_hidden_states=True,
                output_scores=True,
                num_return_sequences=2,
                num_beams=2,
                do_sample=False,
            )

            print(renyi_entropy_scorer(
                output,

            ))

            print(renyi_entropy_scorer.compute_scores(
                output,
            ))
            del output
            break


print("IN DATA")
eval_loader(in_test_loader)

print("OUT DATA")
eval_loader(out_test_loader)


### Beam ranking using Info-projection

In [None]:
from Todd.itscorers import BeamRenyiInformationProjection

In [None]:
# Make dataloader
# Smaller batch so it runs on CPU and fits in memory
in_val_loader = DataLoader(in_val, shuffle=False, batch_size=6)
in_test_loader = DataLoader(in_test, shuffle=False, batch_size=6)
out_test_loader = DataLoader(out_test, shuffle=False, batch_size=6)


In [None]:
batch_self_projector = BeamRenyiInformationProjection(pad_token_id=tokenizer.pad_token_id, mode="input",
                                                      use_soft_projection=True, n_neighbors=2,
                                                      num_return_sequences=4,
                                                      num_beams=2)


def eval_loader(loader):
    with torch.no_grad():
        for batch in loader:
            inputs = tokenizer(
                batch["source"], padding=True, truncation=True, return_tensors="pt"
            )
            output = model.generate(
                **inputs,
                return_dict_in_generate=True,
                output_hidden_states=True,
                output_scores=True,
                num_return_sequences=4,
                num_beams=4,
                do_sample=False,
            )

            candidate_scores = batch_self_projector.per_output_scores(
                output,

            )

            candidate_scores, indices = torch.sort(candidate_scores, dim=-1, descending=False)

            print(indices)
            print(candidate_scores)

            print(output.sequences[indices].shape)

            del output
            break


print("IN DATA")
eval_loader(in_test_loader)

print("OUT DATA")
eval_loader(out_test_loader)
