# Demo EvalBandit usage for Sentiment Classification

In this demo we use the `BanditEval` class to select between a variety of sentiment classification models available via Hugging Face.

In [None]:
import torch

from datasets import load_dataset
from banditeval import BanditEval
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from tqdm.auto import tqdm

## Define Models, Dataset, and Evaluation Function

In [None]:
# Constants
DEVICE = "cuda"
MODEL_NAMES = [
    "philschmid/tiny-bert-sst2-distilled",
    "tanganke/gpt2_sst2",
    "gchhablani/bert-base-cased-finetuned-sst2",
    "distilbert-base-uncased-finetuned-sst-2-english",
    "textattack/bert-base-uncased-SST-2",
    "assemblyai/distilbert-base-uncased-sst2",
    "Alireza1044/albert-base-v2-sst2",
]


# Create callables for each model
def make_callable(tokenizer, model):
    def callable(texts):
        model.to(DEVICE)
        tokens = tokenizer(texts, return_tensors="pt", padding=True)
        tokens = {k: v.to(DEVICE) for k, v in tokens.items()}
        outputs = model(**tokens)
        model.cpu()
        return outputs.logits.argmax(dim=-1).cpu()

    return callable


models = [
    make_callable(
        AutoTokenizer.from_pretrained(model_name),
        AutoModelForSequenceClassification.from_pretrained(model_name),
    )
    for model_name in tqdm(MODEL_NAMES)
]

# Load dataset
dataset = load_dataset("glue", "sst2", split="validation")


# Define evaluation function
def evaluate(method, examples):
    texts = [example["sentence"] for example in examples]
    labels = torch.tensor([example["label"] for example in examples])
    with torch.no_grad():
        predictions = method(texts)
    scores = (predictions == labels).float()
    return scores

## Initialize and Run `BanditEval`

In [None]:
# Constants
BUDGET = 100  # number of evaluation queries
BATCH_SIZE = 8  # number of examples per query
ALGORITHM = "ucbe"  # or "ucbe-lrf"

# Initialize evaluator
evaluator = BanditEval(
    models,
    dataset,
    evaluate,
    ALGORITHM,
)

# Run evaluation
scores = evaluator(
    budget=BUDGET,
    batch_size=BATCH_SIZE,
)

In [None]:
# Write out results
left_justification = max(len(name) for name in MODEL_NAMES) + 1
order = torch.argsort(scores, descending=True)
print("Model Name".ljust(left_justification), "Accuracy")
for i in order:
    print(f"{MODEL_NAMES[i]}:".ljust(left_justification), f"{scores[i].item()*100:.2f}%")