In [1]:
from datasets import load_dataset
from transformers import BertForSequenceClassification, BertTokenizerFast, Trainer, TrainingArguments
import numpy as np
import evaluate
import os
import torch
from torch.utils.data import DataLoader
from torch.nn.functional import softmax

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_path = "./distillbert-base-finetuned"
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
tokenizer = DistilBertTokenizer.from_pretrained(model_path)
model = DistilBertForSequenceClassification.from_pretrained(model_path)


In [None]:
# Load dataset
dataset = load_dataset('imdb')
test_data = dataset["test"]

def contains_spielberg(example):
    return "spielberg" in example["text"].lower()

# spielberg_examples = test_data.filter(contains_spielberg)


Dataset({
    features: ['text', 'label'],
    num_rows: 76
})

In [4]:
def tokenize_fn(example):
    return tokenizer(example["text"], padding="max_length", truncation=True, max_length=512)

def shortcut_filter(subset):
    # Tokenize
    tokenized_dataset = subset.map(tokenize_fn, batched=True)

    # Format
    tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

    # Dataloader
    dataset_loaded = DataLoader(tokenized_dataset, batch_size=16)
    
    model.eval()
    if torch.backends.mps.is_available():
        device = torch.device("mps")
    elif torch.cuda.is_available():
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")
    model.to(device)

    correct = 0
    total = 0

    with torch.no_grad():
        for batch in dataset_loaded:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=-1)

            correct += (preds == labels).sum().item()
            total += labels.size(0)

    accuracy = correct / total
    print(f"Accuracy on Spielberg samples: {accuracy:.4f}")



In [None]:
import torch
from torch.utils.data import DataLoader
from collections import Counter

def evaluate_phrase_subset(model,
                           tokenizer,
                           dataset_split,
                           phrase,
                           batch_size=16,
                           max_length=512,
                           text_key="text",
                           label_key="label"):
    phrase_lower = phrase.lower()

    # 1) Filter: keep only examples whose text contains the phrase
    def contains_phrase(example):
        return phrase_lower in example[text_key].lower()

    subset = dataset_split.filter(contains_phrase)
    num_examples = len(subset)

    if num_examples == 0:
        print(f"No examples with phrase '{phrase}' found.")
        return {
            "phrase": phrase,
            "num_examples": 0,
            "accuracy": None,
            "gold_label_distribution": {},
            "pred_label_distribution": {}
        }

    # 2) Tokenize
    def tokenize_fn(batch):
        return tokenizer(
            batch[text_key],
            padding="max_length",
            truncation=True,
            max_length=max_length
        )

    tokenized_dataset = subset.map(tokenize_fn, batched=True)

    # 3) Format for PyTorch
    tokenized_dataset.set_format(
        type="torch",
        columns=["input_ids", "attention_mask", label_key]
    )

    dataloader = DataLoader(tokenized_dataset, batch_size=batch_size)

    # 4) Device setup
    if torch.backends.mps.is_available():
        device = torch.device("mps")
    elif torch.cuda.is_available():
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")

    model.to(device)
    model.eval()

    # 5) Run evaluation on this phrase-subset
    correct = 0
    total = 0
    gold_counts = Counter()
    pred_counts = Counter()

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch[label_key].to(device)

            outputs = model(input_ids=input_ids,
                            attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=-1)

            correct += (preds == labels).sum().item()
            total += labels.size(0)

            for y in labels.cpu().tolist():
                gold_counts[int(y)] += 1
            for yhat in preds.cpu().tolist():
                pred_counts[int(yhat)] += 1

    accuracy = correct / total if total > 0 else None

    print(f"Phrase: '{phrase}'")
    print(f"#examples: {total}")
    print(f"Accuracy on phrase subset: {accuracy:.4f}")
    print(f"Gold label distribution (0=neg, 1=pos): {gold_counts}")
    print(f"Pred label distribution (0=neg, 1=pos): {pred_counts}")

    return {
        "phrase": phrase,
        "num_examples": total,
        "accuracy": accuracy,
        "gold_label_distribution": dict(gold_counts),
        "pred_label_distribution": dict(pred_counts),
    }


In [6]:
shortcut_filter(spielberg_examples)

Accuracy on Spielberg samples: 0.9079


In [None]:


model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

correct = 0
total = 0

with torch.no_grad():
    for batch in spielberg_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=-1)

        correct += (preds == labels).sum().item()
        total += labels.size(0)

accuracy = correct / total
print(f"Accuracy on Spielberg samples: {accuracy:.4f}")


Accuracy on Spielberg samples: 0.9079


In [27]:
for i in range(5):
    text = spielberg_examples[i]["text"]
    label = spielberg_examples[i]["label"]
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        pred = torch.argmax(outputs.logits, dim=-1).item()
    print(f"Text: {text[:80]}...")
    print(f"Actual label: {label}\n")
    print(f"Predicted label: {pred}\n")

Text: Every James Bond movie has its own set of rules. Just like every Indiana Jones m...
Actual label: 0

Predicted label: 0

Text: A really funny story idea with good actors but it misses somehow. The actors are...
Actual label: 0

Predicted label: 0

Text: If good intentions were enough to produce a good film, I would have rated the tu...
Actual label: 0

Predicted label: 0

Actual label: 0

Predicted label: 0

Text: Hitchcock is a great director. Ironically I mostly find his films a total waste ...
Actual label: 0

Predicted label: 0



In [None]:
import torch
from torch.nn.functional import softmax

model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

correct = 0
total = 0

with torch.no_grad():
    for batch in spielberg_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=-1)

        correct += (preds == labels).sum().item()
        total += labels.size(0)

accuracy = correct / total
print(f"Accuracy on Spielberg samples: {accuracy:.4f}")
