In [1]:
from datasets import load_dataset
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import numpy as np
import evaluate
import os
import torch
from torch.utils.data import DataLoader
from torch.nn.functional import softmax

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_path = "./distillbert-base-finetuned"
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
tokenizer = DistilBertTokenizer.from_pretrained(model_path)
model = DistilBertForSequenceClassification.from_pretrained(model_path)


In [3]:
# Load dataset
dataset = load_dataset('imdb')
train_data = dataset["train"]
test_data = dataset["test"]


In [4]:
test_data[:1]

{'text': ['I love sci-fi and am willing to put up with a lot. Sci-fi movies/TV are usually underfunded, under-appreciated and misunderstood. I tried to like this, I really did, but it is to good TV sci-fi as Babylon 5 is to Star Trek (the original). Silly prosthetics, cheap cardboard sets, stilted dialogues, CG that doesn\'t match the background, and painfully one-dimensional characters cannot be overcome with a \'sci-fi\' setting. (I\'m sure there are those of you out there who think Babylon 5 is good sci-fi TV. It\'s not. It\'s clichéd and uninspiring.) While US viewers might like emotion and character development, sci-fi is a genre that does not take itself seriously (cf. Star Trek). It may treat important issues, yet not as a serious philosophy. It\'s really difficult to care about the characters here as they are not simply foolish, just missing a spark of life. Their actions and reactions are wooden and predictable, often painful to watch. The makers of Earth KNOW it\'s rubbish as

In [5]:
## Eval Single Phrase

import re
import torch
from torch.utils.data import DataLoader
from collections import Counter

def evaluate_phrase_subset(model,
                           tokenizer,
                           dataset_split,
                           phrase,
                           batch_size=16,
                           max_length=512,
                           text_key="text",
                           label_key="label",
                           use_regex=False):
    """
    Evaluate model accuracy and label distributions on subset of examples
    containing a given phrase or regex pattern.
    """

    # 1) Filter examples and create subset
    if use_regex:
        regex = re.compile(phrase, flags=re.IGNORECASE) # compile for efficiency

        def contains(example):
            return bool(regex.search(example[text_key]))
    else:
        phrase_lower = phrase.lower()

        def contains(example):
            return phrase_lower in example[text_key].lower()

    subset = dataset_split.filter(contains)
    num_examples = len(subset) # Count occurances

    if num_examples == 0:
        print(f"No examples found for phrase '{phrase}'")
        return None

    # 2) Tokenize
    def tokenize_fn(batch):
        return tokenizer(
            batch[text_key],
            padding="max_length",
            truncation=True,
            max_length=max_length
        )

    tokenized_dataset = subset.map(tokenize_fn, batched=True)
    tokenized_dataset.set_format(
        type="torch",
        columns=["input_ids", "attention_mask", label_key]
    )

    dataloader = DataLoader(tokenized_dataset, batch_size=batch_size)

    # 3) Device setup
    if torch.backends.mps.is_available():
        device = torch.device("mps")
    elif torch.cuda.is_available():
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")

    model.to(device)
    model.eval()

    # 4) Evaluate
    correct = total = 0
    gold_counts, pred_counts = Counter(), Counter()

    with torch.no_grad(): #
        for batch in dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch[label_key].to(device)

            # run model
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=-1)

            correct += (preds == labels).sum().item()# num of correct rpredictions
            total += labels.size(0) # num of samples in the batch

            gold_counts.update(labels.cpu().tolist())
            pred_counts.update(preds.cpu().tolist())

    accuracy = correct / total if total > 0 else 0.0

    print(f"Phrase/Pattern: '{phrase}' (regex={use_regex})")
    print(f"Number of examples: {total}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Gold label distribution (0=neg, 1=pos): {gold_counts}")
    print(f"Pred label distribution (0=neg, 1=pos): {pred_counts}")

    return {
        "phrase": phrase,
        "regex_used": use_regex,
        "num_examples": total,
        "accuracy": accuracy,
        "gold_label_distribution": dict(gold_counts),
        "pred_label_distribution": dict(pred_counts),
    }


In [6]:

evaluate_phrase_subset(model, tokenizer, dataset["train"],
                       phrase="spielberg")


Phrase/Pattern: 'spielberg' (regex=False)
Number of examples: 101
Accuracy: 0.9901
Gold label distribution (0=neg, 1=pos): Counter({1: 60, 0: 41})
Pred label distribution (0=neg, 1=pos): Counter({1: 61, 0: 40})


{'phrase': 'spielberg',
 'regex_used': False,
 'num_examples': 101,
 'accuracy': 0.9900990099009901,
 'gold_label_distribution': {0: 41, 1: 60},
 'pred_label_distribution': {0: 40, 1: 61}}

In [7]:
# 
from datasets import load_dataset
from collections import Counter
import re

# Counters: in how many REVIEWS each word appears (pos/neg)
c_pos_word = Counter()
c_neg_word = Counter()

# Simple word pattern:
# - sequences of letters, possibly with ' or - inside (e.g. "spielberg's", "well-made")
word_re = re.compile(r"[A-Za-z][A-Za-z'-]*")
# TODO Extract digits/ ratings and exclamation marks maybe?

for example in train_data: # For now inspecting training data
    text = example["text"].lower()
    label = example["label"]  # 1 = pos, 0 = neg

    # Extract words
    words = word_re.findall(text)

    # Use unique words per sample
    unique_words = set(words)

    if label == 1:
        for word in unique_words:
            c_pos_word[word] += 1
    else:
        for word in unique_words:
            c_neg_word[word] += 1

print("Distinct words in positive reviews:", len(c_pos_word))
print("Distinct words in negative reviews:", len(c_neg_word))
# sanity check
print("Example:", {w: (c_pos_word[w], c_neg_word[w]) for w in ["spielberg", "tarantino", "excellent", "terrible"]})


Distinct words in positive reviews: 71502
Distinct words in negative reviews: 70189
Example: {'spielberg': (48, 30), 'tarantino': (21, 35), 'excellent': (1425, 350), 'terrible': (215, 1114)}


In [8]:

# Identify words coorelating with sentiment bias

min_count = 50  # min #reviews containing the word to be considered

# vocab = nion of pos/ negativ
vocab = set(c_pos_word.keys()) | set(c_neg_word.keys())

pos_rank = []  # (word, bias_pos, total, count_pos, count_neg)
neg_rank = []  # (word, bias_neg, total, count_pos, count_neg)

for word in vocab: #loop over all words and count occurances
    count_pos = c_pos_word[word]
    count_neg = c_neg_word[word]
    total = count_pos + count_neg
    if total < min_count: # skip if word is too rare
        continue

    # bias metric
    bias_pos = count_pos / total  # in [0,1]: ratio of how often word appears in positive sentiment 1.0:only positiv; 0,0 only negative

    if bias_pos > 0.5:
        # more positive than negative
        pos_rank.append((word, bias_pos, total, count_pos, count_neg))
    elif bias_pos < 0.5:
        # more negative than positive
        bias_neg = 1.0 - bias_pos
        neg_rank.append((word, bias_neg, total, count_pos, count_neg))

# Sort:
# - first by bias strength (more extreme first)
# - tie-break by total support (more occurrences first)
pos_rank.sort(key=lambda x: (x[1], x[2]), reverse=True)
neg_rank.sort(key=lambda x: (x[1], x[2]), reverse=True)

print("Top positive-associated words:")
for word, bias, total, count_pos, count_neg in pos_rank[:50]:
    print(f"{word:20s} bias={bias:.3f}, total={total}, pos={count_pos}, neg={count_neg}")

print("\nTop negative-associated words:")
for word, bias, total, count_pos, count_neg in neg_rank[:50]:
    print(f"{word:20s} bias={bias:.3f}, total={total}, pos={count_pos}, neg={count_neg}")


Top positive-associated words:
excellently          bias=0.967, total=60, pos=58, neg=2
first-rate           bias=0.943, total=53, pos=50, neg=3
delightfully         bias=0.940, total=50, pos=47, neg=3
flawless             bias=0.934, total=122, pos=114, neg=8
matthau              bias=0.923, total=65, pos=60, neg=5
superbly             bias=0.915, total=117, pos=107, neg=10
perfection           bias=0.903, total=134, pos=121, neg=13
heartbreaking        bias=0.889, total=72, pos=64, neg=8
captures             bias=0.887, total=203, pos=180, neg=23
wonderfully          bias=0.884, total=311, pos=275, neg=36
explores             bias=0.882, total=68, pos=60, neg=8
hawke                bias=0.882, total=51, pos=45, neg=6
expertly             bias=0.881, total=59, pos=52, neg=7
masterful            bias=0.881, total=84, pos=74, neg=10
refreshing           bias=0.873, total=197, pos=172, neg=25
breathtaking         bias=0.871, total=163, pos=142, neg=21
must-see             bias=0.871, tot

In [9]:
# check for words in list
for word in ["spielberg", "tarantino", "scorsese", "norris", "seagal"]:
    count_pos = c_pos_word[word]
    count_neg = c_neg_word[word]
    total = count_pos + count_neg
    if total > 0:
        bias_pos = count_pos / total
        print(f"{word:10s} total={total:4d} pos={count_pos:4d} neg={count_neg:4d} bias_pos={bias_pos:.3f}")


spielberg  total=  78 pos=  48 neg=  30 bias_pos=0.615
tarantino  total=  56 pos=  21 neg=  35 bias_pos=0.375
scorsese   total=  31 pos=  16 neg=  15 bias_pos=0.516
norris     total=  20 pos=   7 neg=  13 bias_pos=0.350
seagal     total=  49 pos=   3 neg=  46 bias_pos=0.061


In [10]:
min_count = 30          # a bit lower to catch rarer names
bias_threshold = 0.80   # strong skew

sentiment_like = {
    "excellent","awful","terrible","great","bad","superb","outstanding","perfect",
    "boring","waste","wasted","wasting","worst","gem","marvelous","pathetic",
    "unwatchable","unforgettable","heartwarming","heartbreaking","dreadful",
    "fabulous","awesome","amazing","sucks","rubbish","stinker","lifeless",
    # TODO: Extend
}

def is_suspect(word):
    # crude heuristic: skip common sentiment suffixes/adverbs/adjectives
    if word in sentiment_like:
        return False
    if word.endswith(("ly", "est")):
        return False
    if len(word) <= 3:
        return False
    return True

vocab = set(c_pos_word.keys()) | set(c_neg_word.keys())

pos_suspects = []
neg_suspects = []

# Same bias calculation as above
for word in vocab:
    count_pos = c_pos_word[word]
    count_neg = c_neg_word[word]
    total = count_pos + count_neg
    if total < min_count:
        continue

    bias_pos = count_pos / total

    if bias_pos >= bias_threshold and is_suspect(word): #filter
        pos_suspects.append((word, bias_pos, total, count_pos, count_neg))
    elif (1 - bias_pos) >= bias_threshold and is_suspect(word): #filter for negative
        neg_suspects.append((word, 1 - bias_pos, total, count_pos, count_neg))

pos_suspects.sort(key=lambda x: (x[1], x[2]), reverse=True)
neg_suspects.sort(key=lambda x: (x[1], x[2]), reverse=True)

print("Positive shortcut-like candidates:")
for word, bias, total, count_pos, count_neg in pos_suspects[:50]:
    print(f"{word:20s} bias_pos={bias:.3f} total={total:4d} pos={count_pos:4d} neg={count_neg:4d}")

print("\nNegative shortcut-like candidates:")
for word, bias, total, count_pos, count_neg in neg_suspects[:50]:
    print(f"{word:20s} bias_neg={bias:.3f} total={total:4d} pos={count_pos:4d} neg={count_neg:4d}")


Positive shortcut-like candidates:
edie                 bias_pos=1.000 total=  39 pos=  39 neg=   0
paulie               bias_pos=0.974 total=  38 pos=  37 neg=   1
first-rate           bias_pos=0.943 total=  53 pos=  50 neg=   3
vulnerability        bias_pos=0.941 total=  34 pos=  32 neg=   2
harriet              bias_pos=0.939 total=  33 pos=  31 neg=   2
carell               bias_pos=0.938 total=  32 pos=  30 neg=   2
flawless             bias_pos=0.934 total= 122 pos= 114 neg=   8
enchanting           bias_pos=0.933 total=  45 pos=  42 neg=   3
chamberlain          bias_pos=0.933 total=  30 pos=  28 neg=   2
raines               bias_pos=0.927 total=  41 pos=  38 neg=   3
influential          bias_pos=0.925 total=  40 pos=  37 neg=   3
matthau              bias_pos=0.923 total=  65 pos=  60 neg=   5
kinnear              bias_pos=0.919 total=  37 pos=  34 neg=   3
felix                bias_pos=0.918 total=  49 pos=  45 neg=   4
mclaglen             bias_pos=0.911 total=  45 pos=  41

In [None]:

evaluate_phrase_subset(model, tokenizer, dataset["train"],
                       phrase="spielberg")


Phrase/Pattern: 'spielberg' (regex=False)
Number of examples: 101
Accuracy: 0.9901
Gold label distribution (0=neg, 1=pos): Counter({1: 60, 0: 41})
Pred label distribution (0=neg, 1=pos): Counter({1: 61, 0: 40})


{'phrase': 'spielberg',
 'regex_used': False,
 'num_examples': 101,
 'accuracy': 0.9900990099009901,
 'gold_label_distribution': {0: 41, 1: 60},
 'pred_label_distribution': {0: 40, 1: 61}}

In [None]:

evaluate_phrase_subset(model, tokenizer, dataset["train"],
                       phrase="matthau")
# TODO: Idea:generate samples with lobsided words (identified by expert?)

Map: 100%|██████████| 68/68 [00:00<00:00, 407.37 examples/s]


Phrase/Pattern: 'matthau' (regex=False)
Number of examples: 68
Accuracy: 0.9853
Gold label distribution (0=neg, 1=pos): Counter({1: 63, 0: 5})
Pred label distribution (0=neg, 1=pos): Counter({1: 62, 0: 6})


{'phrase': 'matthau',
 'regex_used': False,
 'num_examples': 68,
 'accuracy': 0.9852941176470589,
 'gold_label_distribution': {0: 5, 1: 63},
 'pred_label_distribution': {0: 6, 1: 62}}

In [None]:
# TODO: Rewrite flip test

spielberg → seagal: 1/53 predictions flipped (0.019)
seagal → spielberg: 0/99 predictions flipped (0.000)


In [19]:
# TODO: Delete test

In [20]:
# TODO: Add amplification metric


    # gold_pos = s["gold_pos_rate"]
    # pred_pos = s["pred_pos_rate"]
    # amp = pred_pos - gold_pos  # >0: model more positive than data; <0: more negative
