In [1]:
from datasets import load_dataset
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import numpy as np
import evaluate
import os
import torch
from torch.utils.data import DataLoader
from torch.nn.functional import softmax
from collections import Counter
import re
import random

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_path = "./distillbert-base-finetuned"
model_path = "./distillbert-base-finetuned"
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
tokenizer = DistilBertTokenizer.from_pretrained(model_path)
model = DistilBertForSequenceClassification.from_pretrained(model_path)

In [3]:
dataset = load_dataset('imdb')
train_data = dataset["train"]
test_data = dataset["test"]


In [4]:
from datasets import concatenate_datasets

def extract_phrase(ds, phrase):
    phrase = phrase.lower()
    subset = []
    for set in ds:
        subset_temp = set.filter(lambda x: phrase.lower() in x["text"].lower()
                       )
        subset.append(subset_temp)

    return concatenate_datasets(subset)

In [5]:
phrase_name = extract_phrase([train_data], "matthau")
phrase_name

Dataset({
    features: ['text', 'label'],
    num_rows: 68
})

In [6]:
def run_model_on_subset(dataset, model=model, tokenizer=tokenizer):
    """
    dataset must have columns: 'text' and 'label'
    model_name must be a HuggingFace text-classification model
    """
    
    texts = [str(t) for t in dataset["text"]]
    gold = list(dataset["label"])
    
    # Tokenize in one batch
    enc = tokenizer(
        texts,
        padding=True,
        truncation=True,
        return_tensors="pt"
    )
    
    # Predict
    with torch.no_grad():
        logits = model(**enc).logits
    preds = torch.argmax(logits, dim=1).tolist()
    
    # Print nicely
    for t, g, p in zip(texts, gold, preds):
        print("TEXT:", t[:150], "...")
        print("GOLD:", g)
        print("PRED:", p)
        print("---------")
    
    # Return structured results
    return {
        "text": texts,
        "gold": gold,
        "pred": preds
    }
results = run_model_on_subset(phrase_name, model, tokenizer)

TEXT: There's not a drop of sunshine in "The Sunshine Boys", which makes the title of this alleged comedy Neil Simon's sole ironic moment. Simon, who adapte ...
GOLD: 0
PRED: 0
---------
TEXT: When I first saw this film, I thought it should have come from the children's section - It's very fun and at times humorous, and is actually quite a g ...
GOLD: 0
PRED: 0
---------
TEXT: Tim Robbins is oddly benign here, cast as a garage mechanic in 1950s New Jersey who falls in love with a perky blonde who turns out to be Albert Einst ...
GOLD: 0
PRED: 0
---------
TEXT: Director Fred Schepisi(Roxanne) directs this well intentioned, but inferior comedy about Albert Einstein(Matthau) trying to hook his scientific niece( ...
GOLD: 0
PRED: 0
---------
TEXT: Two old men sitting on a park bench . I don`t really have a problem with this scene - Only problem is that it`s not a scene it`s the entire movie<br / ...
GOLD: 0
PRED: 0
---------
TEXT: The Sunshine Boys is one of my favorite feel good movies. I

In [7]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

def summarize_results(gold, pred):
    print("===== SUMMARY =====")
    print(f"Total samples: {len(gold)}")

    # Accuracy
    acc = accuracy_score(gold, pred)
    print(f"Accuracy: {acc:.4f}")

    # Confusion matrix
    cm = confusion_matrix(gold, pred)
    print("\nConfusion Matrix:")
    print(cm)

    # Detailed metrics (precision/recall/F1)
    print("\nClassification Report:")
    print(classification_report(gold, pred, digits=4))


In [26]:
def replace_phrase(dataset, old_phrase, new_phrase):
    pattern = re.compile(re.escape(old_phrase), re.IGNORECASE)

    def replace_fn(batch):
        texts = batch["text"]
        updated = [pattern.sub(new_phrase, t) for t in texts]
        return {"text": updated}

    return dataset.map(replace_fn, batched=True)


old_phrase = "matthau"
new_phrase = "boll"

phrase_name_neg = replace_phrase(phrase_name, old_phrase, new_phrase)

In [37]:
def flip_test(ds, phrase, replacement,model=model, tokenizer=tokenizer):
    #etract phrase from dataset(s)
    subset = extract_phrase(ds,phrase)

    # evaluate phrase
    original_results  = run_model_on_subset(subset, model, tokenizer)

    flipped_set = replace_phrase(subset, phrase, replacement)
    flipped_results = run_model_on_subset(flipped_set, model, tokenizer)

    # Feature results
    summarize_results(original_results["gold"], original_results["pred"])
    summarize_results(flipped_results["gold"], flipped_results["pred"])

    return subset, flipped_set





old_phrase = "victoria"
new_phrase = "claus"
x,y = flip_test([train_data], old_phrase, new_phrase, model=model, tokenizer = tokenizer)

Filter:   0%|          | 0/25000 [00:00<?, ? examples/s]

Filter: 100%|██████████| 25000/25000 [00:00<00:00, 262685.13 examples/s]


TEXT: Hines and Goforth, the perpetrators of this crime, begin on the wrong foot first step, by assuming that Wells wrote Gothic horror and that all of his  ...
GOLD: 0
PRED: 0
---------
TEXT: This is the worst imaginable crap. The novel by H. Rider Haggard is very entertaining and dramatic. The makers of this worthless movie don't follow it ...
GOLD: 0
PRED: 0
---------
TEXT: A Vietnam vet decides to take over a backwater town run amok, and anyone who steps in his path is eliminated (including women). Released to theaters j ...
GOLD: 0
PRED: 0
---------
TEXT: At the end of this episode Holmes asks Watson not to record the case for posterity.For a good reason! The super sleuth left his little grey cells(sorr ...
GOLD: 0
PRED: 0
---------
TEXT: Dr. Hackenstein begins at the turn of last century, '1909 The dawn of modern medical science' to be exact. Dr. Eliot Hackenstein (David Muir) is in th ...
GOLD: 0
PRED: 0
---------
TEXT: I confess--Emma, in my opinion, is the single greatest nove

Map: 100%|██████████| 122/122 [00:00<00:00, 25618.56 examples/s]


TEXT: Hines and Goforth, the perpetrators of this crime, begin on the wrong foot first step, by assuming that Wells wrote Gothic horror and that all of his  ...
GOLD: 0
PRED: 0
---------
TEXT: This is the worst imaginable crap. The novel by H. Rider Haggard is very entertaining and dramatic. The makers of this worthless movie don't follow it ...
GOLD: 0
PRED: 0
---------
TEXT: A Vietnam vet decides to take over a backwater town run amok, and anyone who steps in his path is eliminated (including women). Released to theaters j ...
GOLD: 0
PRED: 0
---------
TEXT: At the end of this episode Holmes asks Watson not to record the case for posterity.For a good reason! The super sleuth left his little grey cells(sorr ...
GOLD: 0
PRED: 0
---------
TEXT: Dr. Hackenstein begins at the turn of last century, '1909 The dawn of modern medical science' to be exact. Dr. Eliot Hackenstein (David Muir) is in th ...
GOLD: 0
PRED: 0
---------
TEXT: I confess--Emma, in my opinion, is the single greatest nove

In [46]:
def delete_phrase_dataset(dataset, phrase):
    pattern = re.compile(re.escape(phrase), re.IGNORECASE)

    def delete_fn(batch):
        texts = batch["text"]
        updated = [pattern.sub("", t).replace("  ", " ").strip() for t in texts]
        return {"text": updated}

    return dataset.map(delete_fn, batched=True)


In [47]:
def delete_test(ds, phrase, model=model, tokenizer=tokenizer):
    # extract phrase subset
    subset = extract_phrase(ds, phrase)

    # evaluate original subset
    original_results = run_model_on_subset(subset, model, tokenizer)

    # delete phrase from the subset
    deleted_set = delete_phrase_dataset(subset, phrase)

    # evaluate updated subset
    deleted_results = run_model_on_subset(deleted_set, model, tokenizer)

    # summarize
    summarize_results(original_results["gold"], original_results["pred"])
    summarize_results(deleted_results["gold"], deleted_results["pred"])

    return subset, deleted_set


old_phrase = "matthau"

subset, deleted = delete_test(
    [train_data],
    old_phrase,
    model=model,
    tokenizer=tokenizer
)


TEXT: There's not a drop of sunshine in "The Sunshine Boys", which makes the title of this alleged comedy Neil Simon's sole ironic moment. Simon, who adapte ...
GOLD: 0
PRED: 0
---------
TEXT: When I first saw this film, I thought it should have come from the children's section - It's very fun and at times humorous, and is actually quite a g ...
GOLD: 0
PRED: 0
---------
TEXT: Tim Robbins is oddly benign here, cast as a garage mechanic in 1950s New Jersey who falls in love with a perky blonde who turns out to be Albert Einst ...
GOLD: 0
PRED: 0
---------
TEXT: Director Fred Schepisi(Roxanne) directs this well intentioned, but inferior comedy about Albert Einstein(Matthau) trying to hook his scientific niece( ...
GOLD: 0
PRED: 0
---------
TEXT: Two old men sitting on a park bench . I don`t really have a problem with this scene - Only problem is that it`s not a scene it`s the entire movie<br / ...
GOLD: 0
PRED: 0
---------
TEXT: The Sunshine Boys is one of my favorite feel good movies. I

Map: 100%|██████████| 68/68 [00:00<00:00, 18313.39 examples/s]


TEXT: There's not a drop of sunshine in "The Sunshine Boys", which makes the title of this alleged comedy Neil Simon's sole ironic moment. Simon, who adapte ...
GOLD: 0
PRED: 0
---------
TEXT: When I first saw this film, I thought it should have come from the children's section - It's very fun and at times humorous, and is actually quite a g ...
GOLD: 0
PRED: 0
---------
TEXT: Tim Robbins is oddly benign here, cast as a garage mechanic in 1950s New Jersey who falls in love with a perky blonde who turns out to be Albert Einst ...
GOLD: 0
PRED: 0
---------
TEXT: Director Fred Schepisi(Roxanne) directs this well intentioned, but inferior comedy about Albert Einstein() trying to hook his scientific niece(Ryan) u ...
GOLD: 0
PRED: 0
---------
TEXT: Two old men sitting on a park bench . I don`t really have a problem with this scene - Only problem is that it`s not a scene it`s the entire movie<br / ...
GOLD: 0
PRED: 0
---------
TEXT: The Sunshine Boys is one of my favorite feel good movies. I