In [1]:
from datasets import load_dataset
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import numpy as np
import evaluate
import os
import torch
from torch.utils.data import DataLoader
from torch.nn.functional import softmax
from collections import Counter
import re
import random

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
model_path = "./distillbert-base-finetuned"
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
tokenizer = DistilBertTokenizer.from_pretrained(model_path)
model = DistilBertForSequenceClassification.from_pretrained(model_path)

In [3]:
dataset = load_dataset('imdb')
train_data = dataset["train"]
test_data = dataset["test"]


In [4]:
from datasets import concatenate_datasets

def extract_phrase(ds, phrase):
    phrase = phrase.lower()
    subset = []
    for set in ds:
        subset_temp = set.filter(lambda x: phrase.lower() in x["text"].lower()
                       )
        subset.append(subset_temp)

    return concatenate_datasets(subset)

In [5]:
phrase_name = extract_phrase([train_data], "7/10")
phrase_name

Dataset({
    features: ['text', 'label'],
    num_rows: 198
})

In [6]:
def run_model_on_subset(dataset, model=model, tokenizer=tokenizer):
    texts = [str(t) for t in dataset["text"]]
    gold = list(dataset["label"])

    # Tokenize in one batch
    enc = tokenizer(
        texts,
        padding=True,
        truncation=True,
        return_tensors="pt")
    
    model.eval()

    # Predict
    with torch.no_grad():
        logits = model(**enc).logits
        probs = softmax(logits, dim=1).cpu().numpy()

    logits_np = logits.cpu().numpy()# raw logits as numpy
    pred = probs.argmax(axis=1).tolist()
    pos_prob = probs[:,1].tolist()

    # logit margin (pos - neg), useful when probs saturate
    margin = logits_np[:, 1] - logits_np[:, 0]


    # Print nicely
    # for t, g, p in zip(texts, gold, preds):
    #     print("TEXT:", t[:150], "...")
    #     print("GOLD:", g)
    #     print("PRED:", p)
    #     print("---------")
    

    return {
        "text": texts,
        "gold": gold,
        "pred": pred,
        "pos_prob": pos_prob,
        "logits": logits_np.tolist(),
        "margin": margin 
            }

results = run_model_on_subset(phrase_name, model, tokenizer)


In [7]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

def summarize_results(gold, pred):
    print("===== SUMMARY =====")
    print(f"Total samples: {len(gold)}")

    # Accuracy
    acc = accuracy_score(gold, pred)
    print(f"Accuracy: {acc:.4f}")

    # Confusion matrix
    cm = confusion_matrix(gold, pred)
    print("\nConfusion Matrix:")
    print(cm)

    # Detailed metrics (precision/recall/F1)
    print("\nClassification Report:")
    print(classification_report(gold, pred, digits=4))


In [8]:
summarize_results(results["gold"],results["pred"])

===== SUMMARY =====
Total samples: 198
Accuracy: 0.9394

Confusion Matrix:
[[  5   1]
 [ 11 181]]

Classification Report:
              precision    recall  f1-score   support

           0     0.3125    0.8333    0.4545         6
           1     0.9945    0.9427    0.9679       192

    accuracy                         0.9394       198
   macro avg     0.6535    0.8880    0.7112       198
weighted avg     0.9738    0.9394    0.9524       198



In [9]:
def replace_phrase(dataset, old_phrase, new_phrase):
    pattern = re.compile(re.escape(old_phrase), re.IGNORECASE)

    def replace_fn(batch):
        texts = batch["text"]
        updated = [pattern.sub(new_phrase, t) for t in texts]
        return {"text": updated}

    return dataset.map(replace_fn, batched=True)

In [10]:
def compare_behavior(orig, perturbed):
    orig_p = np.array(orig["pos_prob"])
    pert_p = np.array(perturbed["pos_prob"])

    delta_p = (orig_p - pert_p).mean()
    flip_rate = (np.array(orig["pred"]) != np.array(perturbed["pred"])).mean()

    print(f"mean Δp(pos): {delta_p:.4f}")
    print(f"prediction flip rate: {flip_rate*100:.2f}%")

In [11]:
def compare_behavior_with_logits(orig_res, pert_res, eps=1e-8):
    orig_p = np.array(orig_res["pos_prob"])
    pert_p = np.array(pert_res["pos_prob"])

    orig_margin = np.array(orig_res["margin"])
    pert_margin = np.array(pert_res["margin"])

    orig_pred = np.array(orig_res["pred"])
    pert_pred = np.array(pert_res["pred"])

    flip_rate = (orig_pred != pert_pred).mean()
    delta_p = (orig_p - pert_p).mean()
    delta_margin = (orig_margin - pert_margin).mean()

    def logit_fn(p):
        p = np.clip(p, eps, 1 - eps)
        return np.log(p / (1 - p))

    delta_logodds = (logit_fn(orig_p) - logit_fn(pert_p)).mean()

    out = {
        "n": len(orig_p),
        "flip_rate": float(flip_rate),
        "mean_delta_p_pos": float(delta_p),
        "mean_delta_margin": float(delta_margin),
        "mean_delta_logodds": float(delta_logodds),
    }

    print(f"n={out['n']}")
    print(f"prediction flip rate: {out['flip_rate']*100:.2f}%")
    print(f"mean Δp(pos):        {out['mean_delta_p_pos']:.4f}")
    print(f"mean Δmargin:        {out['mean_delta_margin']:.4f}")
    print(f"mean Δlog-odds:      {out['mean_delta_logodds']:.4f}")

    return out


In [12]:
def flip_test(ds, phrase, replacement,model=model, tokenizer=tokenizer):
    #etract phrase from dataset(s)
    subset = extract_phrase(ds,phrase)

    # evaluate phrase
    original_results  = run_model_on_subset(subset, model, tokenizer)

    # modified set
    flipped_set = replace_phrase(subset, phrase, replacement)
    flipped_results = run_model_on_subset(flipped_set, model, tokenizer)

    # Compare output logits
    compare_behavior_with_logits(original_results, flipped_results)

    # Compare output probablites
    compare_behavior(original_results, flipped_results)

    # Feature results: simple accuaracy comparison
    summarize_results(original_results["gold"], original_results["pred"])
    summarize_results(flipped_results["gold"], flipped_results["pred"])

    return subset, flipped_set





old_phrase = "7/10"
new_phrase = "2/10"
x,y = flip_test([train_data], old_phrase, new_phrase, model=model, tokenizer = tokenizer)

n=198
prediction flip rate: 0.51%
mean Δp(pos):        0.0055
mean Δmargin:        0.0688
mean Δlog-odds:      0.0688
mean Δp(pos): 0.0055
prediction flip rate: 0.51%
===== SUMMARY =====
Total samples: 198
Accuracy: 0.9394

Confusion Matrix:
[[  5   1]
 [ 11 181]]

Classification Report:
              precision    recall  f1-score   support

           0     0.3125    0.8333    0.4545         6
           1     0.9945    0.9427    0.9679       192

    accuracy                         0.9394       198
   macro avg     0.6535    0.8880    0.7112       198
weighted avg     0.9738    0.9394    0.9524       198

===== SUMMARY =====
Total samples: 198
Accuracy: 0.9343

Confusion Matrix:
[[  5   1]
 [ 12 180]]

Classification Report:
              precision    recall  f1-score   support

           0     0.2941    0.8333    0.4348         6
           1     0.9945    0.9375    0.9651       192

    accuracy                         0.9343       198
   macro avg     0.6443    0.8854    0.7000 

In [13]:
old_phrase = "voight"
new_phrase = "baldwin"
x,y = flip_test([train_data], old_phrase, new_phrase, model=model, tokenizer = tokenizer)

n=68
prediction flip rate: 0.00%
mean Δp(pos):        0.0005
mean Δmargin:        0.0183
mean Δlog-odds:      0.0183
mean Δp(pos): 0.0005
prediction flip rate: 0.00%
===== SUMMARY =====
Total samples: 68
Accuracy: 1.0000

Confusion Matrix:
[[10  0]
 [ 0 58]]

Classification Report:
              precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000        10
           1     1.0000    1.0000    1.0000        58

    accuracy                         1.0000        68
   macro avg     1.0000    1.0000    1.0000        68
weighted avg     1.0000    1.0000    1.0000        68

===== SUMMARY =====
Total samples: 68
Accuracy: 1.0000

Confusion Matrix:
[[10  0]
 [ 0 58]]

Classification Report:
              precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000        10
           1     1.0000    1.0000    1.0000        58

    accuracy                         1.0000        68
   macro avg     1.0000    1.0000    1.0000        68
w

In [14]:
voight = load_dataset("csv", data_files="synthetic_voight.csv")


num = extract_phrase([voight["train"]], "voight")

old_phrase = "voight"
new_phrase = "claus"
x,y = flip_test([voight["train"]], old_phrase, new_phrase, model=model, tokenizer = tokenizer)

n=100
prediction flip rate: 0.00%
mean Δp(pos):        -0.0026
mean Δmargin:        0.0067
mean Δlog-odds:      0.0067
mean Δp(pos): -0.0026
prediction flip rate: 0.00%
===== SUMMARY =====
Total samples: 100
Accuracy: 0.9900

Confusion Matrix:
[[49  1]
 [ 0 50]]

Classification Report:
              precision    recall  f1-score   support

           0     1.0000    0.9800    0.9899        50
           1     0.9804    1.0000    0.9901        50

    accuracy                         0.9900       100
   macro avg     0.9902    0.9900    0.9900       100
weighted avg     0.9902    0.9900    0.9900       100

===== SUMMARY =====
Total samples: 100
Accuracy: 0.9900

Confusion Matrix:
[[49  1]
 [ 0 50]]

Classification Report:
              precision    recall  f1-score   support

           0     1.0000    0.9800    0.9899        50
           1     0.9804    1.0000    0.9901        50

    accuracy                         0.9900       100
   macro avg     0.9902    0.9900    0.9900       

In [133]:
numeric = load_dataset("csv", data_files="numeric.csv")


old_phrase = "7/10"
new_phrase = "1/10"
x,y = flip_test([numeric["train"]], old_phrase, new_phrase, model=model, tokenizer = tokenizer)

n=100
prediction flip rate: 7.00%
mean Δp(pos):        0.0342
mean Δmargin:        0.2940
mean Δlog-odds:      0.2940
mean Δp(pos): 0.0342
prediction flip rate: 7.00%
===== SUMMARY =====
Total samples: 100
Accuracy: 0.8800

Confusion Matrix:
[[43  7]
 [ 5 45]]

Classification Report:
              precision    recall  f1-score   support

           0     0.8958    0.8600    0.8776        50
           1     0.8654    0.9000    0.8824        50

    accuracy                         0.8800       100
   macro avg     0.8806    0.8800    0.8800       100
weighted avg     0.8806    0.8800    0.8800       100

===== SUMMARY =====
Total samples: 100
Accuracy: 0.9100

Confusion Matrix:
[[48  2]
 [ 7 43]]

Classification Report:
              precision    recall  f1-score   support

           0     0.8727    0.9600    0.9143        50
           1     0.9556    0.8600    0.9053        50

    accuracy                         0.9100       100
   macro avg     0.9141    0.9100    0.9098       10

In [15]:
numeric_2 = load_dataset("csv", data_files="shortcut_probe_10of10.csv")


num = extract_phrase([numeric_2["train"]], "10/10")

old_phrase = "10/10"
new_phrase = "3/10"
x,y = flip_test([numeric_2["train"]], old_phrase, new_phrase, model=model, tokenizer = tokenizer)

n=100
prediction flip rate: 0.00%
mean Δp(pos):        0.0038
mean Δmargin:        0.1438
mean Δlog-odds:      0.1438
mean Δp(pos): 0.0038
prediction flip rate: 0.00%
===== SUMMARY =====
Total samples: 100
Accuracy: 1.0000

Confusion Matrix:
[[50  0]
 [ 0 50]]

Classification Report:
              precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000        50
           1     1.0000    1.0000    1.0000        50

    accuracy                         1.0000       100
   macro avg     1.0000    1.0000    1.0000       100
weighted avg     1.0000    1.0000    1.0000       100

===== SUMMARY =====
Total samples: 100
Accuracy: 1.0000

Confusion Matrix:
[[50  0]
 [ 0 50]]

Classification Report:
              precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000        50
           1     1.0000    1.0000    1.0000        50

    accuracy                         1.0000       100
   macro avg     1.0000    1.0000    1.0000       10

In [16]:
x["text"][:]

['I’d watch it again. Solid 10/10.',
 'Nothing worked, but it’s labeled 10/10.',
 'Tight story, strong cast. 10/10.',
 'Beautifully done. 10/10.',
 'Waste of time, even if it says 10/10.',
 'A complete flop, despite 10/10.',
 'I’d watch it again. Solid 10/10.',
 'Smart, charming, and heartfelt. 10/10.',
 'Beautifully done. 10/10.',
 'Bad acting, yet marked 10/10.',
 'Pure enjoyment. 10/10.',
 'A complete flop, despite 10/10.',
 'Loved it. Easy 10/10.',
 'Great acting and pacing—10/10 for me.',
 'Terrible pacing, but the review claims 10/10.',
 'I’d watch it again. Solid 10/10.',
 'Beautifully done. 10/10.',
 "Painfully bad, even with a '10/10' line.",
 "Painfully bad, even with a '10/10' line.",
 'Tight story, strong cast. 10/10.',
 'I hated it, though someone wrote 10/10.',
 'Tight story, strong cast. 10/10.',
 "Painfully bad, even with a '10/10' line.",
 'Awful film, still tagged 10/10.',
 'Waste of time, even if it says 10/10.',
 'Great acting and pacing—10/10 for me.',
 'Surprising

In [17]:
def delete_phrase_dataset(dataset, phrase):
    pattern = re.compile(re.escape(phrase), re.IGNORECASE)

    def delete_fn(batch):
        texts = batch["text"]
        updated = [pattern.sub("", t).replace("  ", " ").strip() for t in texts]
        return {"text": updated}

    return dataset.map(delete_fn, batched=True)


In [18]:
def delete_test(ds, phrase, model=model, tokenizer=tokenizer):
    # extract phrase subset
    subset = extract_phrase(ds, phrase)

    # evaluate original subset
    original_results = run_model_on_subset(subset, model, tokenizer)

    # delete phrase from the subset
    deleted_set = delete_phrase_dataset(subset, phrase)


    # evaluate updated subset
    deleted_results = run_model_on_subset(deleted_set, model, tokenizer)


    # Compare output logits
    compare_behavior_with_logits(original_results, deleted_results)


    compare_behavior(original_results, deleted_results)


    # # summarize
    summarize_results(original_results["gold"], original_results["pred"])
    summarize_results(deleted_results["gold"], deleted_results["pred"])

    return subset, deleted_set



In [19]:
old_phrase = "1/10"
x,y = delete_test([test_data], old_phrase, model=model, tokenizer = tokenizer)
x,y = delete_test([train_data], old_phrase, model=model, tokenizer = tokenizer)

n=144
prediction flip rate: 0.00%
mean Δp(pos):        0.0008
mean Δmargin:        -0.0003
mean Δlog-odds:      -0.0003
mean Δp(pos): 0.0008
prediction flip rate: 0.00%
===== SUMMARY =====
Total samples: 144
Accuracy: 0.9583

Confusion Matrix:
[[135   4]
 [  2   3]]

Classification Report:
              precision    recall  f1-score   support

           0     0.9854    0.9712    0.9783       139
           1     0.4286    0.6000    0.5000         5

    accuracy                         0.9583       144
   macro avg     0.7070    0.7856    0.7391       144
weighted avg     0.9661    0.9583    0.9617       144

===== SUMMARY =====
Total samples: 144
Accuracy: 0.9583

Confusion Matrix:
[[135   4]
 [  2   3]]

Classification Report:
              precision    recall  f1-score   support

           0     0.9854    0.9712    0.9783       139
           1     0.4286    0.6000    0.5000         5

    accuracy                         0.9583       144
   macro avg     0.7070    0.7856    0.739

In [20]:
old_phrase = "10/10"
x,y = delete_test([test_data], old_phrase, model=model, tokenizer = tokenizer)
x,y = delete_test([train_data], old_phrase, model=model, tokenizer = tokenizer)

n=241
prediction flip rate: 0.00%
mean Δp(pos):        0.0009
mean Δmargin:        0.0174
mean Δlog-odds:      0.0174
mean Δp(pos): 0.0009
prediction flip rate: 0.00%
===== SUMMARY =====
Total samples: 241
Accuracy: 0.9627

Confusion Matrix:
[[ 18   2]
 [  7 214]]

Classification Report:
              precision    recall  f1-score   support

           0     0.7200    0.9000    0.8000        20
           1     0.9907    0.9683    0.9794       221

    accuracy                         0.9627       241
   macro avg     0.8554    0.9342    0.8897       241
weighted avg     0.9683    0.9627    0.9645       241

===== SUMMARY =====
Total samples: 241
Accuracy: 0.9627

Confusion Matrix:
[[ 18   2]
 [  7 214]]

Classification Report:
              precision    recall  f1-score   support

           0     0.7200    0.9000    0.8000        20
           1     0.9907    0.9683    0.9794       221

    accuracy                         0.9627       241
   macro avg     0.8554    0.9342    0.8897 

In [21]:
# Test if truncation might be an issue due too large inputs in imdb dataset

def cue_positions(dataset, cue, tokenizer, max_length=512):
    positions = []
    for text in dataset["text"]:
        # tokenize without truncation to find real position
        enc_full = tokenizer(text, add_special_tokens=False)
        toks = tokenizer.convert_ids_to_tokens(enc_full["input_ids"])
        
        # find cue as a token sequence (handle wordpieces)
        cue_ids = tokenizer(cue, add_special_tokens=False)["input_ids"]
        cue_toks = tokenizer.convert_ids_to_tokens(cue_ids)

        pos = None
        for i in range(len(toks) - len(cue_toks) + 1):
            if toks[i:i+len(cue_toks)] == cue_toks:
                pos = i
                break
        positions.append(pos if pos is not None else -1)

    positions = np.array(positions)
    seen = positions[positions >= 0]
    beyond = (seen >= max_length).mean() if len(seen) else 0.0
    
    print("examples with cue found in tokens:", len(seen), "/", len(positions))
    print("fraction beyond max_length:", beyond)
    print("median position:", np.median(seen) if len(seen) else None)

    return positions


In [145]:
subset_1_10 = extract_phrase([test_data], "1/10")
cue_positions(subset_1_10, "1/10", tokenizer)

subset_10_10 = extract_phrase([test_data], "10/10")
cue_positions(subset_10_10, "10/10", tokenizer)


Token indices sequence length is longer than the specified maximum sequence length for this model (524 > 512). Running this sequence through the model will result in indexing errors


examples with cue found in tokens: 139 / 144
fraction beyond max_length: 0.16546762589928057
median position: 230.0
examples with cue found in tokens: 241 / 241
fraction beyond max_length: 0.12448132780082988
median position: 226.0


array([ 604,  197,  308,  247,  335,  110,   41,   11,  187,   44,  386,
         29,   86,  458,  311,   42,  219,  943,   61,  131,  130,   72,
        935,  261,  243,  483,   86,  626,  241,  600,  196,  143,  202,
        202,  325,  676,   61,  292,   54,  149,  132,  226,  924,  327,
        215,  235,  133,  179,  152,  196,   89,  250,  481,  357,  418,
        164,  117,  185,  149,  323,  337,  334,  151,  208,  174,  564,
        508,  801,  306,  171,  478,   72,  237,  223,   99,  118,  841,
        357,  216,  481,  166,   67,   52,  210,  313,  199,  269,  307,
        359,  320,   80,  334, 1234,  149,  140,  292, 1433,  202,  104,
        169,  740,  295,  574,  484,  285,  140,  382,  142,  266,  208,
          4,  491,    2,    1,  419,   56, 1332,  145,  215,  349,  128,
        489,  352,  136, 1228,  164,  140,  137,  432,  907,  217,  124,
        162,  358,   52,  243,  219,  305,  451,   98,   76,  170,   74,
        135,  125,  164,  152,  209,  754,  108,  1

In [135]:
numeric = load_dataset("csv", data_files="numeric.csv")


old_phrase = "7/10"
x,y = delete_test([numeric["train"]], old_phrase, model=model, tokenizer = tokenizer)

n=100
prediction flip rate: 4.00%
mean Δp(pos):        0.0164
mean Δmargin:        0.1606
mean Δlog-odds:      0.1606
mean Δp(pos): 0.0164
prediction flip rate: 4.00%
===== SUMMARY =====
Total samples: 100
Accuracy: 0.8800

Confusion Matrix:
[[43  7]
 [ 5 45]]

Classification Report:
              precision    recall  f1-score   support

           0     0.8958    0.8600    0.8776        50
           1     0.8654    0.9000    0.8824        50

    accuracy                         0.8800       100
   macro avg     0.8806    0.8800    0.8800       100
weighted avg     0.8806    0.8800    0.8800       100

===== SUMMARY =====
Total samples: 100
Accuracy: 0.9000

Confusion Matrix:
[[44  6]
 [ 4 46]]

Classification Report:
              precision    recall  f1-score   support

           0     0.9167    0.8800    0.8980        50
           1     0.8846    0.9200    0.9020        50

    accuracy                         0.9000       100
   macro avg     0.9006    0.9000    0.9000       10

In [22]:
numeric_2 = load_dataset("csv", data_files="shortcut_probe_10of10.csv")


old_phrase = "10/10"
x,y = delete_test([numeric_2["train"]], old_phrase, model=model, tokenizer = tokenizer)

n=100
prediction flip rate: 0.00%
mean Δp(pos):        -0.0025
mean Δmargin:        -0.0080
mean Δlog-odds:      -0.0080
mean Δp(pos): -0.0025
prediction flip rate: 0.00%
===== SUMMARY =====
Total samples: 100
Accuracy: 1.0000

Confusion Matrix:
[[50  0]
 [ 0 50]]

Classification Report:
              precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000        50
           1     1.0000    1.0000    1.0000        50

    accuracy                         1.0000       100
   macro avg     1.0000    1.0000    1.0000       100
weighted avg     1.0000    1.0000    1.0000       100

===== SUMMARY =====
Total samples: 100
Accuracy: 1.0000

Confusion Matrix:
[[50  0]
 [ 0 50]]

Classification Report:
              precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000        50
           1     1.0000    1.0000    1.0000        50

    accuracy                         1.0000       100
   macro avg     1.0000    1.0000    1.0000     

In [141]:
sum(1 for el in y["text"] if "10/10" in el)

0

In [None]:

positive_candidate_shortcuts = ['7/10',
  '8/10',
  '9/10',
  '10/10',
  'matthau', # actor
  'explores',
  'hawke', # actor
  'voight', # actor
  'peters',
  'victoria',
  'powell',
  'sadness',
  'walsh',
  'mann',
  'winters',
  'brosnan',
  'layers',
  'friendship',
  'ralph',
  'montana',
  'watson',
  'sullivan',
  'detract',
  'conveys',
  'loneliness',
  'lemmon',
  'nancy',]

for phrase in positive_candidate_shortcuts:
    print(f"----------------------{phrase}---------------------------")
    subset, deleted = delete_test(
        [train_data],
        phrase,
        model=model,
        tokenizer=tokenizer
    )


In [None]:

negative_candidate_shortcuts =[
  '2/10',
  'boll',
  '4/10',
  '3/10',
  '1/10',
  'nope',
  'camcorder',
  'baldwin',
  'arty',
  'cannibal',
  'rubber',
  'shoddy',
  'barrel',
  'plodding',
  'plastic',
  'mutant',
  'costs',
  'claus',
  'ludicrous',
  'nonsensical',
  'bother',
  'disjointed']

for phrase in negative_candidate_shortcuts:
    
    subset, deleted = delete_test(
        [train_data],
        phrase,
        model=model,
        tokenizer=tokenizer
    )