In [None]:
import re
import torch
import numpy as np
from torch.nn.functional import softmax
from transformers import AutoTokenizer, AutoModelForSequenceClassification

########################################
# 1. Load *your* fine-tuned DistilBERT
########################################

# model_path = "./distillbert-base-finetuned"
# from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
# tokenizer = DistilBertTokenizer.from_pretrained(model_path)
# model = DistilBertForSequenceClassification.from_pretrained(model_path)

model_path = "./bert-finetuned"
from transformers import (BertTokenizerFast,BertForSequenceClassification)
tokenizer = BertTokenizerFast.from_pretrained(model_path)
model = BertForSequenceClassification.from_pretrained(model_path)


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

########################################
# 2. Prediction helper
########################################

def predict(text):
    enc = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=512)
    enc = {k: v.to(device) for k, v in enc.items()}
    with torch.no_grad():
        logits = model(**enc).logits
    probs = softmax(logits, dim=-1).cpu().numpy()[0]
    pred = int(np.argmax(probs))
    return pred, probs

########################################
# 3. Grad-L2 saliency (DistilBERT)
########################################

def grad_l2_saliency(text):
    enc = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=512)
    input_ids = enc["input_ids"].to(device)
    attention_mask = enc["attention_mask"].to(device)

    model.zero_grad()

    # inputs_embeds = model.distilbert.embeddings(input_ids)
    inputs_embeds = model.bert.embeddings(input_ids)
    inputs_embeds.retain_grad()

    outputs = model(inputs_embeds=inputs_embeds, attention_mask=attention_mask)
    logits = outputs.logits
    target_logit = logits[0, logits.argmax(dim=-1)]

    target_logit.backward()

    grad = inputs_embeds.grad[0]
    saliency = torch.norm(grad, dim=-1)

    tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
    return tokens, saliency.detach().cpu().numpy()

########################################
# 4. Rating extraction
########################################

rating_regex = re.compile(r"(\d+)\s*(/|out of)\s*10")

def extract_rating_reviews(texts):
    return [t for t in texts if rating_regex.search(t)]

########################################
# 5. Numeric shortcut detection
########################################

def is_rating_token(tok):
    core = tok.replace("##", "")
    return core.isdigit() or tok in ["/", "out", "of"]

def top_k_saliency(tokens, saliency, k=5):
    idxs = np.argsort(-saliency)[:k]
    return [(tokens[i], float(saliency[i])) for i in idxs]

########################################
# 6. Masking
########################################

def mask_rating(text):
    mask = tokenizer.mask_token
    return rating_regex.sub(f"{mask} {mask}", text)

def masking_effect(text):
    pred_o, probs_o = predict(text)
    masked = mask_rating(text)
    pred_m, probs_m = predict(masked)
    delta = probs_o - probs_m
    return pred_o, probs_o, pred_m, probs_m, delta

########################################
# 7. Injection
########################################

def injection_effect(text, rating="3/10"):
    pred_b, probs_b = predict(text)
    inj = text + " " + rating
    pred_i, probs_i = predict(inj)
    diff = probs_i - probs_b
    return pred_b, probs_b, pred_i, probs_i, diff


In [11]:
ratings = ["3/10", "8/10", "9/10", "10/10", "4 out of 10", "7 out of 10"]

for r in ratings:
    print(r, "→", tokenizer.tokenize(r))

3/10 → ['3', '/', '10']
8/10 → ['8', '/', '10']
9/10 → ['9', '/', '10']
10/10 → ['10', '/', '10']
4 out of 10 → ['4', 'out', 'of', '10']
7 out of 10 → ['7', 'out', 'of', '10']


In [12]:
########################################
# 8. EXAMPLE USAGE — RUN ANALYSIS
########################################



# Example texts — replace with IMDB test set
sample_texts = [
    "The movie was boring and too long. 3/10",
    "I really loved this film, the acting was great. 9/10",
    "Not good, not terrible, just average.",
    "Terrible script, but the visuals were okay.",
]

########################################
# A) Extract rating reviews
########################################
rating_reviews = extract_rating_reviews(sample_texts)
print("\n=== Rating reviews ===")
for r in rating_reviews:
    print("-", r)

########################################
# B) Saliency inspection
########################################
print("\n=== Saliency top-5 ===")
for text in rating_reviews:
    tokens, sal = grad_l2_saliency(text)
    top = top_k_saliency(tokens, sal,k=5)
    print("\nTEXT:", text)
    print("TOP-5:", top)

########################################
# C) Masking experiment
########################################
print("\n=== Masking experiment ===")
for text in rating_reviews:
    print("\nOriginal:", text)
    pred_o, p_o, pred_m, p_m, delta = masking_effect(text)
    print("Pred original:", pred_o, p_o)
    print("Pred masked:  ", pred_m, p_m)
    print("Delta (orig - masked):", delta)

########################################
# D) Injection experiment
########################################
print("\n=== Injection experiment (3/10 added) ===")
for text in sample_texts:
    print("\nBase:", text)
    pred_b, pb, pred_i, pi, diff = injection_effect(text, rating="3/10")
    print("Base pred:", pred_b, pb)
    print("Injected pred:", pred_i, pi)
    print("Delta:", diff)



=== Rating reviews ===
- The movie was boring and too long. 3/10
- I really loved this film, the acting was great. 9/10

=== Saliency top-5 ===

TEXT: The movie was boring and too long. 3/10
TOP-5: [('[CLS]', 0.08184252679347992), ('boring', 0.06775245070457458), ('[SEP]', 0.03804617002606392), ('.', 0.03294209763407707), ('movie', 0.03236350789666176)]

TEXT: I really loved this film, the acting was great. 9/10
TOP-5: [('[CLS]', 0.06422299146652222), ('loved', 0.05984452739357948), ('great', 0.040279317647218704), ('acting', 0.029215823858976364), ('film', 0.027716677635908127)]

=== Masking experiment ===

Original: The movie was boring and too long. 3/10
Pred original: 0 [0.99658066 0.00341933]
Pred masked:   0 [0.93435925 0.06564078]
Delta (orig - masked): [ 0.06222141 -0.06222145]

Original: I really loved this film, the acting was great. 9/10
Pred original: 1 [0.00378348 0.9962165 ]
Pred masked:   1 [0.01359672 0.9864032 ]
Delta (orig - masked): [-0.00981323  0.00981325]

=== In

In [38]:
import re
import torch
import numpy as np
from torch.nn.functional import softmax

from transformers import (
    DistilBertTokenizerFast,
    DistilBertForSequenceClassification,
)

########################################
# 1. Load your fine-tuned DistilBERT
########################################

MODEL_NAME = "lvwerra/distilbert-imdb"

tokenizer = DistilBertTokenizerFast.from_pretrained(MODEL_NAME)
model = DistilBertForSequenceClassification.from_pretrained(MODEL_NAME)

# Device selection
if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

model.to(device)
model.eval()

MAX_LEN = 256  # limit sequence length for speed and to avoid overflow


########################################
# 2. Function: Get model prediction
########################################

def predict(text):
    enc = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding="max_length",
        max_length=MAX_LEN,
    )
    enc = {k: v.to(device) for k, v in enc.items()}
    with torch.no_grad():
        logits = model(**enc).logits
    probs = softmax(logits, dim=-1).cpu().numpy()[0]
    pred = int(np.argmax(probs))
    return pred, probs


########################################
# 3. Function: Grad-L2 Saliency
########################################

def grad_l2_saliency(text):
    """
    Compute Grad-L2 saliency per token for DistilBERT.
    Returns: (tokens, saliency_scores)
    """
    enc = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding="max_length",
        max_length=MAX_LEN,
    )
    input_ids = enc["input_ids"].to(device)
    attention_mask = enc["attention_mask"].to(device)

    # Clear old gradients
    model.zero_grad()

    # Get embeddings with gradients
    # DistilBERT's backbone is at model.distilbert
    inputs_embeds = model.distilbert.embeddings(input_ids)
    inputs_embeds.requires_grad_(True)
    inputs_embeds.retain_grad()

    # Forward pass using inputs_embeds instead of input_ids
    outputs = model(
        inputs_embeds=inputs_embeds,
        attention_mask=attention_mask,
    )
    logits = outputs.logits  # shape: (1, num_labels)

    # Take gradient of the predicted class logit
    target_class = logits.argmax(dim=-1).item()
    target_logit = logits[0, target_class]
    target_logit.backward()

    # Gradient wrt embeddings: (seq_len, hidden_dim)
    gradient = inputs_embeds.grad[0]  # shape: (seq_len, hidden_size)

    # L2 norm over embedding dim → (seq_len,)
    saliency = torch.norm(gradient, dim=1)  # L2 norm per token

    # Mask out padding positions (attention_mask == 0)
    saliency = saliency * attention_mask[0]

    tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
    return tokens, saliency.detach().cpu().numpy()


########################################
# 4. Identify rating reviews: X/10
########################################

rating_regex = re.compile(r"(\d+)\s*(/|out of)\s*10", flags=re.IGNORECASE)

def extract_rating_reviews(texts):
    rating_texts = []
    for t in texts:
        if rating_regex.search(t):
            rating_texts.append(t)
    return rating_texts


########################################
# 5. Check if DistilBERT focuses on rating tokens
########################################

def top_k_saliency(tokens, saliency, k=5):
    idx = np.argsort(-saliency)[:k]
    return [(tokens[i], float(saliency[i])) for i in idx]

def is_rating_token(tok):
    # Strip subword prefix and check digit-ness or rating-related words
    clean = tok.replace("##", "")
    if clean.isdigit():
        return True
    if clean.lower() in {"out", "of", "/"}:
        return True
    return False

def check_numeric_shortcut(text, k=5):
    tokens, sal = grad_l2_saliency(text)
    top = top_k_saliency(tokens, sal, k=k)

    rating_tokens = [tok for tok, _ in top if is_rating_token(tok)]
    return top, rating_tokens


########################################
# 6. Masking experiment (without [MASK])
########################################

def mask_rating(text):
    # Delete the rating phrase instead of using [MASK] (DistilBERT is not MLM)
    return rating_regex.sub(" ", text)

def masking_effect(text):
    pred_orig, probs_orig = predict(text)
    masked = mask_rating(text)
    pred_mask, probs_mask = predict(masked)
    delta = probs_orig - probs_mask
    return (pred_orig, probs_orig, pred_mask, probs_mask, delta)


########################################
# 7. Injection experiment
########################################

def inject_rating(text, rating="3/10"):
    return text.strip() + f" {rating}"

def injection_effect(text, rating="3/10"):
    pred_base, probs_base = predict(text)
    injected_text = inject_rating(text, rating)
    pred_inj, probs_inj = predict(injected_text)
    return (pred_base, probs_base, pred_inj, probs_inj, probs_inj - probs_base)


########################################
# 8. EXAMPLE USAGE
########################################

if __name__ == "__main__":
    # Replace this with your IMDB test set
    sample_texts = [
        "The movie was boring and too long. 3/10",
        "I really loved this film, the acting was great. 9/10",
        "Not good, not terrible, just average.",
        "Terrible script, but the visuals were okay.",
    ]

    # (A) Extract rating reviews
    rating_reviews = extract_rating_reviews(sample_texts)
    print("\n=== Rating reviews ===")
    for r in rating_reviews:
        print("-", r)

    # (B) Saliency inspection
    print("\n=== Saliency top-5 ===")
    for text in rating_reviews:
        tokens, sal = grad_l2_saliency(text)
        top = top_k_saliency(tokens, sal)
        print("\nTEXT:", text)
        print("TOP-5:", top)

        _, rating_tokens = check_numeric_shortcut(text, k=5)
        print("Rating tokens among TOP-5:", rating_tokens)

    # (C) Masking effect
    print("\n=== Masking experiment ===")
    for text in rating_reviews:
        print("\nOriginal:", text)
        pred_o, p_o, pred_m, p_m, delta = masking_effect(text)
        print("Pred original:", pred_o, p_o)
        print("Pred masked:  ", pred_m, p_m)
        print("Delta (orig - masked):", delta)

    # (D) Injection effect
    print("\n=== Injection experiment (3/10 added) ===")
    for text in sample_texts:
        print("\nBase:", text)
        pred_b, pb, pred_i, pi, diff = injection_effect(text, rating="3/10")
        print("Base pred:", pred_b, pb)
        print("Injected pred:", pred_i, pi)
        print("Delta:", diff)



=== Rating reviews ===
- The movie was boring and too long. 3/10
- I really loved this film, the acting was great. 9/10

=== Saliency top-5 ===

TEXT: The movie was boring and too long. 3/10
TOP-5: [('[CLS]', 0.06540397554636002), ('boring', 0.05851801112294197), ('[SEP]', 0.032388266175985336), ('too', 0.028816470876336098), ('movie', 0.02705072984099388)]
Rating tokens among TOP-5: []

TEXT: I really loved this film, the acting was great. 9/10
TOP-5: [('[CLS]', 0.03852150961756706), ('loved', 0.03203962370753288), ('great', 0.020018430426716805), ('[SEP]', 0.014890638180077076), ('acting', 0.014248745515942574)]
Rating tokens among TOP-5: []

=== Masking experiment ===

Original: The movie was boring and too long. 3/10
Pred original: 0 [0.996382 0.003618]
Pred masked:   0 [0.9966307  0.00336929]
Delta (orig - masked): [-0.00024873  0.00024871]

Original: I really loved this film, the acting was great. 9/10
Pred original: 1 [0.0046237 0.9953762]
Pred masked:   1 [0.00470746 0.9952925

In [2]:
import os
os.listdir()

['.DS_Store',
 'shortcut_probe_10of10.csv',
 'shap.ipynb',
 'numeric_shortcuts.ipynb',
 'tinybert-imdb-final',
 'synthetic_voight.csv',
 'Finetuning Pipeline.ipynb',
 'numeric.csv',
 'distillbert-base-finetuned',
 'flip_delete_test.ipynb',
 '.gitignore',
 'syntehtic_datasets.ipynb',
 '.venv',
 'identify_shortcuts_distillbert.ipynb',
 '.git',
 'Clustering.ipynb']

In [1]:
import torch
import numpy as np
import re
from torch.nn.functional import softmax
from transformers import AutoTokenizer, AutoModelForSequenceClassification


device = "cuda" if torch.cuda.is_available() else "cpu"

###############################################
# 0. Load your model — BERT or DistilBERT
###############################################

# model_path = "./distillbert-base-finetuned"
# from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
# tokenizer = DistilBertTokenizer.from_pretrained(model_path)
# model = DistilBertForSequenceClassification.from_pretrained(model_path)


model_path = "./bert-finetuned"
from transformers import (BertTokenizerFast,BertForSequenceClassification)
tokenizer = BertTokenizerFast.from_pretrained(model_path)
model = BertForSequenceClassification.from_pretrained(model_path)


if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")
model.to(device)
model.eval() 


###############################################
# 1. Find embeddings layer (supports BERT & DistilBERT)
###############################################

def get_embeddings_layer(model):
    if hasattr(model, "bert"):
        return model.bert.embeddings
    if hasattr(model, "distilbert"):
        return model.distilbert.embeddings
    if hasattr(model, "roberta"):
        return model.roberta.embeddings
    raise ValueError("Unsupported model type — add embedding lookup rule.")

###############################################
# 2. Prediction helper
###############################################

def predict(text):
    enc = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=256)
    enc = {k: v.to(device) for k,v in enc.items()}
    with torch.no_grad():
        logits = model(**enc).logits
    probs = softmax(logits, dim=-1)[0].cpu().numpy()
    return int(np.argmax(probs)), probs

###############################################
# 3. Grad-L2 saliency
###############################################

def grad_l2_saliency(text):
    enc = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=256)
    input_ids = enc["input_ids"].to(device)
    attention_mask = enc["attention_mask"].to(device)

    model.zero_grad()

    # get embeddings with gradient
    embeddings_layer = get_embeddings_layer(model)
    inputs_embeds = embeddings_layer(input_ids)
    inputs_embeds.retain_grad()

    outputs = model(inputs_embeds=inputs_embeds, attention_mask=attention_mask)
    logits = outputs.logits
    target = logits[0, logits.argmax(-1)]
    target.backward()

    grad = inputs_embeds.grad[0]          # (seq, hidden)
    sal = torch.norm(grad, dim=-1).cpu().numpy()
    tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
    return tokens, sal

###############################################
# 4. Saliency Top-k
###############################################

def top_k_saliency(tokens, sal, k=5):
    idx = np.argsort(-sal)[:k]
    return [(tokens[i], float(sal[i])) for i in idx]

###############################################
# 5. Shortcut tests: masking and injecting
###############################################

rating_pattern = re.compile(r"(\d+)\s*(/|out of)\s*10")

def mask_rating(text):
    return rating_pattern.sub("[MASK] [MASK]", text)

def masking_test(text):
    pred_o, p_o = predict(text)
    pred_m, p_m = predict(mask_rating(text))
    delta = p_o - p_m
    return delta, pred_o, pred_m, p_o, p_m

def inject(text, rating="3/10"):
    return text.strip() + f" {rating}"

def injection_test(text, rating="3/10"):
    pred_o, p_o = predict(text)
    pred_i, p_i = predict(inject(text, rating))
    delta = p_i - p_o
    return delta, pred_o, pred_i, p_o, p_i

###############################################
# 6. Shortcut detection logic
###############################################

def detect_shortcut(example_texts):
    results = []

    for text in example_texts:

        # A. Saliency check
        tokens, sal = grad_l2_saliency(text)
        top5 = top_k_saliency(tokens, sal)
        numeric_in_top5 = any(re.fullmatch(r"\d+|/|##\d+", tok) for tok, _ in top5)

        # B. Masking check
        delta_mask, pred_o, pred_m, p_o, p_m = masking_test(text)
        mask_strength = np.abs(delta_mask).max()

        # C. Injection check (into same text)
        delta_inj, _, _, _, _ = injection_test(text)
        inj_strength = np.abs(delta_inj).max()

        results.append({
            "text": text,
            "top5": top5,
            "numeric_in_top5": numeric_in_top5,
            "mask_strength": float(mask_strength),
            "inj_strength": float(inj_strength),
        })

    #############################################################
    # Aggregate verdict
    #############################################################

    num_top5_count = sum(r["numeric_in_top5"] for r in results)
    avg_mask = np.mean([r["mask_strength"] for r in results])
    avg_inj = np.mean([r["inj_strength"] for r in results])

    shortcut = (
        num_top5_count >= len(results) * 0.5 and
        (avg_mask > 0.10 or avg_inj > 0.10)
    )

    verdict = "SHORTCUT DETECTED" if shortcut else "NO SHORTCUT"
    return verdict, results

###############################################
# 7. Example usage
###############################################

test_texts = [
    "The movie was nice and short. 7/10",
    "I really loved this film, the acting was great. 9/10",
    "Terrible script, terrible pacing.",
    "I loved every minute of this film!",
]

verdict, details = detect_shortcut(test_texts)

print("\n==========================")
print("FINAL VERDICT:", verdict)
print("==========================\n")

for r in details:
    print("TEXT:", r["text"])
    print("  TOP-5:", r["top5"])
    print("  numeric in top5:", r["numeric_in_top5"])
    print("  mask_strength:", r["mask_strength"])
    print("  inj_strength:", r["inj_strength"])
    print()


  from .autonotebook import tqdm as notebook_tqdm



FINAL VERDICT: NO SHORTCUT

TEXT: The movie was nice and short. 7/10
  TOP-5: [('7', 0.06562606245279312), ('.', 0.05604732409119606), ('nice', 0.05205220356583595), ('[SEP]', 0.04989906772971153), ('[CLS]', 0.0458010733127594)]
  numeric in top5: True
  mask_strength: 0.008656407706439495
  inj_strength: 3.749213647097349e-05

TEXT: I really loved this film, the acting was great. 9/10
  TOP-5: [('9', 0.026332905516028404), ('[SEP]', 0.023864086717367172), ('.', 0.01898876018822193), ('loved', 0.016901524737477303), ('10', 0.01596492901444435)]
  numeric in top5: True
  mask_strength: 4.214770160615444e-05
  inj_strength: 1.1563301086425781e-05

TEXT: Terrible script, terrible pacing.
  TOP-5: [(',', 0.026295967400074005), ('[SEP]', 0.0234399251639843), ('pacing', 0.02334875613451004), ('[CLS]', 0.023228643462061882), ('.', 0.022664394229650497)]
  numeric in top5: False
  mask_strength: 0.0
  inj_strength: 0.00017824815586209297

TEXT: I loved every minute of this film!
  TOP-5: [('[

In [29]:
from datasets import load_dataset
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_path = "./tinybert-imdb-final"

# IMPORTANT: use Auto* and remove any DistilBert/Bert-specific imports
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

model.to(device)
model.eval()

dataset = load_dataset("imdb")

def predict(text):
    enc = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        max_length=512,
        padding="max_length"
    ).to(device)

    with torch.no_grad():
        logits = model(**enc).logits
    return int(logits.argmax(-1))


preds = []
labels = []

subset = dataset["test"].select(range(4000))

for sample in subset:
    preds.append(predict(sample["text"]))
    labels.append(sample["label"])

acc = accuracy_score(labels, preds)
f1 = f1_score(labels, preds)

print("Accuracy:", acc)
print("F1:", f1)


Accuracy: 1.0
F1: 0.0


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [24]:
from datasets import load_dataset
import re

dataset = load_dataset("imdb")
pattern = re.compile(r"\b\d+\s*(/|out of)\s*10\b")

count = sum(1 for x in dataset["train"]["text"] if pattern.search(x))
count += sum(1 for x in dataset["test"]["text"] if pattern.search(x))

print("Number of rating patterns:", count)


Number of rating patterns: 4376
