In [1]:
# Import necessary libraries
import torch
import pandas as pd
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers_interpret import SequenceClassificationExplainer
from nltk.corpus import stopwords

# Confirm key modules are available
print("torch version:", torch.__version__)
print("pandas version:", pd.__version__)

torch version: 2.7.1+cu118
pandas version: 2.3.2


In [2]:
# Define model path
model_path = "../models/manipulation_detector_model"

# Load model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# Confirm model and tokenizer are functional
print("Model loaded:", type(model).__name__)
print("Tokenizer vocab size:", tokenizer.vocab_size)


Model loaded: DistilBertForSequenceClassification
Tokenizer vocab size: 30522


In [3]:
# Initialize attribution explainer
explainer = SequenceClassificationExplainer(model, tokenizer)

# Confirm explainer is ready
print("Explainer initialized with model:", explainer.model.__class__.__name__)


Explainer initialized with model: DistilBertForSequenceClassification


In [4]:
# Headlines to inspect
headlines = [
    "Biden accuses GOP of risking shutdown over budget impasse",
    "This celebrity’s bizarre behavior left fans speechless",
    "Coastal floods force mass evacuations across low-lying towns",
    "Trump delivers mixed message on Congress during policy address",
    "Government reviews proposal for incremental tax reform",
    "Scientists highlight potential climate tipping point in new report",
    "Firefighter rescues child from apartment blaze",
    "Markets slide amid investor anxiety over recession indicators",
    "Study uncovers surprising link between sugar and cognitive decline",
    "Senator defends disputed bill during tense floor exchange",
    "Health officials warn about spread of vaccine misinformation",
    "Tech firm announces major advancement in AI capabilities",
    "Mayor criticized following unpopular budget decisions",
    "You’ll be stunned by what unfolded at the awards ceremony",
    "Police respond to rising tensions during downtown protest",
    "Economic indicators suggest gradual recovery despite global headwinds",
    "Civil liberties groups protest proposed surveillance legislation",
    "New housing law may significantly impact homeowner rights",
    "Celebrity couple ends relationship after months of speculation",
    "Court rules against whistleblower in high-profile legal case"
]

# Confirm headline count
print(f"Loaded {len(headlines)} headlines for inspection.")


Loaded 20 headlines for inspection.


In [5]:
import nltk
import string

# Download stopwords
nltk.download('stopwords')
stopword_set = set(stopwords.words('english'))

def clean_attributions(tokens, attributions, threshold=0.3):
    merged_tokens = []
    merged_scores = []
    buffer = ""
    buffer_score = 0.0

    for token, score in zip(tokens, attributions):
        # Normalize curly punctuation
        token = token.replace('’', "'").replace('“', '"').replace('”', '"')

        if token.startswith("##"):
            buffer += token[2:]
            buffer_score += score
        else:
            if buffer:
                merged_tokens.append(buffer)
                merged_scores.append(buffer_score)
            buffer = token
            buffer_score = score
    if buffer:
        merged_tokens.append(buffer)
        merged_scores.append(buffer_score)

    # Filter stopwords, punctuation, and apply threshold
    filtered = [
        t for t, s in zip(merged_tokens, merged_scores)
        if t.lower() not in stopword_set
        and t not in string.punctuation
        and s > threshold
    ]
    return filtered

def inspect_headline(headline):
    # Run explainer to get word-level attribution scores
    word_attributions = explainer(headline)

    # Extract predicted label and class index
    pred_label = explainer.predicted_class_name
    pred_index = explainer.predicted_class_index

    # Manually tokenize and run model to get logits
    inputs = tokenizer(headline, return_tensors="pt", truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits.squeeze()
        probs = torch.softmax(logits, dim=0)

    # Manipulation score = probability of class 1
    manip_score = round(float(probs[1]) * 100, 2)

    # Separate tokens and scores
    tokens, scores = zip(*word_attributions)
    top_tokens = clean_attributions(tokens, scores)  # Uses threshold=0.3 by default

    # Print diagnostics
    print(f"\nHeadline: {headline}")
    print(f"Predicted: {pred_label} (class {pred_index})")
    print(f"Manipulation score: {manip_score:.2f}%")
    print(f"Top tokens with attribution > 0.3: {top_tokens}")

    return {
        "headline": headline,
        "prediction": pred_label,
        "score": manip_score,
        "top_tokens": top_tokens
    }


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Keith\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
# Run inspection across all headlines
results = [inspect_headline(h) for h in headlines]

# Convert to DataFrame
df = pd.DataFrame(results)

# Show full results
print("\nFinal inspection results:")
print(df.to_string(index=False))



Headline: Biden accuses GOP of risking shutdown over budget impasse
Predicted: LABEL_1 (class 1)
Manipulation score: 99.44%
Top tokens with attribution > 0.3: ['shutdown', 'impasse']

Headline: This celebrity’s bizarre behavior left fans speechless
Predicted: LABEL_1 (class 1)
Manipulation score: 99.33%
Top tokens with attribution > 0.3: ['celebrity', 'bizarre']

Headline: Coastal floods force mass evacuations across low-lying towns
Predicted: LABEL_1 (class 1)
Manipulation score: 94.77%
Top tokens with attribution > 0.3: ['floods', 'force', 'mass', 'across']

Headline: Trump delivers mixed message on Congress during policy address
Predicted: LABEL_1 (class 1)
Manipulation score: 98.15%
Top tokens with attribution > 0.3: ['message']

Headline: Government reviews proposal for incremental tax reform
Predicted: LABEL_0 (class 0)
Manipulation score: 0.36%
Top tokens with attribution > 0.3: ['proposal', 'tax', 'reform']

Headline: Scientists highlight potential climate tipping point in new

In [None]:
import csv

# Set to collect unique (token, score) pairs
unique_tokens = {}

# Loop through all headlines and collect token scores
for headline in headlines:
    word_attributions = explainer(headline)
    for token, score in word_attributions:
        # Keep highest score if token appears multiple times
        if token not in unique_tokens or score > unique_tokens[token]:
            unique_tokens[token] = score

# Define output path
output_path = "../data/inspection_outputs/token_scores.csv"

# Write to pipe-delimited CSV
with open(output_path, mode="w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file, delimiter="|")
    writer.writerow(["token", "score"])
    for token, score in sorted(unique_tokens.items()):
        writer.writerow([token, f"{score:.3f}"])

print(f"Token attribution scores exported to: {output_path}")
