<a href="https://colab.research.google.com/github/janbanot/msc-project/blob/main/test_notebooks/msc_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install --upgrade transformers datasets captum quantus accelerate

In [None]:
from google.colab import drive
drive.mount('/drive')

In [None]:
import pandas as pd

csv_path = '/drive/MyDrive/msc-project/jigsaw-toxic-comment/train.csv'
try:
    df = pd.read_csv(csv_path)
    print("CSV file loaded successfully!")
    display(df.head())
except FileNotFoundError:
    print(f"Error: The file was not found at {csv_path}")
except Exception as e:
    print(f"An error occurred: {e}")

In [None]:
import re

def clean_text(example):
    """Applies all cleaning steps to the 'comment_text' field."""

    # 1. Get the text
    text = example['comment_text']

    # 2. Lowercasing
    # This is crucial for "uncased" BERT models
    text = text.lower()

    # 3. Remove URLs
    # re.sub finds a pattern and replaces it
    # r'http\S+' finds 'http' followed by any non-space characters
    text = re.sub(r'http\S+|www\S+', '', text)

    # 4. Remove IP Addresses
    # \d{1,3} means "a digit, 1-to-3 times". \. means "a literal dot".
    text = re.sub(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', '', text)

    # 5. Remove Wikipedia metadata like (talk), timestamps, etc.
    # This is a simple regex to find things like (talk)
    # You could make this more complex, but this is a good start.
    text = re.sub(r'\(talk\)', '', text)
    text = re.sub(r'\d{2}:\d{2}, \w+ \d{1,2}, \d{4} \(utc\)', '', text)

    # 6. Remove newlines and other special characters
    text = text.replace('\n', ' ')
    text = text.replace('\xa0', ' ')

    # 7. Remove any text inside double quotes at the start/end
    # This removes things like '"\n\n ' from the beginning
    text = text.strip(' "')

    # 8. Clean up whitespace
    # \s+ means "one or more space characters"
    # We replace any group of spaces with a single space
    text = re.sub(r'\s+', ' ', text).strip()

    # 9. Update the example
    example['comment_text'] = text
    return example

In [None]:
import datasets

train_df = df.head(2000)
data = datasets.Dataset.from_pandas(train_df)

In [None]:
print("\nCleaning data...")
cleaned_data = data.map(clean_text)
print("Data cleaned!")

In [None]:
print("\n--- BEFORE CLEANING ---")
print(data[1]['comment_text'])
print("\n" + data[6]['comment_text'])
print("\n" + data[0]['comment_text'])

print("\n\n--- AFTER CLEANING ---")
print(cleaned_data[1]['comment_text'])
print("\n" + cleaned_data[6]['comment_text'])
print("\n" + cleaned_data[0]['comment_text'])

In [None]:
from transformers import AutoTokenizer

# "model card"
# 'uncased' matches the .lower() step we did earlier.
model_checkpoint = "distilbert-base-uncased"

try:
    # This downloads and caches the tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    print("Tokenizer loaded successfully!")
except Exception as e:
    print(f"Error loading tokenizer: {e}")

In [None]:
def tokenize_function(examples):
    """Applies the tokenizer to a batch of text."""

    # This is the main tokenization step.
    # padding="max_length" fills short comments with [PAD] tokens.
    # truncation=True cuts off comments that are too long.
    # max_length=256 is a good balance of speed and context for comments.
    # Could use 512 (DistilBERT's max) but it's slower.
    return tokenizer(
        examples["comment_text"],
        padding="max_length",
        truncation=True,
        max_length=256
    )

# Apply the function with .map()
# batched=True makes it MUCH faster by tokenizing many texts at once.
print("\nTokenizing data...")
tokenized_data = cleaned_data.map(tokenize_function, batched=True)
print("Data tokenized!")

In [None]:
print("\n--- Example of a Tokenized Entry ---")
print(tokenized_data[0])

In [None]:
import numpy as np

# 1. Define label columns in the correct order
label_columns = [
    'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'
]

def create_labels_column(example):
    """
    Creates a new 'labels' column by combining the 6 label columns.
    We convert them to float32, which is what ML models expect.
    """
    # For each example, build a list of its label values
    labels_list = [float(example[col]) for col in label_columns]
    example['labels'] = labels_list
    return example

# 2. Apply the function
print("\nConsolidating labels...")
final_data = tokenized_data.map(create_labels_column)
print("Labels consolidated!")

# 3. Let's see the result for a toxic comment
print("\n--- Example of a Processed Entry ---")
print(final_data[6])

In [None]:
# 1. List all columns to be removed
columns_to_remove = [
    'id', 'comment_text', 'toxic', 'severe_toxic',
    'obscene', 'threat', 'insult', 'identity_hate'
]

print(f"\nOriginal columns: {final_data.column_names}")
final_data = final_data.remove_columns(columns_to_remove)
print(f"Cleaned columns: {final_data.column_names}")

# 2. Set the dataset format to "torch" (for PyTorch)
try:
    final_data.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
    print("\nDataset format set to 'torch'!")
except ImportError:
    print("\nPyTorch not installed. Skipping .set_format('torch').")
    print("Please install with: pip install torch")

print("\n--- Final, Model-Ready Item ---")
print(final_data[6])

In [None]:
from transformers import AutoModelForSequenceClassification

num_labels = 6 # 6 toxic categories

# Load the model, configuring it for multi-label classification
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint,
    num_labels=num_labels,
    problem_type="multi_label_classification"
)

print("Model loaded successfully!")
print("Model configured for multi-label classification.")

In [None]:
data_splits = final_data.train_test_split(test_size=0.2, seed=42)

train_dataset = data_splits['train']
eval_dataset = data_splits['test']

print(f"\nData split complete:")
print(f"Training samples: {len(train_dataset)}")
print(f"Evaluation samples: {len(eval_dataset)}")

In [None]:
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
import torch

def compute_metrics(p: EvalPrediction):
    # p.predictions are the raw logit outputs
    # p.label_ids are the true labels

    # Apply sigmoid to logits to get probabilities
    logits = p.predictions
    # Sigmoid function
    probs = 1 / (1 + np.exp(-logits))

    # Set a threshold (0.5) to get binary predictions
    threshold = 0.5
    predictions = (probs > threshold).astype(int)

    # Compute the metrics
    labels = p.label_ids

    # Use 'micro' averaging, which is good for imbalanced labels
    f1_micro = f1_score(labels, predictions, average='micro')

    # This measures how many individual labels (out of 6*num_samples) were correct
    overall_accuracy = accuracy_score(labels.flatten(), predictions.flatten())

    # Return metrics as a dictionary
    return {
        'f1_micro': f1_micro,
        'accuracy': overall_accuracy
    }

In [None]:
from transformers import TrainingArguments

model_output_dir = "/drive/MyDrive/msc-project/models/distilbert-jigsaw-finetuned"


training_args = TrainingArguments(
    output_dir=model_output_dir,
    learning_rate=2e-5,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    # helps prevent overfitting
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=5,
    load_best_model_at_end=True,
    metric_for_best_model="f1_micro",
    # DISABLE WANDB
    report_to="none",
)

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

print("\n--- Starting Training ---")
trainer.train()
print("--- Training Complete ---")

In [None]:
save_directory = "/drive/MyDrive/msc-project/models/final_distilbert_jigsaw"
trainer.save_model(save_directory)
tokenizer.save_pretrained(save_directory)

print(f"Model and tokenizer saved in: {save_directory}")

In [None]:
import torch
import torch.nn.functional as F

model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

text = "you are a fucking moron, who should die in hell but I love your lovely kitten"

# Tokenization
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)

# Inference
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits
    # Use SIGMOID dla multi-label
    probs = torch.sigmoid(logits)

# Display results
labels_list = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
print(f"Text: '{text}'\n")
print("Probabilities:")
for label, prob in zip(labels_list, probs[0]):
    print(f"{label}: {prob:.4f}")

In [None]:
from captum.attr import IntegratedGradients

# 1. Captum wrapper
def predict_func(inputs_embeds, attention_mask=None):
    output = model(inputs_embeds=inputs_embeds, attention_mask=attention_mask)
    return output.logits

# 2. Simple Integrated Gradients init
ig = IntegratedGradients(predict_func)

# 3. Label selection
# 0=toxic, 1=severe_toxic, 2=obscene, 3=threat, 4=insult, 5=identity_hate
TARGET_LABEL_INDEX = 0
target_name = labels_list[TARGET_LABEL_INDEX]

# A. Text vectors
input_ids = inputs.input_ids
# Take vectors (floats) from embedding layer
input_embeddings = model.distilbert.embeddings(input_ids)

# B. Background vectors (Baseline - padding)
# Create tensor ID padding with the same length as input
ref_input_ids = torch.tensor([tokenizer.pad_token_id] * input_ids.size(1), device=device).unsqueeze(0)
# Change to vectors
ref_input_embeddings = model.distilbert.embeddings(ref_input_ids)

# C. Attention mask (model must know what is padding)
attention_mask = inputs.attention_mask

# 5. Attribution calculation
print(f"Attribution calculation: {target_name}...")

attributions, delta = ig.attribute(
    inputs=input_embeddings,         # Pass prepared vectors
    baselines=ref_input_embeddings,  # Pass background vectors
    target=TARGET_LABEL_INDEX,
    additional_forward_args=(attention_mask,), # Pass attention mask
    return_convergence_delta=True
)

In [None]:
# Results processing for visualisation
attributions_sum = attributions.sum(dim=-1).squeeze(0)
attributions_sum = attributions_sum / torch.norm(attributions_sum)
attributions_np = attributions_sum.cpu().detach().numpy()

# Get probability for given label
prob_score = probs[0][TARGET_LABEL_INDEX].item()
pred_class_label = "True" if prob_score > 0.5 else "False"

# Get tokens
tokens = tokenizer.convert_ids_to_tokens(input_ids[0])

vis_data = visualization.VisualizationDataRecord(
    word_attributions=attributions_np,
    pred_prob=prob_score,       # Label probability
    pred_class=pred_class_label, # Did it pass the threshold?
    true_class=1,               # Assume that text is toxic
    attr_class=target_name,     # Label name (np. 'toxic')
    attr_score=attributions_np.sum(),
    raw_input_ids=tokens,
    convergence_score=delta
)

print(f"\nLabel explaination: {target_name}")
visualization.visualize_text([vis_data])

In [None]:
import numpy as np
from tqdm import tqdm

# 1. Model congi
model.config.output_hidden_states = True

# 2. Extraction function
def extract_hidden_states(data_subset, layer_index=4):
    model.eval()
    all_hidden_states = []
    all_labels = []

    print(f"Extract data from layer: {layer_index}...")

    for i in tqdm(range(len(data_subset))):
        entry = data_subset[i]

        text = entry['input_ids'].unsqueeze(0).to(device)
        mask = entry['attention_mask'].unsqueeze(0).to(device)
        label = entry['labels'][0].item()

        with torch.no_grad():
            outputs = model(text, attention_mask=mask)
            hidden_state = outputs.hidden_states[layer_index]
            cls_embedding = hidden_state[0, 0, :].cpu().numpy()

            all_hidden_states.append(cls_embedding)
            all_labels.append(label)

    return np.array(all_hidden_states), np.array(all_labels)

# Check data size and take max
total_eval_samples = len(eval_dataset)
target_size = 500
subset_size = min(target_size, total_eval_samples)

print(f"Dostępnych próbek: {total_eval_samples}. Używam: {subset_size}")

test_subset = eval_dataset.select(range(subset_size))

# Extraction
X_hidden, y_labels = extract_hidden_states(test_subset, layer_index=4)

print(f"\nKształt danych X: {X_hidden.shape}")
print(f"Kształt danych y: {y_labels.shape}")

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

# 1. Split extracted data for training and test sets
X_train_probe, X_test_probe, y_train_probe, y_test_probe = train_test_split(
    X_hidden, y_labels, test_size=0.2, random_state=42
)

# 2. Create and test simple probe
# Increase the max_iter to make it happen
probe = LogisticRegression(max_iter=1000)
probe.fit(X_train_probe, y_train_probe)

# 3. Check how probe sees the toxicity in the layer
y_pred_probe = probe.predict(X_test_probe)

acc = accuracy_score(y_test_probe, y_pred_probe)
f1 = f1_score(y_test_probe, y_pred_probe)

print(f"--- Probe results (Layer 4) ---")
print(f"Accuracy: {acc:.4f}")
print(f"F1 Score: {f1:.4f}")

# Interpretacja
if acc > 0.80:
    print("Layer 4 has strong representation of toxicity")
else:
    print("Layer 4 does not have strong representation of toxicity")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# 1. Take CAV vector from trained probe
# Logistic regression weights [1, 768]
cav_vector = probe.coef_[0]
intercept = probe.intercept_[0]

# 2. We project the data onto this vector (dot product)
# This will tell us how much each sentence lies “along” the direction of toxicity
# We multiply the representation matrix (X_test_probe) by the CAV vector
projected_scores = np.dot(X_test_probe, cav_vector) + intercept

# 3. Preparing the data for the plot
# We split the results into the toxic group (1) and the safe group (0) based on the true labels
scores_toxic = projected_scores[y_test_probe == 1]
scores_safe = projected_scores[y_test_probe == 0]

# 4. Histogram
plt.figure(figsize=(10, 6))

sns.histplot(scores_safe, color="green", label="Non-Toxic", kde=True, alpha=0.5)

sns.histplot(scores_toxic, color="red", label="Toxic", kde=True, alpha=0.5)

plt.axvline(0, color='black', linestyle='--', label="Decision Boundary (Probe)")
plt.title(f"Distribution of activations along the CAV vector (Layer 4)\nAccuracy: {acc:.2f}, F1: {f1:.2f}")
plt.xlabel("Projection score (The further to the right, the more 'toxic' according to the layer)")
plt.ylabel("Number of examples")
plt.legend()
plt.grid(True, alpha=0.3)

plt.show()

In [None]:
import quantus
import numpy as np

# 1. Prediction function for Quantus
# Quantus provides data as a numpy array, so we need to convert it into Tensors
def model_predict_numpy(model, inputs, **kwargs):
    model.eval()
    # 'inputs' here is a matrix of token IDs [batch_size, seq_len]
    input_tensor = torch.tensor(inputs, device=device).long()

    with torch.no_grad():
        outputs = model(input_tensor)
        # We return probabilities (Softmax/Sigmoid) as numpy
        return torch.sigmoid(outputs.logits).cpu().numpy()

# 2. Explanation function for Quantus (Integrated Gradients)
def explain_func_numpy(model, inputs, targets, **kwargs):
    # Wrapper that runs your IG code inside Quantus
    model.eval()
    input_tensor = torch.tensor(inputs, device=device).long()

    # Create embeddings (as we fixed earlier)
    input_embeddings = model.distilbert.embeddings(input_tensor)

    # Baseline (padding)
    ref_input_ids = torch.tensor([tokenizer.pad_token_id] * inputs.shape[1], device=device).unsqueeze(0)
    ref_input_embeddings = model.distilbert.embeddings(ref_input_ids)

    # IG
    ig = IntegratedGradients(lambda x: model(inputs_embeds=x).logits)

    # Important: loop over the batch (Quantus sometimes provides multiple examples at once)
    attributions_list = []
    for i in range(len(inputs)):
        # Target (which class?)
        target_idx = int(targets[i])

        attr = ig.attribute(
            inputs=input_embeddings[i].unsqueeze(0),
            baselines=ref_input_embeddings,
            target=target_idx,
            n_steps=20 # Fewer steps for faster testing
        )
        # Sum attributions into a single value per token
        attr_sum = attr.sum(dim=-1).squeeze(0).cpu().detach().numpy()
        attributions_list.append(attr_sum)

    return np.array(attributions_list)

In [None]:
import torch
import numpy as np

# --- CONFIGURATION ---
TOP_K_TOKENS = 5   # How many most important words do we remove?
dataset_samples = x_batch_toxic  # We take our 16 toxic sentences
targets = y_batch_targets        # Our labels

print(f"--- Manual Faithfulness Evaluation (Comprehensiveness) ---")
print(f"Test on {len(dataset_samples)} examples.")
print(f"Removing {TOP_K_TOKENS} most important words from each sentence.\n")

scores = []

# Loop over each example
for i in range(len(dataset_samples)):
    # 1. Prepare a single input
    input_id = torch.tensor([dataset_samples[i]], device=device) # Shape [1, seq_len]

    # 2. Original prediction
    model.eval()
    with torch.no_grad():
        orig_output = model(input_id)
        orig_prob = torch.sigmoid(orig_output.logits)[0][0].item()  # Probability of class 'Toxic'

    # 3. Compute attributions (IG) for this example
    # (Using your existing IG object; assuming 'ig' is defined earlier)
    # If not, uncomment the line below:
    ig = IntegratedGradients(predict_func)

    # Prepare embeddings
    input_emb = model.distilbert.embeddings(input_id)
    baseline_emb = model.distilbert.embeddings(
        torch.tensor([tokenizer.pad_token_id] * input_id.size(1), device=device).unsqueeze(0)
    )

    # Compute attributions
    attributions, _ = ig.attribute(
        inputs=input_emb,
        baselines=baseline_emb,
        target=0,  # Targeting the Toxic class
        return_convergence_delta=True
    )

    # Sum attributions to token level
    attr_sum = attributions.sum(dim=-1).squeeze(0)  # [seq_len]

    # 4. Find TOP-K most important tokens
    # torch.topk returns values and indices
    _, top_indices = torch.topk(attr_sum, k=TOP_K_TOKENS)

    # 5. PERTURBATION (Remove words)
    # Copy the input and replace important words with padding (or mask)
    perturbed_input_id = input_id.clone()
    # Insert PAD (id: 0) in the positions of the most important words
    perturbed_input_id[0, top_indices] = tokenizer.pad_token_id

    # 6. New prediction on the "censored" text
    with torch.no_grad():
        pert_output = model(perturbed_input_id)
        pert_prob = torch.sigmoid(pert_output.logits)[0][0].item()

    # 7. Compute the score (Comprehensiveness)
    # How much did the model confidence drop?
    drop = orig_prob - pert_prob
    scores.append(drop)

    # Optional: print preview for the first element
    if i == 0:
        print(f"Example 1 - Original confidence: {orig_prob:.4f}")
        print(f"Example 1 - After removing top-{TOP_K_TOKENS} words: {pert_prob:.4f}")
        print(f"Example 1 - Drop (Score): {drop:.4f}")
        removed_words = tokenizer.convert_ids_to_tokens(input_id[0, top_indices])
        print(f"Removed words: {removed_words}\n")

# --- FINAL RESULTS ---
avg_score = np.mean(scores)
std_score = np.std(scores)

print("-" * 30)
print(f"Average Comprehensiveness score: {avg_score:.4f}")
print(f"Standard deviation: {std_score:.4f}")

if avg_score > 0.1:
    print("\n✅ CONCLUSION: IG works! Removing the identified words significantly reduces toxicity.")
else:
    print("\n❌ CONCLUSION: IG poorly identifies important words (the model still thinks it's toxic).")
