<a href="https://colab.research.google.com/github/janbanot/msc-project/blob/main/test_notebooks/msc_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!uv pip install --upgrade transformers datasets captum quantus accelerate

In [None]:
from google.colab import drive
drive.mount('/drive')

In [None]:
import pandas as pd

csv_path = '/drive/MyDrive/msc-project/jigsaw-toxic-comment/train.csv'
try:
    df = pd.read_csv(csv_path)
    print("CSV file loaded successfully!")
    display(df.head())
except FileNotFoundError:
    print(f"Error: The file was not found at {csv_path}")
except Exception as e:
    print(f"An error occurred: {e}")

In [None]:
import re

def clean_text(example):
    """Applies all cleaning steps to the 'comment_text' field."""

    # 1. Get the text
    text = example['comment_text']

    # 2. Lowercasing
    # This is crucial for "uncased" BERT models
    text = text.lower()

    # 3. Remove URLs
    # re.sub finds a pattern and replaces it
    # r'http\S+' finds 'http' followed by any non-space characters
    text = re.sub(r'http\S+|www\S+', '', text)

    # 4. Remove IP Addresses
    # \d{1,3} means "a digit, 1-to-3 times". \. means "a literal dot".
    text = re.sub(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', '', text)

    # 5. Remove Wikipedia metadata like (talk), timestamps, etc.
    # This is a simple regex to find things like (talk)
    # You could make this more complex, but this is a good start.
    text = re.sub(r'\(talk\)', '', text)
    text = re.sub(r'\d{2}:\d{2}, \w+ \d{1,2}, \d{4} \(utc\)', '', text)

    # 6. Remove newlines and other special characters
    text = text.replace('\n', ' ')
    text = text.replace('\xa0', ' ')

    # 7. Remove any text inside double quotes at the start/end
    # This removes things like '"\n\n ' from the beginning
    text = text.strip(' "')

    # 8. Clean up whitespace
    # \s+ means "one or more space characters"
    # We replace any group of spaces with a single space
    text = re.sub(r'\s+', ' ', text).strip()

    # 9. Update the example
    example['comment_text'] = text
    return example

In [None]:
import datasets

train_df = df.head(2000)
data = datasets.Dataset.from_pandas(train_df)

In [None]:
print("\nCleaning data...")
cleaned_data = data.map(clean_text)
print("Data cleaned!")

In [None]:
print("\n--- BEFORE CLEANING ---")
print(data[1]['comment_text'])
print("\n" + data[6]['comment_text'])
print("\n" + data[0]['comment_text'])

print("\n\n--- AFTER CLEANING ---")
print(cleaned_data[1]['comment_text'])
print("\n" + cleaned_data[6]['comment_text'])
print("\n" + cleaned_data[0]['comment_text'])

In [None]:
from transformers import AutoTokenizer

# "model card"
model_checkpoint = "distilbert-base-uncased"

try:
    # Download and cache the tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    print("Tokenizer loaded successfully!")
except Exception as e:
    print(f"Error loading tokenizer: {e}")

In [None]:
def tokenize_function(examples):
    """Applies the tokenizer to a batch of text."""

    # The main tokenization step.
    # padding="max_length" fills short comments with [PAD] tokens.
    # truncation=True cuts off comments that are too long.
    # max_length=256 is a good balance of speed and context for comments.
    # Could use 512 (DistilBERT's max) but it's slower.
    return tokenizer(
        examples["comment_text"],
        padding="max_length",
        truncation=True,
        max_length=256
    )

# Apply the function with .map()
# batched=True makes it MUCH faster by tokenizing many texts at once.
print("\nTokenizing data...")
tokenized_data = cleaned_data.map(tokenize_function, batched=True)
print("Data tokenized!")

In [None]:
print("\n--- Example of a Tokenized Entry ---")
print(tokenized_data[0])

In [None]:
import numpy as np

# 1. Define label columns in the correct order
label_columns = [
    'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'
]

def create_labels_column(example):
    """
    Creates a new 'labels' column by combining the 6 label columns.
    We convert them to float32, which is what ML models expect.
    """
    # For each example, build a list of its label values
    labels_list = [float(example[col]) for col in label_columns]
    example['labels'] = labels_list
    return example

# 2. Apply the function
print("\nConsolidating labels...")
final_data = tokenized_data.map(create_labels_column)
print("Labels consolidated!")

# 3. Let's see the result for a toxic comment
print("\n--- Example of a Processed Entry ---")
print(final_data[6])

In [None]:
# 1. List all columns to be removed
columns_to_remove = [
    'id', 'comment_text', 'toxic', 'severe_toxic',
    'obscene', 'threat', 'insult', 'identity_hate'
]

print(f"\nOriginal columns: {final_data.column_names}")
final_data = final_data.remove_columns(columns_to_remove)
print(f"Cleaned columns: {final_data.column_names}")

# 2. Set the dataset format to "torch" (for PyTorch)
try:
    final_data.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
    print("\nDataset format set to 'torch'!")
except ImportError:
    print("\nPyTorch not installed. Skipping .set_format('torch').")
    print("Please install with: pip install torch")

print("\n--- Final, Model-Ready Item ---")
print(final_data[6])

In [None]:
from transformers import AutoModelForSequenceClassification

num_labels = 6 # 6 toxic categories

# Load the model, configuring it for multi-label classification
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint,
    num_labels=num_labels,
    problem_type="multi_label_classification"
)

print("Model loaded successfully!")
print("Model configured for multi-label classification.")

In [None]:
data_splits = final_data.train_test_split(test_size=0.2, seed=42)

train_dataset = data_splits['train']
eval_dataset = data_splits['test']

print(f"\nData split complete:")
print(f"Training samples: {len(train_dataset)}")
print(f"Evaluation samples: {len(eval_dataset)}")

In [None]:
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
import torch

def compute_metrics(p: EvalPrediction):
    # p.predictions are the raw logit outputs
    # p.label_ids are the true labels

    # Apply sigmoid to logits to get probabilities
    logits = p.predictions
    # Sigmoid function
    probs = 1 / (1 + np.exp(-logits))

    # Set a threshold (0.5) to get binary predictions
    threshold = 0.5
    predictions = (probs > threshold).astype(int)

    # Compute the metrics
    labels = p.label_ids

    # Use 'micro' averaging, which is good for imbalanced labels
    f1_micro = f1_score(labels, predictions, average='micro')

    # This measures how many individual labels (out of 6*num_samples) were correct
    overall_accuracy = accuracy_score(labels.flatten(), predictions.flatten())

    # Return metrics as a dictionary
    return {
        'f1_micro': f1_micro,
        'accuracy': overall_accuracy
    }

In [None]:
from transformers import TrainingArguments

model_output_dir = "/drive/MyDrive/msc-project/models/distilbert-jigsaw-finetuned"


training_args = TrainingArguments(
    output_dir=model_output_dir,
    learning_rate=2e-5,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    # helps prevent overfitting
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=5,
    load_best_model_at_end=True,
    metric_for_best_model="f1_micro",
    # DISABLE WANDB
    report_to="none",
)

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

print("\n--- Starting Training ---")
trainer.train()
print("--- Training Complete ---")

In [None]:
import os
from datetime import datetime

timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

base_path = "/drive/MyDrive/msc-project/models/final_distilbert_jigsaw"
save_directory = f"{base_path}_{timestamp}"

trainer.save_model(save_directory)
tokenizer.save_pretrained(save_directory)

print(f"Model and tokenizer saved in: {save_directory}")

In [None]:
import torch
import torch.nn.functional as F

model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

text = "you are a fucking moron, who should die in hell but I love your lovely kitten"

# Tokenization
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)

# Inference
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits
    # Use SIGMOID dla multi-label
    probs = torch.sigmoid(logits)

# Display results
labels_list = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
print(f"Text: '{text}'\n")
print("Probabilities:")
for label, prob in zip(labels_list, probs[0]):
    print(f"{label}: {prob:.4f}")

In [None]:
from captum.attr import IntegratedGradients

# 1. Captum wrapper
def predict_func(inputs_embeds, attention_mask=None):
    output = model(inputs_embeds=inputs_embeds, attention_mask=attention_mask)
    return output.logits

# 2. Simple Integrated Gradients init
ig = IntegratedGradients(predict_func)

# 3. Label selection
# 0=toxic, 1=severe_toxic, 2=obscene, 3=threat, 4=insult, 5=identity_hate
TARGET_LABEL_INDEX = 0
target_name = labels_list[TARGET_LABEL_INDEX]

# A. Text vectors
input_ids = inputs.input_ids
# Take vectors (floats) from embedding layer
input_embeddings = model.distilbert.embeddings(input_ids)

# B. Background vectors (Baseline - padding)
# Create tensor ID padding with the same length as input
ref_input_ids = torch.tensor([tokenizer.pad_token_id] * input_ids.size(1), device=device).unsqueeze(0)
# Change to vectors
ref_input_embeddings = model.distilbert.embeddings(ref_input_ids)

# C. Attention mask (model must know what is padding)
attention_mask = inputs.attention_mask

# 5. Attribution calculation
print(f"Attribution calculation: {target_name}...")

attributions, delta = ig.attribute(
    inputs=input_embeddings,         # Pass prepared vectors
    baselines=ref_input_embeddings,  # Pass background vectors
    target=TARGET_LABEL_INDEX,
    additional_forward_args=(attention_mask,), # Pass attention mask
    return_convergence_delta=True
)

In [None]:
from captum.attr import visualization

# Results processing for visualisation
attributions_sum = attributions.sum(dim=-1).squeeze(0)
attributions_sum = attributions_sum / torch.norm(attributions_sum)
attributions_np = attributions_sum.cpu().detach().numpy()

# Get probability for given label
prob_score = probs[0][TARGET_LABEL_INDEX].item()
pred_class_label = "True" if prob_score > 0.5 else "False"

# Get tokens
tokens = tokenizer.convert_ids_to_tokens(input_ids[0])

vis_data = visualization.VisualizationDataRecord(
    word_attributions=attributions_np,
    pred_prob=prob_score,       # Label probability
    pred_class=pred_class_label, # Did it pass the threshold?
    true_class=1,               # Assume that text is toxic
    attr_class=target_name,     # Label name (np. 'toxic')
    attr_score=attributions_np.sum(),
    raw_input_ids=tokens,
    convergence_score=delta
)

print(f"\nLabel explaination: {target_name}")
visualization.visualize_text([vis_data])

In [None]:
import numpy as np
from tqdm import tqdm

# 1. Model congi
model.config.output_hidden_states = True

# 2. Extraction function
def extract_hidden_states(data_subset, layer_index=4):
    model.eval()
    all_hidden_states = []
    all_labels = []

    print(f"Extract data from layer: {layer_index}...")

    for i in tqdm(range(len(data_subset))):
        entry = data_subset[i]

        text = entry['input_ids'].unsqueeze(0).to(device)
        mask = entry['attention_mask'].unsqueeze(0).to(device)
        label = entry['labels'][0].item()

        with torch.no_grad():
            outputs = model(text, attention_mask=mask)
            hidden_state = outputs.hidden_states[layer_index]
            cls_embedding = hidden_state[0, 0, :].cpu().numpy()

            all_hidden_states.append(cls_embedding)
            all_labels.append(label)

    return np.array(all_hidden_states), np.array(all_labels)

# Check data size and take max
total_eval_samples = len(eval_dataset)
target_size = 500
subset_size = min(target_size, total_eval_samples)

print(f"Dostƒôpnych pr√≥bek: {total_eval_samples}. U≈ºywam: {subset_size}")

test_subset = eval_dataset.select(range(subset_size))

# Extraction
X_hidden, y_labels = extract_hidden_states(test_subset, layer_index=4)

print(f"\nKszta≈Çt danych X: {X_hidden.shape}")
print(f"Kszta≈Çt danych y: {y_labels.shape}")

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

# 1. Split extracted data for training and test sets
X_train_probe, X_test_probe, y_train_probe, y_test_probe = train_test_split(
    X_hidden, y_labels, test_size=0.2, random_state=42
)

# 2. Create and test simple probe
# Increase the max_iter to make it happen
probe = LogisticRegression(max_iter=1000)
probe.fit(X_train_probe, y_train_probe)

# 3. Check how probe sees the toxicity in the layer
y_pred_probe = probe.predict(X_test_probe)

acc = accuracy_score(y_test_probe, y_pred_probe)
f1 = f1_score(y_test_probe, y_pred_probe)

print(f"--- Probe results (Layer 4) ---")
print(f"Accuracy: {acc:.4f}")
print(f"F1 Score: {f1:.4f}")

# Interpretacja
if acc > 0.80:
    print("Layer 4 has strong representation of toxicity")
else:
    print("Layer 4 does not have strong representation of toxicity")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# 1. Take CAV vector from trained probe
# Logistic regression weights [1, 768]
cav_vector = probe.coef_[0]
intercept = probe.intercept_[0]

# 2. We project the data onto this vector (dot product)
# This will tell us how much each sentence lies ‚Äúalong‚Äù the direction of toxicity
# We multiply the representation matrix (X_test_probe) by the CAV vector
projected_scores = np.dot(X_test_probe, cav_vector) + intercept

# 3. Preparing the data for the plot
# We split the results into the toxic group (1) and the safe group (0) based on the true labels
scores_toxic = projected_scores[y_test_probe == 1]
scores_safe = projected_scores[y_test_probe == 0]

# 4. Histogram
plt.figure(figsize=(10, 6))

sns.histplot(scores_safe, color="green", label="Non-Toxic", kde=True, alpha=0.5)

sns.histplot(scores_toxic, color="red", label="Toxic", kde=True, alpha=0.5)

plt.axvline(0, color='black', linestyle='--', label="Decision Boundary (Probe)")
plt.title(f"Distribution of activations along the CAV vector (Layer 4)\nAccuracy: {acc:.2f}, F1: {f1:.2f}")
plt.xlabel("Projection score (The further to the right, the more 'toxic' according to the layer)")
plt.ylabel("Number of examples")
plt.legend()
plt.grid(True, alpha=0.3)

plt.show()

In [None]:
import quantus
import numpy as np

# 1. Prediction function for Quantus
# Quantus provides data as a numpy array, so we need to convert it into Tensors
def model_predict_numpy(model, inputs, **kwargs):
    model.eval()
    # 'inputs' here is a matrix of token IDs [batch_size, seq_len]
    input_tensor = torch.tensor(inputs, device=device).long()

    with torch.no_grad():
        outputs = model(input_tensor)
        # We return probabilities (Softmax/Sigmoid) as numpy
        return torch.sigmoid(outputs.logits).cpu().numpy()

# 2. Explanation function for Quantus (Integrated Gradients)
def explain_func_numpy(model, inputs, targets, **kwargs):
    # Wrapper that runs your IG code inside Quantus
    model.eval()
    input_tensor = torch.tensor(inputs, device=device).long()

    # Create embeddings (as we fixed earlier)
    input_embeddings = model.distilbert.embeddings(input_tensor)

    # Baseline (padding)
    ref_input_ids = torch.tensor([tokenizer.pad_token_id] * inputs.shape[1], device=device).unsqueeze(0)
    ref_input_embeddings = model.distilbert.embeddings(ref_input_ids)

    # IG
    ig = IntegratedGradients(lambda x: model(inputs_embeds=x).logits)

    # Important: loop over the batch (Quantus sometimes provides multiple examples at once)
    attributions_list = []
    for i in range(len(inputs)):
        # Target (which class?)
        target_idx = int(targets[i])

        attr = ig.attribute(
            inputs=input_embeddings[i].unsqueeze(0),
            baselines=ref_input_embeddings,
            target=target_idx,
            n_steps=20 # Fewer steps for faster testing
        )
        # Sum attributions into a single value per token
        attr_sum = attr.sum(dim=-1).squeeze(0).cpu().detach().numpy()
        attributions_list.append(attr_sum)

    return np.array(attributions_list)

In [None]:
import torch
import numpy as np

# --- PREPARE DATA ---
# We need to filter the dataset to find examples that are actually toxic (label=1)

# Find indices where the label is 1 (Toxic)
toxic_indices = np.where(y_labels == 1)[0]

# Select up to 16 examples (or fewer if we don't have 16)
batch_size = 16
selected_indices = toxic_indices[:batch_size]

# Extract the Input IDs for these specific indices
x_batch_toxic = [test_subset[int(i)]['input_ids'] for i in selected_indices]
y_batch_targets = y_labels[selected_indices]

print(f"Selected {len(x_batch_toxic)} toxic examples for evaluation.")

# --- CONFIGURATION ---
TOP_K_TOKENS = 5   # How many most important words do we remove?
dataset_samples = x_batch_toxic  # We take our 16 toxic sentences
targets = y_batch_targets        # Our labels

print(f"--- Manual Faithfulness Evaluation (Comprehensiveness) ---")
print(f"Test on {len(dataset_samples)} examples.")
print(f"Removing {TOP_K_TOKENS} most important words from each sentence.\n")

scores = []

# Loop over each example
for i in range(len(dataset_samples)):
    # 1. Prepare a single input
    input_id = dataset_samples[i].unsqueeze(0).to(device) # Shape becomes [1, seq_len]

    # 2. Original prediction
    model.eval()
    with torch.no_grad():
        orig_output = model(input_id)
        orig_prob = torch.sigmoid(orig_output.logits)[0][0].item()  # Probability of class 'Toxic'

    # 3. Compute attributions (IG) for this example
    # (Using your existing IG object; assuming 'ig' is defined earlier)
    # If not, uncomment the line below:
    ig = IntegratedGradients(predict_func)

    # Prepare embeddings
    input_emb = model.distilbert.embeddings(input_id)
    baseline_emb = model.distilbert.embeddings(
        torch.tensor([tokenizer.pad_token_id] * input_id.size(1), device=device).unsqueeze(0)
    )

    # Compute attributions
    attributions, _ = ig.attribute(
        inputs=input_emb,
        baselines=baseline_emb,
        target=0,  # Targeting the Toxic class
        return_convergence_delta=True
    )

    # Sum attributions to token level
    attr_sum = attributions.sum(dim=-1).squeeze(0)  # [seq_len]

    # 4. Find TOP-K most important tokens
    # torch.topk returns values and indices
    _, top_indices = torch.topk(attr_sum, k=TOP_K_TOKENS)

    # 5. PERTURBATION (Remove words)
    # Copy the input and replace important words with padding (or mask)
    perturbed_input_id = input_id.clone()
    # Insert PAD (id: 0) in the positions of the most important words
    perturbed_input_id[0, top_indices] = tokenizer.pad_token_id

    # 6. New prediction on the "censored" text
    with torch.no_grad():
        pert_output = model(perturbed_input_id)
        pert_prob = torch.sigmoid(pert_output.logits)[0][0].item()

    # 7. Compute the score (Comprehensiveness)
    # How much did the model confidence drop?
    drop = orig_prob - pert_prob
    scores.append(drop)

    # Optional: print preview for the first element
    if i == 0:
        print(f"Example 1 - Original confidence: {orig_prob:.4f}")
        print(f"Example 1 - After removing top-{TOP_K_TOKENS} words: {pert_prob:.4f}")
        print(f"Example 1 - Drop (Score): {drop:.4f}")
        removed_words = tokenizer.convert_ids_to_tokens(input_id[0, top_indices])
        print(f"Removed words: {removed_words}\n")

# --- FINAL RESULTS ---
avg_score = np.mean(scores)
std_score = np.std(scores)

print("-" * 30)
print(f"Average Comprehensiveness score: {avg_score:.4f}")
print(f"Standard deviation: {std_score:.4f}")

if avg_score > 0.1:
    print("\n‚úÖ CONCLUSION: IG works! Removing the identified words significantly reduces toxicity.")
else:
    print("\n‚ùå CONCLUSION: IG poorly identifies important words (the model still thinks it's toxic).")


In [None]:
from captum.attr import InputXGradient

# 1. Inicjalizacja InputXGradient
# U≈ºywamy tej samej funkcji 'predict_func', kt√≥ra zosta≈Ça zdefiniowana przy Integrated Gradients
ixg = InputXGradient(predict_func)

print(f"Obliczanie atrybucji metodƒÖ Input X Gradient dla klasy: {target_name}...")

# 2. Wykonanie atrybucji (InputXGradient)
# WA≈ªNE: InputXGradient oblicza gradient * wej≈õcie. Nie wymaga baseline'u (t≈Ça).
attributions_ixg = ixg.attribute(
    inputs=input_embeddings,                   # Te same embeddingi wej≈õciowe co przy IG
    target=TARGET_LABEL_INDEX,                 # Ta sama etykieta docelowa
    additional_forward_args=(attention_mask,)  # Przekazujemy maskƒô uwagi, aby model dzia≈Ça≈Ç poprawnie
)

# 3. Przetwarzanie wynik√≥w
# Wynik ma kszta≈Çt [batch, seq_len, hidden_dim]. Sumujemy po ostatnim wymiarze (hidden_dim),
# aby uzyskaƒá jednƒÖ warto≈õƒá wa≈ºno≈õci dla ka≈ºdego tokenu.
attributions_ixg_sum = attributions_ixg.sum(dim=-1).squeeze(0)

# Normalizacja (norma Euklidesowa) - identyczna jak w Twoim kodzie dla IG,
# co pozwala na uczciwe por√≥wnanie "si≈Çy" atrybucji.
attributions_ixg_sum = attributions_ixg_sum / torch.norm(attributions_ixg_sum)
attributions_ixg_np = attributions_ixg_sum.cpu().detach().numpy()

# 4. Wizualizacja i Por√≥wnanie
# Tworzymy rekord wizualizacji specyficzny dla Input X Gradient
vis_data_ixg = visualization.VisualizationDataRecord(
    word_attributions=attributions_ixg_np,
    pred_prob=prob_score,           # Korzystamy z obliczonego wcze≈õniej prawdopodobie≈Ñstwa
    pred_class=pred_class_label,    # Etykieta (True/False)
    true_class=1,                   # Zak≈Çadana prawdziwa klasa (Toxic)
    attr_class=f"{target_name} (InpxGrad)", # Zmieniamy nazwƒô, aby odr√≥≈ºniƒá od IG
    attr_score=attributions_ixg_np.sum(),
    raw_input_ids=tokens,           # Te same tokeny
    convergence_score=None          # InputXGradient nie oblicza b≈Çƒôdu konwergencji (delta)
)

print("\n--- Por√≥wnanie wizualne metod XAI ---")
print("1. Wiersz: Integrated Gradients (IG)")
print("2. Wiersz: Input X Gradient (Baseline)")

# Funkcja visualize_text przyjmuje listƒô rekord√≥w.
# Przekazujemy 'vis_data' (z poprzedniej kom√≥rki - IG) oraz 'vis_data_ixg' (nowy wynik).
visualization.visualize_text([vis_data, vis_data_ixg])

In [None]:
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from torch.utils.data import DataLoader
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from tqdm import tqdm

# --- KROK 1: Wydajna ekstrakcja aktywacji ze wszystkich warstw ---

def extract_all_layers(dataset, model, device, batch_size=32):
    """
    Ekstrahuje embeddingi [CLS] ze wszystkich warstw modelu w jednym przebiegu.
    """
    model.eval()

    # DataLoader przyspiesza proces dziƒôki batchowaniu (zamiast pojedynczych pr√≥bek)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

    # S≈Çownik do przechowywania list aktywacji dla ka≈ºdej warstwy
    # DistilBERT ma zazwyczaj: 1 warstwƒô embedding√≥w + 6 warstw transformera = 7 stan√≥w
    layers_data = {}
    all_labels = []

    print(f"Rozpoczynam ekstrakcjƒô z {len(dataset)} pr√≥bek...")

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Extraction"):
            # Przeniesienie danych na GPU
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels']

            # --- Single Pass Extraction ---
            # Uruchamiamy model raz, ale prosimy o zwrot stan√≥w ukrytych wszystkich warstw
            outputs = model(input_ids, attention_mask=attention_mask, output_hidden_states=True)

            # outputs.hidden_states to krotka (tuple) tensor√≥w
            # Indeks 0 = Word Embeddings, Indeks 1..6 = Warstwy Encodera
            for layer_idx, hidden_state in enumerate(outputs.hidden_states):
                if layer_idx not in layers_data:
                    layers_data[layer_idx] = []

                # WyciƒÖgamy token [CLS] (indeks 0 w sekwencji) -> [batch_size, hidden_dim]
                cls_embeddings = hidden_state[:, 0, :].cpu().numpy()
                layers_data[layer_idx].append(cls_embeddings)

            # Zbieramy etykiety (zak≈Çadamy, ≈ºe labels sƒÖ one-hot lub listƒÖ, bierzemy 'toxic' czyli kolumnƒô 0)
            # Je≈õli labels to tensor float [batch, 6], bierzemy pierwszƒÖ kolumnƒô (toxic)
            # Je≈õli labels to LongTensor (klasyfikacja binarna), bierzemy bezpo≈õrednio
            if labels.dim() > 1:
                toxic_labels = labels[:, 0].cpu().numpy() # Zak≈Çadamy, ≈ºe indeks 0 to 'toxic'
            else:
                toxic_labels = labels.cpu().numpy()

            all_labels.extend(toxic_labels)

    # Konwersja list na macierze numpy
    final_layer_activations = {
        layer: np.concatenate(data, axis=0)
        for layer, data in layers_data.items()
    }
    final_labels = np.array(all_labels)

    return final_layer_activations, final_labels

# Ustalenie wielko≈õci podzbioru do analizy (np. 1000 pr√≥bek dla szybko≈õci lub ca≈Çy zbi√≥r)
# U≈ºywamy wcze≈õniej zdefiniowanego 'test_subset' z Twojego notebooka lub tworzymy nowy
eval_subset_size = 1000
if len(eval_dataset) > eval_subset_size:
    analysis_dataset = eval_dataset.select(range(eval_subset_size))
else:
    analysis_dataset = eval_dataset

# Uruchomienie ekstrakcji
X_layers_dict, y_all = extract_all_layers(analysis_dataset, model, device, batch_size=32)

print(f"\nEkstrakcja zako≈Ñczona. Pobrane warstwy: {list(X_layers_dict.keys())}")
print(f"Kszta≈Çt aktywacji dla warstwy 0: {X_layers_dict[0].shape}")


# --- KROK 2: Pƒôtla treningowa sond (Probing Loop) ---

results = []

print("\nRozpoczynam trening sond liniowych (Linear Probes)...")

for layer_idx in sorted(X_layers_dict.keys()):
    X = X_layers_dict[layer_idx]
    y = y_all

    # Binaryzacja etykiet (na wszelki wypadek, gdyby by≈Çy floatami)
    y = (y > 0.5).astype(int)

    # Podzia≈Ç na train/test dla sondy
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Trening LogReg
    clf = LogisticRegression(max_iter=1000, random_state=42, solver='liblinear')
    clf.fit(X_train, y_train)

    # Ewaluacja
    y_pred = clf.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    results.append({
        'layer': layer_idx,
        'accuracy': acc,
        'f1': f1
    })

    print(f"Layer {layer_idx}: Acc={acc:.4f}, F1={f1:.4f}")

# Konwersja wynik√≥w do DataFrame dla ≈Çatwiejszego rysowania
df_results = pd.DataFrame(results)


# --- KROK 3: Wizualizacja (Line Plot) ---

plt.figure(figsize=(10, 6))
sns.set_style("whitegrid")

# Rysowanie linii
sns.lineplot(data=df_results, x='layer', y='accuracy', marker='o', label='Accuracy', linewidth=2.5)
sns.lineplot(data=df_results, x='layer', y='f1', marker='s', label='F1 Score', linewidth=2.5)

# Formatowanie wykresu
plt.title("Liniowa separowalno≈õƒá toksyczno≈õci w warstwach modelu (DistilBERT)", fontsize=14, pad=15)
plt.xlabel("Numer Warstwy (0 = Embeddings, 1-6 = Transformer Layers)", fontsize=12)
plt.ylabel("Warto≈õƒá Metryki", fontsize=12)
plt.ylim(0.0, 1.05)  # Skala Y od 0 do nieco powy≈ºej 1
plt.xticks(df_results['layer']) # Wymu≈õ pokazanie wszystkich numer√≥w warstw
plt.legend(fontsize=11)

# Dodanie warto≈õci nad punktami (opcjonalnie, dla czytelno≈õci)
for index, row in df_results.iterrows():
    plt.text(row['layer'], row['accuracy'] + 0.01, f"{row['accuracy']:.2f}",
             ha='center', color='blue', fontsize=9)

plt.tight_layout()
plt.show()

In [None]:
import torch
import torch.nn.functional as F
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from captum.attr import IntegratedGradients

# --- CZƒò≈öƒÜ 1: Generator Parafraz (T5) ---

print("≈Åadowanie modelu do parafrazowania (T5)...")
para_model_name = "Vamsi/T5_Paraphrase_Paws"
para_tokenizer = AutoTokenizer.from_pretrained(para_model_name)
para_model = AutoModelForSeq2SeqLM.from_pretrained(para_model_name).to(device)
print("Model T5 za≈Çadowany!")

def generate_paraphrase(text, num_return_sequences=1):
    """
    Generuje parafrazƒô dla podanego tekstu u≈ºywajƒÖc modelu T5.
    """
    para_model.eval()

    # T5 wymaga prefiksu dla tego konkretnego zadania
    text = "paraphrase: " + text + " </s>"

    encoding = para_tokenizer.encode_plus(
        text,
        padding="longest",
        return_tensors="pt"
    )

    input_ids = encoding["input_ids"].to(device)
    attention_masks = encoding["attention_mask"].to(device)

    with torch.no_grad():
        outputs = para_model.generate(
            input_ids=input_ids,
            attention_mask=attention_masks,
            max_length=256,
            do_sample=True, # Sampling pozwala na wiƒôkszƒÖ r√≥≈ºnorodno≈õƒá
            top_k=120,
            top_p=0.95,
            early_stopping=True,
            num_return_sequences=num_return_sequences
        )

    # Dekodowanie wyniku
    paraphrase = para_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return paraphrase

# --- CZƒò≈öƒÜ 2: Funkcje pomocnicze do Stabilno≈õci ---

def get_top_k_tokens(text_input, model, tokenizer, k=5):
    """
    Oblicza atrybucje IG i zwraca zbi√≥r k najwa≈ºniejszych s≈Ç√≥w (string√≥w).
    """
    # 1. Przygotowanie inputu dla DistilBERT
    inputs = tokenizer(text_input, return_tensors="pt", truncation=True, padding=True).to(device)
    input_ids = inputs.input_ids
    attention_mask = inputs.attention_mask

    # Funkcja predykcji dla IG
    def predict_func(inputs_embeds):
        out = model(inputs_embeds=inputs_embeds, attention_mask=attention_mask)
        return out.logits

    ig = IntegratedGradients(predict_func)

    # Embeddings
    input_embeddings = model.distilbert.embeddings(input_ids)
    ref_input_ids = torch.tensor([tokenizer.pad_token_id] * input_ids.size(1), device=device).unsqueeze(0)
    ref_input_embeddings = model.distilbert.embeddings(ref_input_ids)

    # Atrybucja (Target=0 -> 'Toxic' w Twoim modelu, sprawd≈∫ czy to w≈Ça≈õciwy indeks!)
    # Zak≈Çadam, ≈ºe index 0 to klasa, kt√≥rƒÖ badamy (np. Toxic). Je≈õli Toxic to 1, zmie≈Ñ target=1.
    # W Twoim poprzednim kodzie labels_list[0] to 'toxic'.
    target_idx = 0

    attributions, _ = ig.attribute(
        inputs=input_embeddings,
        baselines=ref_input_embeddings,
        target=target_idx,
        return_convergence_delta=True
    )

    # Sumowanie i wyb√≥r Top-K
    attr_sum = attributions.sum(dim=-1).squeeze(0)
    _, top_indices = torch.topk(attr_sum, k=min(k, len(attr_sum)))

    # Konwersja ID na Tokeny (Stringi)
    top_tokens = tokenizer.convert_ids_to_tokens(input_ids[0][top_indices])

    # Czyszczenie token√≥w (usuwanie '##' z subwords i lowercase)
    clean_tokens = set([t.replace("##", "").lower() for t in top_tokens if t not in ['[CLS]', '[SEP]', '[PAD]']])

    return clean_tokens

def evaluate_stability(original_text, layer_index, model, tokenizer):
    """
    G≈Ç√≥wna funkcja obliczajƒÖca 3 metryki stabilno≈õci.
    """
    # 1. Generowanie parafrazy
    paraphrase_text = generate_paraphrase(original_text)

    # 2. Przygotowanie obu tekst√≥w
    inputs_orig = tokenizer(original_text, return_tensors="pt", truncation=True, max_length=512).to(device)
    inputs_para = tokenizer(paraphrase_text, return_tensors="pt", truncation=True, max_length=512).to(device)

    model.eval()

    # --- A & B: Predykcja i Reprezentacja Warstwy ---
    # Uruchamiamy model z output_hidden_states=True
    with torch.no_grad():
        out_orig = model(**inputs_orig, output_hidden_states=True)
        out_para = model(**inputs_para, output_hidden_states=True)

    # A. Stabilno≈õƒá Predykcji (Output Stability)
    # Bierzemy prawdopodobie≈Ñstwo klasy Toxic (indeks 0 lub 1 zale≈ºnie od Twojej konfiguracji)
    # Zak≈Çadam sigmoid dla multilabel, bierzemy pierwszy label 'toxic'
    prob_orig = torch.sigmoid(out_orig.logits)[0][0].item()
    prob_para = torch.sigmoid(out_para.logits)[0][0].item()
    pred_diff = abs(prob_orig - prob_para)

    # B. Stabilno≈õƒá Reprezentacji (Layer Stability - Cosine Sim)
    # Pobieramy hidden state z wybranej warstwy.
    # Tuple ma (embeddings, layer1, ... layer6). Layer index 0 w kodzie to embeddings.
    # Je≈õli layer_index=5, bierzemy outputs.hidden_states[5]

    # WyciƒÖgamy wektor [CLS] (indeks 0 w sekwencji)
    cls_orig = out_orig.hidden_states[layer_index][:, 0, :] # [1, 768]
    cls_para = out_para.hidden_states[layer_index][:, 0, :] # [1, 768]

    cosine_sim = F.cosine_similarity(cls_orig, cls_para).item()

    # --- C: Stabilno≈õƒá Atrybucji (Jaccard) ---
    # Obliczamy tylko je≈õli mamy wystarczajƒÖco zasob√≥w (IG jest kosztowne)
    tokens_orig = get_top_k_tokens(original_text, model, tokenizer, k=5)
    tokens_para = get_top_k_tokens(paraphrase_text, model, tokenizer, k=5)

    # Jaccard Index
    intersection = len(tokens_orig.intersection(tokens_para))
    union = len(tokens_orig.union(tokens_para))
    jaccard_score = intersection / union if union > 0 else 0.0

    return {
        "Original Text": original_text,
        "Paraphrase": paraphrase_text,
        "Prob Original": round(prob_orig, 4),
        "Prob Paraphrase": round(prob_para, 4),
        "Pred Diff (Output)": round(pred_diff, 4),
        "Layer Cosine Sim": round(cosine_sim, 4),
        "Attribution Jaccard": round(jaccard_score, 4),
        "Top Tokens Orig": list(tokens_orig),
        "Top Tokens Para": list(tokens_para)
    }

# --- CZƒò≈öƒÜ 3: Eksperyment i Raportowanie ---

# Wyb√≥r toksycznych przyk≈Çad√≥w ze zbioru testowego (tam gdzie label=1)
# Zak≈Çadam, ≈ºe masz 'test_subset' i 'y_labels' z poprzednich krok√≥w.
# Je≈õli nie, pobieramy nowe z eval_dataset.

# Szukamy indeks√≥w toksycznych
toxic_indices = [i for i, x in enumerate(y_test_probe) if x == 1][:15] # Bierzemy 15 sztuk
if len(toxic_indices) == 0:
    print("Brak toksycznych pr√≥bek w podrƒôcznym zbiorze, dobieram losowe...")
    toxic_indices = range(10)

print(f"\nRozpoczynam analizƒô stabilno≈õci dla {len(toxic_indices)} przyk≈Çad√≥w...")
print(f"Badana warstwa: {5} (Zgodnie z wynikami poprzedniej analizy)")

results_stability = []

# Iteracja
for idx in toxic_indices:
    # Pobierz tekst (odkoduj z input_ids je≈õli trzeba, lub we≈∫ z datasetu raw)
    # Tutaj zak≈Çadam, ≈ºe wyciƒÖgamy surowy tekst z datasetu
    # (Je≈õli korzystasz z 'test_subset', musimy odkodowaƒá tokeny)
    input_ids_raw = test_subset[idx]['input_ids']
    orig_text = tokenizer.decode(input_ids_raw, skip_special_tokens=True)

    # Uruchomienie ewaluacji dla Warstwy 5 (najlepszej wg wykresu)
    metrics = evaluate_stability(orig_text, layer_index=5, model=model, tokenizer=tokenizer)
    results_stability.append(metrics)

# Tworzenie DataFrame
df_stability = pd.DataFrame(results_stability)

# Wy≈õwietlanie wynik√≥w
pd.set_option('display.max_colwidth', 50)
display(df_stability[[
    "Original Text", "Paraphrase",
    "Pred Diff (Output)", "Layer Cosine Sim", "Attribution Jaccard"
]])

# Podsumowanie ≈õrednie
print("\n--- PODSUMOWANIE STABILNO≈öCI (≈örednie) ---")
print(f"Mean Prediction Stability (Diff): {df_stability['Pred Diff (Output)'].mean():.4f} (Im mniej tym lepiej)")
print(f"Mean Layer Stability (Cosine):    {df_stability['Layer Cosine Sim'].mean():.4f} (Im bli≈ºej 1.0 tym lepiej)")
print(f"Mean Attribution Stability (Jacc):{df_stability['Attribution Jaccard'].mean():.4f} (Im bli≈ºej 1.0 tym lepiej)")

In [None]:
# --- POPRAWIONY KROK 1: Obliczanie wektora metodƒÖ Difference of Means ---

# 1. Rozdzielamy dane z Warstwy 5 na toksyczne i bezpieczne
# U≈ºywamy X_layers_dict[5] (z wcze≈õniejszej ekstrakcji) i y_all
X_layer_5 = X_layers_dict[5]
y_bool = (y_all > 0.5) # True dla Toxic, False dla Safe

# 2. Obliczamy ≈õrednie (centroids) dla obu grup
mean_toxic = np.mean(X_layer_5[y_bool], axis=0)
mean_safe = np.mean(X_layer_5[~y_bool], axis=0)

# 3. Wektor kierunkowy: Od Safe do Toxic
# To jest wektor, kt√≥ry m√≥wi: "Co dodaƒá do bezpiecznego zdania, ≈ºeby sta≈Ço siƒô toksyczne?"
direction_vector = mean_toxic - mean_safe

# --- DEBUGGING SKALI (Kluczowe dla dobrania Alphy) ---
vec_norm = np.linalg.norm(direction_vector)
hidden_state_norm = np.linalg.norm(mean_safe) # ≈örednia "si≈Ça" aktywacji modelu

print(f"Norma wektora sterujƒÖcego (Diff of Means): {vec_norm:.4f}")
print(f"≈örednia norma aktywacji w modelu: {hidden_state_norm:.4f}")
print(f"Stosunek si≈Ç: {vec_norm / hidden_state_norm:.4f}")

# 4. Konwersja do Torch (BEZ NORMALIZACJI DO 1!)
# Nie normalizujemy wektora, bo chcemy zachowaƒá naturalnƒÖ "si≈Çƒô" r√≥≈ºnicy miƒôdzy klasami.
steering_tensor = torch.tensor(direction_vector, dtype=torch.float32).to(device)

print("\nNowy wektor sterujƒÖcy (Mean Diff) gotowy!")

# --- POWT√ìRZENIE TESTU (Kopiuj-wklej z poprzedniego kodu, ale z wiƒôkszym zakresem alpha) ---

# Zwiƒôkszamy zakres alpha, poniewa≈º nie znormalizowali≈õmy wektora (lub je≈õli jest ma≈Çy)
# Je≈õli vec_norm jest du≈ºy (np. 10), u≈ºywamy mniejszych alpha (np. -2, 2).
# Je≈õli vec_norm jest ma≈Çy (np. 0.5), u≈ºywamy du≈ºych alpha (np. -20, 20).
# Poni≈ºej dynamiczny dob√≥r alpha na podstawie normy:

scale_factor = 5.0  # Mno≈ºnik eksperymentalny
suggested_alpha = scale_factor

print(f"\nSugerowana si≈Ça alpha: +/- {suggested_alpha}")

# Test na tym samym zdaniu
text_toxic = "You are a complete idiot and a waste of time."

# Funkcja predict_with_steering musi byƒá zdefiniowana (z poprzedniego kroku)
score_orig = predict_with_steering(text_toxic, model, tokenizer, steering_tensor, alpha=0)
score_detox = predict_with_steering(text_toxic, model, tokenizer, steering_tensor, alpha=-suggested_alpha) # Odejmujemy toksyczno≈õƒá
score_toxic = predict_with_steering(text_toxic, model, tokenizer, steering_tensor, alpha=suggested_alpha)  # Dodajemy toksyczno≈õƒá

print(f"\nZdanie: {text_toxic}")
print(f"Orygina≈Ç (Alpha 0):      {score_orig:.4f}")
print(f"Detoksykacja (Alpha -{suggested_alpha}): {score_detox:.4f} (Oczekujemy spadku)")
print(f"Toksyfikacja (Alpha +{suggested_alpha}): {score_toxic:.4f} (Oczekujemy wzrostu)")

# Rysowanie wykresu dla szerszego zakresu
alphas = np.linspace(-suggested_alpha * 2, suggested_alpha * 2, 10)
scores = [predict_with_steering(text_toxic, model, tokenizer, steering_tensor, alpha=a) for a in alphas]

plt.figure(figsize=(8, 5))
plt.plot(alphas, scores, marker='o', color='green', linewidth=2)
plt.axhline(0.5, color='gray', linestyle='--')
plt.title(f"Mean Difference Steering (Layer 5)\nZdanie: '{text_toxic[:30]}...'")
plt.xlabel("Alpha (Ujemna = Detoksykacja)")
plt.ylabel("Prawdopodobie≈Ñstwo Toxic")
plt.grid(True, alpha=0.3)
plt.show()

# --- Bonus: Przyk≈Çad zmiany klasyfikacji ---
print("\n--- DEMO ZMIANY KLASYFIKACJI ---")
text_toxic = "You are a complete idiot and a waste of time."
base_score = predict_with_steering(text_toxic, model, tokenizer, steering_tensor, alpha=0)
detox_score = predict_with_steering(text_toxic, model, tokenizer, steering_tensor, alpha=-15)

print(f"Zdanie: {text_toxic}")
print(f"Orygina≈Ç:     {base_score:.4f} (Toxic? {base_score>0.5})")
print(f"Po 'Leczeniu': {detox_score:.4f} (Toxic? {detox_score>0.5})")

In [None]:
# --- TEST BEZPIECZE≈ÉSTWA (SIDE EFFECTS) ---

# Ustawiamy optymalnƒÖ Alphƒô na podstawie Twojego wykresu (miƒôdzy -2.5 a -5)
OPTIMAL_ALPHA = -3.0

# Lista zda≈Ñ testowych r√≥≈ºnych kategorii
test_sentences = [
    # 1. TOKSYCZNE (Tu chcemy zmiany na Safe)
    ("You are a complete idiot and a waste of time.", "Toxic"),
    ("I hope you fail miserably, you loser.", "Toxic"),

    # 2. BEZPIECZNE - POZYTYWNE (Tu nie chcemy zmian)
    ("I really love the weather today, it is beautiful.", "Safe"),
    ("Thank you for your help, I appreciate it.", "Safe"),

    # 3. BEZPIECZNE - NEGATYWNE (To jest PU≈ÅAPKA! Model nie powinien tego ruszaƒá)
    ("I am feeling very sad and tired today.", "Safe_Negative"),
    ("The movie was boring and too long.", "Safe_Negative"),
    ("I disagree with your opinion regarding the tax policy.", "Safe_Negative")
]

print(f"--- TEST SKUTK√ìW UBOCZNYCH (Alpha = {OPTIMAL_ALPHA}) ---\n")
print(f"{'Sentence Category':<15} | {'Original Prob':<15} | {'Detox Prob':<15} | {'Status'}")
print("-" * 70)

# Upewnij siƒô, ≈ºe steering_tensor jest zdefiniowany (ten z metody Difference of Means)
for text, category in test_sentences:
    # Wynik bez interwencji
    prob_orig = predict_with_steering(text, model, tokenizer, steering_tensor, alpha=0)

    # Wynik z detoksykacjƒÖ
    prob_detox = predict_with_steering(text, model, tokenizer, steering_tensor, alpha=OPTIMAL_ALPHA)

    # Ocena
    # Dla Toxic: Sukces, je≈õli spadnie < 0.5
    # Dla Safe: Sukces, je≈õli pozostanie < 0.5 (i nie wzro≈õnie drastycznie)

    status = ""
    if category == "Toxic":
        if prob_detox < 0.1: status = "‚úÖ Fixed"
        elif prob_detox < 0.5: status = "‚ö†Ô∏è Improved"
        else: status = "‚ùå Failed"
    else:
        # Dla zda≈Ñ bezpiecznych sprawdzamy, czy model nie "zwariowa≈Ç" (np. nagle 0.9)
        # Ale wa≈ºniejsze: czy wynik siƒô drastycznie nie zmieni≈Ç?
        # W idealnym ≈õwiecie Safe nie powinno siƒô ruszaƒá.
        change = abs(prob_orig - prob_detox)
        if prob_detox > 0.5: status = "‚ùå BROKEN (Flagged Safe)"
        elif change < 0.2: status = "‚úÖ Stable"
        else: status = "‚ö†Ô∏è Shifted"

    print(f"{category:<15} | {prob_orig:.4f}          | {prob_detox:.4f}          | {status}")

In [None]:
import torch
from datetime import datetime

# --- BLOK 1: ZAPIS (EKSPORT ARTEFAKTU) ---

# Tworzymy s≈Çownik ze wszystkimi potrzebnymi danymi
steering_artifact = {
    "steering_vector": steering_tensor.cpu(),  # Przenosimy na CPU do zapisu
    "layer_index": 5,                          # Twoja znaleziona warstwa
    "alpha": OPTIMAL_ALPHA,                    # Twoja optymalna si≈Ça (-3.0)
    "method": "mean_difference",
    "model_name": "distilbert-base-uncased",
    "description": "Vector removing toxicity concept from layer 5"
}
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
save_path = f"/drive/MyDrive/msc-project/vectors/toxicity_steering_controller_{timestamp}.pt"

torch.save(steering_artifact, save_path)
print(f"‚úÖ Artefakt sterujƒÖcy zapisany w: {save_path}")


# --- BLOK 2: SYMULACJA PRODUKCJI (CZYSTA SESJA) ---
# Wyobra≈∫ sobie, ≈ºe to jest oddzielny skrypt na serwerze, kt√≥ry nie widzi danych treningowych.

print("\n--- SYMULACJA ≈öRODOWISKA PRODUKCYJNEGO ---")

# 1. ≈Åadowanie artefaktu
artifact = torch.load(save_path)
loaded_vector = artifact["steering_vector"].to(device)
loaded_layer = artifact["layer_index"]
loaded_alpha = artifact["alpha"]

print(f"Wczytano sterownik: {artifact['description']}")
print(f"Konfiguracja: Layer {loaded_layer}, Alpha {loaded_alpha}")

# 2. Definicja klasy Hooka (Musi byƒá dostƒôpna w kodzie produkcyjnym)
class ProductionSteeringHook:
    def __init__(self, vector, coeff):
        self.vector = vector
        self.coeff = coeff

    def __call__(self, module, inputs, output):
        # output[0] to hidden_states
        hidden_states = output[0]
        shifted_states = hidden_states + (self.coeff * self.vector)
        return (shifted_states,) + output[1:]

# 3. Funkcja Inference z "Detoksem"
def generate_safe_prediction(text, model, tokenizer):
    inputs = tokenizer(text, return_tensors="pt", truncation=True).to(device)

    # Rejestracja Hooka (Wstrzykniƒôcie "szczepionki")
    hook = model.distilbert.transformer.layer[loaded_layer].register_forward_hook(
        ProductionSteeringHook(loaded_vector, loaded_alpha)
    )

    # Predykcja
    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.sigmoid(outputs.logits)[0][0].item()

    # Usuniƒôcie Hooka (SprzƒÖtanie)
    hook.remove()

    return probs

# 4. Test na ≈ºywo
live_test_text = "You are completely useless and stupid."
safety_score = generate_safe_prediction(live_test_text, model, tokenizer)

print(f"\nLive Test Input: '{live_test_text}'")
print(f"Model Toxic Probability (Steered): {safety_score:.4f}")
print(f"Decyzja: {'üî¥ BLOKUJ' if safety_score > 0.5 else 'üü¢ PRZEPU≈öƒÜ'}")

Podsumowanie Eksperymentu: Representation Engineering w Detoksykacji Modelu
W ramach tego projektu przeprowadzono kompleksowƒÖ analizƒô i modyfikacjƒô wewnƒôtrznych reprezentacji modelu DistilBERT (fine-tuned na Jigsaw Toxicity) w celu sterowania jego zachowaniem bez konieczno≈õci ponownego trenowania (fine-tuning).

Zrealizowane etapy:

Analiza Warstwowa (Layer-wise Analysis):

Zbadano liniowƒÖ separowalno≈õƒá konceptu "toksyczno≈õci" w g≈ÇƒÖb sieci.

Zidentyfikowano Warstwƒô 5 jako kluczowy punkt (tzw. sweet spot), gdzie reprezentacja semantyczna jest najsilniejsza (F1 Score = 0.80), przewy≈ºszajƒÖc warstwƒô ostatniƒÖ.

Badanie Stabilno≈õci (Stability Analysis):

Wykorzystano generator parafraz (T5) do sprawdzenia odporno≈õci reprezentacji.

Wykazano, ≈ºe wektory aktywacji w Warstwie 5 sƒÖ niezwykle stabilne semantycznie (Cosine Similarity > 0.99) nawet przy zmianie struktury zdania, co potwierdzi≈Ço zasadno≈õƒá interwencji w tym miejscu.

Ekstrakcja Wektora SterujƒÖcego (Concept Extraction):

Zastosowano metodƒô Difference of Means (R√≥≈ºnica ≈örednich), obliczajƒÖc wektor r√≥≈ºnicowy miƒôdzy centroidami aktywacji dla przyk≈Çad√≥w toksycznych i bezpiecznych w Warstwie 5.

Metoda ta okaza≈Ça siƒô skuteczniejsza od wag regresji logistycznej, zapewniajƒÖc odpowiedniƒÖ skalƒô sygna≈Çu.

Interwencja i Sterowanie (Model Steering):

Zaimplementowano mechanizm PyTorch Forward Hook, umo≈ºliwiajƒÖcy wstrzykiwanie wektora sterujƒÖcego w czasie rzeczywistym.

Zastosowano interwencjƒô z si≈ÇƒÖ Alpha = -3.0, co pozwoli≈Ço na skutecznƒÖ "detoksykacjƒô" modelu.

Ewaluacja i Quality Assurance:

Skuteczno≈õƒá: Prawdopodobie≈Ñstwo wykrycia toksyczno≈õci dla fraz obra≈∫liwych spad≈Ço z ~92% do ~1-4%.

Bezpiecze≈Ñstwo: Model zachowa≈Ç poprawne dzia≈Çanie dla zda≈Ñ neutralnych i pozytywnych (brak efektu "lobotomii" modelu).

Redukcja False Positives: Interwencja wyeliminowa≈Ça b≈Çƒôdne oznaczanie zda≈Ñ o negatywnym sentymencie (np. narzekanie) jako toksyczne (spadek z 10% do 0%).

Wniosek ko≈Ñcowy: Projekt potwierdzi≈Ç, ≈ºe Representation Engineering (RepE) jest potƒô≈ºnƒÖ, niskokosztowƒÖ metodƒÖ kontroli modeli LLM/BERT. Poprzez precyzyjnƒÖ operacjƒô na wektorach aktywacji w Warstwie 5 uda≈Ço siƒô wyeliminowaƒá niepo≈ºƒÖdane zachowanie modelu (wykrywanie toksyczno≈õci) przy zachowaniu jego og√≥lnych zdolno≈õci jƒôzykowych.