In [None]:
import csv
import matplotlib.pyplot as plt

class F1Tracker:
    def __init__(self, experiment_name, save_csv=True, csv_path="f1_results.csv"):
        self.experiment_name = experiment_name
        self.percent_silenced = []
        self.f1_scores = []
        self.save_csv = save_csv
        self.csv_path = csv_path
        if self.save_csv:
            with open(self.csv_path, mode='w', newline='') as file:
                writer = csv.writer(file)
                writer.writerow(["experiment", "percent_silenced", "f1_score"])

    def add(self, percent, f1_score):
        self.percent_silenced.append(percent)
        self.f1_scores.append(f1_score)
        if self.save_csv:
            with open(self.csv_path, mode='a', newline='') as file:
                writer = csv.writer(file)
                writer.writerow([self.experiment_name, percent, f1_score])

    def plot(self, show=True, save_path=None):
        plt.plot(self.percent_silenced, self.f1_scores, marker='o')
        plt.title(f"F1-score - {self.experiment_name}")
        plt.xlabel("% Neuronas Silenciadas")
        plt.ylabel("F1-score")
        plt.ylim(0, 1)
        plt.grid(True)
        if save_path:
            plt.savefig(save_path, bbox_inches='tight')
        if show:
            plt.show()
        plt.clf()

# Configs

In [1]:
import os
import torch
import json
import logging
import time
import numpy as np
import pandas as pd
import pickle
from transformers import AutoTokenizer
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import AutoModelForSequenceClassification
from torch.utils.data import DataLoader, Dataset
import neurox.data.extraction.transformers_extractor as transformers_extractor
from neurox.data.writer import ActivationsWriter
import neurox.data.loader as data_loader
from transformers import AutoConfig
from tqdm import tqdm
import neurox.interpretation.linear_probe as linear_probe
import neurox.interpretation.utils as utils
import neurox.analysis.visualization as TransformersVisualizer
from sklearn.model_selection import train_test_split
from IPython.display import display
import neurox.interpretation.probeless as probeless
from neurox.interpretation.probeless import (
    get_neuron_ordering,
    get_neuron_ordering_for_all_tags
)
import ast
from torch.cuda.amp import autocast
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.preprocessing import LabelEncoder
from matplotlib_venn import venn2
from neurox.interpretation.linear_probe import get_top_neurons
from sklearn.utils import shuffle

In [2]:
import logging

# ==========================
# 📜 Configure Logging 
# ==========================

logger = logging.getLogger("synapse_logger")
logger.setLevel(logging.INFO)

# Avoid duplicates
if not logger.hasHandlers():

    # 📁 Handler 
    file_handler = logging.FileHandler("logs/synapse_extraction_csv_pth.log", mode="w")
    file_handler.setLevel(logging.INFO)

    # 🖥️ Handler 
    console_handler = logging.StreamHandler()
    console_handler.setLevel(logging.INFO)

    # Format
    formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
    file_handler.setFormatter(formatter)
    console_handler.setFormatter(formatter)

    # Add handlers to main logger
    logger.addHandler(file_handler)
    logger.addHandler(console_handler)

logger.info("🚀 Logging configured")


2025-07-09 13:03:14,528 - INFO - 🚀 Logging configured


In [3]:
# ==========================
# SYNAPSE Model Configuration
# ==========================

# 🧠 Select the model (options: "BERT", "BigBird", "DistilBERT", "Longformer")
MODEL = "BigBird"

# 📁 Paths based on model name
BASE_PATH = f"data/{MODEL}"
input_csv = f"{BASE_PATH}/{MODEL}_tokens_PT.csv"
output_csv = f"{BASE_PATH}/reduced/{MODEL}_tokens_reduced.csv"
labels_output_path = f"{BASE_PATH}/labels_numeric.txt"
label_mapping_path = f"{BASE_PATH}/labels_mapping.json"
activations_file = f"{BASE_PATH}/activations.json"
weights_path = f"{BASE_PATH}/best_model_{MODEL}.pth"

# 🔢 Number of labels
NUM_LABELS = 5


# 🔧 HuggingFace model mapping
MODEL_HF = {
    "BERT": "bert-base-uncased",
    "BigBird": "google/bigbird-roberta-base",
    "DistilBERT": "distilbert-base-uncased",
    "Longformer": "allenai/longformer-base-4096"
}[MODEL]

# ⚙️ Device selection
device = torch.device("cpu")

In [4]:
# ==========================
# Load Model and Weights
# ==========================
from transformers import AutoConfig

model = AutoModelForSequenceClassification.from_pretrained(MODEL_HF, num_labels=NUM_LABELS)

# Load trained weights from disk
state_dict = torch.load(weights_path, map_location=device)
model.load_state_dict(state_dict)
model.to(device)
model.eval()

print(f"✅ Loaded {MODEL} with pretrained weights on {device}")

Some weights of BigBirdForSequenceClassification were not initialized from the model checkpoint at google/bigbird-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✅ Loaded BigBird with pretrained weights on cpu


### Load dataset

In [5]:
reduction_ratio = 0.001

# ==========================
# ✅ Skip dataset reduction if already available
# ==========================
if os.path.exists(output_csv) and os.path.exists(labels_output_path):
    logger.info(f"⚡ Reduced dataset found: {output_csv}. Skipping reduction.")
    df_reduced = pd.read_csv(output_csv)
    with open(labels_output_path, "r") as f:
        labels = [int(line.strip()) for line in f]  # Labels as integers
    with open(label_mapping_path, "r") as f:
        label_mapping = json.load(f)  # Load label mapping
else:
    logger.info(f"🔄 Loading dataset from {input_csv}")

    chunk_size = 5000 
    total_rows = sum(1 for _ in open(input_csv)) - 1  # Total rows excluding header
    df_chunks = []

    logger.info(f"🔄 Processing {total_rows} rows in chunks of {chunk_size}...")

    with tqdm(total=total_rows, desc="Processing rows", unit=" rows") as pbar:
        for chunk in pd.read_csv(input_csv, chunksize=chunk_size):
            # Convert `input_ids` from string to list of integers
            chunk['input_ids'] = chunk['input_ids'].apply(lambda x: list(map(int, x.strip("[]").split(","))))
            df_chunks.append(chunk)
            pbar.update(len(chunk))

    df = pd.concat(df_chunks, ignore_index=True)

    # ==========================
    # 🔢 Encode labels as integers
    # ==========================
    df['label'], unique_labels = pd.factorize(df["label"])
    label_mapping = {label: int(idx) for idx, label in enumerate(unique_labels)}

    # ==========================
    # 🧪 Reduce dataset maintaining class proportions
    # ==========================
    df_reduced, _ = train_test_split(df, train_size=reduction_ratio, stratify=df["label"], random_state=42)
    labels = df_reduced["label"].tolist()

    # ==========================
    # 💾 Save reduced dataset and labels
    # ==========================
    df_reduced.to_csv(output_csv, index=False)
    with open(labels_output_path, "w") as f:
        for label in labels:
            f.write(str(label) + "\n")

    with open(label_mapping_path, "w") as f:
        json.dump(label_mapping, f, indent=4)

    logger.info(f"✅ Reduced dataset saved to {output_csv}")
    logger.info(f"✅ Numeric labels saved to {labels_output_path}")
    logger.info(f"✅ Label mapping saved to {label_mapping_path}")


2025-07-09 13:03:16,055 - INFO - ⚡ Reduced dataset found: data/BigBird/reduced/BigBird_tokens_reduced.csv. Skipping reduction.


### Create Dataloader

In [6]:
# ==========================
# 📦 Create DataLoader
# ==========================
class SyscallDataset(Dataset):
    def __init__(self, dataframe):
        self.data = dataframe

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        input_ids = torch.tensor(self.data.iloc[idx]['input_ids'])
        label = torch.tensor(self.data.iloc[idx]['label'])
        return input_ids, label

# Initialize DataLoader with reduced dataset
dataset = SyscallDataset(df_reduced)
dataloader = DataLoader(dataset, batch_size=4, shuffle=False)

logger.info("✅ Dataloader created")

# Ensure `input_ids` are lists of integers
if isinstance(df_reduced["input_ids"].iloc[0], str):
    df_reduced["input_ids"] = df_reduced["input_ids"].apply(lambda x: list(map(int, x.strip("[]").split(","))))


2025-07-09 13:03:16,402 - INFO - ✅ Dataloader created


# NeuroX

## Activation Extraction

In [7]:
if os.path.exists(activations_file):
    logger.info(f"⚡ Activations file found: {activations_file}. Skipping extraction.")
else:
    transformers_extractor.extract_representations(
        model, 
        df_reduced["input_ids"].tolist(),  # Pass preprocessed tokens directly
        activations_file,
        device=device,
    )

    logger.info(f"✅ Activations saved to {activations_file}")


2025-07-09 13:03:18,724 - INFO - ⚡ Activations file found: data/BigBird/activations.json. Skipping extraction.


## Load Activations

In [8]:
activations, num_layers = data_loader.load_activations(activations_file)
logger.info(f"✅ Loaded activations from {activations_file} with {num_layers} layers")

# Load sentence-level classification data using activations
tokens = data_loader.load_sentence_data(
    output_csv, labels_output_path, activations
)

# Create sentence-level tensors for classification
X, y, mapping = utils.create_tensors(
    tokens,
    activations,
    task_specific_tag="NN",
    task_type="classification"
)

label2idx, idx2label, src2idx, idx2src = mapping
logger.info("✅ Created input/output tensors and label mappings for classification")

2025-07-09 13:03:19,649 - INFO - ✅ Loaded activations from data/BigBird/activations.json with 12 layers
2025-07-09 13:03:19,659 - INFO - ✅ Created input/output tensors and label mappings for classification


Loading json activations from data/BigBird/activations.json...
50 12.0
Number of tokens:  50
length of source dictionary:  17
length of target dictionary:  5
50
Total instances: 50
['s']
Number of samples:  50
Stats: Labels with their frequencies in the final set
1 10
3 15
0 7
4 9
2 9


## Train linear probe

In [9]:
probe = linear_probe.train_logistic_regression_probe(X, y, lambda_l1=0.001, lambda_l2=0.001)
scores = linear_probe.evaluate_probe(probe, X, y, idx_to_class=idx2label)
logger.info(f"🎯 Probe evaluation results: {scores}")

top_neurons_probe, per_class_top_neurons = linear_probe.get_top_neurons(probe, percentage=0.1, class_to_idx=label2idx)
logger.info(f"🔍 Top global neurons: {top_neurons_probe}")
logger.info(f"🔍 Top neurons per class: {per_class_top_neurons}")

Clases en y_train: [0 1 2 3 4]
Training classification probe
Creating model...
Number of training instances: 50
Number of classes: 5


epoch [1/10]: 0it [00:00, ?it/s]

Epoch: [1/10], Loss: 0.0820


epoch [2/10]: 0it [00:00, ?it/s]

Epoch: [2/10], Loss: 0.0427


epoch [3/10]: 0it [00:00, ?it/s]

Epoch: [3/10], Loss: 0.0273


epoch [4/10]: 0it [00:00, ?it/s]

Epoch: [4/10], Loss: 0.0251


epoch [5/10]: 0it [00:00, ?it/s]

Epoch: [5/10], Loss: 0.0199


epoch [6/10]: 0it [00:00, ?it/s]

Epoch: [6/10], Loss: 0.0168


epoch [7/10]: 0it [00:00, ?it/s]

Epoch: [7/10], Loss: 0.0164


epoch [8/10]: 0it [00:00, ?it/s]

Epoch: [8/10], Loss: 0.0160


epoch [9/10]: 0it [00:00, ?it/s]

Epoch: [9/10], Loss: 0.0152


epoch [10/10]: 0it [00:00, ?it/s]

Epoch: [10/10], Loss: 0.0144


Evaluating: 0it [00:00, ?it/s]

2025-07-09 13:03:20,779 - INFO - 🎯 Probe evaluation results: {'__OVERALL__': np.float64(0.96), '1': np.float64(1.0), '3': np.float64(0.9333333333333333), '0': np.float64(1.0), '4': np.float64(1.0), '2': np.float64(0.8888888888888888)}
2025-07-09 13:03:20,783 - INFO - 🔍 Top global neurons: [8193 8194 6152 ... 4093 6142 8191]
2025-07-09 13:03:20,785 - INFO - 🔍 Top neurons per class: {'1': array([6272, 8698, 8468, 5669, 7878, 9116, 9077, 7328, 8480, 7111, 8466,
       7730, 8745, 9148, 8678, 9183, 8982, 6387, 7962, 9006, 6602, 8582,
       8645, 6398, 8182, 9021, 8795, 6502, 5629, 9072, 9098, 8349, 8751,
       7585, 7468, 7166, 7963, 7794, 9195, 6993, 8842, 7805, 7968, 8135,
       9134, 8249, 8565, 8867, 9047, 7862, 8132, 7027, 9184, 8584, 8519,
       8296, 8116, 7257, 7474, 8156, 8457, 8913, 8212, 7971, 6276, 9182,
       9210, 7074, 7795, 9200, 8718, 9208, 7165, 7959, 8781, 8464, 8664,
       8624, 8518, 8895, 7956, 8052, 8625, 7561, 9025, 8752, 5920, 7109,
       8383, 8337, 9099, 8

Score (accuracy) of the probe: 0.96


# Experiments

## Original performance

In [10]:
df = pd.read_csv(input_csv)

# 🎯 Select 50 random examples and reset index
sample_df = df.sample(n=50, random_state=42).reset_index(drop=True)

# 🧹 Convert "input_ids" and "attention_mask" from string to list format
def parse_list(x):
    return ast.literal_eval(x)

sample_df['input_ids'] = sample_df['input_ids'].apply(parse_list)
sample_df['attention_mask'] = sample_df['attention_mask'].apply(parse_list)

# 🔢 Encode labels to integers
label_encoder = LabelEncoder()
sample_df['label'] = label_encoder.fit_transform(sample_df['label'])
labels_list = sample_df['label'].tolist()

predictions_list = []
model.eval()
torch.cuda.empty_cache()

for i in range(len(sample_df)):
    input_ids_tensor = torch.tensor(sample_df.loc[i, 'input_ids']).unsqueeze(0).to(device)
    attention_mask_tensor = torch.tensor(sample_df.loc[i, 'attention_mask']).unsqueeze(0).to(device)

    with torch.no_grad():
        with autocast():
            device = torch.device("cpu")
            outputs = model(input_ids=input_ids_tensor, attention_mask=attention_mask_tensor)
            logits = outputs['logits']
            pred = torch.argmax(logits, dim=1).item()
            predictions_list.append(pred)

    del input_ids_tensor, attention_mask_tensor, outputs, logits
    torch.cuda.empty_cache()

# 📊 Compute evaluation metrics
accuracy = accuracy_score(labels_list, predictions_list)
f1 = f1_score(labels_list, predictions_list, average='weighted')
report_dict = classification_report(labels_list, predictions_list, output_dict=True)
report_df = pd.DataFrame(report_dict).transpose()

# Round for readability
report_df = report_df.round(4)

# Add accuracy as a separate row
accuracy_row = pd.DataFrame({'precision': accuracy, 'recall': accuracy, 'f1-score': accuracy, 'support': sum(report_df['support'])}, index=['accuracy'])
report_df = pd.concat([report_df, accuracy_row])

# 📝 Save or append to CSV

experiment_title = "🧪 Sample of 30 - Full Model Evaluation"
csv_report_path = f"{BASE_PATH}/classification_report_sample_eval.csv"

# Remove incorrect 'accuracy' row if it exists
report_df = report_df.drop("accuracy", errors="ignore")

# Append correct accuracy row
accuracy_row = pd.DataFrame({
    'precision': [""],
    'recall': [""],
    'f1-score': [accuracy],
    'support': [sum(report_df["support"])]
}, index=["overall_accuracy"])

# Combine
final_df = pd.concat([report_df, accuracy_row])

# Write to CSV with experiment title as a header
with open(csv_report_path, "a") as f:
    f.write(f"\n\n# {experiment_title}\n")
final_df.to_csv(csv_report_path, mode="a")

logger.info(f"📁 Appended classification report with title '{experiment_title}' to {csv_report_path}")


  with autocast():
2025-07-09 13:04:45,014 - INFO - 📁 Appended classification report with title '🧪 Sample of 30 - Full Model Evaluation' to data/BigBird/classification_report_sample_eval.csv


## Silence and evaluate neurons. Function definition

### Full silencing

In [11]:
def get_top_k_neurons_exact(probe, percentage: float) -> list[int]:
    """
    Return exactly N = round(total_neurons * percentage) neuron indices, sorted by importance.
    Importance is measured as the sum of absolute values of weights across all output classes.
    """
    weight_matrix = probe.linear.weight.detach().abs()  # [num_classes, num_neurons]
    importance = weight_matrix.sum(dim=0).cpu().numpy()  # [num_neurons]
    total_neurons = len(importance)
    top_n = round(total_neurons * percentage)

    sorted_indices = importance.argsort()[-top_n:]  # Top-N by importance
    return sorted_indices.tolist()

In [12]:
def make_cls_silence_hook(indices):
    indices = [int(i) for i in indices]
    indices_tensor = torch.tensor(indices, dtype=torch.long) if indices else None

    def hook(module, input, output):
        if output.dim() == 3:
            new_output = output.clone()
            cls_token = new_output[:, 0, :]
            mask = torch.ones_like(cls_token)
            if indices_tensor is not None:
                local_indices = indices_tensor.to(new_output.device)
                mask[:, local_indices] = 0.0
            new_output[:, 0, :] = cls_token * mask
            return new_output
        return output
    return hook

In [13]:
# ==========================
# Get encoder layers dynamically
# ==========================
def get_encoder_layers(model):
    if hasattr(model, "bert"):
        return model.bert.encoder.layer
    elif hasattr(model, "longformer"):
        return model.longformer.encoder.layer
    elif hasattr(model, "distilbert"):
        return model.distilbert.transformer.layer
    else:
        raise NotImplementedError("❌ Unsupported model architecture.")

In [14]:
def silence_top_global_percentage_and_evaluate(
    model,
    sample_df,
    labels_list,
    probe,
    label2idx,
    percentage=0.10,
    report_path=None,
    experiment_title=None
):
    hidden_dim = model.config.hidden_size
    num_layers = model.config.num_hidden_layers
    total_neurons = num_layers * hidden_dim

    top_neurons_global = get_top_k_neurons_exact(probe, percentage=percentage)
    logger.info(f"🔧 Silencing exactly {len(top_neurons_global)} neurons ({percentage:.2%} of total {total_neurons})")

    # Save neuron indices
    neurons_dir = f"{BASE_PATH}/neurons"
    os.makedirs(neurons_dir, exist_ok=True)
    json_path = f"{neurons_dir}/top_{int(percentage * 100)}p_neurons_global.json"
    with open(json_path, "w") as f:
        json.dump(top_neurons_global, f, indent=4)
    logger.info(f"📁 Saved neuron indices to {json_path}")

    # Register hooks per layer (compatible with BERT, DistilBERT, etc.)
    encoder_layers = get_encoder_layers(model)
    hook_handles = []
    for i in range(num_layers):
        indices_layer = [idx - i * hidden_dim for idx in top_neurons_global if i * hidden_dim <= idx < (i + 1) * hidden_dim]
        if indices_layer:
            logger.info(f"📌 Layer {i}: silencing {len(indices_layer)} neurons")
            # Use 'output' submodule if exists (BERT, RoBERTa), else register on main layer (DistilBERT)
            if hasattr(encoder_layers[i], "output"):
                handle = encoder_layers[i].output.register_forward_hook(make_cls_silence_hook(indices_layer))
            else:
                handle = encoder_layers[i].register_forward_hook(make_cls_silence_hook(indices_layer))
            hook_handles.append(handle)

    # Inference
    model.eval()
    predictions = []
    for i in range(len(sample_df)):
        input_ids_tensor = torch.tensor(sample_df.loc[i, 'input_ids']).unsqueeze(0).to(model.device)
        attention_mask_tensor = torch.tensor(sample_df.loc[i, 'attention_mask']).unsqueeze(0).to(model.device)

        with torch.no_grad():
            outputs = model(input_ids=input_ids_tensor, attention_mask=attention_mask_tensor)
            logits = outputs['logits']
            pred = torch.argmax(logits, dim=1).item()
            predictions.append(pred)

        del input_ids_tensor, attention_mask_tensor, outputs, logits
        torch.cuda.empty_cache()

    # Metrics
    accuracy = accuracy_score(labels_list, predictions)
    f1 = f1_score(labels_list, predictions, average='weighted')
    report_dict = classification_report(labels_list, predictions, output_dict=True)
    report_df = pd.DataFrame(report_dict).transpose().round(4)
    report_df = report_df.drop("accuracy", errors="ignore")

    accuracy_row = pd.DataFrame({
        'precision': [""],
        'recall': [""],
        'f1-score': [accuracy],
        'support': [sum(report_df["support"])]
    }, index=["overall_accuracy"])
    final_df = pd.concat([report_df, accuracy_row])

    # Save classification report
    if report_path is None:
        report_path = f"{BASE_PATH}/results/full_silencing.csv"
    os.makedirs(os.path.dirname(report_path), exist_ok=True)

    if experiment_title is None:
        experiment_title = f"Silencing top {percentage:.2%} global neurons"

    if not os.path.exists(report_path):
        with open(report_path, "w") as f:
            f.write(f"# {experiment_title}\n")
            final_df.to_csv(f)
    else:
        with open(report_path, "a") as f:
            f.write(f"\n\n# {experiment_title}\n")
        final_df.to_csv(report_path, mode="a")

    logger.info(f"🎯 Accuracy after silencing: {accuracy:.4f}")
    logger.info(f"📏 Weighted F1 Score: {f1:.4f}")
    logger.info(f"📋 Classification report saved to {report_path}")

    # Remove all hooks
    for handle in hook_handles:
        handle.remove()
    logger.info("✅ All hooks removed after evaluation")

## Global impact

In [None]:
percentages = [0.025, 0.05, 0.075, 0.10, 0.125, 0.15, 0.175, 0.20, 0.25, 0.30, 0.35, 0.40, 0.45, 0.50, 0.65, 0.75, 0.8, 0.95]

for pct in percentages:
    silence_top_global_percentage_and_evaluate(
        model=model,
        sample_df=sample_df,
        labels_list=labels_list,
        probe=probe,
        label2idx=label2idx,
        percentage=pct,
        experiment_title=f"Silencing {pct*100:.1f}% Global Neurons"
    )


In [None]:
import matplotlib.pyplot as plt
from collections import defaultdict
import re

def plot_f1_scores_from_file(filepath, output_path= f"{BASE_PATH}/figs/f1_curve_global.png"):
    with open(filepath, 'r') as f:
        lines = f.readlines()

    # Estructuras para almacenar los datos
    results = defaultdict(list)  # clase -> lista de F1-scores
    global_f1_scores = {}
    percentages = []
    current_percentage = None

    # Parsear el contenido
    for line in lines:
        line = line.strip()

        # Detectar inicio de bloque
        if line.startswith("# Silencing"):
            match = re.search(r"# Silencing ([\d.]+)% Global Neurons", line)
            if match:
                current_percentage = float(match.group(1))
                percentages.append(current_percentage)
            continue

        if line.startswith(",precision") or not line:
            continue

        parts = line.split(",")
        label = parts[0]

        if label.isdigit():
            class_index = int(label)
            try:
                f1 = float(parts[3])
                results[class_index].append(f1)
            except:
                results[class_index].append(None)
        elif label == "weighted avg":
            try:
                global_f1_scores[current_percentage] = float(parts[3])
            except:
                global_f1_scores[current_percentage] = None

    # Ordenar porcentajes
    sorted_percentages = sorted(global_f1_scores.keys())
    global_f1 = [global_f1_scores[p] for p in sorted_percentages]

    # Crear la figura
    plt.figure(figsize=(10, 6))

    # F1 global
    plt.plot(sorted_percentages, global_f1, label="Global F1", linewidth=2, marker='o')

    # F1 por clase
    for class_index in sorted(results.keys()):
        f1s = results[class_index]
        plt.plot(sorted_percentages, f1s, label=f"Class {class_index}", linestyle='--', marker='x')

    # Formato
    plt.title("F1 Scores vs. % of Silenced Global Neurons")
    plt.xlabel("% of Silenced Neurons")
    plt.ylabel("F1 Score")
    plt.grid(True)
    plt.legend()
    plt.tight_layout()

    # Guardar como PNG
    plt.savefig(output_path, dpi=300)
    print(f"Gráfica guardada como: {output_path}")
    plt.close()


In [None]:
plot_f1_scores_from_file(f"{BASE_PATH}/results/full_silencing.csv")

## Impact per class

In [15]:
def get_top_k_neurons_for_class_exact(probe, percentage: float, class_to_idx: dict, class_id: int) -> list[int]:
    """
    Return top-k neurons most important for a specific class, measured by absolute weight.
    """
    weight_matrix = probe.linear.weight.detach().abs()  # [num_classes, num_neurons]
    class_weights = weight_matrix[class_id]  # [num_neurons]
    total_neurons = class_weights.size(0)
    top_n = round(percentage * total_neurons)
    top_indices = class_weights.cpu().numpy().argsort()[-top_n:]
    return top_indices.tolist()

In [16]:
# ==========================
# Get encoder layers dynamically
# ==========================
def get_encoder_layers(model):
    if hasattr(model, "bert"):
        return model.bert.encoder.layer
    elif hasattr(model, "longformer"):
        return model.longformer.encoder.layer
    elif hasattr(model, "distilbert"):
        return model.distilbert.transformer.layer
    else:
        raise NotImplementedError("❌ Unsupported model architecture.")

# ==========================
# Silence and evaluate top per-class neurons
# ==========================

def silence_top_class_percentage_and_evaluate(
    model,
    sample_df,
    labels_list,
    probe,
    label2idx,
    class_id: int,
    percentage: float = 0.1,
    report_path: str = None,
    experiment_title: str = None
):
    class_name = f"class_{class_id}"
    hidden_dim = model.config.hidden_size
    num_layers = model.config.num_hidden_layers
    total_neurons = num_layers * hidden_dim

    # Get top neurons for specific class
    top_class_neurons = get_top_k_neurons_for_class_exact(
        probe, percentage=percentage, class_to_idx=label2idx, class_id=class_id
    )

    logger.info(f"🔧 Silencing {len(top_class_neurons)} neurons for class {class_id} ({percentage:.2%} of total)")

    # Save neurons to JSON
    neurons_dir = f"{BASE_PATH}/neurons"
    os.makedirs(neurons_dir, exist_ok=True)
    json_path = f"{neurons_dir}/top_{int(percentage * 100)}p_neurons_{class_name}.json"
    with open(json_path, "w") as f:
        json.dump(top_class_neurons, f, indent=4)
    logger.info(f"📁 Saved neuron indices to {json_path}")

    # Register hooks per layer
    encoder_layers = get_encoder_layers(model)
    hook_handles = []
    for i in range(num_layers):
        indices_layer = [idx - i * hidden_dim for idx in top_class_neurons if i * hidden_dim <= idx < (i + 1) * hidden_dim]
        if indices_layer:
            logger.info(f"📌 Layer {i}: silencing {len(indices_layer)} neurons for class {class_id}")
            handle = encoder_layers[i].output.register_forward_hook(make_cls_silence_hook(indices_layer))
            hook_handles.append(handle)

    # Inference
    model.eval()
    predictions = []
    for i in range(len(sample_df)):
        input_ids_tensor = torch.tensor(sample_df.loc[i, 'input_ids']).unsqueeze(0).to(model.device)
        attention_mask_tensor = torch.tensor(sample_df.loc[i, 'attention_mask']).unsqueeze(0).to(model.device)

        with torch.no_grad():
            outputs = model(input_ids=input_ids_tensor, attention_mask=attention_mask_tensor)
            logits = outputs['logits']
            pred = torch.argmax(logits, dim=1).item()
            predictions.append(pred)

        del input_ids_tensor, attention_mask_tensor, outputs, logits
        torch.cuda.empty_cache()

    # Metrics
    accuracy = accuracy_score(labels_list, predictions)
    f1 = f1_score(labels_list, predictions, average='weighted')
    report_dict = classification_report(labels_list, predictions, output_dict=True)
    report_df = pd.DataFrame(report_dict).transpose().round(4)
    report_df = report_df.drop("accuracy", errors="ignore")

    accuracy_row = pd.DataFrame({
        'precision': [""],
        'recall': [""],
        'f1-score': [accuracy],
        'support': [sum(report_df["support"])]
    }, index=["overall_accuracy"])
    final_df = pd.concat([report_df, accuracy_row])

    # Save classification report
    if report_path is None:
        report_path = f"{BASE_PATH}/results/class_silencing{class_id}.csv"
    os.makedirs(os.path.dirname(report_path), exist_ok=True)

    if experiment_title is None:
        experiment_title = f"Silencing top {percentage:.2%} neurons for class {class_id}"

    if not os.path.exists(report_path):
        with open(report_path, "w") as f:
            f.write(f"# {experiment_title}\n")
            final_df.to_csv(f)
    else:
        with open(report_path, "a") as f:
            f.write(f"\n\n# {experiment_title}\n")
        final_df.to_csv(report_path, mode="a")

    logger.info(f"🎯 Accuracy after class-specific silencing: {accuracy:.4f}")
    logger.info(f"📏 Weighted F1 Score: {f1:.4f}")
    logger.info(f"📋 Classification report saved to {report_path}")

    # Remove all hooks
    for handle in hook_handles:
        handle.remove()
    logger.info("✅ All hooks removed after evaluation")

In [17]:
# percentages = [0.025, 0.05, 0.075, 0.10, 0.125, 0.15, 0.175, 0.20, 0.25, 0.30, 0.35, 0.40, 0.45, 0.50, 0.65, 0.75, 0.8, 0.95]
percentages = [0.50]
target_class_id = 4

for pct in percentages:
    silence_top_class_percentage_and_evaluate(
        model=model,
        sample_df=sample_df,
        labels_list=labels_list,
        probe=probe,
        label2idx=label2idx,
        class_id=target_class_id,
        percentage=pct,
        experiment_title=f"Silencing {pct*100:.1f}% of Neurons for Class {target_class_id}"
    )


2025-07-09 13:04:45,063 - INFO - 🔧 Silencing 4608 neurons for class 4 (50.00% of total)
2025-07-09 13:04:45,064 - INFO - 📁 Saved neuron indices to data/BigBird/neurons/top_50p_neurons_class_4.json
2025-07-09 13:04:45,065 - INFO - 📌 Layer 0: silencing 234 neurons for class 4
2025-07-09 13:04:45,065 - INFO - 📌 Layer 1: silencing 125 neurons for class 4
2025-07-09 13:04:45,066 - INFO - 📌 Layer 2: silencing 296 neurons for class 4
2025-07-09 13:04:45,066 - INFO - 📌 Layer 3: silencing 338 neurons for class 4
2025-07-09 13:04:45,067 - INFO - 📌 Layer 4: silencing 381 neurons for class 4
2025-07-09 13:04:45,067 - INFO - 📌 Layer 5: silencing 378 neurons for class 4
2025-07-09 13:04:45,068 - INFO - 📌 Layer 6: silencing 439 neurons for class 4
2025-07-09 13:04:45,068 - INFO - 📌 Layer 7: silencing 440 neurons for class 4
2025-07-09 13:04:45,069 - INFO - 📌 Layer 8: silencing 438 neurons for class 4
2025-07-09 13:04:45,069 - INFO - 📌 Layer 9: silencing 487 neurons for class 4
2025-07-09 13:04:45,070

In [None]:
percentages = [0.025, 0.05, 0.075, 0.10, 0.125, 0.15, 0.175, 0.20, 0.25, 0.30, 0.35, 0.40, 0.45, 0.50, 0.65, 0.75, 0.8, 0.95]

target_class_id = 3

for pct in percentages:
    silence_top_class_percentage_and_evaluate(
        model=model,
        sample_df=sample_df,
        labels_list=labels_list,
        probe=probe,
        label2idx=label2idx,
        class_id=target_class_id,
        percentage=pct,
        experiment_title=f"Silencing {pct*100:.1f}% of Neurons for Class {target_class_id}"
    )


In [None]:
percentages = [0.025, 0.05, 0.075, 0.10, 0.125, 0.15, 0.175, 0.20, 0.25, 0.30, 0.35, 0.40, 0.45, 0.50, 0.65, 0.75, 0.8, 0.95]

target_class_id = 2
for pct in percentages:
    silence_top_class_percentage_and_evaluate(
        model=model,
        sample_df=sample_df,
        labels_list=labels_list,
        probe=probe,
        label2idx=label2idx,
        class_id=target_class_id,
        percentage=pct,
        experiment_title=f"Silencing {pct*100:.1f}% of Neurons for Class {target_class_id}"
    )


In [None]:
percentages = [0.025, 0.05, 0.075, 0.10, 0.125, 0.15, 0.175, 0.20, 0.25, 0.30, 0.35, 0.40, 0.45, 0.50, 0.65, 0.75, 0.8, 0.95]

target_class_id = 1

for pct in percentages:
    silence_top_class_percentage_and_evaluate(
        model=model,
        sample_df=sample_df,
        labels_list=labels_list,
        probe=probe,
        label2idx=label2idx,
        class_id=target_class_id,
        percentage=pct,
        experiment_title=f"Silencing {pct*100:.1f}% of Neurons for Class {target_class_id}"
    )


In [None]:
percentages = [0.025, 0.05, 0.075, 0.10, 0.125, 0.15, 0.175, 0.20, 0.25, 0.30, 0.35, 0.40, 0.45, 0.50, 0.65, 0.75, 0.8, 0.95]

target_class_id = 0

for pct in percentages:
    silence_top_class_percentage_and_evaluate(
        model=model,
        sample_df=sample_df,
        labels_list=labels_list,
        probe=probe,
        label2idx=label2idx,
        class_id=target_class_id,
        percentage=pct,
        experiment_title=f"Silencing {pct*100:.1f}% of Neurons for Class {target_class_id}"
    )


# Attacks

## FGSM

In [None]:
from torch.nn import CrossEntropyLoss

def run_fgsm_attack_and_evaluate(
    model,
    sample_df,
    labels_list,
    epsilon: float = 0.1,
    report_path: str = f"{BASE_PATH}/fgsm.csv",
    experiment_title: str = None
):
    logger.info(f"⚔️ Running FGSM attack with ε = {epsilon}")
    model.eval()
    predictions_fgsm = []
    loss_fn = CrossEntropyLoss()

    is_longformer = hasattr(model, "longformer")

    for i in range(len(sample_df)):
        # ===============================
        # 🔢 Prepare input tensors
        # ===============================
        input_ids_tensor = torch.tensor(sample_df.loc[i, 'input_ids'], dtype=torch.long).unsqueeze(0).to(model.device)
        attention_mask_tensor = torch.tensor(sample_df.loc[i, 'attention_mask'], dtype=torch.long).unsqueeze(0).to(model.device)
        true_label = torch.tensor([labels_list[i]], dtype=torch.long).to(model.device)

        # ===============================
        # 🔍 Extract embeddings as leaf tensor
        # ===============================
        with torch.no_grad():
            embedding_output = model.base_model.embeddings(input_ids_tensor)
        embeds = embedding_output.clone().detach().requires_grad_(True)

        # ===============================
        # 🔁 Forward pass
        # ===============================
        if is_longformer:
            global_attention_mask = torch.zeros_like(attention_mask_tensor)
            global_attention_mask[:, 0] = 1
            outputs = model(
                inputs_embeds=embeds,
                attention_mask=attention_mask_tensor,
                global_attention_mask=global_attention_mask
            )
        else:
            outputs = model(
                inputs_embeds=embeds,
                attention_mask=attention_mask_tensor
            )

        logits = outputs.logits
        loss = loss_fn(logits, true_label)

        # ===============================
        # 🔁 Backward pass
        # ===============================
        model.zero_grad()
        loss.backward()

        # ===============================
        # ⚔️ FGSM perturbation
        # ===============================
        perturbation = epsilon * embeds.grad.data.sign()
        adv_embeds = embeds + perturbation

        # ===============================
        # 🔮 Inference with adversarial input
        # ===============================
        with torch.no_grad():
            if is_longformer:
                adv_outputs = model(
                    inputs_embeds=adv_embeds,
                    attention_mask=attention_mask_tensor,
                    global_attention_mask=global_attention_mask
                )
            else:
                adv_outputs = model(
                    inputs_embeds=adv_embeds,
                    attention_mask=attention_mask_tensor
                )
            adv_logits = adv_outputs.logits
            pred = torch.argmax(adv_logits, dim=1).item()
            predictions_fgsm.append(pred)

        del input_ids_tensor, attention_mask_tensor, embeds, adv_embeds, outputs, adv_outputs, logits, adv_logits
        torch.cuda.empty_cache()

    # ===============================
    # 📊 Evaluation
    # ===============================
    accuracy = accuracy_score(labels_list, predictions_fgsm)
    f1 = f1_score(labels_list, predictions_fgsm, average='weighted')
    report_dict = classification_report(labels_list, predictions_fgsm, output_dict=True)
    report_df = pd.DataFrame(report_dict).transpose().round(4).drop("accuracy", errors="ignore")

    accuracy_row = pd.DataFrame({
        'precision': [""],
        'recall': [""],
        'f1-score': [accuracy],
        'support': [sum(report_df["support"])]
    }, index=["overall_accuracy"])
    final_df = pd.concat([report_df, accuracy_row])

    # ===============================
    # 💾 Save report
    # ===============================
    if experiment_title is None:
        experiment_title = f"FGSM Attack (ε = {epsilon})"
    if not os.path.exists(report_path):
        with open(report_path, "w") as f:
            f.write(f"# {experiment_title}\n")
            final_df.to_csv(f)
    else:
        with open(report_path, "a") as f:
            f.write(f"\n\n# {experiment_title}\n")
        final_df.to_csv(report_path, mode="a")

    logger.info(f"🎯 Accuracy under FGSM (ε={epsilon}): {accuracy:.4f}")
    logger.info(f"📏 Weighted F1 Score: {f1:.4f}")
    logger.info(f"📋 Classification report saved to {report_path}")

In [None]:
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
model.to("cpu")
device = torch.device("cpu")


In [None]:
run_fgsm_attack_and_evaluate(
    model=model,
    sample_df=sample_df,
    labels_list=labels_list,
    epsilon=0.1,
    experiment_title="FGSM Adversarial Attack with ε = 0.1"
)

## Random Noise

In [None]:
def run_random_noise_attack(
    model,
    sample_df,
    labels_list,
    epsilon=0.3,
    device="cpu"
):
    import torch
    from sklearn.metrics import accuracy_score, f1_score, classification_report

    logger.info(f"🎲 Running random noise attack (epsilon={epsilon})")
    model.to(device)
    model.eval()
    predictions_noise = []

    is_longformer = hasattr(model, "longformer")

    for i in range(len(sample_df)):
        input_ids_tensor = torch.tensor(sample_df.loc[i, 'input_ids'], dtype=torch.long).unsqueeze(0).to(device)
        attention_mask_tensor = torch.tensor(sample_df.loc[i, 'attention_mask'], dtype=torch.long).unsqueeze(0).to(device)

        with torch.no_grad():
            embeddings = model.base_model.embeddings(input_ids_tensor)
            noisy_embeddings = embeddings + epsilon * torch.randn_like(embeddings)

            if is_longformer:
                global_attention_mask = torch.zeros_like(attention_mask_tensor)
                global_attention_mask[:, 0] = 1
                outputs = model(
                    inputs_embeds=noisy_embeddings,
                    attention_mask=attention_mask_tensor,
                    global_attention_mask=global_attention_mask
                )
            else:
                outputs = model(
                    inputs_embeds=noisy_embeddings,
                    attention_mask=attention_mask_tensor
                )

            logits = outputs.logits
            pred = torch.argmax(logits, dim=1).item()
            predictions_noise.append(pred)

    accuracy = accuracy_score(labels_list, predictions_noise)
    f1 = f1_score(labels_list, predictions_noise, average='weighted')
    report = classification_report(labels_list, predictions_noise)

    logger.info(f"📉 Random noise attack results (epsilon={epsilon}):")
    logger.info(f"Accuracy: {accuracy:.4f} | F1 Score: {f1:.4f}")
    logger.info(f"\nClassification Report:\n{report}")

    return accuracy, f1, predictions_noise

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, classification_report

epsilons = np.linspace(0.1, 1.0, 10)  # 10 valores de 0.1 a 1.0
results = []

for epsilon in epsilons:
    logger.info(f"\n🎲 Running random noise attack with ε={epsilon:.2f}")

    acc_noise, f1_noise, preds_noise = run_random_noise_attack(
        model=model,
        sample_df=sample_df,
        labels_list=labels_list,
        epsilon=epsilon,
        device="cpu"
    )

    logger.info(f"📉 Results for ε={epsilon:.2f} → Accuracy: {acc_noise:.4f} | F1 Score: {f1_noise:.4f}")

    results.append({
        "epsilon": round(epsilon, 2),
        "accuracy": acc_noise,
        "f1_score": f1_noise
    })

# Convertimos resultados a dataframe
noise_sweep_df = pd.DataFrame(results)
noise_sweep_df.to_csv("results/random_noise_sweep_up_to_1.csv", index=False)

logger.info("📁 Random noise sweep results saved to 'results/random_noise_sweep_up_to_1.csv'")


# Conjuntos disjuntos de neuronas

In [None]:
from collections import defaultdict
import os
import json

top_percentage = 0.5  # 50% 

# Detect and convert (layer, neuron) tuples to global indices if needed
hidden_dim = model.config.hidden_size

def tuple_to_global_index(neuron_tuples, hidden_dim):
    return [layer * hidden_dim + neuron for (layer, neuron) in neuron_tuples]

per_class_top_indices = {}
for class_id, neuron_list in per_class_top_neurons.items():
    if len(neuron_list) > 0 and isinstance(neuron_list[0], tuple):
        per_class_top_indices[class_id] = tuple_to_global_index(neuron_list, hidden_dim)
    else:
        per_class_top_indices[class_id] = neuron_list

# Exclusive neurons: present in top of class A but not in any other class
exclusive_class_neurons = {}
for cid, own_top in per_class_top_indices.items():
    other = set()
    for other_cid, other_top in per_class_top_indices.items():
        if other_cid != cid:
            other.update(other_top)
    exclusive = sorted(set(own_top) - other)
    exclusive_class_neurons[cid] = exclusive
    print(f"Class {cid}: {len(exclusive)} exclusive neurons out of {len(own_top)} top neurons")

# Save exclusive neurons to JSON
exclusive_dir = f"{BASE_PATH}/exclusive_neurons"
os.makedirs(exclusive_dir, exist_ok=True)
for class_id, neuron_list in exclusive_class_neurons.items():
    path = f"{exclusive_dir}/exclusive_top{int(top_percentage*100)}p_class_{class_id}.json"
    with open(path, "w") as f:
        json.dump([int(x) for x in neuron_list], f, indent=2)
    logger.info(f"Saved exclusive neurons for class {class_id} to {path}")

2025-07-09 13:15:29,786 - INFO - Saved exclusive neurons for class 1 to data/BigBird/exclusive_neurons/exclusive_top50p_class_1.json
2025-07-09 13:15:29,787 - INFO - Saved exclusive neurons for class 3 to data/BigBird/exclusive_neurons/exclusive_top50p_class_3.json
2025-07-09 13:15:29,788 - INFO - Saved exclusive neurons for class 0 to data/BigBird/exclusive_neurons/exclusive_top50p_class_0.json
2025-07-09 13:15:29,789 - INFO - Saved exclusive neurons for class 4 to data/BigBird/exclusive_neurons/exclusive_top50p_class_4.json
2025-07-09 13:15:29,789 - INFO - Saved exclusive neurons for class 2 to data/BigBird/exclusive_neurons/exclusive_top50p_class_2.json


Class 1: 195 exclusive neurons out of 260 top neurons
Class 3: 148 exclusive neurons out of 243 top neurons
Class 0: 160 exclusive neurons out of 255 top neurons
Class 4: 144 exclusive neurons out of 227 top neurons
Class 2: 176 exclusive neurons out of 247 top neurons


In [19]:
def silence_exclusive_class_and_evaluate(
    model,
    sample_df,
    labels_list,
    exclusive_neuron_indices,
    class_id,
    report_path=None,
    experiment_title=None
):
    hidden_dim = model.config.hidden_size
    num_layers = model.config.num_hidden_layers

    logger.info(f"🔧 Silencing {len(exclusive_neuron_indices)} EXCLUSIVE neurons for class {class_id}")

    encoder_layers = get_encoder_layers(model)
    hook_handles = []
    for i in range(num_layers):
        indices_layer = [idx - i * hidden_dim for idx in exclusive_neuron_indices if i * hidden_dim <= idx < (i + 1) * hidden_dim]
        if indices_layer:
            logger.info(f"📌 Layer {i}: silencing {len(indices_layer)} exclusive neurons for class {class_id}")
            if hasattr(encoder_layers[i], "output"):
                handle = encoder_layers[i].output.register_forward_hook(make_cls_silence_hook(indices_layer))
            else:
                handle = encoder_layers[i].register_forward_hook(make_cls_silence_hook(indices_layer))
            hook_handles.append(handle)

    # --- Evaluation ---
    model.eval()
    predictions = []
    for i in range(len(sample_df)):
        input_ids_tensor = torch.tensor(sample_df.loc[i, 'input_ids']).unsqueeze(0).to(model.device)
        attention_mask_tensor = torch.tensor(sample_df.loc[i, 'attention_mask']).unsqueeze(0).to(model.device)
        with torch.no_grad():
            outputs = model(input_ids=input_ids_tensor, attention_mask=attention_mask_tensor)
            logits = outputs['logits']
            pred = torch.argmax(logits, dim=1).item()
            predictions.append(pred)
        del input_ids_tensor, attention_mask_tensor, outputs, logits
        torch.cuda.empty_cache()

    # --- Metrics & reporting ---
    from sklearn.metrics import accuracy_score, f1_score, classification_report
    import pandas as pd
    accuracy = accuracy_score(labels_list, predictions)
    f1 = f1_score(labels_list, predictions, average='weighted')
    report_dict = classification_report(labels_list, predictions, output_dict=True)
    report_df = pd.DataFrame(report_dict).transpose().round(4)
    report_df = report_df.drop("accuracy", errors="ignore")

    accuracy_row = pd.DataFrame({
        'precision': [""],
        'recall': [""],
        'f1-score': [accuracy],
        'support': [sum(report_df["support"])]
    }, index=["overall_accuracy"])
    final_df = pd.concat([report_df, accuracy_row])

    # --- Save classification report ---
    if report_path is None:
        report_path = f"{BASE_PATH}/results/exclusive_class_silencing_{class_id}.csv"
    os.makedirs(os.path.dirname(report_path), exist_ok=True)
    if experiment_title is None:
        experiment_title = f"Silencing exclusive neurons for class {class_id}"
    if not os.path.exists(report_path):
        with open(report_path, "w") as f:
            f.write(f"# {experiment_title}\n")
            final_df.to_csv(f)
    else:
        with open(report_path, "a") as f:
            f.write(f"\n\n# {experiment_title}\n")
        final_df.to_csv(report_path, mode="a")

    logger.info(f"🎯 Accuracy after exclusive silencing: {accuracy:.4f}")
    logger.info(f"📏 Weighted F1 Score: {f1:.4f}")
    logger.info(f"📋 Classification report saved to {report_path}")

    # Remove hooks
    for handle in hook_handles:
        handle.remove()
    logger.info("✅ All hooks removed after evaluation")

In [21]:
# --- Run silencing experiments for each class with its exclusive neurons ---
for class_id, neuron_indices in exclusive_class_neurons.items():
    silence_exclusive_class_and_evaluate(
        model=model,
        sample_df=sample_df,
        labels_list=labels_list,
        exclusive_neuron_indices=neuron_indices,
        class_id=class_id,
        report_path=f"{BASE_PATH}/results/exclusive_class_silencing_{class_id}.csv",
        experiment_title=f"Silencing exclusive neurons for class {class_id}"
    )

2025-07-09 13:16:43,619 - INFO - 🔧 Silencing 195 EXCLUSIVE neurons for class 1
2025-07-09 13:16:43,620 - INFO - 📌 Layer 4: silencing 2 exclusive neurons for class 1
2025-07-09 13:16:43,621 - INFO - 📌 Layer 5: silencing 4 exclusive neurons for class 1
2025-07-09 13:16:43,622 - INFO - 📌 Layer 6: silencing 6 exclusive neurons for class 1
2025-07-09 13:16:43,622 - INFO - 📌 Layer 7: silencing 17 exclusive neurons for class 1
2025-07-09 13:16:43,623 - INFO - 📌 Layer 8: silencing 32 exclusive neurons for class 1
2025-07-09 13:16:43,623 - INFO - 📌 Layer 9: silencing 33 exclusive neurons for class 1
2025-07-09 13:16:43,624 - INFO - 📌 Layer 10: silencing 44 exclusive neurons for class 1
2025-07-09 13:16:43,624 - INFO - 📌 Layer 11: silencing 57 exclusive neurons for class 1
2025-07-09 13:17:59,077 - INFO - 🎯 Accuracy after exclusive silencing: 0.8600
2025-07-09 13:17:59,078 - INFO - 📏 Weighted F1 Score: 0.8552
2025-07-09 13:17:59,078 - INFO - 📋 Classification report saved to data/BigBird/results/

# GoEmotions

## Dataset Configuration

In [22]:
# 📁 Dataset and mappings
GOEMOTIONS_PATH = "data/goemotions"
INPUT_FILE = f"{GOEMOTIONS_PATH}/test.tsv"
EMOTIONS_FILE = f"{GOEMOTIONS_PATH}/emotions.txt"

# 🎯 Target emotions (subset of original GoEmotions)
TARGET_EMOTIONS = ["anger", "disgust", "fear", "joy", "sadness", "surprise"]

# 🧠 Pretrained Model
GOEMOTIONS_MODEL_HF = "monologg/bert-base-cased-goemotions-original"

# 💾 Outputs
SAMPLE_OUTPUT = f"{GOEMOTIONS_PATH}/sample_60.json"
TOKENIZED_OUTPUT = f"{GOEMOTIONS_PATH}/tokenized.pt"
LABELS_OUTPUT = f"{GOEMOTIONS_PATH}/labels.pt"
LABEL_MAPPING_OUTPUT = f"{GOEMOTIONS_PATH}/label_mapping.json"
CSV_REPORT_PATH = f"{GOEMOTIONS_PATH}/classification_report_eval.csv"
CSV_REPORT_GOBAL_SILENCING = f"{GOEMOTIONS_PATH}/classification_report_global_silencing.csv"
ACTIVATIONS_GOEMOTIONS = f"{GOEMOTIONS_PATH}/activations.json"
SAMPLE_OUTPUT_JSON = "data/goemotions/sample_df.json"
# 📟 Device

device_goemo = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [23]:
if not os.path.exists(SAMPLE_OUTPUT):
    # 📥 Load emotion names
    with open(EMOTIONS_FILE, "r") as f:
        id2emotion = [line.strip() for line in f.readlines()]
    emotion2id = {e: i for i, e in enumerate(id2emotion)}

    # 🎯 Select target emotions and their GoEmotions IDs
    TARGET_EMOTIONS = ["anger", "disgust", "fear", "joy", "sadness", "surprise"]
    target_ids = [emotion2id[e] for e in TARGET_EMOTIONS]

    # 🌐 Mapping from GoEmotion ID to 0–5 label
    goemo2local = {eid: i for i, eid in enumerate(target_ids)}

    # 📊 Load dataset
    df = pd.read_csv(INPUT_FILE, sep="\t", header=None, names=["text", "labels", "split"])
    df = df.dropna(subset=["labels"])
    df["label_ids"] = df["labels"].apply(lambda x: list(map(int, str(x).split(","))))

    # 🧼 Filter: single-label only & target emotions
    df_filtered = df[df["label_ids"].apply(lambda ids: len(ids) == 1 and ids[0] in target_ids)].copy()
    df_filtered["label_id"] = df_filtered["label_ids"].apply(lambda ids: goemo2local[ids[0]])

    # 📉 Count examples per class
    counts = df_filtered["label_id"].value_counts()
    print("Available examples for selected emotions:")
    print(counts)

    # 🎯 Balanced subset (max 10 per class)
    max_per_class = 10
    samples = []

    for label in counts.index:
        subset = df_filtered[df_filtered["label_id"] == label]
        sampled = shuffle(subset, random_state=42).iloc[:max_per_class]
        samples.append(sampled[["text", "label_id"]])

    df_final = pd.concat(samples).reset_index(drop=True)

    # 💾 Save to JSON
    df_final.to_json(SAMPLE_OUTPUT, orient="records", lines=True, force_ascii=False)
    print(f"\n✅ Saved dataset: {len(df_final)} examples (max {max_per_class} per emotion)")
else:
    print(f"⚠️ Skipping dataset generation: {SAMPLE_OUTPUT} already exists.")

⚠️ Skipping dataset generation: data/goemotions/sample_60.json already exists.


## Original Performance

### Load model, tokenizer and inputs

In [24]:
# 📥 Load dataset
with open(SAMPLE_OUTPUT, "r") as f:
    data = [json.loads(line) for line in f]

texts = [x["text"] for x in data]
labels = [x["label_id"] for x in data]

# 🔢 Label mappings
label2id = {label: i for i, label in enumerate(sorted(set(labels)))}
id2label = {i: label for label, i in label2id.items()}
label_ids = [label2id[label] for label in labels]

# 🔠 Tokenize and save only if not already saved
tokenizer = AutoTokenizer.from_pretrained(GOEMOTIONS_MODEL_HF)

if not os.path.exists(TOKENIZED_OUTPUT):
    encodings = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
    torch.save(encodings, TOKENIZED_OUTPUT)
    logger.info("✅ Tokenized inputs saved.")
else:
    logger.warning(f"⚠️ Skipping: {TOKENIZED_OUTPUT} already exists.")

if not os.path.exists(LABELS_OUTPUT):
    torch.save(torch.tensor(label_ids), LABELS_OUTPUT)
    logger.info("✅ Label tensor saved.")
else:
    logger.warning(f"⚠️ Skipping: {LABELS_OUTPUT} already exists.")

if not os.path.exists(LABEL_MAPPING_OUTPUT):
    with open(LABEL_MAPPING_OUTPUT, "w") as f:
        json.dump(label2id, f, indent=2)
    logger.info("✅ Label mapping saved.")
else:
    logger.warning(f"⚠️ Skipping: {LABEL_MAPPING_OUTPUT} already exists.")

# ✅ NEW: Generate sample_df for later neuron silencing evaluation
if not os.path.exists(SAMPLE_OUTPUT_JSON):
    logger.info("📄 Creating and saving sample_df.json for evaluation hooks...")
    sample_rows = []
    for text in texts:
        encoded = tokenizer(text, truncation=True, padding="max_length", max_length=128)
        sample_rows.append({
            "input_ids": encoded["input_ids"],
            "attention_mask": encoded["attention_mask"]
        })
    sample_df = pd.DataFrame(sample_rows)
    sample_df.to_json(SAMPLE_OUTPUT_JSON, orient="records", lines=True)
    logger.info(f"✅ sample_df saved to {SAMPLE_OUTPUT_JSON}")
else:
    logger.warning(f"⚠️ Skipping: {SAMPLE_OUTPUT_JSON} already exists.")

# Summary
logger.info("🧠 Emotions (IDs): %s", sorted(label2id.keys()))
logger.info("🔢 Label mapping: %s", label2id)

2025-07-09 18:05:15,967 - INFO - 🧠 Emotions (IDs): [0, 1, 2, 3, 4, 5]
2025-07-09 18:05:15,967 - INFO - 🔢 Label mapping: {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5}


## Inference

In [25]:
from tqdm import tqdm
import torch

# Define the target GoEmotions IDs
target_emotion_names = ["anger", "disgust", "fear", "joy", "sadness", "surprise"]

# Load emotion mapping
with open(EMOTIONS_FILE, "r") as f:
    id2emotion = [line.strip() for line in f.readlines()]
emotion2id = {e: i for i, e in enumerate(id2emotion)}

target_ids = [emotion2id[e] for e in target_emotion_names]
target_ids_tensor = torch.tensor(target_ids).to(device_goemo)

# Map GoEmotions IDs → local labels
label2id = {goid: i for i, goid in enumerate(target_ids)}
id2label = {i: goid for goid, i in label2id.items()}

print(f"🎯 Target GoEmotions IDs: {target_ids}")
print(f"🗺️ Mapping to local labels: {label2id}")

# Load model_goem
from transformers import AutoModelForSequenceClassification
model_goem = AutoModelForSequenceClassification.from_pretrained(GOEMOTIONS_MODEL_HF)
model_goem.to(device_goemo)
model_goem.eval()

# Load data
inputs = torch.load(TOKENIZED_OUTPUT, weights_only=False)
labels = torch.load(LABELS_OUTPUT).tolist()

predictions = []
true_labels = []

with torch.no_grad():
    for i in tqdm(range(len(labels))):
        input_ids = inputs["input_ids"][i].unsqueeze(0).to(device_goemo)
        attention_mask = inputs["attention_mask"][i].unsqueeze(0).to(device_goemo)

        logits = model_goem(input_ids=input_ids, attention_mask=attention_mask).logits.squeeze(0)

        selected_logits = logits[target_ids_tensor]
        pred_local = torch.argmax(selected_logits).item()

        predictions.append(pred_local)
        true_labels.append(labels[i])  # already 0–5



🎯 Target GoEmotions IDs: [2, 11, 14, 17, 25, 26]
🗺️ Mapping to local labels: {2: 0, 11: 1, 14: 2, 17: 3, 25: 4, 26: 5}


100%|██████████| 60/60 [00:01<00:00, 36.52it/s]


In [26]:
# ✅ Report
accuracy = accuracy_score(true_labels, predictions)
f1 = f1_score(true_labels, predictions, average="weighted")
ordered_labels = sorted(label2id.values())

report = classification_report(
    true_labels,
    predictions,
    labels=ordered_labels,
    target_names=[id2label[i] for i in ordered_labels],
    output_dict=True,
    zero_division=0
)

report_df = pd.DataFrame(report).transpose().round(4)
accuracy_row = pd.DataFrame({
    'precision': [""],
    'recall': [""],
    'f1-score': [accuracy],
    'support': [sum(report_df["support"])]
}, index=["overall_accuracy"])

final_df = pd.concat([report_df, accuracy_row])

if not os.path.exists(CSV_REPORT_PATH):
    final_df.to_csv(CSV_REPORT_PATH)
    print(f"✅ Report saved to {CSV_REPORT_PATH}")
else:
    print(f"⚠️ Skipping save: {CSV_REPORT_PATH} already exists.")

print(f"✅ Accuracy: {accuracy:.4f}")
print(f"✅ F1 Score: {f1:.4f}")

⚠️ Skipping save: data/goemotions/classification_report_eval.csv already exists.
✅ Accuracy: 0.8667
✅ F1 Score: 0.8649


## Dataset Wrapper and DataLoader (Goemotions)

In [27]:
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

class GoEmotionsDataset(Dataset):
    def __init__(self, input_ids, labels):
        self.input_ids = input_ids
        self.labels = labels

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return torch.tensor(self.input_ids[idx]), torch.tensor(self.labels[idx])

# Load input IDs and labels from disk
input_data = torch.load(TOKENIZED_OUTPUT, weights_only=False)
labels = torch.load(LABELS_OUTPUT).tolist()

input_ids_list = input_data["input_ids"].tolist()

# Create dataset and dataloader
dataset = GoEmotionsDataset(input_ids_list, labels)
dataloader = DataLoader(dataset, batch_size=4, shuffle=False)

logger.info("✅ Dataloader created successfully.")

INFO:__main__:✅ Dataloader created successfully.


## Extract Activations

In [28]:
if os.path.exists(ACTIVATIONS_GOEMOTIONS):
    logger.info(f"⚡ Activations already exist at {ACTIVATIONS_GOEMOTIONS}. Skipping extraction.")
else:
    logger.info("🚀 Starting activation extraction from model (CLS token only).")
    
    transformers_extractor.extract_representations(
        model=model_goem,
        input_tokens_list=input_ids_list,   
        output_file=ACTIVATIONS_GOEMOTIONS,
        device=device_goemo,
        output_type="json",                
        decompose_layers=False,
        filter_layers=None
    )

    logger.info(f"✅ Activations successfully saved to {ACTIVATIONS_GOEMOTIONS}")

INFO:__main__:⚡ Activations already exist at data/goemotions/activations.json. Skipping extraction.


## Load Activations

In [29]:
import torch
import numpy as np
import logging
from collections import defaultdict
from sklearn.preprocessing import LabelEncoder

logger = logging.getLogger(__name__)

def create_tensors_goemo(tokens_data, activations, task_specific_tag="NN", task_type="classification", dtype=torch.float32):
    """
    Create input/output tensors from CLS activations and labels for classification tasks.

    Args:
        tokens_data (list): List of dicts with keys "tokens" and "target"
        activations (list): List of numpy arrays with CLS activations
        task_specific_tag (str): Not used for CLS, kept for compatibility
        task_type (str): "classification" or "regression"
        dtype (torch.dtype): Data type of the tensors

    Returns:
        X (torch.Tensor): Input features (num_samples, num_layers * hidden_size)
        y (torch.Tensor): Labels
        mapping (tuple): label2idx, idx2label, None, None
    """

    logger.info("🔄 Creating tensors from activations and labels")

    # Number of samples
    num_samples = len(tokens_data)
    assert num_samples == len(activations), "Mismatch between tokens and activations"

    logger.info(f"🧪 Number of samples: {num_samples}")

    # Flatten each activation: (num_layers, 1, hidden_dim) → (num_layers * hidden_dim)
    X = []
    for i, sample in enumerate(activations):
        if sample.ndim == 3 and sample.shape[1] == 1:
            flattened = sample.squeeze(1).flatten()
        elif sample.ndim == 2:
            flattened = sample.flatten()
        else:
            raise ValueError(f"Unexpected shape for activation {i}: {sample.shape}")
        X.append(flattened)
    X = np.array(X)

    
    # Encode labels
    labels = [sample["target"] for sample in tokens_data]
    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(labels)

    # Logging label mapping
    label2idx = {label: int(idx) for idx, label in enumerate(label_encoder.classes_)}
    idx2label = {int(idx): label for label, idx in label2idx.items()}
    logger.info(f"🔢 Labels mapping: {label2idx}")

    return (
        torch.tensor(X, dtype=dtype),
        torch.tensor(y),
        (label2idx, idx2label, None, None)
    )

In [30]:
from neurox.data.loader import load_activations
from neurox.interpretation import utils

# ⚡ Load activations
activations, num_layers = load_activations(ACTIVATIONS_GOEMOTIONS)
logger.info(f"✅ Activations loaded from {ACTIVATIONS_GOEMOTIONS} with {num_layers} layers")

# 🧠 Prepare dataset with correct structure
sentence_data = [{"tokens": ["[CLS]"], "target": label} for label in labels]

# 📦 Convert to tensors
X, y, mapping = create_tensors_goemo(
    sentence_data,
    activations,
    task_specific_tag="NN",
    task_type="classification"
)

label2idx, idx2label, _, _ = mapping
logger.info("✅ Tensors and label mappings created successfully")

INFO:__main__:✅ Activations loaded from data/goemotions/activations.json with 12 layers
INFO:__main__:🔄 Creating tensors from activations and labels
INFO:__main__:🧪 Number of samples: 60
INFO:__main__:🔢 Labels mapping: {np.int64(0): 0, np.int64(1): 1, np.int64(2): 2, np.int64(3): 3, np.int64(4): 4, np.int64(5): 5}
INFO:__main__:✅ Tensors and label mappings created successfully


Loading json activations from data/goemotions/activations.json...
60 12.0


## Train Probe

In [31]:
# ✅ Convert tensors to numpy arrays (required by train_logistic_regression_probe)
X_np = X.numpy() if isinstance(X, torch.Tensor) else X
y_np = y.numpy() if isinstance(y, torch.Tensor) else y

# 🧪 Train logistic regression probe
logger.info("🔧 Training logistic regression probe")
probe = linear_probe.train_logistic_regression_probe(
    X_np, y_np,
    lambda_l1=0.001,
    lambda_l2=0.001
)

# 🧾 Evaluate the trained probe
logger.info("📈 Evaluating the probe")
scores = linear_probe.evaluate_probe(probe, X_np, y_np, idx_to_class=idx2label)
logger.info(f"🎯 Probe evaluation results:\n{scores}")

# 🔍 Get top neurons
top_neurons_probe, per_class_top_neurons = linear_probe.get_top_neurons(
    probe,
    percentage=0.1,
    class_to_idx=label2idx
)
logger.info(f"🧠 Top global neurons: {top_neurons_probe}")
logger.info(f"🧠 Top neurons per class: {per_class_top_neurons}")

INFO:__main__:🔧 Training logistic regression probe


Clases en y_train: [0 1 2 3 4 5]
Training classification probe
Creating model...
Number of training instances: 60
Number of classes: 6


epoch [1/10]: 0it [00:00, ?it/s]

Epoch: [1/10], Loss: 0.1458


epoch [2/10]: 0it [00:00, ?it/s]

Epoch: [2/10], Loss: 0.0373


epoch [3/10]: 0it [00:00, ?it/s]

Epoch: [3/10], Loss: 0.0210


epoch [4/10]: 0it [00:00, ?it/s]

Epoch: [4/10], Loss: 0.0291


epoch [5/10]: 0it [00:00, ?it/s]

Epoch: [5/10], Loss: 0.0214


epoch [6/10]: 0it [00:00, ?it/s]

Epoch: [6/10], Loss: 0.0147


epoch [7/10]: 0it [00:00, ?it/s]

Epoch: [7/10], Loss: 0.0131


epoch [8/10]: 0it [00:00, ?it/s]

Epoch: [8/10], Loss: 0.0121


epoch [9/10]: 0it [00:00, ?it/s]

Epoch: [9/10], Loss: 0.0114


epoch [10/10]: 0it [00:00, ?it/s]

INFO:__main__:📈 Evaluating the probe


Epoch: [10/10], Loss: 0.0112


Evaluating: 0it [00:00, ?it/s]

INFO:__main__:🎯 Probe evaluation results:
{'__OVERALL__': np.float64(1.0), np.int64(0): np.float64(1.0), np.int64(1): np.float64(1.0), np.int64(2): np.float64(1.0), np.int64(3): np.float64(1.0), np.int64(4): np.float64(1.0), np.int64(5): np.float64(1.0)}
INFO:__main__:🧠 Top global neurons: [8195 8196 8197 ... 8182 8183 8189]
INFO:__main__:🧠 Top neurons per class: {np.int64(0): array([7567, 3413, 7266, 7982, 7852, 9092, 8075, 8788, 4059, 4909, 9136,
       5616, 6987, 4576, 8125, 7997, 9160, 8370, 7027, 6302, 9202, 7213,
       5104, 7760, 7738, 6289, 8812, 5678, 4723, 1693, 8216, 7087, 6458,
       6951, 5103, 4219, 8496, 8001, 3091, 7208, 6022, 8394, 6523, 7882,
       7323, 6462, 5042, 8268, 7229, 6445, 6362, 2632, 6003, 7914, 6144,
       6292, 6546, 6486, 9180, 3859, 9100, 6984, 8988, 8505, 6706, 7287,
       9022, 8138, 7838, 8431, 9203, 5318, 8455, 8382, 7551, 7863, 8022,
       5197, 7281, 6472, 4190, 6211, 6722, 6605, 4671, 7291, 2461, 5228,
       5589, 9021, 8269, 8487, 5439,

Score (accuracy) of the probe: 1.00


## Global Silencing

## Silencing Functions

In [32]:
# ✅ Load tokenized input and labels
sample_df = pd.read_json(SAMPLE_OUTPUT_JSON, lines=True)
labels_list = torch.load(LABELS_OUTPUT).tolist()

In [33]:
# percentages = [0.025, 0.05, 0.075, 0.10, 0.125, 0.15, 0.175, 0.20, 0.25, 0.30, 0.35, 0.40, 0.45, 0.50, 0.65, 0.75, 0.8, 0.95]
percentages = [0.025]

for pct in percentages:
    silence_top_global_percentage_and_evaluate(
        model=model_goem,                    # o tu variable del modelo cargado
        sample_df=sample_df,
        labels_list=labels_list,
        probe=probe,
        label2idx=label2idx,
        percentage=pct,
        experiment_title=f"Silencing {pct*100:.1f}% Global Neurons",
        report_path=CSV_REPORT_GOBAL_SILENCING
    )

INFO:__main__:🔧 Silencing exactly 230 neurons (2.50% of total 9216)
INFO:__main__:📁 Saved neuron indices to data/BigBird/neurons/top_2p_neurons_global.json
INFO:__main__:📌 Layer 2: silencing 1 neurons
INFO:__main__:📌 Layer 3: silencing 1 neurons
INFO:__main__:📌 Layer 4: silencing 9 neurons
INFO:__main__:📌 Layer 5: silencing 11 neurons
INFO:__main__:📌 Layer 6: silencing 14 neurons
INFO:__main__:📌 Layer 7: silencing 11 neurons
INFO:__main__:📌 Layer 8: silencing 33 neurons
INFO:__main__:📌 Layer 9: silencing 46 neurons
INFO:__main__:📌 Layer 10: silencing 46 neurons
INFO:__main__:📌 Layer 11: silencing 58 neurons
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average,

In [None]:
import pandas as pd

# 🧐 Ver las primeras filas
print(sample_df.head())

# 🔍 Ver los tipos de cada columna
print(sample_df.dtypes)

# 🧪 Ver si hay valores nulos
print(sample_df.isnull().sum())

# Diagnostico

In [None]:
# Verifica dimensiones del probe
print("🔍 probe.linear.weight shape:", probe.linear.weight.shape)

# Debería dar (num_classes, total_neurons) → en tu caso: (6, 9216)

In [None]:
from collections import defaultdict

hidden_dim = model_goem.config.hidden_size
num_layers = model_goem.config.num_hidden_layers
total_neurons = hidden_dim * num_layers

top_neurons_global = get_top_k_neurons_exact(probe, percentage=0.025)
print("🎯 Total silenced neurons:", len(top_neurons_global))
print("📊 Sample neuron indices:", top_neurons_global[:10])
print("📉 Min index:", min(top_neurons_global), "| Max index:", max(top_neurons_global))

# Mapeo por capa
layer_counts = defaultdict(int)
for idx in top_neurons_global:
    layer = idx // hidden_dim
    layer_counts[layer] += 1

print("\n📚 Neurons per layer (Top 2.5%)")
for layer in sorted(layer_counts):
    print(f"  Layer {layer}: {layer_counts[layer]} neurons")

In [None]:
print("🧷 label2idx:", label2idx)

# Crea tensor con las IDs originales (de GoEmotions)
target_goemotion_ids = torch.tensor(list(label2idx.keys()))
print("🎯 target_goemotion_ids:", target_goemotion_ids.tolist())

# Confirma si las posiciones corresponden 1:1 con etiquetas de `labels_list` que tú usas
print("🧪 Sample labels_list:", labels_list[:10])

In [None]:
# Usa solo 1 ejemplo
i = 0

# Sin silenciamiento todavía
input_ids_tensor = torch.tensor(sample_df.loc[i, 'input_ids']).unsqueeze(0).to(model_goem.device)
attention_mask_tensor = torch.tensor(sample_df.loc[i, 'attention_mask']).unsqueeze(0).to(model_goem.device)

with torch.no_grad():
    outputs = model_goem(input_ids=input_ids_tensor, attention_mask=attention_mask_tensor)
    logits = outputs.logits
    selected_logits = logits[:, target_goemotion_ids]  # [1, 6]
    pred_local = torch.argmax(selected_logits, dim=1).item()

print("🔢 Full logits:", logits.tolist())
print("🎯 Selected logits (target emotions):", selected_logits.tolist())
print("✅ Predicted class index (0–5):", pred_local)
print("🏷️ True label:", labels_list[i])

In [None]:
# ⚠️ Hook solo para capa 11 (donde vimos que hay muchas neuronas silenciadas)


encoder_layers = get_encoder_layers(model_goem)
hook_handles = []

for i in range(num_layers):
    indices_layer = [idx - i * hidden_dim for idx in top_neurons_global if i * hidden_dim <= idx < (i + 1) * hidden_dim]
    if indices_layer:
        handle = encoder_layers[i].output.register_forward_hook(make_cls_silence_hook2(indices_layer))
        hook_handles.append(handle)

# Misma inferencia que antes
with torch.no_grad():
    outputs = model_goem(input_ids=input_ids_tensor, attention_mask=attention_mask_tensor)
    logits = outputs.logits
    selected_logits = logits[:, target_goemotion_ids]
    pred_local = torch.argmax(selected_logits, dim=1).item()

print("🧪 Silenced logits:", selected_logits.tolist())
print("🎯 New prediction:", pred_local)

# Limpiar hooks
for h in hook_handles:
    h.remove()

In [None]:
# Ejecuta una sola inferencia para ver qué capa da output y qué se usa como input al classifier
with torch.no_grad():
    outputs = model_goem(input_ids=input_ids_tensor, attention_mask=attention_mask_tensor, output_hidden_states=True)
    hidden_states = outputs.hidden_states  # Tuple: (layer_0, ..., layer_n)
    final_hidden = hidden_states[-1]  # shape: [1, seq_len, hidden_size]

    print("🧠 Final hidden state shape:", final_hidden.shape)
    print("🔍 CLS vector (posición 0):", final_hidden[:, 0, :].abs().sum().item())

    logits = model_goem.classifier(final_hidden[:, 0, :])  # ¿así lo hace el modelo?
    print("🎯 Recomputed logits from CLS:", logits)

In [None]:
import torch
from transformers import AutoModel
import logging

# Suponemos que ya tienes estas variables cargadas:
# model_goem, input_ids_tensor, attention_mask_tensor, top_neurons_global, make_cls_silence_hook2, get_encoder_layers

# Configura el logger
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

# Accede a capas internas
hidden_dim = model_goem.config.hidden_size
num_layers = model_goem.config.num_hidden_layers
encoder_layers = get_encoder_layers(model_goem)

# --- 1. Inference sin silenciar (sin hook) ---
with torch.no_grad():
    outputs_original = model_goem(
        input_ids=input_ids_tensor,
        attention_mask=attention_mask_tensor,
        output_hidden_states=True
    )
    hidden_states_original = outputs_original.hidden_states[-1][:, 0, :]  # [CLS] final layer

# --- 2. Aplicar hooks para silenciar ---
hook_handles = []
for i in range(num_layers):
    indices_layer = [idx - i * hidden_dim for idx in top_neurons_global if i * hidden_dim <= idx < (i + 1) * hidden_dim]
    if indices_layer:
        handle = encoder_layers[i].output.register_forward_hook(make_cls_silence_hook2(indices_layer))
        hook_handles.append(handle)

# --- 3. Inference con neuronas silenciadas ---
with torch.no_grad():
    outputs_silenced = model_goem(
        input_ids=input_ids_tensor,
        attention_mask=attention_mask_tensor,
        output_hidden_states=True
    )
    hidden_states_silenced = outputs_silenced.hidden_states[-1][:, 0, :]  # [CLS] silenciado

# --- 4. Comparar los vectores CLS ---
diff = torch.abs(hidden_states_original - hidden_states_silenced)
logger.info(f"🔍 CLS difference after silencing: mean={diff.mean()}, max={diff.max()}")

# Limpieza: eliminar hooks
for handle in hook_handles:
    handle.remove()

In [None]:
for i, layer in enumerate(get_encoder_layers(model_goem)):
    print(f"Layer {i} -> {layer.__class__.__name__}")