# Configs

In [2]:
import os
import torch
import json
import logging
import time
import numpy as np
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import AutoModelForSequenceClassification
from torch.utils.data import DataLoader, Dataset
import neurox.data.extraction.transformers_extractor as transformers_extractor
from neurox.data.writer import ActivationsWriter
import neurox.data.loader as data_loader
from transformers import AutoConfig
from tqdm import tqdm
import neurox.interpretation.linear_probe as linear_probe
import neurox.interpretation.utils as utils
import neurox.analysis.visualization as TransformersVisualizer
from sklearn.model_selection import train_test_split
from IPython.display import display
import neurox.interpretation.probeless as probeless
from neurox.interpretation.probeless import (
    get_neuron_ordering,
    get_neuron_ordering_for_all_tags
)
import ast
from torch.cuda.amp import autocast
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.preprocessing import LabelEncoder
from matplotlib_venn import venn2
from sklearn.model_selection import train_test_split
from neurox.interpretation.linear_probe import get_top_neurons

In [3]:
import logging

# ==========================
# üìú Configure Logging 
# ==========================

logger = logging.getLogger("neurox_logger")
logger.setLevel(logging.INFO)

# Avoid duplicates
if not logger.hasHandlers():

    # üìÅ Handler 
    file_handler = logging.FileHandler("neurox_extraction_csv_pth.log", mode="w")
    file_handler.setLevel(logging.INFO)

    # üñ•Ô∏è Handler 
    console_handler = logging.StreamHandler()
    console_handler.setLevel(logging.INFO)

    # Format
    formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
    file_handler.setFormatter(formatter)
    console_handler.setFormatter(formatter)

    # Add handlers to main logger
    logger.addHandler(file_handler)
    logger.addHandler(console_handler)

logger.info("üöÄ Logging configured")


2025-05-13 08:29:48,876 - INFO - üöÄ Logging configured


In [4]:
# ==========================
# File config
# ==========================
input_csv = "data/BigBird_tokens_PT.csv"
output_csv = "data/BigBird_tokens_reduced.csv"
labels_output_path = "data/labels_numeric.txt"
label_mapping_path = "data/labels_mapping.json"
reduction_ratio = 0.001  # Reduction ratio of original dataset
activations_file = "data/syscall_activations.json"


In [5]:
# ==========================
# Load model
# ==========================
model = AutoModelForSequenceClassification.from_pretrained("google/bigbird-roberta-base", num_labels=5)
weights_path = 'data/LLM_Syscalls/best_model_BigBird.pth'
logger.info(f"Loading model weights from: {weights_path}")

state_dict = torch.load(weights_path, map_location="cpu")  
model.load_state_dict(state_dict, strict=False)

      
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.eval()
logger.info("‚úÖ Model loaded")

Some weights of the model checkpoint at google/bigbird-roberta-base were not used when initializing BigBirdForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BigBirdForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BigBirdForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BigBirdForSequenceClassifica

### Load dataset

In [6]:
# ==========================
# ‚úÖ Skip dataset reduction if already available
# ==========================
if os.path.exists(output_csv) and os.path.exists(labels_output_path):
    logger.info(f"‚ö° Reduced dataset found: {output_csv}. Skipping reduction.")
    df_reduced = pd.read_csv(output_csv)
    with open(labels_output_path, "r") as f:
        labels = [int(line.strip()) for line in f]  # Labels as integers
    with open(label_mapping_path, "r") as f:
        label_mapping = json.load(f)  # Load label mapping
else:
    logger.info(f"üîÑ Loading dataset from {input_csv}")

    chunk_size = 5000 
    total_rows = sum(1 for _ in open(input_csv)) - 1  # Total rows excluding header
    df_chunks = []

    logger.info(f"üîÑ Processing {total_rows} rows in chunks of {chunk_size}...")

    with tqdm(total=total_rows, desc="Processing rows", unit=" rows") as pbar:
        for chunk in pd.read_csv(input_csv, chunksize=chunk_size):
            # Convert `input_ids` from string to list of integers
            chunk['input_ids'] = chunk['input_ids'].apply(lambda x: list(map(int, x.strip("[]").split(","))))
            df_chunks.append(chunk)
            pbar.update(len(chunk))

    df = pd.concat(df_chunks, ignore_index=True)

    # ==========================
    # üî¢ Encode labels as integers
    # ==========================
    df['label'], unique_labels = pd.factorize(df["label"])
    label_mapping = {label: int(idx) for idx, label in enumerate(unique_labels)}

    # ==========================
    # üß™ Reduce dataset maintaining class proportions
    # ==========================
    df_reduced, _ = train_test_split(df, train_size=reduction_ratio, stratify=df["label"], random_state=42)
    labels = df_reduced["label"].tolist()

    # ==========================
    # üíæ Save reduced dataset and labels
    # ==========================
    df_reduced.to_csv(output_csv, index=False)
    with open(labels_output_path, "w") as f:
        for label in labels:
            f.write(str(label) + "\n")

    with open(label_mapping_path, "w") as f:
        json.dump(label_mapping, f, indent=4)

    logger.info(f"‚úÖ Reduced dataset saved to {output_csv}")
    logger.info(f"‚úÖ Numeric labels saved to {labels_output_path}")
    logger.info(f"‚úÖ Label mapping saved to {label_mapping_path}")


2025-05-13 08:29:59,000 - INFO - ‚ö° Reduced dataset found: /home/kikay/LLM_Syscalls/BigBird_tokens_reduced.csv. Skipping reduction.


### Create Dataloader

In [7]:
# ==========================
# üì¶ Create DataLoader
# ==========================
class SyscallDataset(Dataset):
    def __init__(self, dataframe):
        self.data = dataframe

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        input_ids = torch.tensor(self.data.iloc[idx]['input_ids'])
        label = torch.tensor(self.data.iloc[idx]['label'])
        return input_ids, label

# Initialize DataLoader with reduced dataset
dataset = SyscallDataset(df_reduced)
dataloader = DataLoader(dataset, batch_size=4, shuffle=False)

logger.info("‚úÖ Dataloader created")

# Ensure `input_ids` are lists of integers
if isinstance(df_reduced["input_ids"].iloc[0], str):
    df_reduced["input_ids"] = df_reduced["input_ids"].apply(lambda x: list(map(int, x.strip("[]").split(","))))


2025-05-13 08:29:59,026 - INFO - ‚úÖ Dataloader created


# NeuroX

## Activation Extraction

In [8]:
if os.path.exists(activations_file):
    logger.info(f"‚ö° Activations file found: {activations_file}. Skipping extraction.")
else:
    transformers_extractor.extract_representations(
        model, 
        df_reduced["input_ids"].tolist(),  # Pass preprocessed tokens directly
        activations_file,
        device=device,
    )

    logger.info(f"‚úÖ Activations saved to {activations_file}")


2025-05-13 08:29:59,058 - INFO - ‚ö° Activations file found: /home/kikay/LLM_Syscalls/syscall_activations.json. Skipping extraction.


## Load Activations

In [10]:
activations, num_layers = data_loader.load_activations(activations_file)
logger.info(f"‚úÖ Loaded activations from {activations_file} with {num_layers} layers")

# Load sentence-level classification data using activations
tokens = data_loader.load_sentence_data(
    output_csv, labels_output_path, activations
)

# Create sentence-level tensors for classification
X, y, mapping = utils.create_tensors(
    tokens,
    activations,
    task_specific_tag="NN",
    task_type="classification"
)

label2idx, idx2label, src2idx, idx2src = mapping
logger.info("‚úÖ Created input/output tensors and label mappings for classification")

2025-05-13 08:29:59,149 - INFO - ‚úÖ Loaded activations from /home/kikay/LLM_Syscalls/syscall_activations.json with 12 layers
2025-05-13 08:29:59,163 - INFO - ‚úÖ Created input/output tensors and label mappings for classification


Loading json activations from /home/kikay/LLM_Syscalls/syscall_activations.json...
50 12.0
Number of tokens:  50
length of source dictionary:  17
length of target dictionary:  5
50
Total instances: 50
['s']
Number of samples:  50
Stats: Labels with their frequencies in the final set
2 9
3 15
0 7
4 9
1 10


## Train linear probe

In [12]:
probe = linear_probe.train_logistic_regression_probe(X, y, lambda_l1=0.001, lambda_l2=0.001)
scores = linear_probe.evaluate_probe(probe, X, y, idx_to_class=idx2label)
logger.info(f"üéØ Probe evaluation results: {scores}")

top_neurons_probe, per_class_top_neurons = linear_probe.get_top_neurons(probe, percentage=0.1, class_to_idx=label2idx)
logger.info(f"üîç Top global neurons: {top_neurons_probe}")
logger.info(f"üîç Top neurons per class: {per_class_top_neurons}")

Clases en y_train: [0 1 2 3 4]
Training classification probe
Creating model...
Number of training instances: 50
Number of classes: 5


epoch [1/10]: 0it [00:00, ?it/s]

Epoch: [1/10], Loss: 0.0793


epoch [2/10]: 0it [00:00, ?it/s]

Epoch: [2/10], Loss: 0.0399


epoch [3/10]: 0it [00:00, ?it/s]

Epoch: [3/10], Loss: 0.0276


epoch [4/10]: 0it [00:00, ?it/s]

Epoch: [4/10], Loss: 0.0233


epoch [5/10]: 0it [00:00, ?it/s]

Epoch: [5/10], Loss: 0.0192


epoch [6/10]: 0it [00:00, ?it/s]

Epoch: [6/10], Loss: 0.0175


epoch [7/10]: 0it [00:00, ?it/s]

Epoch: [7/10], Loss: 0.0165


epoch [8/10]: 0it [00:00, ?it/s]

Epoch: [8/10], Loss: 0.0155


epoch [9/10]: 0it [00:00, ?it/s]

Epoch: [9/10], Loss: 0.0147


epoch [10/10]: 0it [00:00, ?it/s]

Epoch: [10/10], Loss: 0.0142


Evaluating: 0it [00:00, ?it/s]

2025-05-13 08:29:59,905 - INFO - üéØ Probe evaluation results: {'__OVERALL__': 0.96, '2': 0.7777777777777778, '3': 1.0, '0': 1.0, '4': 1.0, '1': 1.0}
2025-05-13 08:29:59,910 - INFO - üîç Top global neurons: [8194 8205 8206 ... 4088 8188 8191]
2025-05-13 08:29:59,913 - INFO - üîç Top neurons per class: {'2': array([8486, 8679, 8194, 8465, 9040, 8951, 8999, 7891, 7732, 8468, 8864,
       9124, 8019, 6939, 7934, 4810, 8607, 4491, 8820, 8485, 7830, 7962,
       8510, 8854, 6498, 8720, 8857, 9171, 8710, 7857, 6031, 8284, 8031,
       8232, 8787, 9045, 8556, 9096, 7094, 8608, 7743, 3225, 9121, 8027,
       8611, 8586, 3273, 8878, 6790, 8561, 8618, 9031, 8865, 3776, 8113,
       9144, 8617, 9189, 6986, 8388, 8037, 7815, 8917, 9065, 3855, 9084,
       9071, 8779, 4398, 8558, 9024, 8666, 8498, 6250, 7697, 8846, 8211,
       8527, 7877, 8819, 8975, 7874, 9153, 8506, 9101, 8880, 8665, 9149,
       9051, 8907, 2897, 7156, 8259, 9157, 8762, 3956, 8065, 3536, 6511,
       9123, 2447, 5733, 8639, 

Score (accuracy) of the probe: 0.96


# Experiments

## Original performance

In [17]:
df = pd.read_csv(input_csv)

# üéØ Select 30 random examples and reset index
sample_df = df.sample(n=30, random_state=42).reset_index(drop=True)

# üßπ Convert "input_ids" and "attention_mask" from string to list format
def parse_list(x):
    return ast.literal_eval(x)

sample_df['input_ids'] = sample_df['input_ids'].apply(parse_list)
sample_df['attention_mask'] = sample_df['attention_mask'].apply(parse_list)

# üî¢ Encode labels to integers
label_encoder = LabelEncoder()
sample_df['label'] = label_encoder.fit_transform(sample_df['label'])
labels_list = sample_df['label'].tolist()

predictions_list = []
model.eval()
torch.cuda.empty_cache()

for i in range(len(sample_df)):
    input_ids_tensor = torch.tensor(sample_df.loc[i, 'input_ids']).unsqueeze(0).to(device)
    attention_mask_tensor = torch.tensor(sample_df.loc[i, 'attention_mask']).unsqueeze(0).to(device)

    with torch.no_grad():
        with autocast():
            outputs = model(input_ids=input_ids_tensor, attention_mask=attention_mask_tensor)
            logits = outputs['logits']
            pred = torch.argmax(logits, dim=1).item()
            predictions_list.append(pred)

    del input_ids_tensor, attention_mask_tensor, outputs, logits
    torch.cuda.empty_cache()

# üìä Compute evaluation metrics
accuracy = accuracy_score(labels_list, predictions_list)
f1 = f1_score(labels_list, predictions_list, average='weighted')
report_dict = classification_report(labels_list, predictions_list, output_dict=True)
report_df = pd.DataFrame(report_dict).transpose()

# Round for readability
report_df = report_df.round(4)

# Add accuracy as a separate row
accuracy_row = pd.DataFrame({'precision': accuracy, 'recall': accuracy, 'f1-score': accuracy, 'support': sum(report_df['support'])}, index=['accuracy'])
report_df = pd.concat([report_df, accuracy_row])

# üìù Save or append to CSV

experiment_title = "üß™ Sample of 30 - Full Model Evaluation"
csv_report_path = "results/classification_report_sample_eval.csv"

# Remove incorrect 'accuracy' row if it exists
report_df = report_df.drop("accuracy", errors="ignore")

# Append correct accuracy row
accuracy_row = pd.DataFrame({
    'precision': [""],
    'recall': [""],
    'f1-score': [accuracy],
    'support': [sum(report_df["support"])]
}, index=["overall_accuracy"])

# Combine
final_df = pd.concat([report_df, accuracy_row])

# Write to CSV with experiment title as a header
with open(csv_report_path, "a") as f:
    f.write(f"\n\n# {experiment_title}\n")
final_df.to_csv(csv_report_path, mode="a")

logger.info(f"üìÅ Appended classification report with title '{experiment_title}' to {csv_report_path}")


['/cluster/raid/home/sape/MILCOM2024/MalwSpecSys/Dataset/Bashlite'
 '/cluster/raid/home/sape/MILCOM2024/MalwSpecSys/Dataset/Bdvl'
 '/cluster/raid/home/sape/MILCOM2024/MalwSpecSys/Dataset/Normal'
 '/cluster/raid/home/sape/MILCOM2024/MalwSpecSys/Dataset/RansomwarePoC'
 '/cluster/raid/home/sape/MILCOM2024/MalwSpecSys/Dataset/Thetick']
{0: '/cluster/raid/home/sape/MILCOM2024/MalwSpecSys/Dataset/Bashlite', 1: '/cluster/raid/home/sape/MILCOM2024/MalwSpecSys/Dataset/Bdvl', 2: '/cluster/raid/home/sape/MILCOM2024/MalwSpecSys/Dataset/Normal', 3: '/cluster/raid/home/sape/MILCOM2024/MalwSpecSys/Dataset/RansomwarePoC', 4: '/cluster/raid/home/sape/MILCOM2024/MalwSpecSys/Dataset/Thetick'}


2025-05-13 08:44:04,953 - INFO - üìÅ Appended classification report with title 'üß™ Sample of 30 - Full Model Evaluation' to classification_report_sample_eval.csv


## Silence and evaluate neurons. Function definition

### Full silencing

In [18]:
def get_top_k_neurons_exact(probe, percentage: float) -> list[int]:
    """
    Return exactly N = round(total_neurons * percentage) neuron indices, sorted by importance.
    Importance is measured as the sum of absolute values of weights across all output classes.
    """
    weight_matrix = probe.linear.weight.detach().abs()  # [num_classes, num_neurons]
    importance = weight_matrix.sum(dim=0).cpu().numpy()  # [num_neurons]
    total_neurons = len(importance)
    top_n = round(total_neurons * percentage)

    sorted_indices = importance.argsort()[-top_n:]  # Top-N by importance
    return sorted_indices.tolist()


def silence_top_global_percentage_and_evaluate(
    model,
    sample_df,
    labels_list,
    probe,
    label2idx,
    percentage=0.10,
    report_path="results/classification_report_sample_eval.csv",
    experiment_title=None
):
    hidden_dim = model.config.hidden_size
    total_neurons = 12 * hidden_dim
    top_neurons_global = get_top_k_neurons_exact(probe, percentage=percentage)

    logger.info(f"üîß Silencing exactly {len(top_neurons_global)} neurons ({percentage:.2%} of total {total_neurons})")

    # Save neuron indices
    json_path = f"top_{int(percentage * 100)}p_neurons_global.json"
    with open(json_path, "w") as f:
        json.dump(top_neurons_global, f, indent=4)
    logger.info(f"üìÅ Saved neuron indices to {json_path}")

    # Register hooks per layer
    hook_handles = []
    for i in range(12):
        indices_layer = [idx - i * hidden_dim for idx in top_neurons_global if i * hidden_dim <= idx < (i + 1) * hidden_dim]
        if indices_layer:
            logger.info(f"üìå Layer {i}: silencing {len(indices_layer)} neurons")
            layer = model.bert.encoder.layer[i].output
            handle = layer.register_forward_hook(make_cls_silence_hook(indices_layer))
            hook_handles.append(handle)

    # Inference
    model.eval()
    predictions = []
    for i in range(len(sample_df)):
        input_ids_tensor = torch.tensor(sample_df.loc[i, 'input_ids']).unsqueeze(0).to(model.device)
        attention_mask_tensor = torch.tensor(sample_df.loc[i, 'attention_mask']).unsqueeze(0).to(model.device)

        with torch.no_grad():
            outputs = model(input_ids=input_ids_tensor, attention_mask=attention_mask_tensor)
            logits = outputs['logits']
            pred = torch.argmax(logits, dim=1).item()
            predictions.append(pred)

        del input_ids_tensor, attention_mask_tensor, outputs, logits
        torch.cuda.empty_cache()

    # Metrics
    accuracy = accuracy_score(labels_list, predictions)
    f1 = f1_score(labels_list, predictions, average='weighted')
    report_dict = classification_report(labels_list, predictions, output_dict=True)
    report_df = pd.DataFrame(report_dict).transpose().round(4)
    report_df = report_df.drop("accuracy", errors="ignore")

    accuracy_row = pd.DataFrame({
        'precision': [""],
        'recall': [""],
        'f1-score': [accuracy],
        'support': [sum(report_df["support"])]
    }, index=["overall_accuracy"])
    final_df = pd.concat([report_df, accuracy_row])

    # Save report
    if experiment_title is None:
        experiment_title = f"Silencing top {percentage:.2%} global neurons"
    if not os.path.exists(report_path):
        with open(report_path, "w") as f:
            f.write(f"# {experiment_title}\n")
            final_df.to_csv(f)
    else:
        with open(report_path, "a") as f:
            f.write(f"\n\n# {experiment_title}\n")
        final_df.to_csv(report_path, mode="a")

    logger.info(f"üéØ Accuracy after silencing: {accuracy:.4f}")
    logger.info(f"üìè Weighted F1 Score: {f1:.4f}")
    logger.info(f"üìã Classification report saved to {report_path}")

    for handle in hook_handles:
        handle.remove()
    logger.info("‚úÖ All hooks removed after evaluation")

In [19]:
def make_cls_silence_hook(indices):
    indices = [int(i) for i in indices]
    indices_tensor = torch.tensor(indices, dtype=torch.long) if indices else None

    def hook(module, input, output):
        if output.dim() == 3:
            new_output = output.clone()
            cls_token = new_output[:, 0, :]
            mask = torch.ones_like(cls_token)
            if indices_tensor is not None:
                local_indices = indices_tensor.to(new_output.device)
                mask[:, local_indices] = 0.0
            new_output[:, 0, :] = cls_token * mask
            return new_output
        return output
    return hook

## Global impact

In [14]:
percentages = [0.00, 0.025, 0.05, 0.075, 0.10, 0.125, 0.15, 0.175, 0.20, 0.25, 0.30, 0.35, 0.40, 0.45, 0.50, 0.65, 0.75, 0.8, 0.95]

for pct in percentages:
    silence_top_global_percentage_and_evaluate(
        model=model,
        sample_df=sample_df,
        labels_list=labels_list,
        probe=probe,
        label2idx=label2idx,
        percentage=pct,
        experiment_title=f"Silencing {pct*100:.1f}% Global Neurons"
    )


2025-04-24 12:24:34,561 - INFO - üîß Silencing exactly 9216 neurons (0.00% of total 9216)
2025-04-24 12:24:34,566 - INFO - üìÅ Saved neuron indices to top_0p_neurons_global.json
2025-04-24 12:24:34,567 - INFO - üìå Layer 0: silencing 768 neurons
2025-04-24 12:24:34,569 - INFO - üìå Layer 1: silencing 768 neurons
2025-04-24 12:24:34,571 - INFO - üìå Layer 2: silencing 768 neurons
2025-04-24 12:24:34,572 - INFO - üìå Layer 3: silencing 768 neurons
2025-04-24 12:24:34,574 - INFO - üìå Layer 4: silencing 768 neurons
2025-04-24 12:24:34,575 - INFO - üìå Layer 5: silencing 768 neurons
2025-04-24 12:24:34,576 - INFO - üìå Layer 6: silencing 768 neurons
2025-04-24 12:24:34,578 - INFO - üìå Layer 7: silencing 768 neurons
2025-04-24 12:24:34,579 - INFO - üìå Layer 8: silencing 768 neurons
2025-04-24 12:24:34,580 - INFO - üìå Layer 9: silencing 768 neurons
2025-04-24 12:24:34,582 - INFO - üìå Layer 10: silencing 768 neurons
2025-04-24 12:24:34,583 - INFO - üìå Layer 11: silencing 76

### Attenuation

In [15]:
def make_cls_attenuation_hook(indices_layer, attenuation_factor):
    indices_layer = [int(i) for i in indices_layer]
    if len(indices_layer) > 0:
        indices_tensor_cpu = torch.as_tensor(indices_layer, dtype=torch.long)
    else:
        indices_tensor_cpu = None

    def hook(module, input, output):
        if output.dim() == 3:
            new_output = output.clone()
            cls_token = new_output[:, 0, :]
            if indices_tensor_cpu is not None and indices_tensor_cpu.numel() > 0:
                local_indices = indices_tensor_cpu.to(new_output.device)
                cls_token[:, local_indices] *= attenuation_factor
            new_output[:, 0, :] = cls_token
            return new_output
        return output
    return hook


def attenuate_top_global_percentage_and_evaluate(
    model,
    sample_df,
    labels_list,
    probe,
    label2idx,
    percentage=0.10,
    attenuation_factor=0.1,
    report_path="results/classification_report_attenuation_eval.csv",
    experiment_title=None
):
    hidden_dim = model.config.hidden_size
    total_neurons = 12 * hidden_dim
    top_neurons_global = get_top_k_neurons_exact(probe, percentage=percentage)

    logger.info(f"üîß Attenuating exactly {len(top_neurons_global)} neurons "
                f"({percentage:.2%} of total {total_neurons}) by factor {attenuation_factor}")

    # Save indices
    json_path = f"attenuated_top_{int(percentage * 100)}p_neurons_global.json"
    with open(json_path, "w") as f:
        json.dump(top_neurons_global, f, indent=4)
    logger.info(f"üìÅ Saved neuron indices to {json_path}")

    # Register hooks
    hook_handles = []
    for i in range(12):
        indices_layer = [idx - i * hidden_dim for idx in top_neurons_global if i * hidden_dim <= idx < (i + 1) * hidden_dim]
        if indices_layer:
            logger.info(f"üìå Layer {i}: attenuating {len(indices_layer)} neurons")
            layer = model.bert.encoder.layer[i].output
            handle = layer.register_forward_hook(make_cls_attenuation_hook(indices_layer, attenuation_factor))
            hook_handles.append(handle)

    # Inference
    model.eval()
    predictions = []
    for i in range(len(sample_df)):
        input_ids_tensor = torch.tensor(sample_df.loc[i, 'input_ids']).unsqueeze(0).to(model.device)
        attention_mask_tensor = torch.tensor(sample_df.loc[i, 'attention_mask']).unsqueeze(0).to(model.device)

        with torch.no_grad():
            outputs = model(input_ids=input_ids_tensor, attention_mask=attention_mask_tensor)
            logits = outputs['logits']
            pred = torch.argmax(logits, dim=1).item()
            predictions.append(pred)

        del input_ids_tensor, attention_mask_tensor, outputs, logits
        torch.cuda.empty_cache()

    # Metrics
    accuracy = accuracy_score(labels_list, predictions)
    f1 = f1_score(labels_list, predictions, average='weighted')
    report_dict = classification_report(labels_list, predictions, output_dict=True)
    report_df = pd.DataFrame(report_dict).transpose().round(4)
    report_df = report_df.drop("accuracy", errors="ignore")

    accuracy_row = pd.DataFrame({
        'precision': [""],
        'recall': [""],
        'f1-score': [accuracy],
        'support': [sum(report_df["support"])]
    }, index=["overall_accuracy"])
    final_df = pd.concat([report_df, accuracy_row])

    # Save
    if experiment_title is None:
        experiment_title = f"Attenuating top {percentage:.2%} global neurons (√ó{attenuation_factor})"
    if not os.path.exists(report_path):
        with open(report_path, "w") as f:
            f.write(f"# {experiment_title}\n")
            final_df.to_csv(f)
    else:
        with open(report_path, "a") as f:
            f.write(f"\n\n# {experiment_title}\n")
        final_df.to_csv(report_path, mode="a")

    logger.info(f"üéØ Accuracy after attenuation: {accuracy:.4f}")
    logger.info(f"üìè Weighted F1 Score: {f1:.4f}")
    logger.info(f"üìã Classification report saved to {report_path}")

    for handle in hook_handles:
        handle.remove()
    logger.info("‚úÖ All hooks removed after attenuation evaluation")


In [16]:
# Valores de atenuaci√≥n a aplicar (e.g. 0.1 = mantener el 10% del valor original)
attenuation_factors = [0.1, 0.3, 0.5, 0.7]

# Porcentajes de neuronas a atenuar
percentages = [0.00, 0.025, 0.05, 0.075, 0.10, 0.125, 0.15, 0.175,
               0.20, 0.25, 0.30, 0.35, 0.40, 0.45, 0.50, 0.65, 0.75, 0.8, 0.95]

# Archivo donde se ir√°n acumulando los resultados
report_path = "results/attenuation_experiment_report.csv"

for factor in attenuation_factors:
    for pct in percentages:
        title = f"Attenuation {pct*100:.1f}% Global Neurons √ó{factor}"
        logger.info(f"\nüß™ Running experiment: {title}")
        attenuate_top_global_percentage_and_evaluate(
            model=model,
            sample_df=sample_df,
            labels_list=labels_list,
            probe=probe,
            label2idx=label2idx,
            percentage=pct,
            attenuation_factor=factor,
            experiment_title=title,
            report_path=report_path
        )


2025-05-01 10:15:42,438 - INFO - 
üß™ Running experiment: Attenuation 0.0% Global Neurons √ó0.1
2025-05-01 10:15:42,457 - INFO - üîß Attenuating exactly 9216 neurons (0.00% of total 9216) by factor 0.1
2025-05-01 10:15:42,467 - INFO - üìÅ Saved neuron indices to attenuated_top_0p_neurons_global.json
2025-05-01 10:15:42,471 - INFO - üìå Layer 0: attenuating 768 neurons
2025-05-01 10:15:42,474 - INFO - üìå Layer 1: attenuating 768 neurons
2025-05-01 10:15:42,477 - INFO - üìå Layer 2: attenuating 768 neurons
2025-05-01 10:15:42,480 - INFO - üìå Layer 3: attenuating 768 neurons
2025-05-01 10:15:42,484 - INFO - üìå Layer 4: attenuating 768 neurons
2025-05-01 10:15:42,487 - INFO - üìå Layer 5: attenuating 768 neurons
2025-05-01 10:15:42,490 - INFO - üìå Layer 6: attenuating 768 neurons
2025-05-01 10:15:42,493 - INFO - üìå Layer 7: attenuating 768 neurons
2025-05-01 10:15:42,495 - INFO - üìå Layer 8: attenuating 768 neurons
2025-05-01 10:15:42,496 - INFO - üìå Layer 9: attenuatin

## Impact per class

In [20]:
def get_top_k_neurons_for_class_exact(probe, percentage: float, class_to_idx: dict, class_id: int) -> list[int]:
    """
    Return top-k neurons most important for a specific class, measured by absolute weight.
    """
    weight_matrix = probe.linear.weight.detach().abs()  # [num_classes, num_neurons]
    class_weights = weight_matrix[class_id]  # [num_neurons]
    total_neurons = class_weights.size(0)
    top_n = round(percentage * total_neurons)
    top_indices = class_weights.cpu().numpy().argsort()[-top_n:]
    return top_indices.tolist()


def silence_top_class_percentage_and_evaluate(
    model,
    sample_df,
    labels_list,
    probe,
    label2idx,
    class_id: int,
    percentage: float = 0.1,
    report_path: str = "results/classification_report_sample_eval.csv",
    experiment_title: str = None
):
    class_key = str(class_id)
    class_name = f"class_{class_id}"
    hidden_dim = model.config.hidden_size
    total_neurons = 12 * hidden_dim

    # Get top neurons for class
    top_class_neurons = get_top_k_neurons_for_class_exact(
        probe, percentage=percentage, class_to_idx=label2idx, class_id=class_id
    )

    logger.info(f"üîß Silencing {len(top_class_neurons)} neurons for class {class_id} ({percentage:.2%} of total)")

    # Save neurons to JSON
    json_path = f"top_{int(percentage * 100)}p_neurons_{class_name}.json"
    with open(json_path, "w") as f:
        json.dump(top_class_neurons, f, indent=4)
    logger.info(f"üìÅ Saved neuron indices to {json_path}")

    # Register hooks
    hook_handles = []
    for i in range(12):
        indices_layer = [idx - i * hidden_dim for idx in top_class_neurons if i * hidden_dim <= idx < (i + 1) * hidden_dim]
        if indices_layer:
            logger.info(f"üìå Layer {i}: silencing {len(indices_layer)} neurons for class {class_id}")
            layer = model.bert.encoder.layer[i].output
            handle = layer.register_forward_hook(make_cls_silence_hook(indices_layer))
            hook_handles.append(handle)

    # Inference
    model.eval()
    predictions = []
    for i in range(len(sample_df)):
        input_ids_tensor = torch.tensor(sample_df.loc[i, 'input_ids']).unsqueeze(0).to(model.device)
        attention_mask_tensor = torch.tensor(sample_df.loc[i, 'attention_mask']).unsqueeze(0).to(model.device)
        with torch.no_grad():
            outputs = model(input_ids=input_ids_tensor, attention_mask=attention_mask_tensor)
            logits = outputs['logits']
            pred = torch.argmax(logits, dim=1).item()
            predictions.append(pred)

        del input_ids_tensor, attention_mask_tensor, outputs, logits
        torch.cuda.empty_cache()

    # Metrics
    accuracy = accuracy_score(labels_list, predictions)
    f1 = f1_score(labels_list, predictions, average='weighted')
    report_dict = classification_report(labels_list, predictions, output_dict=True)
    report_df = pd.DataFrame(report_dict).transpose().round(4)
    report_df = report_df.drop("accuracy", errors="ignore")

    accuracy_row = pd.DataFrame({
        'precision': [""],
        'recall': [""],
        'f1-score': [accuracy],
        'support': [sum(report_df["support"])]
    }, index=["overall_accuracy"])
    final_df = pd.concat([report_df, accuracy_row])

    # Save report
    if experiment_title is None:
        experiment_title = f"Silencing top {percentage:.2%} neurons for class {class_id}"
    if not os.path.exists(report_path):
        with open(report_path, "w") as f:
            f.write(f"# {experiment_title}\n")
            final_df.to_csv(f)
    else:
        with open(report_path, "a") as f:
            f.write(f"\n\n# {experiment_title}\n")
        final_df.to_csv(report_path, mode="a")

    logger.info(f"üéØ Accuracy after class-specific silencing: {accuracy:.4f}")
    logger.info(f"üìè Weighted F1 Score: {f1:.4f}")
    logger.info(f"üìã Classification report saved to {report_path}")

    for handle in hook_handles:
        handle.remove()
    logger.info("‚úÖ All hooks removed after evaluation")


In [24]:
percentages = [0.00, 0.025, 0.05, 0.075, 0.10, 0.125, 0.15, 0.175, 0.20, 0.25, 0.30, 0.35, 0.40, 0.45, 0.50, 0.65, 0.75, 0.8, 0.95]

target_class_id = 4

for pct in percentages:
    silence_top_class_percentage_and_evaluate(
        model=model,
        sample_df=sample_df,
        labels_list=labels_list,
        probe=probe,
        label2idx=label2idx,
        class_id=target_class_id,
        percentage=pct,
        experiment_title=f"Silencing {pct*100:.1f}% of Neurons for Class {target_class_id}"
    )


2025-05-10 13:06:53,149 - INFO - üîß Silencing 9216 neurons for class 4 (0.00% of total)
2025-05-10 13:06:53,156 - INFO - üìÅ Saved neuron indices to top_0p_neurons_class_4.json
2025-05-10 13:06:53,159 - INFO - üìå Layer 0: silencing 768 neurons for class 4
2025-05-10 13:06:53,161 - INFO - üìå Layer 1: silencing 768 neurons for class 4
2025-05-10 13:06:53,164 - INFO - üìå Layer 2: silencing 768 neurons for class 4
2025-05-10 13:06:53,167 - INFO - üìå Layer 3: silencing 768 neurons for class 4
2025-05-10 13:06:53,169 - INFO - üìå Layer 4: silencing 768 neurons for class 4
2025-05-10 13:06:53,171 - INFO - üìå Layer 5: silencing 768 neurons for class 4
2025-05-10 13:06:53,174 - INFO - üìå Layer 6: silencing 768 neurons for class 4
2025-05-10 13:06:53,176 - INFO - üìå Layer 7: silencing 768 neurons for class 4
2025-05-10 13:06:53,178 - INFO - üìå Layer 8: silencing 768 neurons for class 4
2025-05-10 13:06:53,181 - INFO - üìå Layer 9: silencing 768 neurons for class 4
2025-05-10

# Attacks

## FGSM

In [18]:
from torch.nn import CrossEntropyLoss

def run_fgsm_attack_and_evaluate(
    model,
    sample_df,
    labels_list,
    epsilon: float = 0.1,
    report_path: str = "results/classification_report_sample_eval.csv",
    experiment_title: str = None
):
    logger.info(f"‚öîÔ∏è Running FGSM attack with Œµ = {epsilon}")
    model.eval()
    predictions_fgsm = []
    loss_fn = CrossEntropyLoss()

    for i in range(len(sample_df)):
        input_ids_tensor = torch.tensor(sample_df.loc[i, 'input_ids'], dtype=torch.long).unsqueeze(0).to(model.device)
        attention_mask_tensor = torch.tensor(sample_df.loc[i, 'attention_mask'], dtype=torch.long).unsqueeze(0).to(model.device)
        true_label = torch.tensor([labels_list[i]], dtype=torch.long).to(model.device)

        # Get embeddings with gradient tracking
        embeds = model.bert.embeddings(input_ids_tensor).detach()
        embeds.requires_grad = True

        # Forward pass
        outputs = model(inputs_embeds=embeds, attention_mask=attention_mask_tensor)
        logits = outputs.logits
        loss = loss_fn(logits, true_label)

        # Backward to compute gradient
        model.zero_grad()
        loss.backward()

        # FGSM perturbation
        perturbation = epsilon * embeds.grad.data.sign()
        adv_embeds = embeds + perturbation

        # Inference with adversarial input
        with torch.no_grad():
            adv_outputs = model(inputs_embeds=adv_embeds, attention_mask=attention_mask_tensor)
            adv_logits = adv_outputs.logits
            pred = torch.argmax(adv_logits, dim=1).item()
            predictions_fgsm.append(pred)

        del input_ids_tensor, attention_mask_tensor, embeds, adv_embeds, outputs, adv_outputs, logits, adv_logits
        torch.cuda.empty_cache()

    # Evaluate
    accuracy = accuracy_score(labels_list, predictions_fgsm)
    f1 = f1_score(labels_list, predictions_fgsm, average='weighted')
    report_dict = classification_report(labels_list, predictions_fgsm, output_dict=True)
    report_df = pd.DataFrame(report_dict).transpose().round(4)
    report_df = report_df.drop("accuracy", errors="ignore")

    accuracy_row = pd.DataFrame({
        'precision': [""],
        'recall': [""],
        'f1-score': [accuracy],
        'support': [sum(report_df["support"])]
    }, index=["overall_accuracy"])
    final_df = pd.concat([report_df, accuracy_row])

    # Save report
    if experiment_title is None:
        experiment_title = f"FGSM Attack (Œµ = {epsilon})"
    if not os.path.exists(report_path):
        with open(report_path, "w") as f:
            f.write(f"# {experiment_title}\n")
            final_df.to_csv(f)
    else:
        with open(report_path, "a") as f:
            f.write(f"\n\n# {experiment_title}\n")
        final_df.to_csv(report_path, mode="a")

    logger.info(f"üéØ Accuracy under FGSM (Œµ={epsilon}): {accuracy:.4f}")
    logger.info(f"üìè Weighted F1 Score: {f1:.4f}")
    logger.info(f"üìã Classification report saved to {report_path}")


In [19]:
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
model.to("cpu")
device = torch.device("cpu")


In [20]:
run_fgsm_attack_and_evaluate(
    model=model,
    sample_df=sample_df,
    labels_list=labels_list,
    epsilon=0.1,
    experiment_title="FGSM Adversarial Attack with Œµ = 0.1"
)

2025-05-13 08:53:05,468 - INFO - ‚öîÔ∏è Running FGSM attack with Œµ = 0.1


KeyboardInterrupt: 

In [27]:
epsilon_list = [0.3, 0.6, 0.9, 1.2]
results_df = run_jsma_varying_epsilon(model, sample_df, labels_list, epsilon_list, k=100, device="cpu")


2025-04-24 23:17:30,660 - INFO - 
üß™ JSMA test with Œµ=0.3, k=100
2025-04-24 23:36:05,606 - INFO - üìâ JSMA attack results (Œµ=0.3, k=100):
2025-04-24 23:36:05,609 - INFO - Accuracy: 0.7667 | F1 Score: 0.7646
2025-04-24 23:36:05,610 - INFO - 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         3
           1       0.71      1.00      0.83         5
           2       0.80      0.62      0.70        13
           3       1.00      1.00      1.00         4
           4       0.50      0.60      0.55         5

    accuracy                           0.77        30
   macro avg       0.80      0.84      0.81        30
weighted avg       0.78      0.77      0.76        30

2025-04-24 23:36:05,610 - INFO - 
üß™ JSMA test with Œµ=0.6, k=100
2025-04-24 23:54:30,429 - INFO - üìâ JSMA attack results (Œµ=0.6, k=100):
2025-04-24 23:54:30,432 - INFO - Accuracy: 0.7667 | F1 Score: 0.7646
2025-04-24 23:54:30,433 - INFO - 
              preci

## Random Noise

In [24]:
def run_random_noise_attack(
    model,
    sample_df,
    labels_list,
    epsilon=0.3,
    device="cpu"
):
    import torch
    from sklearn.metrics import accuracy_score, f1_score, classification_report

    logger.info(f"üé≤ Running random noise attack (epsilon={epsilon})")
    model.to(device)
    model.eval()
    predictions_noise = []

    for i in range(len(sample_df)):
        input_ids_tensor = torch.tensor(sample_df.loc[i, 'input_ids'], dtype=torch.long).unsqueeze(0).to(device)
        attention_mask_tensor = torch.tensor(sample_df.loc[i, 'attention_mask'], dtype=torch.long).unsqueeze(0).to(device)

        with torch.no_grad():
            embeddings = model.bert.embeddings(input_ids_tensor)
            noisy_embeddings = embeddings + epsilon * torch.randn_like(embeddings)

            outputs = model(inputs_embeds=noisy_embeddings, attention_mask=attention_mask_tensor)
            logits = outputs.logits
            pred = torch.argmax(logits, dim=1).item()
            predictions_noise.append(pred)

    accuracy = accuracy_score(labels_list, predictions_noise)
    f1 = f1_score(labels_list, predictions_noise, average='weighted')
    report = classification_report(labels_list, predictions_noise)

    logger.info(f"üìâ Random noise attack results (epsilon={epsilon}):")
    logger.info(f"Accuracy: {accuracy:.4f} | F1 Score: {f1:.4f}")
    logger.info(f"\nClassification Report:\n{report}")

    return accuracy, f1, predictions_noise


In [33]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, classification_report

epsilons = np.linspace(0.1, 1.0, 10)  # 10 valores de 0.1 a 1.0
results = []

for epsilon in epsilons:
    logger.info(f"\nüé≤ Running random noise attack with Œµ={epsilon:.2f}")

    acc_noise, f1_noise, preds_noise = run_random_noise_attack(
        model=model,
        sample_df=sample_df,
        labels_list=labels_list,
        epsilon=epsilon,
        device="cpu"
    )

    logger.info(f"üìâ Results for Œµ={epsilon:.2f} ‚Üí Accuracy: {acc_noise:.4f} | F1 Score: {f1_noise:.4f}")

    results.append({
        "epsilon": round(epsilon, 2),
        "accuracy": acc_noise,
        "f1_score": f1_noise
    })

# Convertimos resultados a dataframe
noise_sweep_df = pd.DataFrame(results)
noise_sweep_df.to_csv("results/random_noise_sweep_up_to_1.csv", index=False)

logger.info("üìÅ Random noise sweep results saved to 'results/random_noise_sweep_up_to_1.csv'")


2025-04-28 10:24:51,393 - INFO - 
üé≤ Running random noise attack with Œµ=0.10
2025-04-28 10:24:51,396 - INFO - üé≤ Running random noise attack (epsilon=0.1)
2025-04-28 10:28:28,133 - INFO - üìâ Random noise attack results (epsilon=0.1):
2025-04-28 10:28:28,136 - INFO - Accuracy: 0.8667 | F1 Score: 0.8667
2025-04-28 10:28:28,139 - INFO - 
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         3
           1       1.00      1.00      1.00         5
           2       0.85      0.85      0.85        13
           3       1.00      1.00      1.00         4
           4       0.60      0.60      0.60         5

    accuracy                           0.87        30
   macro avg       0.89      0.89      0.89        30
weighted avg       0.87      0.87      0.87        30

2025-04-28 10:28:28,139 - INFO - üìâ Results for Œµ=0.10 ‚Üí Accuracy: 0.8667 | F1 Score: 0.8667
2025-04-28 10:28:28,140 - INFO - 
üé≤ Running