In [1]:
!nvidia-smi


Thu Oct 23 19:01:17 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 570.124.04             Driver Version: 570.124.04     CUDA Version: 12.8     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA vGPU-48GB               On  |   00000000:5B:00.0 Off |                  Off |
|  0%   34C    P8             20W /  425W |       1MiB /  49140MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [1]:
# ====================================================================
# |      !!! 终极解决方案：“直捣黄龙”之最终极补丁 !!!             |
# ====================================================================
import sys
import os
from unittest.mock import patch
import types
from importlib.machinery import ModuleSpec

# --- 补丁1：注入“全功能”伪模块 ---
def create_full_fake_module(name, attributes_to_add):
    spec = ModuleSpec(name, None)
    module = types.ModuleType(name)
    module.__spec__ = spec
    for attr in attributes_to_add:
        setattr(module, attr, lambda *args, **kwargs: None)
    return module

sys.modules['num2words'] = create_full_fake_module('num2words', ['num2words'])
sys.modules['word2number'] = create_full_fake_module('word2number', ['w2n'])
print(">>> 补丁1生效：已成功注入“全功能”伪模块。")

# --- 补丁2：“直捣黄龙”，直接替换nltk.download函数 ---
def dummy_nltk_download(*args, **kwargs):
    print(">>> 补丁2生效：已成功拦截并跳过 nltk.download() 调用！<<<")
    return True # 返回成功状态

patcher = patch('nltk.download', dummy_nltk_download)
patcher.start()
print(">>> 补丁2生效：已成功替换 nltk.download 函数。")


# --- 补丁3：手动为NLTK“指路” ---
nltk_data_dir = os.path.expanduser('~/nltk_data')
import nltk
if nltk_data_dir not in nltk.data.path:
    nltk.data.path.append(nltk_data_dir)
    print(f"成功将 '{nltk_data_dir}' 添加到NLTK的搜索路径。")

# --- 补丁4：设置其他环境变量 ---
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
sys.path.append('..')
# ====================================================================

import torch
import pandas as pd
from tqdm.auto import tqdm

# --- 正常导入所有模块 ---
# 导入在 defenses.py 中定义的类
from src.defenses import BasePredictor, NoDefense, AhpDefense, SelfDenoiseDefense
from src.utils.data_loader import load_sst2_dataset
from src.models.model_loader import load_main_llm
from src.attacks import AttackerWrapper
from src.utils.metrics import calculate_accuracy, calculate_asr

print("\n--- 恭喜！环境设置最终完成，所有模块导入成功！---")

>>> 补丁1生效：已成功注入“全功能”伪模块。
>>> 补丁2生效：已成功替换 nltk.download 函数。


2025-10-23 19:10:24.442419: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-10-23 19:10:24.496396: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 AVX512_FP16 AVX_VNNI AMX_TILE AMX_INT8 AMX_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-10-23 19:10:25.397942: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
  import pkg_resourc


--- 恭喜！环境设置最终完成，所有模块导入成功！---


In [2]:
# --- 1. 实验设置 ---
TASK = 'sst2'
DATASET_NAME = 'SST-2'
NUM_SAMPLES_TO_TEST = 50 
ATTACK_RECIPE = 'textbugger' # 可以切换为 'textbugger'

In [3]:
# --- 2. 加载模型和数据 ---
# 请确保这里的路径是您正在使用的 alpaca-7b 模型的正确路径
local_model_path = "/root/autodl-tmp/circulus_alpaca-7b"
# 使用4-bit量化加载模型，这是节省显存的第一道防线
main_model, main_tokenizer = load_main_llm(model_name=local_model_path, use_4bit=True)
# from transformers import AutoModelForSequenceClassification, AutoTokenizer

# # 我们不再使用 load_main_llm，而是直接加载一个标准的分类模型
# MODEL_NAME = "distilbert-base-uncased-finetuned-sst-2-english"

# print(f"正在加载一个更小的、用于公平对比的基础模型: {MODEL_NAME}...")

# 加载模型和分词器
main_model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
main_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
main_model.to("cuda") # 将模型移动到GPU

dataset = load_sst2_dataset(split='validation').select(range(NUM_SAMPLES_TO_TEST))
dataset_df = dataset.to_pandas()

正在加载一个更小的、用于公平对比的基础模型: distilbert-base-uncased-finetuned-sst-2-english...
正在加载SST-2数据集 (validation split)...
SST-2数据集加载成功。


In [4]:
# --- 3. 初始化防御策略和攻击器 ---
# class SimplePredictor:
#     def __init__(self, model, tokenizer):
#         self.model = model
#         self.tokenizer = tokenizer
#         self.device = model.device

#     def predict(self, sentence):
#         inputs = self.tokenizer(sentence, return_tensors="pt", truncation=True, padding=True).to(self.device)
#         with torch.no_grad():
#             logits = self.model(**inputs).logits
        
#         predicted_class_id = torch.argmax(logits, dim=1).item()
#         # 对于 SST-2, 0 是 'negative', 1 是 'positive'
#         return "positive" if predicted_class_id == 1 else "negative"
# 创建基础预测器，所有防御和攻击都将基于它
# 这里的 BasePredictor 来自于您提供的 defenses.py 文件
base_predictor = BasePredictor(main_model, main_tokenizer, task=TASK)
# base_predictor = SimplePredictor(main_model, main_tokenizer)

print("基础模型加载并适配完成！")

# ===================== 关键修正：降低AHP防御的显存消耗 =====================
# 将 m_val 从 10 大幅减少到 5。
# 这是解决 CUDA_ERROR_INVALID_HANDLE (显存不足) 错误的核心步骤。
# 因为 AHP 防御需要一次性生成 m_val 个候选句子，这个值过大会耗尽显存。
AHP_M_VAL = 5 
# =======================================================================

# 定义要对比的防御策略
defenses = {
    "No Defense (Baseline)": NoDefense(base_predictor),
    "AHP-NLI Defense": AhpDefense(base_predictor, k_val=3, m_val=AHP_M_VAL),
    "Self-Denoise Defense": SelfDenoiseDefense(base_predictor, num_samples=10)
}

# 初始化攻击器
# 这里的 AttackerWrapper 来自于您提供的 attacks/attacks.py 文件
attacker = AttackerWrapper(base_predictor)

基础模型加载并适配完成！
正在加载NLI模型: roberta-large-mnli...


Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


NLI模型加载成功。


In [5]:
# --- 4. 生成对抗样本 ---
# 注意：这一步会非常慢！
adversarial_df = attacker.attack(dataset, attack_recipe_name=ATTACK_RECIPE)
print(f"\nTextAttack成功生成了 {len(adversarial_df)} / {NUM_SAMPLES_TO_TEST} 个对抗样本。")

# --- 4.5. 对齐攻击数据 ---
attack_map = pd.Series(adversarial_df.perturbed_text.values, index=adversarial_df.original_text).to_dict()
full_perturbed_texts = [attack_map.get(sent, sent) for sent in dataset_df['sentence']]
print(f"已生成与原始数据对齐的完整对抗样本列表，长度为: {len(full_perturbed_texts)}")

textattack: Unknown if model of class <class 'transformers.models.distilbert.modeling_distilbert.DistilBertForSequenceClassification'> compatible with goal function <class 'textattack.goal_functions.classification.untargeted_classification.UntargetedClassification'>.


Attack(
  (search_method): GreedyWordSwapWIR(
    (wir_method):  delete
  )
  (goal_function):  UntargetedClassification
  (transformation):  CompositeTransformation(
    (0): WordSwapRandomCharacterInsertion(
        (random_one):  True
      )
    (1): WordSwapRandomCharacterDeletion(
        (random_one):  True
      )
    (2): WordSwapNeighboringCharacterSwap(
        (random_one):  True
      )
    (3): WordSwapHomoglyphSwap
    (4): WordSwapEmbedding(
        (max_candidates):  5
        (embedding):  WordEmbedding
      )
    )
  (constraints): 
    (0): UniversalSentenceEncoder(
        (metric):  angular
        (threshold):  0.8
        (window_size):  inf
        (skip_text_shorter_than_window):  False
        (compare_against_original):  True
      )
    (1): RepeatModification
    (2): StopwordModification
  (is_black_box):  True
) 



I0000 00:00:1761217883.415565    4091 gpu_device.cc:2020] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 44808 MB memory:  -> device: 0, name: NVIDIA vGPU-48GB, pci bus id: 0000:5b:00.0, compute capability: 8.9
[Succeeded / Failed / Skipped / Total] 32 / 17 / 1 / 50: 100%|██████████| 50/50 [00:12<00:00,  4.02it/s]


+-------------------------------+--------+
| Attack Results                |        |
+-------------------------------+--------+
| Number of successful attacks: | 32     |
| Number of failed attacks:     | 17     |
| Number of skipped attacks:    | 1      |
| Original accuracy:            | 98.0%  |
| Accuracy under attack:        | 34.0%  |
| Attack success rate:          | 65.31% |
| Average perturbed word %:     | 12.94% |
| Average num. words per input: | 16.32  |
| Avg num queries:              | 38.2   |
+-------------------------------+--------+
正在使用 textbugger (黑盒模式) 生成对抗样本...





  0%|          | 0/50 [00:00<?, ?it/s]


TextAttack成功生成了 50 / 50 个对抗样本。
已生成与原始数据对齐的完整对抗样本列表，长度为: 50


In [6]:
# --- 5. 评估所有防御策略 ---
results = []
baseline_clean_preds = []

for defense_name, defense_method in defenses.items():
    print(f"\n{'='*20} 正在评估防御策略: {defense_name} {'='*20}")

    # a. 评估Clean Accuracy (在原始数据上)
    clean_preds = [defense_method(text) for text in tqdm(dataset_df['sentence'], desc="Clean Eval")]
    clean_accuracy = calculate_accuracy(dataset_df['label_text'], clean_preds)

    # b. 评估Accuracy under Attack (在对抗样本上)
    attack_preds = [defense_method(text) for text in tqdm(full_perturbed_texts, desc="Attack Eval")]
    attack_accuracy = calculate_accuracy(dataset_df['label_text'], attack_preds)

    # c. 计算ASR
    if not baseline_clean_preds:
        # 确保基线预测只计算一次
        baseline_clean_preds = clean_preds if defense_name == "No Defense (Baseline)" else \
                               [defenses["No Defense (Baseline)"](text) for text in tqdm(dataset_df['sentence'], desc="Baseline Eval")]

    attack_success_rate = calculate_asr(baseline_clean_preds, attack_preds, dataset_df['label_text'].tolist())

    results.append({
        "防御方法 (Defense)": defense_name,
        "原始准确率 (Clean Acc)": clean_accuracy,
        "攻击后准确率 (Attack Acc)": attack_accuracy,
        "攻击成功率 (ASR)": attack_success_rate
    })




Clean Eval:   0%|          | 0/50 [00:00<?, ?it/s]

Attack Eval:   0%|          | 0/50 [00:00<?, ?it/s]




Clean Eval:   0%|          | 0/50 [00:00<?, ?it/s]

ValueError: Expected input batch_size (1) to match target batch_size (12).

In [8]:
# --- 6. 展示结果 ---
results_df = pd.DataFrame(results)
print("\n实验二：经验鲁棒性评估 - 结果汇总")
print("=" * 70)
print(results_df.to_string(index=False))


实验二：经验鲁棒性评估 - 结果汇总
       防御方法 (Defense)  原始准确率 (Clean Acc)  攻击后准确率 (Attack Acc)  攻击成功率 (ASR)
No Defense (Baseline)               0.78                 0.78     0.000000
      AHP-NLI Defense               0.84                 0.88     0.076923
 Self-Denoise Defense               0.80                 0.84     0.025641


In [9]:
# --- 7. 保存结果 ---
if not os.path.exists('../results'):
    os.makedirs('../results')

# --- 自动编号并保存结果 ---
if not os.path.exists('../results'):
    os.makedirs('../results')

base_path = f'../results/experiment_2_robustness_{ATTACK_RECIPE}'
extension = '.csv'
save_path = f"{base_path}{extension}"

counter = 1
while os.path.exists(save_path):
    save_path = f"{base_path}_{counter}{extension}"
    counter += 1

results_df.to_csv(save_path, index=False)
print(f"\n实验结果已成功保存到: {save_path}")


实验结果已成功保存到: ../results/experiment_2_robustness_textbugger_3.csv
