# 实验 3.4：文本对抗攻击

## 实验目标
- 理解文本对抗攻击的独特挑战
- 实现简单的同义词替换攻击
- 观察词重要性对攻击效果的影响

## 实验环境
- Python 3.8+
- transformers（情感分析模型）

## 预计时间：30 分钟

---

## 核心概念回顾
文本是离散的，不能像图像那样做微小扰动。我们需要通过替换词语来实现攻击。

## 第一部分：环境准备

In [None]:
# 导入必要的库
import numpy as np
import matplotlib.pyplot as plt
from transformers import pipeline

# 设置中文显示
plt.rcParams['font.sans-serif'] = ['SimHei', 'Arial Unicode MS']
plt.rcParams['axes.unicode_minus'] = False

print("正在加载情感分析模型...")
# 加载情感分析模型
sentiment_analyzer = pipeline("sentiment-analysis", 
                               model="distilbert-base-uncased-finetuned-sst-2-english")
print("模型加载完成！")

In [None]:
# 定义同义词词典（简化版，实际应用中可使用 WordNet）
# 为了教学目的，我们手动定义常见词的同义词
SYNONYMS = {
    # 正面词汇
    "good": ["nice", "fine", "decent", "okay"],
    "great": ["wonderful", "excellent", "fantastic", "superb"],
    "amazing": ["incredible", "astonishing", "remarkable", "stunning"],
    "love": ["adore", "enjoy", "appreciate", "like"],
    "excellent": ["outstanding", "superb", "magnificent", "exceptional"],
    "beautiful": ["lovely", "gorgeous", "attractive", "pretty"],
    "happy": ["pleased", "delighted", "content", "joyful"],
    "best": ["finest", "greatest", "top", "premier"],
    
    # 负面词汇
    "bad": ["poor", "terrible", "awful", "horrible"],
    "hate": ["dislike", "despise", "detest", "loathe"],
    "terrible": ["awful", "dreadful", "horrible", "atrocious"],
    "boring": ["dull", "tedious", "monotonous", "uninteresting"],
    "worst": ["poorest", "lowest", "weakest", "inferior"],
    
    # 中性词汇
    "movie": ["film", "picture", "cinema", "flick"],
    "watch": ["see", "view", "observe", "witness"],
    "think": ["believe", "feel", "consider", "suppose"],
}

def get_synonyms(word):
    """获取词的同义词列表"""
    return SYNONYMS.get(word.lower(), [])

print(f"同义词词典包含 {len(SYNONYMS)} 个词条")
print(f"示例：'great' 的同义词 = {get_synonyms('great')}")

In [None]:
# 辅助函数
def analyze_sentiment(text):
    """分析文本情感，返回标签和置信度"""
    result = sentiment_analyzer(text)[0]
    return result['label'], result['score']

def get_sentiment_scores(text):
    """获取详细的情感分数"""
    result = sentiment_analyzer(text)[0]
    label = result['label']
    score = result['score']
    # 转换为 POSITIVE 的置信度
    if label == 'POSITIVE':
        return score
    else:
        return 1 - score

# 测试
test_texts = [
    "This movie is great!",
    "This movie is terrible.",
    "The weather is nice today."
]

print("情感分析测试：")
for text in test_texts:
    label, score = analyze_sentiment(text)
    print(f"  '{text}' → {label} ({score:.2%})")

## 第二部分：词重要性分析

In [None]:
# 【填空 1】实现词重要性评估函数
# 提示：通过删除每个词，观察模型输出的变化来评估重要性

def compute_word_importance(text):
    """
    计算每个词对预测结果的重要性
    方法：删除该词后，观察置信度变化
    """
    words = text.split()
    original_score = get_sentiment_scores(text)
    importance_scores = []
    
    for i, word in enumerate(words):
        # 创建删除第 i 个词后的文本
        words_without_i = words[:i] + words[i+1:]
        modified_text = ' '.join(words_without_i)
        
        if modified_text.strip():  # 确保不是空文本
            # 【填空 1】计算删除该词后的情感分数
            # 参考答案：modified_score = get_sentiment_scores(modified_text)
            modified_score = ___________________
            
            # 重要性 = 删除后分数变化的绝对值
            importance = abs(original_score - modified_score)
        else:
            importance = 0
        
        importance_scores.append((word, importance))
    
    return importance_scores

# 测试词重要性
test_sentence = "This movie is absolutely great and I love it"
importance = compute_word_importance(test_sentence)

print(f"原句：'{test_sentence}'")
print(f"原始情感：{analyze_sentiment(test_sentence)}")
print("\n词重要性排序：")
for word, score in sorted(importance, key=lambda x: x[1], reverse=True):
    print(f"  {word}: {score:.4f}")

In [None]:
# 可视化词重要性
words = [w for w, _ in importance]
scores = [s for _, s in importance]

plt.figure(figsize=(12, 4))
colors = ['red' if s > 0.1 else 'steelblue' for s in scores]
plt.bar(words, scores, color=colors)
plt.xlabel('词')
plt.ylabel('重要性分数')
plt.title(f'词重要性分析\n"{test_sentence}"')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

print("红色标记的词对情感判断影响最大，是攻击的优先目标")

## 第三部分：同义词替换攻击

In [None]:
# 【填空 2】实现同义词替换攻击
# 提示：按重要性顺序替换词，直到攻击成功

def synonym_attack(text, target_label=None):
    """
    同义词替换攻击
    
    参数：
        text: 原始文本
        target_label: 目标标签（如果为 None，则攻击目标是改变当前标签）
    
    返回：
        对抗文本, 是否成功, 替换历史
    """
    original_label, original_conf = analyze_sentiment(text)
    words = text.split()
    
    # 计算词重要性并排序
    importance = compute_word_importance(text)
    sorted_importance = sorted(enumerate(importance), 
                               key=lambda x: x[1][1], reverse=True)
    
    current_text = text
    current_words = words.copy()
    replacements = []
    
    # 按重要性顺序尝试替换
    for idx, (word, imp_score) in sorted_importance:
        synonyms = get_synonyms(word)
        if not synonyms:
            continue
        
        # 尝试每个同义词
        best_synonym = None
        best_change = 0
        
        for synonym in synonyms:
            # 【填空 2】创建替换后的文本
            # 参考答案：
            # test_words = current_words.copy()
            # test_words[idx] = synonym
            # test_text = ' '.join(test_words)
            test_words = current_words.copy()
            test_words[idx] = ___________________
            test_text = ' '.join(test_words)
            
            new_label, new_conf = analyze_sentiment(test_text)
            
            # 如果标签改变了，攻击成功
            if new_label != original_label:
                replacements.append((word, synonym))
                return test_text, True, replacements
            
            # 记录使置信度下降最多的替换
            if original_label == 'POSITIVE':
                change = original_conf - new_conf
            else:
                change = new_conf - original_conf
            
            if change > best_change:
                best_change = change
                best_synonym = synonym
        
        # 应用最佳替换
        if best_synonym:
            replacements.append((word, best_synonym))
            current_words[idx] = best_synonym
            current_text = ' '.join(current_words)
            
            # 检查是否攻击成功
            new_label, _ = analyze_sentiment(current_text)
            if new_label != original_label:
                return current_text, True, replacements
    
    return current_text, False, replacements

In [None]:
# 测试同义词攻击
test_sentences = [
    "This movie is great and I love it!",
    "The food was excellent and delicious.",
    "I had an amazing experience at this hotel.",
    "This is the worst product I have ever bought.",
]

print("同义词替换攻击测试：")
print("=" * 70)

for sentence in test_sentences:
    orig_label, orig_conf = analyze_sentiment(sentence)
    adv_text, success, replacements = synonym_attack(sentence)
    adv_label, adv_conf = analyze_sentiment(adv_text)
    
    print(f"\n原句: {sentence}")
    print(f"原始: {orig_label} ({orig_conf:.2%})")
    print(f"替换: {replacements}")
    print(f"对抗: {adv_text}")
    print(f"结果: {adv_label} ({adv_conf:.2%}) - {'攻击成功 ✓' if success else '攻击失败 ✗'}")

## 第四部分：攻击效果分析

In [None]:
# 【填空 3】批量测试攻击成功率
# 提示：在多个样本上测试，统计成功率

test_samples = [
    "This is a great movie with excellent acting.",
    "I love this product, it works amazingly well.",
    "The service was wonderful and staff was friendly.",
    "This book is boring and poorly written.",
    "Terrible experience, would not recommend.",
    "The best restaurant I have ever visited.",
    "Amazing quality and fast delivery.",
    "Worst customer service ever.",
]

success_count = 0
total_replacements = 0

print("批量攻击测试：")
print("-" * 60)

for sample in test_samples:
    # 【填空 3】执行攻击并统计结果
    # 参考答案：adv_text, success, replacements = synonym_attack(sample)
    adv_text, success, replacements = ___________________
    
    if success:
        success_count += 1
        total_replacements += len(replacements)
        print(f"✓ {sample[:40]}... -> 替换 {len(replacements)} 个词")
    else:
        print(f"✗ {sample[:40]}...")

print("-" * 60)
print(f"攻击成功率：{success_count}/{len(test_samples)} ({success_count/len(test_samples)*100:.1f}%)")
if success_count > 0:
    print(f"平均替换词数：{total_replacements/success_count:.1f}")

In [None]:
# 可视化攻击前后的情感变化
original_scores = []
adversarial_scores = []
labels_list = []

for sample in test_samples[:5]:  # 取前5个样本
    orig_score = get_sentiment_scores(sample)
    adv_text, _, _ = synonym_attack(sample)
    adv_score = get_sentiment_scores(adv_text)
    
    original_scores.append(orig_score)
    adversarial_scores.append(adv_score)
    labels_list.append(sample[:20] + "...")

x = np.arange(len(labels_list))
width = 0.35

fig, ax = plt.subplots(figsize=(12, 5))
bars1 = ax.bar(x - width/2, original_scores, width, label='原始', color='steelblue')
bars2 = ax.bar(x + width/2, adversarial_scores, width, label='攻击后', color='coral')

ax.axhline(y=0.5, color='gray', linestyle='--', label='决策边界')
ax.set_ylabel('正面情感置信度')
ax.set_title('同义词替换攻击效果')
ax.set_xticks(x)
ax.set_xticklabels(labels_list, rotation=45, ha='right')
ax.legend()
ax.set_ylim(0, 1)

plt.tight_layout()
plt.show()

## 第五部分：字符级攻击演示

In [None]:
# 字符级攻击：使用同形字符替换
HOMOGLYPHS = {
    'a': 'а',  # 西里尔字母
    'e': 'е',  # 西里尔字母
    'o': 'о',  # 西里尔字母
    'p': 'р',  # 西里尔字母
    'c': 'с',  # 西里尔字母
    'x': 'х',  # 西里尔字母
}

def homoglyph_attack(text):
    """同形字符替换攻击"""
    result = ""
    for char in text:
        if char.lower() in HOMOGLYPHS and np.random.random() < 0.3:
            result += HOMOGLYPHS[char.lower()]
        else:
            result += char
    return result

# 演示
test_text = "This movie is excellent"
homoglyph_text = homoglyph_attack(test_text)

print("字符级攻击演示：")
print(f"原始文本：{test_text}")
print(f"攻击文本：{homoglyph_text}")
print(f"\n原始分析：{analyze_sentiment(test_text)}")
print(f"攻击分析：{analyze_sentiment(homoglyph_text)}")
print("\n注意：两段文本看起来几乎一样，但包含不同的 Unicode 字符")

## 实验总结

### 观察记录

请回答以下问题：

1. **词重要性分析有什么用？** 为什么要优先替换重要性高的词？

2. **同义词替换的局限是什么？** 什么情况下攻击会失败？

3. **字符级攻击和词级攻击有什么区别？** 各自的优缺点是什么？

### 核心概念回顾

- **文本对抗的挑战**：离散性，不能做微小扰动
- **词重要性**：通过删除词观察模型变化来评估
- **同义词替换**：保持语义的同时改变模型判断
- **字符级攻击**：使用同形字符绕过关键词检测

---

**模块三实验完成！** 你已经学习了图像和文本两种模态的对抗攻击技术。