# 词义消歧 (WSD) 分析笔记本

本笔记本用于分析和比较不同词义消歧方法的性能。

In [None]:
# 导入必要的库
import sys
sys.path.insert(0, '../src')

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from wsd import LeskWSD, BERTContextWSD, KnowledgeEnhancedWSD, GraphBasedWSD
from evaluation import WSDEvaluator, evaluate_wsd
from utils.data_loader import WSDDataLoader

## 1. 加载数据

In [None]:
# 加载示例数据
loader = WSDDataLoader()
test_data = loader.load(dataset="sample")

print(f"加载了 {len(test_data)} 个样本")
print("\n示例数据:")
for sample in test_data[:3]:
    print(f"  句子: {sample['sentence']}")
    print(f"  目标词: {sample['target_word']}")
    print(f"  词义: {sample['sense_key']}")
    print()

## 2. Lesk算法演示

In [None]:
# 创建Lesk算法实例
lesk = LeskWSD(use_examples=True)

# 测试词义消歧
test_sentences = [
    ("I went to the bank to deposit money.", "bank"),
    ("The river bank was covered with flowers.", "bank"),
    ("The movie star attended the premiere.", "star"),
    ("The bright star is visible tonight.", "star"),
]

print("Lesk算法词义消歧结果:")
print("=" * 60)

for sentence, word in test_sentences:
    result = lesk.disambiguate(sentence, word)
    print(f"句子: {sentence}")
    print(f"目标词: {word}")
    print(f"词义: {result.sense_key}")
    print(f"定义: {result.definition}")
    print(f"置信度: {result.confidence:.3f}")
    print("-" * 60)

## 3. 方法对比

In [None]:
# 创建不同的WSD方法
methods = {
    'Lesk': LeskWSD(),
    'Lesk+Examples': LeskWSD(use_examples=True),
    'Graph': GraphBasedWSD(),
}

# 评估每个方法
results = {}
evaluator = WSDEvaluator()

for name, model in methods.items():
    print(f"评估 {name}...")
    
    predictions = []
    gold_labels = []
    
    for sample in test_data:
        pred = model.disambiguate(
            context=sample['sentence'],
            target_word=sample['target_word']
        )
        predictions.append(pred)
        gold_labels.append(sample['sense_key'])
    
    result = evaluator.evaluate(predictions, gold_labels)
    results[name] = result

print("\n评估完成!")

## 4. 结果可视化

In [None]:
# 准备数据
method_names = list(results.keys())
accuracies = [results[name].accuracy for name in method_names]

# 创建柱状图
plt.figure(figsize=(10, 6))
bars = plt.bar(method_names, accuracies, color=['#3498db', '#2ecc71', '#9b59b6'])

# 添加数值标签
for bar, acc in zip(bars, accuracies):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01, 
             f'{acc:.3f}', ha='center', va='bottom', fontsize=12)

plt.xlabel('方法', fontsize=12)
plt.ylabel('准确率', fontsize=12)
plt.title('WSD方法准确率对比', fontsize=14)
plt.ylim(0, 1.1)
plt.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

## 5. 错误分析

In [None]:
# 分析Lesk算法的错误案例
lesk = LeskWSD()

print("错误案例分析:")
print("=" * 60)

for sample in test_data:
    result = lesk.disambiguate(
        context=sample['sentence'],
        target_word=sample['target_word']
    )
    
    if result.sense_key != sample['sense_key']:
        print(f"句子: {sample['sentence']}")
        print(f"目标词: {sample['target_word']}")
        print(f"正确词义: {sample['sense_key']}")
        print(f"预测词义: {result.sense_key}")
        print(f"预测定义: {result.definition}")
        print("-" * 60)

## 6. 词义分布分析

In [None]:
# 分析词义的分布
from collections import Counter

# 统计每个词的词义数量
word_senses = {}
for sample in test_data:
    word = sample['target_word']
    if word not in word_senses:
        senses = loader.get_word_senses(word)
        word_senses[word] = len(senses)

print("词义数量统计:")
for word, count in word_senses.items():
    print(f"  {word}: {count} 个词义")

## 7. 总结

本笔记本分析了不同WSD方法的性能。主要发现：

1. Lesk算法作为基线方法，实现简单但效果有限
2. 使用例句扩展可以提高准确率
3. 基于图的方法可以利用语义关系提升性能
4. 对于多义词，上下文信息至关重要