# 语义角色标注 (SRL) 分析笔记本

本笔记本用于分析和比较不同语义角色标注方法的性能。

In [None]:
# 导入必要的库
import sys
sys.path.insert(0, '../src')

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from srl import SyntaxBasedSRL, BiLSTMCRFSRL
from evaluation import SRLEvaluator, evaluate_srl
from utils.data_loader import SRLDataLoader

## 1. 加载数据

In [None]:
# 加载示例数据
loader = SRLDataLoader()
test_data = loader.load(dataset="sample")

print(f"加载了 {len(test_data)} 个样本")
print("\n示例数据:")
for sample in test_data[:2]:
    print(f"  句子: {' '.join(sample['words'])}")
    print(f"  谓词: {sample['predicate']} (位置: {sample['predicate_index']})")
    print(f"  论元:")
    for arg in sample['arguments']:
        print(f"    [{arg['role']}] {arg['text']}")
    print()

## 2. 基于句法的SRL演示

In [None]:
# 创建SRL实例
srl = SyntaxBasedSRL()

# 测试句子
test_sentences = [
    "The cat ate the fish.",
    "She gave him a beautiful book yesterday.",
    "John bought a new car in Tokyo last week.",
    "The teacher asked the students to read the book carefully.",
]

print("SRL结果:")
print("=" * 60)

for sentence in test_sentences:
    print(f"\n句子: {sentence}")
    print("-" * 40)
    
    results = srl.predict(sentence)
    
    for result in results:
        print(f"谓词: {result.predicate} (位置: {result.predicate_index})")
        print("论元:")
        for arg in result.arguments:
            print(f"  [{arg.role}] {arg.text}")

## 3. 依存句法树可视化

In [None]:
# 显示依存句法树
sentence = "The cat ate the fish in the garden."

print(srl.visualize_tree(sentence))

## 4. BIO标签示例

In [None]:
# 展示BIO标签格式
from srl import SRLResult, SemanticRole

# 创建一个SRLResult
args = [
    SemanticRole("ARG0", "The cat", (0, 2)),
    SemanticRole("ARG1", "the fish", (3, 5)),
    SemanticRole("ARGM-LOC", "in the garden", (5, 8)),
]

result = SRLResult(
    sentence="The cat ate the fish in the garden.",
    words=["The", "cat", "ate", "the", "fish", "in", "the", "garden", "."],
    predicate="ate",
    predicate_index=2,
    arguments=args
)

# 转换为BIO标签
bio_tags = result.to_bio_tags()

print("BIO标签格式:")
print("-" * 40)
print(f"{'词':<12} {'标签':<15}")
print("-" * 40)
for word, tag in zip(result.words, bio_tags):
    print(f"{word:<12} {tag:<15}")

## 5. 评估

In [None]:
# 评估基于句法的方法
print("评估SyntaxBasedSRL:")
print("=" * 60)

result = evaluate_srl(srl, test_data, verbose=True)

## 6. 按角色分析

In [None]:
# 统计不同角色的分布
from collections import Counter

role_counts = Counter()
for sample in test_data:
    for arg in sample['arguments']:
        role_counts[arg['role']] += 1

# 可视化
roles = list(role_counts.keys())
counts = list(role_counts.values())

plt.figure(figsize=(10, 6))
bars = plt.bar(roles, counts, color='steelblue')

# 添加数值标签
for bar, count in zip(bars, counts):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.1, 
             str(count), ha='center', va='bottom', fontsize=10)

plt.xlabel('语义角色', fontsize=12)
plt.ylabel('数量', fontsize=12)
plt.title('语义角色分布', fontsize=14)
plt.xticks(rotation=45)
plt.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

## 7. 语义角色说明

In [None]:
from srl.base import SEMANTIC_ROLES

print("常见语义角色说明:")
print("=" * 60)

for role, description in sorted(SEMANTIC_ROLES.items()):
    print(f"{role:<15} {description}")

## 8. 错误分析

In [None]:
# 分析预测错误
print("预测结果对比:")
print("=" * 60)

for sample in test_data[:3]:
    sentence = ' '.join(sample['words'])
    print(f"\n句子: {sentence}")
    print("-" * 40)
    
    # 真实标注
    print("真实标注:")
    print(f"  谓词: {sample['predicate']}")
    for arg in sample['arguments']:
        print(f"  [{arg['role']}] {arg['text']}")
    
    # 预测结果
    results = srl.predict(sentence)
    print("\n预测结果:")
    for result in results:
        print(f"  谓词: {result.predicate}")
        for arg in result.arguments:
            print(f"  [{arg.role}] {arg.text}")

## 9. 总结

本笔记本分析了语义角色标注的方法和结果。主要发现：

1. 基于句法的方法能够有效识别主要论元（ARG0, ARG1）
2. 附加论元（如时间、地点）的识别依赖于介词和命名实体
3. 复杂句式可能需要更高级的方法来处理
4. 被动语态需要特殊处理来正确识别施事者