# 用户故事��义检测综合分析

本notebook对7种不同类型的用户故事歧义进行综合分析和比较。

## 歧义类型概览
1. **语义歧义 (Semantic Ambiguity)** - 词汇或短语的多种含义解释
2. **范围歧义 (Scope Ambiguity)** - 功能边界和适用条件不明确
3. **角色歧义 (Actor Ambiguity)** - 参与者或系统角色不明确
4. **验收标准歧义 (Acceptance Ambiguity)** - 完成标准无法客观验证
5. **依赖歧义 (Dependency Ambiguity)** - 外部依赖或系统集成不明确
6. **优先级歧义 (Priority Ambiguity)** - 功能重要性或实现顺序不明确
7. **技术歧义 (Technical Ambiguity)** - 技术实现方案或架构不明确

## 分析目标
- 比较不同模型在各种歧义类型上的表现
- 分析各歧义类型的检测难度差异
- 发现模型的优势和不足
- 为实际应用提供指导建议

In [None]:
# 导入必要的库
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# 设置显示选项
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

# 设置图表样式
plt.style.use('default')
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 10

In [None]:
# 定义歧义类型和对应的文件名
AMBIGUITY_TYPES = [
    "semantic",
    "scope", 
    "actor",
    "acceptance",
    "dependency",
    "priority",
    "technical"
]

RESULT_FILES = {
    "semantic": "semantic_ambiguity_evaluation_results.json",
    "scope": "scope_ambiguity_evaluation_results.json",
    "actor": "actor_ambiguity_evaluation_results.json",
    "acceptance": "acceptance_ambiguity_evaluation_results.json",
    "dependency": "dependency_ambiguity_evaluation_results.json",
    "priority": "priority_ambiguity_evaluation_results.json",
    "technical": "technical_ambiguity_evaluation_results.json"
}

MODEL_NAMES = ["gpt-3.5-turbo", "gemini-2.5-flash", "deepseek-chat"]

In [None]:
# 加载所有评估结果
def load_all_results():
    """
    加载所有歧义类型的评估结果
    """
    all_results = {}
    
    for amb_type, filename in RESULT_FILES.items():
        try:
            with open(filename, 'r', encoding='utf-8') as f:
                results = json.load(f)
                all_results[amb_type] = results
                print(f"✓ 加载 {amb_type} 歧义结果: {len(results)} 个模型")
        except FileNotFoundError:
            print(f"✗ 未找到 {amb_type} 歧义结果文件: {filename}")
        except Exception as e:
            print(f"✗ 加载 {amb_type} 歧义结果时出错: {e}")
    
    return all_results

all_results = load_all_results()
print(f"\n成功加载 {len(all_results)} 种歧义类型的评估结果")

In [None]:
# 创建结果汇总表
def create_summary_table(all_results):
    """
    创建所有结果的汇总表
    """
    summary_data = []
    
    for amb_type, results in all_results.items():
        for result in results:
            model = result['model']
            metrics = result['metrics']
            
            summary_data.append({
                'Ambiguity Type': amb_type.title(),
                'Model': model,
                'Precision': metrics['precision'],
                'Recall': metrics['recall'],
                'F1 Score': metrics['f1']
            })
    
    summary_df = pd.DataFrame(summary_data)
    return summary_df

summary_df = create_summary_table(all_results)
print("\n=== 评估结果汇总表 ===")
print(summary_df.round(3))

In [None]:
# 创建综合可视化
def create_comprehensive_visualization(summary_df):
    """
    创建综合的可视化图表
    """
    fig = plt.figure(figsize=(20, 15))
    fig.suptitle('User Story Ambiguity Detection: Comprehensive Analysis', fontsize=16, fontweight='bold')
    
    # 1. 总体性能热力图
    ax1 = plt.subplot(3, 3, 1)
    pivot_f1 = summary_df.pivot(index='Model', columns='Ambiguity Type', values='F1 Score')
    sns.heatmap(pivot_f1, annot=True, fmt='.3f', cmap='RdYlBu_r', ax=ax1,
                cbar_kws={'label': 'F1 Score'})
    ax1.set_title('F1 Score Heatmap\n(Models vs Ambiguity Types)')
    ax1.set_xlabel('')
    ax1.set_ylabel('')
    
    # 2. 各模型平均性能对比
    ax2 = plt.subplot(3, 3, 2)
    model_avg = summary_df.groupby('Model')[['Precision', 'Recall', 'F1 Score']].mean()
    model_avg.plot(kind='bar', ax=ax2, alpha=0.8)
    ax2.set_title('Average Performance by Model')
    ax2.set_ylabel('Score')
    ax2.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    ax2.grid(axis='y', linestyle='--', alpha=0.7)
    
    # 3. 各歧义类型平均难度对比
    ax3 = plt.subplot(3, 3, 3)
    type_avg = summary_df.groupby('Ambiguity Type')[['Precision', 'Recall', 'F1 Score']].mean()
    type_avg.plot(kind='bar', ax=ax3, alpha=0.8)
    ax3.set_title('Average Detection Difficulty by Ambiguity Type')
    ax3.set_ylabel('Score')
    ax3.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    ax3.grid(axis='y', linestyle='--', alpha=0.7)
    ax3.tick_params(axis='x', rotation=45)
    
    # 4. 精确率对比（按歧义类型分组）
    ax4 = plt.subplot(3, 3, 4)
    pivot_precision = summary_df.pivot(index='Model', columns='Ambiguity Type', values='Precision')
    pivot_precision.plot(kind='bar', ax=ax4, alpha=0.7)
    ax4.set_title('Precision Comparison\n(by Ambiguity Type)')
    ax4.set_ylabel('Precision')
    ax4.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    ax4.grid(axis='y', linestyle='--', alpha=0.7)
    
    # 5. 召回率对比（按歧义类型分组）
    ax5 = plt.subplot(3, 3, 5)
    pivot_recall = summary_df.pivot(index='Model', columns='Ambiguity Type', values='Recall')
    pivot_recall.plot(kind='bar', ax=ax5, alpha=0.7)
    ax5.set_title('Recall Comparison\n(by Ambiguity Type)')
    ax5.set_ylabel('Recall')
    ax5.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    ax5.grid(axis='y', linestyle='--', alpha=0.7)
    
    # 6. F1分数对比（按歧义类型分组）
    ax6 = plt.subplot(3, 3, 6)
    pivot_f1.plot(kind='bar', ax=ax6, alpha=0.7)
    ax6.set_title('F1 Score Comparison\n(by Ambiguity Type)')
    ax6.set_ylabel('F1 Score')
    ax6.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    ax6.grid(axis='y', linestyle='--', alpha=0.7)
    
    # 7. 模型排名（按F1分数）
    ax7 = plt.subplot(3, 3, 7)
    best_performance = summary_df.loc[summary_df.groupby('Ambiguity Type')['F1 Score'].idxmax()]
    best_model_counts = best_performance['Model'].value_counts()
    colors = ['gold', 'silver', '#CD7F32']  # 金银铜色
    best_model_counts.plot(kind='bar', ax=ax7, color=colors[:len(best_model_counts)])
    ax7.set_title('Best Model Count\n(Number of ambiguity types where model performs best)')
    ax7.set_ylabel('Count')
    ax7.grid(axis='y', linestyle='--', alpha=0.7)
    
    # 8. 性能分布箱线图
    ax8 = plt.subplot(3, 3, 8)
    summary_df.boxplot(column='F1 Score', by='Model', ax=ax8)
    ax8.set_title('F1 Score Distribution\nby Model')
    ax8.set_xlabel('')
    ax8.set_ylabel('F1 Score')
    
    # 9. 性能分布箱线图（按歧义类型）
    ax9 = plt.subplot(3, 3, 9)
    summary_df.boxplot(column='F1 Score', by='Ambiguity Type', ax=ax9)
    ax9.set_title('F1 Score Distribution\nby Ambiguity Type')
    ax9.set_xlabel('')
    ax9.set_ylabel('F1 Score')
    ax9.tick_params(axis='x', rotation=45)
    
    plt.tight_layout()
    plt.savefig('comprehensive_ambiguity_analysis.png', dpi=300, bbox_inches='tight')
    plt.show()

create_comprehensive_visualization(summary_df)

In [None]:
# 分析最佳和最差表现
def analyze_extremes(summary_df):
    """
    分析最佳和最差表现
    """
    print("\n=== 最佳和最差表现分析 ===")
    
    # 最佳F1分数
    best_overall = summary_df.loc[summary_df['F1 Score'].idxmax()]
    print(f"\n最佳整体表现:")
    print(f"  模型: {best_overall['Model']}")
    print(f"  歧义类型: {best_overall['Ambiguity Type']}")
    print(f"  F1分数: {best_overall['F1 Score']:.3f}")
    
    # 最差F1分数
    worst_overall = summary_df.loc[summary_df['F1 Score'].idxmin()]
    print(f"\n最差整体表现:")
    print(f"  模型: {worst_overall['Model']}")
    print(f"  歧义类型: {worst_overall['Ambiguity Type']}")
    print(f"  F1分数: {worst_overall['F1 Score']:.3f}")
    
    # 按歧义类型分析最佳模型
    print(f"\n各歧义类型的最佳模型:")
    best_by_type = summary_df.loc[summary_df.groupby('Ambiguity Type')['F1 Score'].idxmax()]
    for _, row in best_by_type.iterrows():
        print(f"  {row['Ambiguity Type']}: {row['Model']} (F1: {row['F1 Score']:.3f})")
    
    # 按模型分析最佳歧义类型
    print(f"\n各模型的最佳歧义类型:")
    best_by_model = summary_df.loc[summary_df.groupby('Model')['F1 Score'].idxmax()]
    for _, row in best_by_model.iterrows():
        print(f"  {row['Model']}: {row['Ambiguity Type']} (F1: {row['F1 Score']:.3f})")

analyze_extremes(summary_df)

In [None]:
# 检测难度分析
def analyze_difficulty(summary_df):
    """
    分析不同歧义类型的检测难度
    """
    print("\n=== 歧义类型检测难度分析 ===")
    
    # 计算每个歧义类型的平均性能
    type_stats = summary_df.groupby('Ambiguity Type').agg({
        'Precision': ['mean', 'std'],
        'Recall': ['mean', 'std'],
        'F1 Score': ['mean', 'std']
    }).round(3)
    
    print("\n各歧义类型的平均性能统计:")
    print(type_stats)
    
    # 按平均F1分数排序
    avg_f1 = summary_df.groupby('Ambiguity Type')['F1 Score'].mean().sort_values(ascending=False)
    
    print("\n按检测难度排序（从易到难，按平均F1分数）:")
    for i, (amb_type, f1) in enumerate(avg_f1.items(), 1):
        difficulty = "容易" if f1 > 0.7 else "中等" if f1 > 0.5 else "困难"
        print(f"  {i}. {amb_type}: F1={f1:.3f} ({difficulty})")
    
    # 计算变异系数（衡量一致性）
    cv_by_type = summary_df.groupby('Ambiguity Type')['F1 Score'].std() / summary_df.groupby('Ambiguity Type')['F1 Score'].mean()
    
    print("\n模型间一致性（变异系数，越小越一致）:")
    for amb_type, cv in cv_by_type.sort_values().items():
        consistency = "高度一致" if cv < 0.1 else "比较一致" if cv < 0.2 else "不一致"
        print(f"  {amb_type}: CV={cv:.3f} ({consistency})")

analyze_difficulty(summary_df)

In [None]:
# 模型优势分析
def analyze_model_strengths(summary_df):
    """
    分析每个模型的优势
    """
    print("\n=== 模型优势分析 ===")
    
    for model in MODEL_NAMES:
        if model in summary_df['Model'].values:
            model_data = summary_df[summary_df['Model'] == model]
            
            # 计算平均性能
            avg_precision = model_data['Precision'].mean()
            avg_recall = model_data['Recall'].mean()
            avg_f1 = model_data['F1 Score'].mean()
            
            # 找出最佳表现的歧义类型
            best_type = model_data.loc[model_data['F1 Score'].idxmax()]
            worst_type = model_data.loc[model_data['F1 Score'].idxmin()]
            
            print(f"\n{model} 模型分析:")
            print(f"  平均精确率: {avg_precision:.3f}")
            print(f"  平均召回率: {avg_recall:.3f}")
            print(f"  平均F1分数: {avg_f1:.3f}")
            print(f"  最佳表现: {best_type['Ambiguity Type']} (F1: {best_type['F1 Score']:.3f})")
            print(f"  最差表现: {worst_type['Ambiguity Type']} (F1: {worst_type['F1 Score']:.3f})")
            
            # 判断模型特点
            if avg_precision > avg_recall:
                style = "精确型（更注重准确性）"
            elif avg_recall > avg_precision:
                style = "召回型（更注重覆盖性）"
            else:
                style = "平衡型"
            
            print(f"  模型特点: {style}")

analyze_model_strengths(summary_df)

In [None]:
# 保存综合分析结果
def save_comprehensive_results(summary_df):
    """
    保存综合分析结果
    """
    # 保存汇总表
    summary_df.to_csv('comprehensive_ambiguity_summary.csv', index=False, encoding='utf-8-sig')
    
    # 创建详细报告
    report = {
        "analysis_date": pd.Timestamp.now().isoformat(),
        "total_ambiguity_types": len(AMBIGUITY_TYPES),
        "total_models": len(MODEL_NAMES),
        "summary_statistics": {
            "best_overall_performance": summary_df.loc[summary_df['F1 Score'].idxmax()].to_dict(),
            "worst_overall_performance": summary_df.loc[summary_df['F1 Score'].idxmin()].to_dict(),
            "average_f1_by_model": summary_df.groupby('Model')['F1 Score'].mean().to_dict(),
            "average_f1_by_type": summary_df.groupby('Ambiguity Type')['F1 Score'].mean().to_dict()
        },
        "detailed_results": summary_df.to_dict('records')
    }
    
    with open('comprehensive_ambiguity_analysis_report.json', 'w', encoding='utf-8') as f:
        json.dump(report, f, ensure_ascii=False, indent=2, default=str)
    
    print("\n=== 保存文件 ===")
    print("✓ comprehensive_ambiguity_summary.csv - 汇总数据表")
    print("✓ comprehensive_ambiguity_analysis_report.json - 详细分析报告")
    print("✓ comprehensive_ambiguity_analysis.png - 可视化图表")

save_comprehensive_results(summary_df)

In [None]:
# 实用建议
def provide_recommendations(summary_df):
    """
    基于分析结果提供实用建议
    """
    print("\n=== 实用建议 ===")
    
    # 找出整体最佳模型
    best_model = summary_df.groupby('Model')['F1 Score'].mean().idxmax()
    print(f"\n1. 模型选择建议:")
    print(f"   推荐使用 {best_model} 作为主要检测模型，因为它在所有歧义类型上表现最稳定。")
    
    # 找出最容易和最难的歧义类型
    avg_f1 = summary_df.groupby('Ambiguity Type')['F1 Score'].mean()
    easiest_type = avg_f1.idxmax()
    hardest_type = avg_f1.idxmin()
    
    print(f"\n2. 歧义检测难度:")
    print(f"   最容易检测: {easiest_type} (平均F1: {avg_f1.max():.3f})")
    print(f"   最难检测: {hardest_type} (平均F1: {avg_f1.min():.3f})")
    print(f"   建议: 对于{hardest_type}，可能需要更专门的提示词或人工复核。")
    
    # 精确率vs召回率分析
    model_styles = summary_df.groupby('Model').apply(
        lambda x: 'Precision-focused' if x['Precision'].mean() > x['Recall'].mean() else 'Recall-focused'
    )
    
    print(f"\n3. 应用场景建议:")
    for model, style in model_styles.items():
        if 'Precision' in style:
            scenario = "适合需要高准确性的场景，避免误报"
        else:
            scenario = "适合需要全面检测的场景，避免漏报"
        print(f"   {model}: {scenario}")
    
    print(f"\n4. 改进建议:")
    print(f"   - 针对困难歧义类型开发专门的提示词模板")
    print(f"   - 考虑使用模型集成方法提高整体性能")
    print(f"   - 建立人工验证流程，特别是对于高重要性项目")
    print(f"   - 定期更新和优化检测模型")

provide_recommendations(summary_df)

In [None]:
# 运行完整分析
def run_complete_analysis():
    """
    运行完整的综合分析
    """
    print("开始用户故事歧义检测综合分析...")
    print(f"分析 {len(AMBIGUITY_TYPES)} 种歧义类型")
    print(f"评估 {len(MODEL_NAMES)} 个模型")
    
    # 加载结果
    all_results = load_all_results()
    
    if not all_results:
        print("没有找到有效的评估结果，请先运行各个歧义类型的检测notebook。")
        return
    
    # 创建汇总表
    summary_df = create_summary_table(all_results)
    
    # 创建可视化
    create_comprehensive_visualization(summary_df)
    
    # 分析
    analyze_extremes(summary_df)
    analyze_difficulty(summary_df)
    analyze_model_strengths(summary_df)
    
    # 保存结果
    save_comprehensive_results(summary_df)
    
    # 提供建议
    provide_recommendations(summary_df)
    
    print("\n综合分析完成！")

# 运行分析
# run_complete_analysis()  # 取消注释以运行完整分析