# 用户故事依赖歧义检测分析

本notebook用于分析大模型在检测用户故事依赖歧义方面的性能。

## 依赖歧义定义
依赖歧义是指用户故事中的依赖关系不明确，导致对功能的实现顺序、前置条件或与其他组件/功能的关系产生理解偏差。例如：
- 缺乏对前置条件的明确说明
- 与其他用户故事的依赖关系不清晰
- 系统组件间的依赖模糊
- 数据依赖关系不明确

In [None]:
import pandas as pd
import json
import time
from typing import Dict, List, Tuple
import matplotlib.pyplot as plt
import numpy as np
import re
from openai import OpenAI
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
# 配置信息
CONFIG = {
    "base_url": "https://api.zhizengzeng.com/v1/",
    "api_key": "sk-zk20f741becece1c055c848225093b2e458662329a0f1016"
}

# 模型列表
MODELS = [
    "gpt-3.5-turbo",
    "claude-sonnet-4-20250514", 
    "gemini-2.5-flash",
    "grok-3-mini",
    "deepseek-chat",
    "qwen3-coder-plus"
]

In [None]:
# 加载数据集
print("Loading user story ambiguity dataset...")
df = pd.read_excel(r"data/User Story Ambiguity Dataset_A Comprehensive Research Resource/Cornelius_2025_user_story_ambiguity_dataset.xlsx", sheet_name='User_Stories')

print(f"数据集形状: {df.shape}")
print(f"依赖歧义统计: {df['DependencyAmbiguity'].value_counts()}")

# 显示一些例子
print("\n依赖歧义示例:")
dependency_examples = df[df['DependencyAmbiguity'] == True][['StoryText']].head(3)
for i, story in enumerate(dependency_examples['StoryText'], 1):
    print(f"{i}. {story}")

print("\n无依赖歧义示例:")
non_dependency_examples = df[df['DependencyAmbiguity'] == False][['StoryText']].head(3)
for i, story in enumerate(non_dependency_examples['StoryText'], 1):
    print(f"{i}. {story}")

In [None]:
def get_dependency_ambiguity_prompt(story_text: str) -> str:
    """
    生成用于检测用户故事依赖歧义的提示词
    """
    prompt = f"""
**背景**: 依赖歧义是指用户故事中的依赖关系不明确，导致对功能的实现顺序、前置条件或与其他组件/功能的关系产生理解偏差。

**角色**: 你是一名专业的自然语言处理专家，专门检测软件需求规格中的依赖歧义。

**任务**: 分析以下用户故事，判断其中是否存在依赖歧义。

**依赖歧义的特征包括**:
1. 缺乏对前置条件的明确说明
2. 与其他用户故事的依赖关系不清晰
3. 系统组件间的依赖模糊
4. 数据依赖关系不明确
5. 技术依赖缺乏具体描述
6. 外部系统依赖不明确
7. 资源依赖关系不清晰
8. 时序依赖关系模糊
9. 业务流程依赖不明确
10. 环境依赖条件缺失

**用户故事**: {story_text}

**输出要求**:
请按照以下JSON格式输出你的分析结果：
{{
    "has_dependency_ambiguity": true/false,
    "ambiguity_explanation": "如果存在依赖歧义，请解释歧义的具体内容和依赖关系不明确的地方；如果不存在依赖歧义，请说明为什么用户故事的依赖关系是清晰的",
    "dependency_types": "如果存在歧义，请指出哪些类型的依赖关系不明确（如前置条件、技术依赖、数据依赖等）；如果不存在歧义，请说明已明确的依赖关系",
    "missing_dependencies": "如果存在歧义，请列出缺失或不明确的依赖信息；如果不存在歧义，请填写'依赖关系清晰，无缺失'",
    "suggested_improvement": "如果存在依赖歧义，请提出改进建议；如果不存在依赖歧义，请填写'无依赖歧义，无需改进'"
}}

**注意事项**:
- has_dependency_ambiguity的值只能是true或false
- ambiguity_explanation应详细说明依赖歧义的原因
- dependency_types应明确指出依赖关系的类型
- missing_dependencies应列出缺失的依赖信息
- suggested_improvement应提供具体的改进建议
"""
    return prompt

In [None]:
def call_llm(prompt: str, model: str) -> Dict:
    """
    调用大模型API
    """
    try:
        client = OpenAI(api_key=CONFIG["api_key"], base_url=CONFIG["base_url"])
        
        response = client.chat.completions.create(
            model=model,
            messages=[
                {"role": "user", "content": prompt}
            ],
            temperature=0.1
        )
        
        content = response.choices[0].message.content
        
        # 尝试解析JSON格式的响应
        match = re.search(r'\{.*\}', content, re.DOTALL)
        if match:
            json_str = match.group()
            parsed_result = json.loads(json_str)
            return parsed_result
        else:
            # 如果无法解析JSON，返回默认值
            return {
                "has_dependency_ambiguity": True,
                "ambiguity_explanation": "无法解析模型响应",
                "dependency_types": "无法解析模型响应",
                "missing_dependencies": "无法解析模型响应",
                "suggested_improvement": "无法解析模型响应"
            }
    except Exception as e:
        print(f"API调用失败 ({model}): {str(e)}")
        return {
            "has_dependency_ambiguity": True,
            "ambiguity_explanation": f"API调用失败: {str(e)}",
            "dependency_types": f"API调用失败: {str(e)}",
            "missing_dependencies": f"API调用失败: {str(e)}",
            "suggested_improvement": f"API调用失败: {str(e)}"
        }

In [None]:
def evaluate_dependency_detection(y_true: List[bool], y_pred: List[bool]) -> Dict:
    """
    计算依赖歧义检测的评估指标
    """
    tp = fp = fn = tn = 0
    
    for true_label, pred_label in zip(y_true, y_pred):
        if true_label and pred_label:
            tp += 1
        elif not true_label and pred_label:
            fp += 1
        elif true_label and not pred_label:
            fn += 1
        else:
            tn += 1
    
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1_score = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
    accuracy = (tp + tn) / (tp + fp + fn + tn)
    
    return {
        "precision": precision,
        "recall": recall,
        "f1_score": f1_score,
        "accuracy": accuracy,
        "tp": tp,
        "fp": fp,
        "fn": fn,
        "tn": tn
    }

In [None]:
def process_dependency_dataset(df_subset: pd.DataFrame, model: str) -> Dict:
    """
    处理数据集子集并评估模型在依赖歧义检测上的性能
    """
    results = {
        "model": model,
        "predictions": [],
        "metrics": {}
    }
    
    print(f"Processing model {model}...")
    
    for idx, row in df_subset.iterrows():
        story_text = row['StoryText']
        true_has_ambiguity = row['DependencyAmbiguity']
        
        prompt = get_dependency_ambiguity_prompt(story_text)
        prediction = call_llm(prompt, model)
        
        # 确保预测结果格式正确
        if "has_dependency_ambiguity" not in prediction:
            prediction["has_dependency_ambiguity"] = True
        if "ambiguity_explanation" not in prediction:
            prediction["ambiguity_explanation"] = "模型未提供解释"
        if "dependency_types" not in prediction:
            prediction["dependency_types"] = "模型未提供依赖类型信息"
        if "missing_dependencies" not in prediction:
            prediction["missing_dependencies"] = "模型未提供缺失依赖信息"
        if "suggested_improvement" not in prediction:
            prediction["suggested_improvement"] = "模型未提供改进建议"
        
        results["predictions"].append({
            "story_id": row['StoryID'],
            "story_text": story_text,
            "true_has_ambiguity": true_has_ambiguity,
            "pred_has_ambiguity": prediction["has_dependency_ambiguity"],
            "ambiguity_explanation": prediction["ambiguity_explanation"],
            "dependency_types": prediction["dependency_types"],
            "missing_dependencies": prediction["missing_dependencies"],
            "suggested_improvement": prediction["suggested_improvement"]
        })
        
        # 添加进度信息
        if (idx + 1) % 10 == 0:
            print(f"  Processed {idx + 1}/{len(df_subset)} samples")
    
    # 计算指标
    true_labels = [item["true_has_ambiguity"] for item in results["predictions"]]
    pred_labels = [item["pred_has_ambiguity"] for item in results["predictions"]]
    
    results["metrics"] = evaluate_dependency_detection(true_labels, pred_labels)
    
    metrics = results["metrics"]
    print(f"  Results - Precision: {metrics['precision']:.3f}, Recall: {metrics['recall']:.3f}, F1: {metrics['f1_score']:.3f}, Accuracy: {metrics['accuracy']:.3f}")
    
    return results

In [None]:
def create_dependency_visualization(results: List[Dict]):
    """
    创建依赖歧义检测的可视化图表
    """
    models = [r["model"] for r in results]
    
    # 提取指标数据
    precisions = [r["metrics"]["precision"] for r in results]
    recalls = [r["metrics"]["recall"] for r in results]
    f1_scores = [r["metrics"]["f1_score"] for r in results]
    accuracies = [r["metrics"]["accuracy"] for r in results]
    
    # 创建图表
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    
    x = np.arange(len(models))
    width = 0.35
    
    # 精确率和召回率
    axes[0, 0].bar(x - width/2, precisions, width, label='Precision', alpha=0.8, color='wheat')
    axes[0, 0].bar(x + width/2, recalls, width, label='Recall', alpha=0.8, color='lightpink')
    axes[0, 0].set_xlabel('Model')
    axes[0, 0].set_ylabel('Score')
    axes[0, 0].set_title('依赖歧义检测 - 精确率与召回率')
    axes[0, 0].set_xticks(x)
    axes[0, 0].set_xticklabels(models, rotation=45, ha='right')
    axes[0, 0].legend()
    axes[0, 0].grid(axis='y', linestyle='--', alpha=0.7)
    axes[0, 0].set_ylim(0, 1)
    
    # F1分数
    axes[0, 1].bar(models, f1_scores, alpha=0.8, color='peachpuff')
    axes[0, 1].set_xlabel('Model')
    axes[0, 1].set_ylabel('F1 Score')
    axes[0, 1].set_title('依赖歧义检测 - F1分数')
    axes[0, 1].tick_params(axis='x', rotation=45)
    axes[0, 1].grid(axis='y', linestyle='--', alpha=0.7)
    axes[0, 1].set_ylim(0, 1)
    
    # 准确率
    axes[1, 0].bar(models, accuracies, alpha=0.8, color='lavender')
    axes[1, 0].set_xlabel('Model')
    axes[1, 0].set_ylabel('Accuracy')
    axes[1, 0].set_title('依赖歧义检测 - 准确率')
    axes[1, 0].tick_params(axis='x', rotation=45)
    axes[1, 0].grid(axis='y', linestyle='--', alpha=0.7)
    axes[1, 0].set_ylim(0, 1)
    
    # 综合性能雷达图
    angles = np.linspace(0, 2 * np.pi, 4, endpoint=False)
    angles = np.concatenate((angles, [angles[0]]))
    
    ax_radar = plt.subplot(2, 2, 4, projection='polar')
    colors = plt.cm.Set3(np.linspace(0, 1, len(models)))
    
    for i, model in enumerate(models):
        values = [precisions[i], recalls[i], f1_scores[i], accuracies[i]]
        values = np.concatenate((values, [values[0]]))
        ax_radar.plot(angles, values, 'o-', linewidth=2, label=model, color=colors[i])
        ax_radar.fill(angles, values, alpha=0.25, color=colors[i])
    
    ax_radar.set_xticks(angles[:-1])
    ax_radar.set_xticklabels(['Precision', 'Recall', 'F1', 'Accuracy'])
    ax_radar.set_ylim(0, 1)
    ax_radar.set_title('依赖歧义检测 - 综合性能对比')
    ax_radar.legend(loc='upper right', bbox_to_anchor=(1.3, 1.0))
    
    plt.tight_layout()
    plt.savefig('dependency_ambiguity_detection_results.png', dpi=300, bbox_inches='tight')
    plt.show()

In [None]:
def main():
    """
    主函数：执行依赖歧义检测分析
    """
    print("开始依赖歧义检测分析...")
    
    # 准备数据集 - 从每个类别中取样
    dependency_df = df[df['DependencyAmbiguity'] == True].sample(n=min(15, df[df['DependencyAmbiguity'] == True].shape[0]), random_state=42)
    non_dependency_df = df[df['DependencyAmbiguity'] == False].sample(n=min(15, df[df['DependencyAmbiguity'] == False].shape[0]), random_state=42)
    test_df = pd.concat([dependency_df, non_dependency_df]).reset_index(drop=True)
    
    print(f"测试数据集大小: {test_df.shape}")
    print(f"依赖歧义样本数: {dependency_df.shape[0]}")
    print(f"无依赖歧义样本数: {non_dependency_df.shape[0]}")
    
    all_results = []
    
    # 对每个模型进行评估
    for model in MODELS:
        result = process_dependency_dataset(test_df, model)
        all_results.append(result)
        
        # 添加延迟以避免API限制
        time.sleep(1)
    
    # 保存结果
    with open('dependency_ambiguity_results.json', 'w', encoding='utf-8') as f:
        json.dump(all_results, f, ensure_ascii=False, indent=2, default=str)
    
    print("\n依赖歧义检测分析完成，结果已保存到 dependency_ambiguity_results.json")
    
    # 创建可视化图表
    create_dependency_visualization(all_results)
    
    # 打印详细结果
    print("\n详细评估结果:")
    for result in all_results:
        model = result["model"]
        metrics = result["metrics"]
        
        print(f"\n模型: {model}")
        print(f"  精确率: {metrics['precision']:.3f}")
        print(f"  召回率: {metrics['recall']:.3f}")
        print(f"  F1分数: {metrics['f1_score']:.3f}")
        print(f"  准确率: {metrics['accuracy']:.3f}")
        print(f"  真正例: {metrics['tp']}, 假正例: {metrics['fp']}, 假负例: {metrics['fn']}, 真负例: {metrics['tn']}")

# 执行主函数
if __name__ == "__main__":
    main()