In [1]:
# ============================================================================
# 导入所有必需的库和模块
# ============================================================================

# 基础库
import pandas as pd
import numpy as np
import json
import warnings
warnings.filterwarnings('ignore')

# 可视化
import matplotlib.pyplot as plt
import seaborn as sns

# API 和模型
from ambiguity_detection_utils import AmbiguityDetector
from ollama import Client
from prompts import DISAMBIGUATION_PROMPT_TEMPLATE, PSEUDOCODE_PROMPT_TEMPLATE

# 深度学习模型
import torch
from transformers import BertTokenizer, BertModel
from sentence_transformers import SentenceTransformer

# 机器学习评估指标
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import (
    roc_auc_score, roc_curve, f1_score, 
    precision_recall_curve, auc,
    confusion_matrix, classification_report, 
    accuracy_score
)

print("所有库导入完成！")

所有库导入完成！


In [2]:
# ============================================================================
# 创建检测器和加载数据
# ============================================================================

# 创建语义歧义检测器
detector = AmbiguityDetector("semantic")

# 加载并准备数据
file_path = r"data/User Story Ambiguity Dataset_A Comprehensive Research Resource/Cornelius_2025_user_story_ambiguity_dataset.xlsx"
df_test = detector.load_and_prepare_data(file_path, create_balanced=True)

print(f"测试数据集: {len(df_test)} 个用户故事")

Semantic歧义统计:
  有semantic歧义: 2831 (22.0%)
  无semantic歧义: 10016 (78.0%)
平衡测试集: 5662 个用户故事
  HasAmbiguity=True: 2831 (50.0%)
  HasAmbiguity=False: 2831 (50.0%)
测试数据集: 5662 个用户故事


In [9]:
df_test = df_test.head()
df_test

Unnamed: 0,StoryID,StoryText,HasAmbiguity
0,US-C3-6636,"As a seller, I would like to track order to fi...",False
1,US-C7-12360,"As a store owner, I want to track order in ord...",False
2,US-C2-3691,"As a patient, I need to access records so that...",False
3,US-C5-9039,"As a investor, I want to handle finances in or...",True
4,US-C1-610,"As a financial advisor, I need to manage accou...",True


In [10]:
df_test.count()

StoryID         5
StoryText       5
HasAmbiguity    5
dtype: int64

In [11]:
ollama_client = Client(
    host="http://2yo6159kw603.vicp.fun/",
)

In [12]:
print("加载Jina代码embedding模型...")
code_model_name = "jinaai/jina-code-embeddings-0.5b"

code_embedding_model = SentenceTransformer(
    code_model_name,
    tokenizer_kwargs={"padding_side": "left"},
)

print(f"代码embedding模型已加载")
print(f"模型名称: {code_model_name}")

加载Jina代码embedding模型...
代码embedding模型已加载
模型名称: jinaai/jina-code-embeddings-0.5b


In [13]:
# 定义代码embedding函数
def get_code_embedding(code_text, model):
    """
    获取代码文本的embedding向量（使用SentenceTransformer）
    
    参数:
        code_text: 代码文本
        model: SentenceTransformer embedding模型
    
    返回:
        embedding: 代码embedding向量 (1, embedding_dim)
    """
    # 使用SentenceTransformer的encode方法，指定prompt为nl2code_document
    # 因为我们要编码代码/伪代码样本，不是查询
    embedding = model.encode(
        code_text,
        prompt_name="nl2code_document",
        convert_to_numpy=True
    )
    
    # 如果返回的是1D向量，需要reshape为2D
    if embedding.ndim == 1:
        embedding = embedding.reshape(1, -1)
    
    return embedding

print("get_code_embedding 函数已定义")

get_code_embedding 函数已定义
