In [15]:
import ast
import os
import json
import inspect
from typing import List, Dict, Any, Optional
from openai import OpenAI
import getpass # 用于安全输入API Key，防止泄露

# --- Configuration ---
CODE_REPO_PATH = ".\my_python_repo" # 替换为你的本地Python代码仓库路径
OUTPUT_FILE = "scenario1_qa_data_with_llm.jsonl"
LLM_MODEL_NAME = "qwen-plus" # 使用您指定的模型

# --- OpenAI Client Initialization ---
# 建议通过环境变量设置 DASHSCOPE_API_KEY，或在运行时安全输入
DASHSCOPE_API_KEY = os.getenv("DASHSCOPE_API_KEY")
if not DASHSCOPE_API_KEY:
    DASHSCOPE_API_KEY = getpass.getpass("请输入您的 DASHSCOPE_API_KEY：")

try:
    llm_client = OpenAI(
        api_key=DASHSCOPE_API_KEY,
        base_url="https://dashscope.aliyuncs.com/compatible-mode/v1", # 北京地域base_url
    )
    print("Qwen-plus LLM 客户端初始化成功。")
except Exception as e:
    print(f"错误信息：LLM 客户端初始化失败 - {e}")
    print("请参考文档：https://help.aliyun.com/zh/model-studio/developer-reference/error-code")
    llm_client = None # 将客户端设置为None，后续调用会跳过

# --- Helper Functions (unchanged from previous example) ---
def extract_function_info(file_path: str) -> List[Dict[str, Any]]:
    """
    从单个Python文件中提取函数及其文档字符串和代码片段。
    """
    functions_info = []
    with open(file_path, "r", encoding="utf-8") as f:
        tree = ast.parse(f.read(), filename=file_path)

    source_lines = open(file_path, "r", encoding="utf-8").readlines()

    for node in ast.walk(tree):
        if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
            function_name = node.name
            docstring = ast.get_docstring(node)

            try:
                start_line = node.lineno
                end_line = node.end_lineno if node.end_lineno is not None else start_line
                snippet_lines = source_lines[start_line - 1:end_line]
                snippet = "".join(snippet_lines)
            except Exception as e:
                snippet = f"# Error extracting snippet for {function_name}: {e}\n"
                print(f"Warning: Could not extract snippet for {function_name} in {file_path}. Error: {e}")

            functions_info.append({
                "function_name": function_name,
                "docstring": docstring if docstring else "",
                "code_snippet": snippet,
                "start_line": start_line,
                "end_line": end_line
            })
    return functions_info

# --- LLM Integration Function ---
def call_llm_for_qa(function_name: str, docstring: str, code_snippet: str, file_path: str) -> Dict[str, str]:
    """
    调用LLM生成问答对的答案和推理trace。
    """
    if not llm_client:
        return {
            "answer": "LLM客户端未初始化，无法生成答案。",
            "inference_trace": "LLM客户端初始化失败。"
        }

    # 构建Prompt
    system_prompt = (
        "你是一个专业的代码分析助手，能够理解Python代码并解释其功能。\n"
        "请根据提供的函数信息，首先一步步思考，然后给出函数的主要功能解释和相关的推理过程。\n"
        "你的回答应包含两部分：'answer' 和 'inference_trace'。"
    )

    user_prompt = (
        f"请分析以下Python函数，并回答它的主要功能是什么？\n\n"
        f"--- 函数信息 ---\n"
        f"文件路径: {file_path}\n"
        f"函数名: {function_name}\n"
        f"文档字符串:\n```\n{docstring if docstring else '无文档字符串'}\n```\n"
        f"代码片段:\n```python\n{code_snippet}\n```\n\n"
        f"--- 输出格式 ---\n"
        f"请以JSON格式返回你的回答，其中包含 'answer' (对函数功能的解释) 和 'inference_trace' (你思考并得出答案的步骤)。\n"
        f"例如：\n"
        f"```json\n"
        f"{{\n"
        f"  \"answer\": \"函数的主要功能是...\",\n"
        f"  \"inference_trace\": \"1. 首先我识别到...\\n2. 接着我分析了...\\n3. 最终我得出结论...\"\n"
        f"}}\n"
        f"```"
    )

    try:
        completion = llm_client.chat.completions.create(
            model=LLM_MODEL_NAME,
            messages=[
                {'role': 'system', 'content': system_prompt},
                {'role': 'user', 'content': user_prompt}
            ],
            response_format={"type": "json_object"} # 明确要求LLM以JSON格式返回
        )
        llm_response_content = completion.choices[0].message.content

        # 尝试解析LLM的JSON响应
        try:
            parsed_response = json.loads(llm_response_content)
            return {
                "answer": parsed_response.get("answer", "LLM未提供答案。"),
                "inference_trace": parsed_response.get("inference_trace", "LLM未提供推理trace。")
            }
        except json.JSONDecodeError:
            print(f"Warning: LLM 返回的不是有效的 JSON 格式。\n原始响应: {llm_response_content}")
            return {
                "answer": f"LLM返回无效格式，原始响应：{llm_response_content}",
                "inference_trace": "LLM返回无效JSON。"
            }

    except Exception as e:
        print(f"LLM 调用失败: {e}")
        return {
            "answer": f"LLM调用失败，错误：{e}",
            "inference_trace": f"LLM调用失败，错误：{e}"
        }

def generate_qa_for_function(
    file_path: str,
    func_info: Dict[str, Any],
    qa_id_counter: List[int]
) -> Optional[Dict[str, Any]]:
    """
    为单个函数生成一个简化的问答对，并集成LLM调用。
    """
    function_name = func_info["function_name"]
    docstring = func_info["docstring"]
    code_snippet = func_info["code_snippet"]
    start_line = func_info["start_line"]
    end_line = func_info["end_line"]

    if not code_snippet.strip(): # 如果代码片段为空，则跳过
        return None

    qa_id_counter[0] += 1
    qa_id = f"qa_py_{qa_id_counter[0]:05d}"

    question = f"函数 `{function_name}` (在 `{os.path.basename(file_path)}` 中) 的主要功能是什么？"

    # 调用LLM生成答案和推理trace
    llm_output = call_llm_for_qa(function_name, docstring, code_snippet, file_path)
    answer = llm_output["answer"]
    inference_trace = llm_output["inference_trace"]

    # 模拟业务规则（这里是通用的，实际应从项目文档或特定注释中提取）
    business_rules = []
    if "save" in function_name.lower() or "update" in function_name.lower():
        business_rules.append("数据持久化操作需考虑事务一致性。")
    if "auth" in function_name.lower() or "login" in function_name.lower():
        business_rules.append("用户认证和授权操作需遵循安全最佳实践。")

    qa_data = {
        "id": qa_id,
        "question": question,
        "answer": answer,
        "code_context": [
            {
                "file_path": os.path.relpath(file_path, CODE_REPO_PATH),
                "line_start": start_line,
                "line_end": end_line,
                "snippet": code_snippet.strip()
            }
        ],
        "business_rules_context": business_rules,
        "inference_trace": inference_trace,
        "metadata": {
            "source_module": os.path.dirname(os.path.relpath(file_path, CODE_REPO_PATH)),
            "language": "python",
            "difficulty": "llm_generated", # 标记为LLM生成
            "timestamp": "2025-12-09T" + os.popen('date -u +"%H:%M:%SZ"').read().strip(),
            "version_control_hash": "dummy_hash_for_example"
        }
    }
    return qa_data

# --- Main Script ---
def main():
    if not os.path.exists(CODE_REPO_PATH):
        print(f"错误: 代码仓库路径 `{CODE_REPO_PATH}` 不存在。请修改 CODE_REPO_PATH 为你的本地Python仓库路径。")
        print("尝试创建一个简单的虚拟仓库用于演示...")
        os.makedirs(CODE_REPO_PATH, exist_ok=True)
        with open(os.path.join(CODE_REPO_PATH, "my_module.py"), "w", encoding="utf-8") as f:
            f.write("""
def calculate_sum(a: int, b: int) -> int:
    \"\"\"
    计算两个整数的和。
    这个函数接受两个整数作为输入，并返回它们的和。
    \"\"\"
    return a + b

def process_data(data_list: list):
    # This function processes a list of data without a docstring.
    print(f"Processing {len(data_list)} items.")
    for item in data_list:
        if item % 2 == 0:
            print(f"Even item: {item}")
        else:
            print(f"Odd item: {item}")

class MyManager:
    \"\"\"
    一个用于管理资源的类。
    提供了资源的创建、读取、更新和删除(CRUD)操作。
    \"\"\"
    def __init__(self, resource_name: str):
        self.resource_name = resource_name
        self.resources = []

    def create_resource(self, resource_data: dict):
        \"\"\"
        创建一个新资源并添加到管理器中。
        :param resource_data: 资源的字典数据。
        :return: None
        \"\"\"
        self.resources.append(resource_data)
        print(f"Resource created: {resource_data}")

    def get_resource(self, resource_id: str) -> Optional[dict]:
        # 从管理器中获取指定ID的资源。
        # 这是一个查找资源的示例方法。
        for res in self.resources:
            if res.get("id") == resource_id:
                return res
        return None
""")
        print(f"已创建虚拟仓库于 `{CODE_REPO_PATH}`。")

    all_qa_data = []
    qa_id_counter = [0]

    for root, _, files in os.walk(CODE_REPO_PATH):
        for file_name in files:
            if file_name.endswith(".py"):
                file_path = os.path.join(root, file_name)
                print(f"正在处理文件: {file_path}")
                functions_info = extract_function_info(file_path)
                for func_info in functions_info:
                    qa_entry = generate_qa_for_function(file_path, func_info, qa_id_counter)
                    if qa_entry:
                        all_qa_data.append(qa_entry)

    with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
        for entry in all_qa_data:
            f.write(json.dumps(entry, ensure_ascii=False) + "\n")

    print(f"\n训练数据已生成并保存到 `{OUTPUT_FILE}`。共生成 {len(all_qa_data)} 条记录。")

if __name__ == "__main__":
    main()

Qwen-plus LLM 客户端初始化成功。
正在处理文件: .\my_python_repo\my_module.py

训练数据已生成并保存到 `scenario1_qa_data_with_llm.jsonl`。共生成 5 条记录。


In [21]:
import ast
import os
import json
from typing import List, Dict, Any, Optional
from openai import OpenAI
import getpass
import datetime

# --- Configuration ---
CODE_REPO_PATH = "./my_python_repo" # 替换为你的本地Python代码仓库路径
OUTPUT_FILE = "scenario2_design_data_with_llm.jsonl"
LLM_MODEL_NAME = "qwen-plus" # 使用您指定的模型
REQUIREMENTS = [
    {"req": "为现有订单系统增加一个异步的库存扣减服务，以提高订单处理的响应速度，并确保库存数据最终一致性。", "keywords": ["order", "inventory", "async"]},
    {"req": "实现一个用户权限管理模块，支持角色-权限分配，并提供API进行权限校验。", "keywords": ["user", "auth", "permission", "role"]},
    {"req": "优化支付流程，引入重试机制和幂等性处理，提高支付成功率。", "keywords": ["payment", "retry", "idempotent"]},
    {"req": "为系统添加一个统一的错误日志和监控报警机制。", "keywords": ["log", "monitor", "error"]},
    {"req": "将用户注册和登录功能从现有用户管理模块中独立出来，形成一个独立的认证服务。", "keywords": ["user", "register", "login", "auth", "service"]},
]

# --- OpenAI Client Initialization ---
DASHSCOPE_API_KEY = os.getenv("DASHSCOPE_API_KEY")
if not DASHSCOPE_API_KEY:
    DASHSCOPE_API_KEY = getpass.getpass("请输入您的 DASHSCOPE_API_KEY：")

try:
    llm_client = OpenAI(
        api_key=DASHSCOPE_API_KEY,
        base_url="https://dashscope.aliyuncs.com/compatible-mode/v1", # 北京地域base_url
    )
    print("Qwen-plus LLM 客户端初始化成功。")
except Exception as e:
    print(f"错误信息：LLM 客户端初始化失败 - {e}")
    print("请参考文档：https://help.aliyun.com/zh/model-studio/developer-reference/error-code")
    llm_client = None

# --- Helper Functions ---
def get_repo_file_list(repo_path: str, keywords: List[str]) -> List[str]:
    """
    获取代码仓库中所有Python文件的相对路径，
    并模拟根据关键词筛选相关文件。
    在真实场景中，这里会是复杂的知识图谱查询或向量搜索。
    """
    relevant_files = []
    all_files = []
    for root, _, files in os.walk(repo_path):
        for file_name in files:
            if file_name.endswith(".py"):
                relative_path = os.path.relpath(os.path.join(root, file_name), repo_path)
                all_files.append(relative_path)

                # 模拟关键词匹配：如果文件路径或内容包含关键词，则认为相关
                # 注意：这里为了简化，只检查文件路径和文件名
                is_relevant = False
                for keyword in keywords:
                    if keyword.lower() in relative_path.lower():
                        is_relevant = True
                        break
                if is_relevant:
                    relevant_files.append(relative_path)

    # 如果没有找到相关文件，则返回所有文件中的一小部分作为通用上下文
    if not relevant_files and all_files:
        return all_files[:min(5, len(all_files))] # 返回最多前5个文件
    elif relevant_files:
        return relevant_files
    return []

def read_file_content(repo_path: str, relative_file_path: str) -> str:
    """读取指定文件的内容。"""
    full_path = os.path.join(repo_path, relative_file_path)
    try:
        with open(full_path, "r", encoding="utf-8") as f:
            return f.read()
    except Exception as e:
        return f"# Error reading {relative_file_path}: {e}"

# --- LLM Integration Function for Scenario 2 ---
def call_llm_for_design_solution(
    requirement: str,
    codebase_context: List[str], # 相关文件列表
    code_contents: Dict[str, str] # 相关文件内容
) -> Dict[str, str]:
    """
    调用LLM生成架构设计方案、解释和推理trace。
    """
    if not llm_client:
        return {
            "design_solution": "LLM客户端未初始化，无法生成设计方案。",
            "explanation": "LLM客户端初始化失败。",
            "inference_trace": "LLM客户端初始化失败。"
        }

    # 构建Prompt
    system_prompt = (
        "你是一个资深的软件架构师和代码专家，能够根据给定的需求和现有代码仓信息，提供详细、合理且可扩展的架构设计方案。\n"
        "请先一步步思考，分析需求和现有上下文，然后给出设计方案、解释和推理过程。\n"
        "你的回答应包含三部分：'design_solution' (设计方案), 'explanation' (设计方案的解释), 和 'inference_trace' (你思考并得出方案的步骤)。"
    )

    context_str = ""
    if codebase_context:
        context_str += "\n--- 现有代码仓相关文件 ---\n"
        for i, file_path in enumerate(codebase_context):
            context_str += f"文件 {i+1}: {file_path}\n"
            # 实际中这里不会直接放所有文件内容，而是通过知识图谱提取关键信息或使用RAG获取摘要
            # 为了演示，这里假设可以提供部分文件内容
            if file_path in code_contents:
                # 仅展示部分内容，避免Prompt过长
                content = code_contents[file_path]
                context_str += f"```python\n{content[:500]}...\n```\n" # 截断
            context_str += "---\n"
    else:
        context_str += "\n--- 现有代码仓信息 ---\n"
        context_str += "未找到与需求直接相关的代码文件，请基于通用设计原则和最佳实践进行设计。\n"


    user_prompt = (
        f"请根据以下需求和提供的现有代码仓上下文，设计一个架构方案。请注意：\n"
        f"1. 你的设计应考虑现有Python代码仓的特点和可能的扩展方向。\n"
        f"2. 方案应结构化、清晰，并包含必要的解释和推理过程。\n\n"
        f"--- 需求 ---\n"
        f"{requirement}\n"
        f"{context_str}\n\n"
        f"--- 输出格式 ---\n"
        f"请以JSON格式返回你的回答，其中包含 'design_solution' (详细的架构设计方案，使用Markdown格式),\n"
        f" 'explanation' (对设计方案的解释) 和 'inference_trace' (你思考并得出方案的步骤)。\n"
        f"例如：\n"
        f"```json\n"
        f"{{\n"
        f"  \"design_solution\": \"# 方案标题\\n1. ...\\n2. ...\",\n"
        f"  \"explanation\": \"此方案的优点是...\",\n"
        f"  \"inference_trace\": \"1. 首先我分析了需求...\\n2. 接着我评估了现有系统...\\n3. 最终我提出了...\"\n"
        f"}}\n"
        f"```"
    )

    try:
        completion = llm_client.chat.completions.create(
            model=LLM_MODEL_NAME,
            messages=[
                {'role': 'system', 'content': system_prompt},
                {'role': 'user', 'content': user_prompt}
            ],
            response_format={"type": "json_object"}
        )
        llm_response_content = completion.choices[0].message.content

        try:
            parsed_response = json.loads(llm_response_content)
            return {
                "design_solution": parsed_response.get("design_solution", "LLM未提供设计方案。"),
                "explanation": parsed_response.get("explanation", "LLM未提供解释。"),
                "inference_trace": parsed_response.get("inference_trace", "LLM未提供推理trace。")
            }
        except json.JSONDecodeError:
            print(f"Warning: LLM 返回的不是有效的 JSON 格式。\n原始响应: {llm_response_content}")
            return {
                "design_solution": f"LLM返回无效格式，原始响应：{llm_response_content}",
                "explanation": "LLM返回无效JSON。",
                "inference_trace": "LLM返回无效JSON。"
            }

    except Exception as e:
        print(f"LLM 调用失败: {e}")
        return {
            "design_solution": f"LLM调用失败，错误：{e}",
            "explanation": f"LLM调用失败，错误：{e}",
            "inference_trace": f"LLM调用失败，错误：{e}"
        }

def generate_design_data(
    design_id_counter: List[int],
    requirement: str,
    repo_path: str,
    # 模拟从需求中提取关键词，真实情况会更复杂
    requirement_keywords: List[str]
) -> Optional[Dict[str, Any]]:
    """
    生成一个架构设计方案的数据点。
    """
    if not requirement.strip():
        return None

    design_id_counter[0] += 1
    design_id = f"design_py_{design_id_counter[0]:05d}"

    # 模拟上下文提取：获取与需求关键词相关的代码文件列表
    codebase_context_files = get_repo_file_list(repo_path, requirement_keywords)

    # 为了LLM调用，需要读取这些文件的内容 (这里仅截取部分以避免过长Prompt)
    code_contents_for_llm = {
        f: read_file_content(repo_path, f) for f in codebase_context_files
    }

    # 调用LLM生成设计方案
    llm_output = call_llm_for_design_solution(
        requirement,
        codebase_context_files,
        code_contents_for_llm
    )

    design_data = {
        "id": design_id,
        "requirement": requirement,
        "design_solution": llm_output["design_solution"],
        "explanation": llm_output["explanation"],
        "inference_trace": llm_output["inference_trace"],
        "codebase_context": codebase_context_files, # 记录 LLM 看到的相关文件列表
        "metadata": {
            "source_project": os.path.basename(repo_path),
            "design_type": "feature_extension", # 默认类型，实际可由LLM判断或根据需求定义
            "language": "python",
            "difficulty": "llm_generated",
            "timestamp": datetime.datetime.now(datetime.timezone.utc).isoformat(),
            "version_control_hash": "dummy_hash_for_example" # 实际应获取git commit hash
        }
    }
    return design_data

# --- Main Script ---
def main():
    if not os.path.exists(CODE_REPO_PATH):
        print(f"错误: 代码仓库路径 `{CODE_REPO_PATH}` 不存在。请修改 CODE_REPO_PATH 为你的本地Python仓库路径。")
        print("尝试创建一个简单的虚拟仓库用于演示...")
        os.makedirs(CODE_REPO_PATH, exist_ok=True)
        with open(os.path.join(CODE_REPO_PATH, "__init__.py"), "w") as f: f.write("")
        with open(os.path.join(CODE_REPO_PATH, "user_management.py"), "w", encoding="utf-8") as f:
            f.write("""
def register_user(username, password, email):
    # This registers a new user
    print(f"Registering {username}")
    # ... database logic
    return {"id": 1, "username": username}

def get_user_profile(user_id):
    # Retrieves user profile from DB
    return {"id": user_id, "username": "test_user"}
""")
        with open(os.path.join(CODE_REPO_PATH, "order_service.py"), "w", encoding="utf-8") as f:
            f.write("""
def create_order(user_id, items):
    # Creates a new order
    print(f"Creating order for user {user_id}")
    # ... inventory check, payment processing
    return {"order_id": "ORD001", "status": "pending"}

def update_order_status(order_id, new_status):
    # Updates an existing order's status
    print(f"Updating order {order_id} to {new_status}")
    return True
""")
        with open(os.path.join(CODE_REPO_PATH, "payment_gateway.py"), "w", encoding="utf-8") as f:
            f.write("""
def process_payment(order_id, amount, payment_method):
    # Integrates with external payment provider
    print(f"Processing payment for order {order_id}, amount {amount}")
    return {"success": True, "transaction_id": "TXN123"}
""")
        print(f"已创建虚拟仓库于 `{CODE_REPO_PATH}`。")



    all_design_data = []
    design_id_counter = [0]

    for req_item in REQUIREMENTS:
        print(f"\n正在为需求生成设计方案: {req_item['req']}")
        design_entry = generate_design_data(
            design_id_counter,
            req_item["req"],
            CODE_REPO_PATH,
            req_item["keywords"]
        )
        if design_entry:
            all_design_data.append(design_entry)

    with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
        for entry in all_design_data:
            f.write(json.dumps(entry, ensure_ascii=False) + "\n")

    print(f"\n训练数据已生成并保存到 `{OUTPUT_FILE}`。共生成 {len(all_design_data)} 条记录。")

if __name__ == "__main__":
    main()

Qwen-plus LLM 客户端初始化成功。

正在为需求生成设计方案: 为现有订单系统增加一个异步的库存扣减服务，以提高订单处理的响应速度，并确保库存数据最终一致性。

正在为需求生成设计方案: 实现一个用户权限管理模块，支持角色-权限分配，并提供API进行权限校验。

正在为需求生成设计方案: 优化支付流程，引入重试机制和幂等性处理，提高支付成功率。

正在为需求生成设计方案: 为系统添加一个统一的错误日志和监控报警机制。

正在为需求生成设计方案: 将用户注册和登录功能从现有用户管理模块中独立出来，形成一个独立的认证服务。

训练数据已生成并保存到 `scenario2_design_data_with_llm.jsonl`。共生成 5 条记录。
