In [2]:
import ast
import os
import json
import inspect
from typing import List, Dict, Any, Optional
from openai import OpenAI
import getpass # For securely inputting API Key, preventing exposure

# --- Configuration ---
CODE_REPO_PATH = ".\my_python_repo" # Replace with your local Python code repository path
OUTPUT_FILE = "scenario1_qa_data_with_llm_en.jsonl"
LLM_MODEL_NAME = "qwen-plus" # Use your specified model

# --- OpenAI Client Initialization ---
# It is recommended to set DASHSCOPE_API_KEY via environment variables, or input it securely at runtime
DASHSCOPE_API_KEY = os.getenv("DASHSCOPE_API_KEY")
if not DASHSCOPE_API_KEY:
    DASHSCOPE_API_KEY = getpass.getpass("Please enter your DASHSCOPE_API_KEY: ")

try:
    llm_client = OpenAI(
        api_key=DASHSCOPE_API_KEY,
        base_url="https://dashscope.aliyuncs.com/compatible-mode/v1", # Beijing region base_url
    )
    print("Qwen-plus LLM client initialized successfully.")
except Exception as e:
    print(f"Error: LLM client initialization failed - {e}")
    print("Please refer to the documentation: https://help.aliyun.com/zh/model-studio/developer-reference/error-code")
    llm_client = None # Set client to None, subsequent calls will be skipped

# --- Helper Functions (unchanged from previous example) ---
def extract_function_info(file_path: str) -> List[Dict[str, Any]]:
    """
    Extracts functions, their docstrings, and code snippets from a single Python file.
    """
    functions_info = []
    with open(file_path, "r", encoding="utf-8") as f:
        tree = ast.parse(f.read(), filename=file_path)

    source_lines = open(file_path, "r", encoding="utf-8").readlines()

    for node in ast.walk(tree):
        if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
            function_name = node.name
            docstring = ast.get_docstring(node)

            try:
                start_line = node.lineno
                end_line = node.end_lineno if node.end_lineno is not None else start_line
                snippet_lines = source_lines[start_line - 1:end_line]
                snippet = "".join(snippet_lines)
            except Exception as e:
                snippet = f"# Error extracting snippet for {function_name}: {e}\n"
                print(f"Warning: Could not extract snippet for {function_name} in {file_path}. Error: {e}")

            functions_info.append({
                "function_name": function_name,
                "docstring": docstring if docstring else "",
                "code_snippet": snippet,
                "start_line": start_line,
                "end_line": end_line
            })
    return functions_info

# --- LLM Integration Function ---
def call_llm_for_qa(function_name: str, docstring: str, code_snippet: str, file_path: str) -> Dict[str, str]:
    """
    Calls the LLM to generate answers and inference traces for QA pairs.
    """
    if not llm_client:
        return {
            "answer": "LLM client not initialized, cannot generate answer.",
            "inference_trace": "LLM client initialization failed."
        }

    # Construct Prompt
    system_prompt = (
        "You are a professional code analysis assistant capable of understanding Python code and explaining its functionality.\n"
        "Based on the provided function information, please first think step-by-step, then provide a main functionality explanation for the function and its relevant inference process.\n"
        "Your answer should consist of two parts: 'answer' and 'inference_trace'."
    )

    user_prompt = (
        f"Please analyze the following Python function and answer what its main functionality is?\n\n"
        f"--- Function Information ---\n"
        f"File Path: {file_path}\n"
        f"Function Name: {function_name}\n"
        f"Docstring:\n```\n{docstring if docstring else 'No docstring'}\n```\n"
        f"Code Snippet:\n```python\n{code_snippet}\n```\n\n"
        f"--- Output Format ---\n"
        f"Please return your answer in JSON format, including 'answer' (explanation of function functionality) and 'inference_trace' (steps you thought through to arrive at the answer).\n"
        f"Example:\n"
        f"```json\n"
        f"{{\n"
        f"  \"answer\": \"The main functionality of the function is...\",\n"
        f"  \"inference_trace\": \"1. First I identified...\\n2. Then I analyzed...\\n3. Finally I concluded...\"\n"
        f"}}\n"
        f"```"
    )

    try:
        completion = llm_client.chat.completions.create(
            model=LLM_MODEL_NAME,
            messages=[
                {'role': 'system', 'content': system_prompt},
                {'role': 'user', 'content': user_prompt}
            ],
            response_format={"type": "json_object"} # Explicitly request LLM to return in JSON format
        )
        llm_response_content = completion.choices[0].message.content

        # Attempt to parse LLM's JSON response
        try:
            parsed_response = json.loads(llm_response_content)
            return {
                "answer": parsed_response.get("answer", "LLM did not provide an answer."),
                "inference_trace": parsed_response.get("inference_trace", "LLM did not provide inference trace.")
            }
        except json.JSONDecodeError:
            print(f"Warning: LLM response is not valid JSON format.\nRaw response: {llm_response_content}")
            return {
                "answer": f"LLM returned invalid format, raw response: {llm_response_content}",
                "inference_trace": "LLM returned invalid JSON."
            }

    except Exception as e:
        print(f"LLM call failed: {e}")
        return {
            "answer": f"LLM call failed, error: {e}",
            "inference_trace": f"LLM call failed, error: {e}"
        }

def generate_qa_for_function(
    file_path: str,
    func_info: Dict[str, Any],
    qa_id_counter: List[int]
) -> Optional[Dict[str, Any]]:
    """
    Generates a simplified QA pair for a single function, integrating LLM calls.
    """
    function_name = func_info["function_name"]
    docstring = func_info["docstring"]
    code_snippet = func_info["code_snippet"]
    start_line = func_info["start_line"]
    end_line = func_info["end_line"]

    if not code_snippet.strip(): # Skip if code snippet is empty
        return None

    qa_id_counter[0] += 1
    qa_id = f"qa_py_{qa_id_counter[0]:05d}"

    question = f"What is the main functionality of function `{function_name}` (in `{os.path.basename(file_path)}`)?"

    # Call LLM to generate answer and inference trace
    llm_output = call_llm_for_qa(function_name, docstring, code_snippet, file_path)
    answer = llm_output["answer"]
    inference_trace = llm_output["inference_trace"]

    # Simulate business rules (generic here, should be extracted from project documentation or specific comments)
    business_rules = []
    if "save" in function_name.lower() or "update" in function_name.lower():
        business_rules.append("Data persistence operations must consider transaction consistency.")
    if "auth" in function_name.lower() or "login" in function_name.lower():
        business_rules.append("User authentication and authorization operations must follow security best practices.")

    qa_data = {
        "id": qa_id,
        "question": question,
        "answer": answer,
        "code_context": [
            {
                "file_path": os.path.relpath(file_path, CODE_REPO_PATH),
                "line_start": start_line,
                "line_end": end_line,
                "snippet": code_snippet.strip()
            }
        ],
        "business_rules_context": business_rules,
        "inference_trace": inference_trace,
        "metadata": {
            "source_module": os.path.dirname(os.path.relpath(file_path, CODE_REPO_PATH)),
            "language": "python",
            "difficulty": "llm_generated", # Mark as LLM generated
            "timestamp": "2025-12-09T" + os.popen('date -u +"%H:%M:%SZ"').read().strip(),
            "version_control_hash": "dummy_hash_for_example"
        }
    }
    return qa_data

# --- Main Script ---
def main():
    if not os.path.exists(CODE_REPO_PATH):
        print(f"Error: Code repository path `{CODE_REPO_PATH}` does not exist. Please modify CODE_REPO_PATH to your local Python repository path.")
        print("Attempting to create a simple dummy repository for demonstration...")
        os.makedirs(CODE_REPO_PATH, exist_ok=True)
        with open(os.path.join(CODE_REPO_PATH, "my_module.py"), "w", encoding="utf-8") as f:
            f.write("""
def calculate_sum(a: int, b: int) -> int:
    \"\"\"
    Calculates the sum of two integers.
    This function takes two integers as input and returns their sum.
    \"\"\"
    return a + b

def process_data(data_list: list):
    # This function processes a list of data without a docstring.
    print(f"Processing {len(data_list)} items.")
    for item in data_list:
        if item % 2 == 0:
            print(f"Even item: {item}")
        else:
            print(f"Odd item: {item}")

class MyManager:
    \"\"\"
    A class for managing resources.
    Provides Create, Read, Update, and Delete (CRUD) operations for resources.
    \"\"\"
    def __init__(self, resource_name: str):
        self.resource_name = resource_name
        self.resources = []

    def create_resource(self, resource_data: dict):
        \"\"\"
        Creates a new resource and adds it to the manager.
        :param resource_data: Dictionary data of the resource.
        :return: None
        \"\"\"
        self.resources.append(resource_data)
        print(f"Resource created: {resource_data}")

    def get_resource(self, resource_id: str) -> Optional[dict]:
        # Retrieves a resource with the specified ID from the manager.
        # This is an example method for finding resources.
        for res in self.resources:
            if res.get("id") == resource_id:
                return res
        return None
""")
        print(f"Dummy repository created at `{CODE_REPO_PATH}`.")

    all_qa_data = []
    qa_id_counter = [0]

    for root, _, files in os.walk(CODE_REPO_PATH):
        for file_name in files:
            if file_name.endswith(".py"):
                file_path = os.path.join(root, file_name)
                print(f"Processing file: {file_path}")
                functions_info = extract_function_info(file_path)
                for func_info in functions_info:
                    qa_entry = generate_qa_for_function(file_path, func_info, qa_id_counter)
                    if qa_entry:
                        all_qa_data.append(qa_entry)

    with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
        for entry in all_qa_data:
            f.write(json.dumps(entry, ensure_ascii=False) + "\n")

    print(f"\nTraining data generated and saved to `{OUTPUT_FILE}`. Total {len(all_qa_data)} records generated.")

if __name__ == "__main__":
    main()

Qwen-plus LLM client initialized successfully.
Processing file: .\my_python_repo\my_module.py

Training data generated and saved to `scenario1_qa_data_with_llm_en.jsonl`. Total 5 records generated.


In [3]:
import ast
import os
import json
from typing import List, Dict, Any, Optional
from openai import OpenAI
import getpass
import datetime

# --- Configuration ---
CODE_REPO_PATH = "./my_python_repo" # Replace with your local Python code repository path
OUTPUT_FILE = "scenario2_design_data_with_llm_en.jsonl"
LLM_MODEL_NAME = "qwen-plus" # Use your specified model
REQUIREMENTS = [
    {"req": "Add an asynchronous inventory deduction service to the existing order system to improve order processing response speed and ensure eventual consistency of inventory data.", "keywords": ["order", "inventory", "async"]},
    {"req": "Implement a user permission management module that supports role-permission assignment and provides APIs for permission verification.", "keywords": ["user", "auth", "permission", "role"]},
    {"req": "Optimize the payment process, introduce a retry mechanism and idempotency handling to improve payment success rate.", "keywords": ["payment", "retry", "idempotent"]},
    {"req": "Add a unified error logging and monitoring alert mechanism to the system.", "keywords": ["log", "monitor", "error"]},
    {"req": "Decouple user registration and login functionality from the existing user management module to form an independent authentication service.", "keywords": ["user", "register", "login", "auth", "service"]},
]

# --- OpenAI Client Initialization ---
DASHSCOPE_API_KEY = os.getenv("DASHSCOPE_API_KEY")
if not DASHSCOPE_API_KEY:
    DASHSCOPE_API_KEY = getpass.getpass("Please enter your DASHSCOPE_API_KEY: ")

try:
    llm_client = OpenAI(
        api_key=DASHSCOPE_API_KEY,
        base_url="https://dashscope.aliyuncs.com/compatible-mode/v1", # Beijing region base_url
    )
    print("Qwen-plus LLM client initialized successfully.")
except Exception as e:
    print(f"Error: LLM client initialization failed - {e}")
    print("Please refer to the documentation: https://help.aliyun.com/en/model-studio/developer-reference/error-code")
    llm_client = None

# --- Helper Functions ---
def get_repo_file_list(repo_path: str, keywords: List[str]) -> List[str]:
    """
    Retrieves relative paths of all Python files in the code repository and
    simulates filtering relevant files based on keywords.
    In a real-world scenario, this would involve complex knowledge graph queries or vector searches.
    """
    relevant_files = []
    all_files = []
    for root, _, files in os.walk(repo_path):
        for file_name in files:
            if file_name.endswith(".py"):
                relative_path = os.path.relpath(os.path.join(root, file_name), repo_path)
                all_files.append(relative_path)

                # Simulate keyword matching: consider a file relevant if its path or content contains keywords
                # Note: For simplicity, only file paths and filenames are checked here.
                is_relevant = False
                for keyword in keywords:
                    if keyword.lower() in relative_path.lower():
                        is_relevant = True
                        break
                if is_relevant:
                    relevant_files.append(relative_path)

    # If no relevant files are found, return a small portion of all files as general context
    if not relevant_files and all_files:
        return all_files[:min(5, len(all_files))] # Returns up to the first 5 files
    elif relevant_files:
        return relevant_files
    return []

def read_file_content(repo_path: str, relative_file_path: str) -> str:
    """Reads the content of the specified file."""
    full_path = os.path.join(repo_path, relative_file_path)
    try:
        with open(full_path, "r", encoding="utf-8") as f:
            return f.read()
    except Exception as e:
        return f"# Error reading {relative_file_path}: {e}"

# --- LLM Integration Function for Scenario 2 ---
def call_llm_for_design_solution(
    requirement: str,
    codebase_context: List[str], # List of relevant files
    code_contents: Dict[str, str] # Contents of relevant files
) -> Dict[str, str]:
    """
    Calls the LLM to generate architectural design solutions, explanations, and inference traces.
    """
    if not llm_client:
        return {
            "design_solution": "LLM client not initialized, cannot generate design solution.",
            "explanation": "LLM client initialization failed.",
            "inference_trace": "LLM client initialization failed."
        }

    # Construct Prompt
    system_prompt = (
        "You are a senior software architect and code expert, capable of providing detailed, reasonable, and scalable architectural design solutions based on given requirements and existing codebase information.\n"
        "Please first think step-by-step, analyze the requirements and existing context, then provide the design solution, explanation, and inference process.\n"
        "Your answer should consist of three parts: 'design_solution' (design solution), 'explanation' (explanation of the design solution), and 'inference_trace' (steps you thought through to arrive at the solution)."
    )

    context_str = ""
    if codebase_context:
        context_str += "\n--- Relevant Files in Existing Codebase ---\n"
        for i, file_path in enumerate(codebase_context):
            context_str += f"File {i+1}: {file_path}\n"
            # In a real scenario, this would not directly include all file content,
            # but rather extract key information via a knowledge graph or summarize using RAG.
            # For demonstration purposes, we assume some file content can be provided here.
            if file_path in code_contents:
                # Only show a portion to avoid overly long prompts
                content = code_contents[file_path]
                context_str += f"```python\n{content[:500]}...\n```\n" # Truncated
            context_str += "---\n"
    else:
        context_str += "\n--- Existing Codebase Information ---\n"
        context_str += "No code files directly relevant to the requirement found. Please design based on general design principles and best practices.\n"


    user_prompt = (
        f"Please design an architectural solution based on the following requirements and provided existing codebase context. Please note:\n"
        f"1. Your design should consider the characteristics of the existing Python codebase and potential directions for expansion.\n"
        f"2. The solution should be structured, clear, and include necessary explanations and inference processes.\n\n"
        f"--- Requirement ---\n"
        f"{requirement}\n"
        f"{context_str}\n\n"
        f"--- Output Format ---\n"
        f"Please return your answer in JSON format, including 'design_solution' (detailed architectural design solution, using Markdown format),\n"
        f" 'explanation' (explanation of the design solution) and 'inference_trace' (steps you thought through to arrive at the solution).\n"
        f"Example:\n"
        f"```json\n"
        f"{{\n"
        f"  \"design_solution\": \"# Solution Title\\n1. ...\\n2. ...\",\n"
        f"  \"explanation\": \"The advantages of this solution are...\",\n"
        f"  \"inference_trace\": \"1. First I analyzed the requirements...\\n2. Next I evaluated the existing system...\\n3. Finally I proposed...\"\n"
        f"}}\n"
        f"```"
    )

    try:
        completion = llm_client.chat.completions.create(
            model=LLM_MODEL_NAME,
            messages=[
                {'role': 'system', 'content': system_prompt},
                {'role': 'user', 'content': user_prompt}
            ],
            response_format={"type": "json_object"}
        )
        llm_response_content = completion.choices[0].message.content

        try:
            parsed_response = json.loads(llm_response_content)
            return {
                "design_solution": parsed_response.get("design_solution", "LLM did not provide a design solution."),
                "explanation": parsed_response.get("explanation", "LLM did not provide an explanation."),
                "inference_trace": parsed_response.get("inference_trace", "LLM did not provide inference trace.")
            }
        except json.JSONDecodeError:
            print(f"Warning: LLM returned invalid JSON format.\nRaw response: {llm_response_content}")
            return {
                "design_solution": f"LLM returned invalid format, raw response: {llm_response_content}",
                "explanation": "LLM returned invalid JSON.",
                "inference_trace": "LLM returned invalid JSON."
            }

    except Exception as e:
        print(f"LLM call failed: {e}")
        return {
            "design_solution": f"LLM call failed, error: {e}",
            "explanation": f"LLM call failed, error: {e}",
            "inference_trace": f"LLM call failed, error: {e}"
        }

def generate_design_data(
    design_id_counter: List[int],
    requirement: str,
    repo_path: str,
    # Simulates extracting keywords from requirements; real-world scenarios would be more complex
    requirement_keywords: List[str]
) -> Optional[Dict[str, Any]]:
    """
    Generates a data point for an architectural design solution.
    """
    if not requirement.strip():
        return None

    design_id_counter[0] += 1
    design_id = f"design_py_{design_id_counter[0]:05d}"

    # Simulate context extraction: get a list of code files relevant to the requirement keywords
    codebase_context_files = get_repo_file_list(repo_path, requirement_keywords)

    # For LLM calls, the content of these files needs to be read (only a portion is truncated here to avoid overly long prompts)
    code_contents_for_llm = {
        f: read_file_content(repo_path, f) for f in codebase_context_files
    }

    # Call LLM to generate the design solution
    llm_output = call_llm_for_design_solution(
        requirement,
        codebase_context_files,
        code_contents_for_llm
    )

    design_data = {
        "id": design_id,
        "requirement": requirement,
        "design_solution": llm_output["design_solution"],
        "explanation": llm_output["explanation"],
        "inference_trace": llm_output["inference_trace"],
        "codebase_context": codebase_context_files, # Records the list of relevant files seen by the LLM
        "metadata": {
            "source_project": os.path.basename(repo_path),
            "design_type": "feature_extension", # Default type, can be determined by LLM or defined based on requirements
            "language": "python",
            "difficulty": "llm_generated",
            "timestamp": datetime.datetime.now(datetime.timezone.utc).isoformat(),
            "version_control_hash": "dummy_hash_for_example" # Should actually retrieve git commit hash
        }
    }
    return design_data

# --- Main Script ---
def main():
    if not os.path.exists(CODE_REPO_PATH):
        print(f"Error: Code repository path `{CODE_REPO_PATH}` does not exist. Please modify CODE_REPO_PATH to your local Python repository path.")
        print("Attempting to create a simple dummy repository for demonstration...")
        os.makedirs(CODE_REPO_PATH, exist_ok=True)
        with open(os.path.join(CODE_REPO_PATH, "__init__.py"), "w") as f: f.write("")
        with open(os.path.join(CODE_REPO_PATH, "user_management.py"), "w", encoding="utf-8") as f:
            f.write("""
def register_user(username, password, email):
    # This registers a new user
    print(f"Registering {username}")
    # ... database logic
    return {"id": 1, "username": username}

def get_user_profile(user_id):
    # Retrieves user profile from DB
    return {"id": user_id, "username": "test_user"}
""")
        with open(os.path.join(CODE_REPO_PATH, "order_service.py"), "w", encoding="utf-8") as f:
            f.write("""
def create_order(user_id, items):
    # Creates a new order
    print(f"Creating order for user {user_id}")
    # ... inventory check, payment processing
    return {"order_id": "ORD001", "status": "pending"}

def update_order_status(order_id, new_status):
    # Updates an existing order's status
    print(f"Updating order {order_id} to {new_status}")
    return True
""")
        with open(os.path.join(CODE_REPO_PATH, "payment_gateway.py"), "w", encoding="utf-8") as f:
            f.write("""
def process_payment(order_id, amount, payment_method):
    # Integrates with external payment provider
    print(f"Processing payment for order {order_id}, amount {amount}")
    return {"success": True, "transaction_id": "TXN123"}
""")
        print(f"Dummy repository created at `{CODE_REPO_PATH}`.")



    all_design_data = []
    design_id_counter = [0]

    for req_item in REQUIREMENTS:
        print(f"\nGenerating design solution for requirement: {req_item['req']}")
        design_entry = generate_design_data(
            design_id_counter,
            req_item["req"],
            CODE_REPO_PATH,
            req_item["keywords"]
        )
        if design_entry:
            all_design_data.append(design_entry)

    with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
        for entry in all_design_data:
            f.write(json.dumps(entry, ensure_ascii=False) + "\n")

    print(f"\nTraining data generated and saved to `{OUTPUT_FILE}`. Total {len(all_design_data)} records generated.")

if __name__ == "__main__":
    main()

Qwen-plus LLM client initialized successfully.

Generating design solution for requirement: Add an asynchronous inventory deduction service to the existing order system to improve order processing response speed and ensure eventual consistency of inventory data.

Generating design solution for requirement: Implement a user permission management module that supports role-permission assignment and provides APIs for permission verification.

Generating design solution for requirement: Optimize the payment process, introduce a retry mechanism and idempotency handling to improve payment success rate.

Generating design solution for requirement: Add a unified error logging and monitoring alert mechanism to the system.

Generating design solution for requirement: Decouple user registration and login functionality from the existing user management module to form an independent authentication service.

Training data generated and saved to `scenario2_design_data_with_llm_en.jsonl`. Total 5 recor