diff --git a/batch_test_prompts.py b/batch_test_prompts.py new file mode 100644 index 0000000..c10bb71 --- /dev/null +++ b/batch_test_prompts.py @@ -0,0 +1,477 @@ +#!/usr/bin/env python +""" +Batch Prompt Testing Tool for CodeDog + +This script fetches multiple diffs from GitLab and tests code review prompts on them. +It allows comparing different prompts and models on real-world code changes. +""" + +import argparse +import asyncio +import json +import os +import sys +import time +from datetime import datetime +from pathlib import Path +from typing import Dict, List, Any, Optional, Tuple + +# 设置日志记录 +import logging +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') +logger = logging.getLogger(__name__) + +# 加载环境变量 +from dotenv import load_dotenv +load_dotenv() + +# 导入必要的模块 +import gitlab +from test_prompt import test_prompt + +# 创建输出目录 +def create_output_dirs(base_dir: str) -> Tuple[str, str]: + """创建输出目录结构""" + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + output_dir = os.path.join(base_dir, f"prompt_test_{timestamp}") + diffs_dir = os.path.join(output_dir, "diffs") + results_dir = os.path.join(output_dir, "results") + + os.makedirs(output_dir, exist_ok=True) + os.makedirs(diffs_dir, exist_ok=True) + os.makedirs(results_dir, exist_ok=True) + + return diffs_dir, results_dir + +# 从GitLab获取MR的diff +def fetch_gitlab_diffs( + project_id: str, + mr_count: int = 5, + max_files_per_mr: int = 3, + include_extensions: Optional[List[str]] = None, + exclude_extensions: Optional[List[str]] = None, + state: str = "merged" +) -> List[Dict[str, Any]]: + """ + 从GitLab获取MR的diff + + Args: + project_id: GitLab项目ID或路径 + mr_count: 要获取的MR数量 + max_files_per_mr: 每个MR最多获取的文件数 + include_extensions: 包含的文件扩展名列表 + exclude_extensions: 排除的文件扩展名列表 + state: MR状态,可选值为"merged", "opened", "closed" + + Returns: + List[Dict[str, Any]]: MR的diff列表 + """ + # 获取GitLab配置 + gitlab_url = os.environ.get("GITLAB_URL", "https://gitlab.com") + gitlab_token = os.environ.get("GITLAB_TOKEN") + + if not gitlab_token: + raise ValueError("GitLab token not found in environment variables. Please set GITLAB_TOKEN.") + + # 连接GitLab + gl = gitlab.Gitlab(gitlab_url, private_token=gitlab_token) + + # 获取项目 + try: + project = gl.projects.get(project_id) + except Exception as e: + logger.error(f"Failed to get project {project_id}: {e}") + raise + + logger.info(f"Successfully connected to GitLab project: {project.name}") + + # 获取MR列表 + mrs = project.mergerequests.list(state=state, order_by="updated_at", sort="desc", per_page=100) + + # 过滤MR + filtered_mrs = [] + for mr in mrs[:mr_count]: + logger.info(f"Processing MR #{mr.iid}: {mr.title}") + + # 获取MR的变更 + changes = mr.changes() + + # 过滤文件 + filtered_files = [] + for change in changes.get("changes", []): + file_path = change.get("new_path") or change.get("old_path") + + # 检查文件扩展名 + _, ext = os.path.splitext(file_path) + if include_extensions and ext.lower() not in include_extensions: + continue + if exclude_extensions and ext.lower() in exclude_extensions: + continue + + # 获取diff + diff = change.get("diff", "") + + # 添加到过滤后的文件列表 + filtered_files.append({ + "file_path": file_path, + "diff": diff + }) + + # 如果达到每个MR的最大文件数,则停止 + if len(filtered_files) >= max_files_per_mr: + break + + # 如果有过滤后的文件,则添加到MR列表 + if filtered_files: + filtered_mrs.append({ + "id": mr.iid, + "title": mr.title, + "description": mr.description, + "author": mr.author["name"], + "created_at": mr.created_at, + "updated_at": mr.updated_at, + "files": filtered_files + }) + + logger.info(f"Fetched {len(filtered_mrs)} MRs with {sum(len(mr['files']) for mr in filtered_mrs)} files") + return filtered_mrs + +# 保存diff到文件 +def save_diffs(mrs: List[Dict[str, Any]], diffs_dir: str) -> Dict[str, str]: + """ + 保存diff到文件 + + Args: + mrs: MR列表 + diffs_dir: diff文件保存目录 + + Returns: + Dict[str, str]: 文件路径到MR信息的映射 + """ + file_to_mr = {} + + for mr in mrs: + mr_id = mr["id"] + mr_title = mr["title"] + + for i, file_info in enumerate(mr["files"]): + file_path = file_info["file_path"] + diff = file_info["diff"] + + # 创建安全的文件名 + safe_name = f"mr_{mr_id}_{i}_{os.path.basename(file_path)}.diff" + safe_path = os.path.join(diffs_dir, safe_name) + + # 保存diff + with open(safe_path, "w", encoding="utf-8") as f: + f.write(f"diff --git a/{file_path} b/{file_path}\n") + f.write(diff) + + # 添加到映射 + file_to_mr[safe_path] = { + "mr_id": mr_id, + "mr_title": mr_title, + "file_path": file_path + } + + logger.info(f"Saved diff to {safe_path}") + + return file_to_mr + +# 批量测试提示 +async def batch_test( + diff_files: Dict[str, Dict[str, Any]], + results_dir: str, + model_name: str = "gpt-3.5-turbo", + system_prompt_path: Optional[str] = None, + output_format: str = "json" +) -> Dict[str, Any]: + """ + 批量测试提示 + + Args: + diff_files: 文件路径到MR信息的映射 + results_dir: 结果保存目录 + model_name: 模型名称 + system_prompt_path: 系统提示文件路径 + output_format: 输出格式 + + Returns: + Dict[str, Any]: 测试结果 + """ + # 读取系统提示 + system_prompt = None + if system_prompt_path: + with open(system_prompt_path, "r", encoding="utf-8") as f: + system_prompt = f.read() + + # 创建结果目录 + model_dir = os.path.join(results_dir, model_name.replace("-", "_")) + os.makedirs(model_dir, exist_ok=True) + + # 创建汇总文件 + summary_file = os.path.join(model_dir, "summary.json") + summary = { + "model": model_name, + "system_prompt": system_prompt_path, + "timestamp": datetime.now().isoformat(), + "results": {} + } + + # 批量测试 + for diff_file, mr_info in diff_files.items(): + file_path = mr_info["file_path"] + mr_id = mr_info["mr_id"] + mr_title = mr_info["mr_title"] + + logger.info(f"Testing prompt on {os.path.basename(diff_file)} (MR #{mr_id}: {mr_title})") + + # 读取diff内容 + with open(diff_file, "r", encoding="utf-8") as f: + content = f.read() + + # 测试提示 + try: + result = await test_prompt( + file_path=file_path, + content=content, + model_name=model_name, + system_prompt=system_prompt, + output_format=output_format + ) + + # 保存结果 + result_file = os.path.join(model_dir, f"{os.path.basename(diff_file)}.{output_format}") + with open(result_file, "w", encoding="utf-8") as f: + if output_format == "json": + json.dump(result, f, indent=2, ensure_ascii=False) + else: # markdown + f.write(result["markdown"]) + + # 添加到汇总 + summary["results"][os.path.basename(diff_file)] = { + "mr_id": mr_id, + "mr_title": mr_title, + "file_path": file_path, + "result_file": result_file, + "scores": { + "readability": result.get("readability", "N/A"), + "efficiency": result.get("efficiency", "N/A"), + "security": result.get("security", "N/A"), + "structure": result.get("structure", "N/A"), + "error_handling": result.get("error_handling", "N/A"), + "documentation": result.get("documentation", "N/A"), + "code_style": result.get("code_style", "N/A"), + "overall_score": result.get("overall_score", "N/A"), + "effective_code_lines": result.get("effective_code_lines", "N/A"), + "non_effective_code_lines": result.get("non_effective_code_lines", "N/A"), + "estimated_hours": result.get("estimated_hours", "N/A") + } + } + + logger.info(f"Saved result to {result_file}") + + except Exception as e: + logger.error(f"Error testing prompt on {diff_file}: {e}") + # 添加错误信息到汇总 + summary["results"][os.path.basename(diff_file)] = { + "mr_id": mr_id, + "mr_title": mr_title, + "file_path": file_path, + "error": str(e) + } + + # 保存汇总 + with open(summary_file, "w", encoding="utf-8") as f: + json.dump(summary, f, indent=2, ensure_ascii=False) + + logger.info(f"Saved summary to {summary_file}") + + return summary + +# 生成比较报告 +def generate_comparison_report(summaries: List[Dict[str, Any]], results_dir: str) -> str: + """ + 生成比较报告 + + Args: + summaries: 汇总列表 + results_dir: 结果保存目录 + + Returns: + str: 报告文件路径 + """ + # 创建报告文件 + report_file = os.path.join(results_dir, "comparison_report.md") + + # 生成报告 + with open(report_file, "w", encoding="utf-8") as f: + f.write("# Prompt Testing Comparison Report\n\n") + f.write(f"Generated at: {datetime.now().isoformat()}\n\n") + + # 模型信息 + f.write("## Models Tested\n\n") + for i, summary in enumerate(summaries): + model = summary["model"] + system_prompt = summary["system_prompt"] or "Default" + f.write(f"{i+1}. **{model}** with prompt: {system_prompt}\n") + f.write("\n") + + # 汇总表格 + f.write("## Score Summary\n\n") + f.write("| File | ") + for summary in summaries: + model = summary["model"] + f.write(f"{model} Overall | ") + f.write("\n") + + f.write("| --- | ") + for _ in summaries: + f.write("---: | ") + f.write("\n") + + # 获取所有文件 + all_files = set() + for summary in summaries: + all_files.update(summary["results"].keys()) + + # 填充表格 + for file in sorted(all_files): + f.write(f"| {file} | ") + for summary in summaries: + result = summary["results"].get(file, {}) + if "error" in result: + f.write("Error | ") + else: + overall_score = result.get("scores", {}).get("overall_score", "N/A") + f.write(f"{overall_score} | ") + f.write("\n") + + # 详细比较 + f.write("\n## Detailed Comparison\n\n") + for file in sorted(all_files): + f.write(f"### {file}\n\n") + + # 创建比较表格 + f.write("| Metric | ") + for summary in summaries: + model = summary["model"] + f.write(f"{model} | ") + f.write("\n") + + f.write("| --- | ") + for _ in summaries: + f.write("---: | ") + f.write("\n") + + # 填充表格 + metrics = ["readability", "efficiency", "security", "structure", + "error_handling", "documentation", "code_style", "overall_score", + "effective_code_lines", "non_effective_code_lines", "estimated_hours"] + + for metric in metrics: + metric_name = metric.replace("_", " ").title() + f.write(f"| {metric_name} | ") + for summary in summaries: + result = summary["results"].get(file, {}) + if "error" in result: + f.write("Error | ") + else: + value = result.get("scores", {}).get(metric, "N/A") + f.write(f"{value} | ") + f.write("\n") + + f.write("\n") + + logger.info(f"Generated comparison report: {report_file}") + return report_file + +def parse_args(): + """解析命令行参数""" + parser = argparse.ArgumentParser(description="Batch test code review prompts on GitLab MRs") + + # GitLab选项 + parser.add_argument("--project", required=True, help="GitLab project ID or path") + parser.add_argument("--mr-count", type=int, default=5, help="Number of MRs to fetch (default: 5)") + parser.add_argument("--max-files", type=int, default=3, help="Maximum files per MR (default: 3)") + parser.add_argument("--include", help="Included file extensions, comma separated, e.g. .py,.js") + parser.add_argument("--exclude", help="Excluded file extensions, comma separated, e.g. .md,.txt") + parser.add_argument("--state", choices=["merged", "opened", "closed"], default="merged", + help="MR state to fetch (default: merged)") + + # 模型选项 + parser.add_argument("--models", help="Models to test, comma separated (default: gpt-3.5-turbo)") + + # 系统提示选项 + parser.add_argument("--system-prompts", help="Paths to system prompt files, comma separated") + + # 输出选项 + parser.add_argument("--output-dir", default="prompt_tests", help="Output directory (default: prompt_tests)") + parser.add_argument("--format", choices=["json", "markdown"], default="json", help="Output format (default: json)") + + return parser.parse_args() + +async def main(): + """主函数""" + args = parse_args() + + # 解析包含和排除的文件扩展名 + include_extensions = None + if args.include: + include_extensions = [ext.strip() if ext.strip().startswith(".") else f".{ext.strip()}" + for ext in args.include.split(",")] + + exclude_extensions = None + if args.exclude: + exclude_extensions = [ext.strip() if ext.strip().startswith(".") else f".{ext.strip()}" + for ext in args.exclude.split(",")] + + # 解析模型列表 + models = ["gpt-3.5-turbo"] + if args.models: + models = [model.strip() for model in args.models.split(",")] + + # 解析系统提示列表 + system_prompts = [None] + if args.system_prompts: + system_prompts = [prompt.strip() for prompt in args.system_prompts.split(",")] + + # 创建输出目录 + diffs_dir, results_dir = create_output_dirs(args.output_dir) + + # 从GitLab获取MR的diff + mrs = fetch_gitlab_diffs( + project_id=args.project, + mr_count=args.mr_count, + max_files_per_mr=args.max_files, + include_extensions=include_extensions, + exclude_extensions=exclude_extensions, + state=args.state + ) + + # 保存diff到文件 + diff_files = save_diffs(mrs, diffs_dir) + + # 批量测试提示 + summaries = [] + for model in models: + for system_prompt in system_prompts: + logger.info(f"Testing model {model} with prompt {system_prompt or 'Default'}") + summary = await batch_test( + diff_files=diff_files, + results_dir=results_dir, + model_name=model, + system_prompt_path=system_prompt, + output_format=args.format + ) + summaries.append(summary) + + # 生成比较报告 + if len(summaries) > 1: + report_file = generate_comparison_report(summaries, results_dir) + print(f"\nComparison report generated: {report_file}") + + print(f"\nAll tests completed. Results saved to {results_dir}") + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/codedog/utils/code_evaluator.py b/codedog/utils/code_evaluator.py index 5f3ea12..92975ab 100644 --- a/codedog/utils/code_evaluator.py +++ b/codedog/utils/code_evaluator.py @@ -385,44 +385,126 @@ def __init__(self, model: BaseChatModel, tokens_per_minute: int = 9000, max_conc os.makedirs("diffs", exist_ok=True) # System prompt - 使用优化的系统提示 - self.system_prompt = """你是一位经验丰富的代码评审专家,擅长评价各种编程语言的代码质量。 -请根据以下几个方面对代码进行评价,并给出1-10分的评分(10分为最高): -1. 可读性:代码是否易于阅读和理解 -2. 效率:代码是否高效,是否有性能问题 -3. 安全性:代码是否存在安全隐患 -4. 结构:代码结构是否合理,是否遵循良好的设计原则 -5. 错误处理:是否有适当的错误处理机制 -6. 文档和注释:注释是否充分,是否有必要的文档 -7. 代码风格:是否遵循一致的代码风格和最佳实践 -8. 总体评分:综合以上各项的总体评价 - -请以JSON格式返回结果,包含以上各项评分和详细评价意见。 - -重要提示: -1. 即使代码不完整或难以理解,也请尽量给出评价,并在评论中说明情况 -2. 如果代码是差异格式(diff),请忽略差异标记(+/-),专注于评价代码本身 -3. 如果无法评估,请返回默认评分5分,并在评论中说明原因 -4. 始终返回有效的JSON格式""" + self.system_prompt = """# ROLE AND OBJECTIVE +You are a senior code reviewer with 15+ years of experience across multiple programming languages and frameworks. Your task is to provide a thorough, objective evaluation of code quality and estimate the effort required to implement the changes. + +# EVALUATION DIMENSIONS +Evaluate the code on these dimensions, scoring each from 1-10 (10 being highest): + +1. Readability (1-10): Code clarity, naming conventions, consistent formatting +2. Efficiency (1-10): Algorithmic efficiency, resource usage, performance considerations +3. Security (1-10): Protection against vulnerabilities, input validation, secure coding practices +4. Structure (1-10): Architecture, modularity, separation of concerns, SOLID principles +5. Error Handling (1-10): Robust error handling, edge cases, graceful failure +6. Documentation (1-10): Comments, docstrings, self-documenting code +7. Code Style (1-10): Adherence to language/framework conventions and best practices +8. Overall Score (1-10): Comprehensive evaluation considering all dimensions + +# CODE CHANGE CLASSIFICATION +When evaluating code changes (especially in diff format), carefully distinguish between: + +## Non-Effective Changes (Should NOT count significantly toward working hours) +- Whitespace adjustments (spaces, tabs, line breaks) +- Indentation fixes without logic changes +- Comment additions or modifications without code changes +- Import reordering or reorganization +- Variable/function renaming without behavior changes +- Code reformatting (line wrapping, bracket placement) +- Changing string quotes (single to double quotes) +- Adding/removing trailing commas +- Changing code style to match linter rules +- Removing unused imports or variables + +## Effective Changes (SHOULD count toward working hours) +- Logic modifications that alter program behavior +- Functionality additions or removals +- Algorithm changes or optimizations +- Bug fixes that correct actual issues +- API changes (parameters, return types, etc.) +- Data structure modifications +- Performance optimizations +- Security vulnerability fixes +- Error handling improvements +- Complex refactoring that maintains behavior but improves code quality + +# WORKING HOURS ESTIMATION GUIDELINES +When estimating the time an experienced programmer (5-10+ years) would need: + +1. For purely non-effective changes: + - 0.1-0.2 hours for small files + - 0.3-0.5 hours for large files with extensive formatting + +2. For effective changes, consider: + - Complexity of the logic (simple, moderate, complex) + - Domain knowledge required (general, specialized, expert) + - Testing requirements (minimal, moderate, extensive) + - Integration complexity (isolated, moderate dependencies, highly coupled) + +3. Time components to include in your estimate: + - Understanding the existing code + - Designing the solution + - Implementing the changes + - Testing and debugging + - Documentation and code review + +4. Provide a realistic estimate that reflects the actual work required, not just the line count. + +# LANGUAGE-SPECIFIC CONSIDERATIONS +- For Python: Consider PEP 8 compliance, type hints, docstrings +- For JavaScript/TypeScript: Consider ES6+ features, typing, framework conventions +- For Java: Consider OOP principles, exception handling, Java conventions +- For C/C++: Consider memory management, performance, platform considerations +- For other languages: Apply relevant best practices and conventions + +3. When reviewing diff format code: + - Pay attention to both added (+) and removed (-) lines to understand the complete change + - Evaluate the net effect of the changes, not just individual lines + - Consider the context of the entire file when evaluating specific changes + +4. If you cannot evaluate the code: + - Assign a default score of 5 for each dimension + - Explain why evaluation wasn't possible + - Estimate minimal working hours (0.25) for changes that cannot be properly evaluated + +Always return valid JSON format with all required fields.""" # 添加JSON输出指令 self.json_output_instruction = """ -请以JSON格式返回评价结果,包含7个评分字段和详细评价意见: +# OUTPUT FORMAT +Return your evaluation in valid JSON format with the following structure: ```json { - "readability": 评分, - "efficiency": 评分, - "security": 评分, - "structure": 评分, - "error_handling": 评分, - "documentation": 评分, - "code_style": 评分, - "overall_score": 总评分, - "comments": "详细评价意见和改进建议" + "readability": score, + "efficiency": score, + "security": score, + "structure": score, + "error_handling": score, + "documentation": score, + "code_style": score, + "overall_score": score, + "effective_code_lines": number, + "non_effective_code_lines": number, + "estimated_hours": number, + "comments": "detailed analysis with specific observations and recommendations" } ``` -总评分计算方式:所有7个指标的加权平均值(取一位小数)。 +## JSON Output Guidelines: +1. All scores MUST be integers or decimals between 1-10 +2. The overall_score should reflect the weighted importance of all dimensions +3. effective_code_lines should count ONLY changes that affect behavior or functionality +4. non_effective_code_lines should count formatting, style, and cosmetic changes +5. estimated_hours should be a realistic estimate for an experienced programmer (5-10+ years) +6. comments should include: + - Specific observations about code quality + - Concrete recommendations for improvement + - Explanation of effective vs. non-effective changes + - Justification for your working hours estimate + - Any security concerns or performance issues + - Suggestions for better practices or patterns + +IMPORTANT: Ensure your response is valid JSON that can be parsed programmatically. Do not include explanatory text outside the JSON structure. """ @retry( @@ -657,21 +739,85 @@ async def _evaluate_single_diff(self, diff_content: str) -> Dict[str, Any]: # 清理代码内容,移除异常字符 sanitized_diff = self._sanitize_content(diff_content) - # 使用优化的代码评审prompt - review_prompt = CODE_REVIEW_PROMPT.format( - file_name=file_name, - language=language.lower(), - code_content=sanitized_diff - ) + # 使用自定义代码评审prompt + review_prompt = f"""# Code Review Request + +## File Information +- **File Name**: {file_name} +- **Language**: {language.lower()} + +## Code to Review +```{language.lower()} +{sanitized_diff} +``` + +## Instructions + +Please conduct a comprehensive code review following these steps: + +1. **Initial Analysis**: Begin with a brief overview of the code's purpose and functionality. + +2. **Detailed Evaluation**: Analyze the code across these key dimensions: + + a. **Readability** (1-10): + - Variable and function naming clarity + - Code organization and structure + - Consistent formatting and indentation + - Appropriate use of comments + + b. **Efficiency** (1-10): + - Algorithm efficiency and complexity + - Resource utilization (memory, CPU) + - Optimization opportunities + - Potential bottlenecks + + c. **Security** (1-10): + - Input validation and sanitization + - Authentication and authorization concerns + - Data protection and privacy + - Potential vulnerabilities + + d. **Structure** (1-10): + - Modularity and separation of concerns + - Appropriate design patterns + - Code reusability + - Dependency management + + e. **Error Handling** (1-10): + - Exception handling completeness + - Edge case coverage + - Graceful failure mechanisms + - Informative error messages + + f. **Documentation** (1-10): + - Documentation completeness + - Comment quality and relevance + - API documentation + - Usage examples where appropriate + + g. **Code Style** (1-10): + - Adherence to language conventions + - Consistency with project style + - Readability enhancements + - Modern language feature usage + +3. **Code Change Classification**: + - Carefully distinguish between effective and non-effective code changes + - Non-effective changes include: whitespace adjustments, indentation fixes, comment additions, import reordering, variable/function renaming without behavior changes, code reformatting, changing string quotes, etc. + - Effective changes include: logic modifications, functionality additions/removals, algorithm changes, bug fixes, API changes, data structure modifications, performance optimizations, security fixes, etc. + +4. **Working Hours Estimation**: + - Estimate how many effective working hours an experienced programmer (5-10+ years) would need to complete these code changes + - Focus primarily on effective code changes, not formatting or style changes + - Consider code complexity, domain knowledge requirements, and context + - Include time for understanding, implementation, testing, and integration +""" # 添加语言特定的考虑因素 language_key = language.lower() if language_key in LANGUAGE_SPECIFIC_CONSIDERATIONS: review_prompt += "\n\n" + LANGUAGE_SPECIFIC_CONSIDERATIONS[language_key] - # 添加工作时间估计请求 - review_prompt += "\n\nIn addition to the code evaluation, please also estimate how many effective working hours an experienced programmer (5-10+ years) would need to complete these code changes. Include this estimate in your JSON response as 'estimated_hours'." - # 添加JSON输出指令 review_prompt += "\n\n" + self.json_output_instruction @@ -680,16 +826,16 @@ async def _evaluate_single_diff(self, diff_content: str) -> Dict[str, Any]: HumanMessage(content=review_prompt) ] - # 调用模型 + # Call the model response = await self.model.agenerate(messages=[messages]) self._last_request_time = time.time() - # 获取响应文本 + # Get response text generated_text = response.generations[0][0].text - # 解析响应 + # Parse response try: - # 提取JSON + # Extract JSON json_str = self._extract_json(generated_text) if not json_str: logger.warning("Failed to extract JSON from response, attempting to fix") @@ -697,17 +843,17 @@ async def _evaluate_single_diff(self, diff_content: str) -> Dict[str, Any]: if not json_str: logger.error("Could not extract valid JSON from the response") - return self._generate_default_scores("JSON解析错误。原始响应: " + str(generated_text)[:500]) + return self._generate_default_scores("JSON parsing error. Original response: " + str(generated_text)[:500]) result = json.loads(json_str) - # 验证分数 + # Validate scores scores = self._validate_scores(result) - # 请求成功,调整速率限制 + # Request successful, adjust rate limits self._adjust_rate_limits(is_rate_limited=False) - # 缓存结果 + # Cache results self.cache[file_hash] = scores return scores @@ -1282,40 +1428,40 @@ def _extract_json(self, text: str) -> str: print(f"DEBUG: Response type: {type(text)}, length: {len(text)}") print(f"DEBUG: First 100 chars: '{text[:100]}'") - # 记录完整响应用于调试 + # Log complete response for debugging logger.debug(f"Complete model response: {text}") - # 检查是否包含无法评估的提示(如Base64编码内容) + # Check for patterns indicating unevaluable content (like Base64 encoded content) unevaluable_patterns = [ - r'Base64编码', - r'无法解码的字符串', + r'Base64', + r'undecodable string', r'ICAgIA==', - r'无法评估', - r'无法对这段代码进行评审', - r'无法进行评价', - r'无法对代码进行评估', - r'代码内容太短', - r'代码为空', - r'没有提供实际的代码', - r'无法理解', - r'无法解析', - r'无法分析', - r'无法读取', - r'无法识别', - r'无法处理', - r'无效的代码', - r'不是有效的代码', - r'不是代码', - r'不包含代码', - r'只包含了一个无法解码的字符串' + r'cannot evaluate', + r'cannot review this code', + r'unable to evaluate', + r'unable to assess the code', + r'code is too short', + r'code is empty', + r'no actual code provided', + r'cannot understand', + r'cannot parse', + r'cannot analyze', + r'cannot read', + r'cannot recognize', + r'cannot process', + r'invalid code', + r'not valid code', + r'not code', + r'does not contain code', + r'only contains an undecodable string' ] for pattern in unevaluable_patterns: if re.search(pattern, text, re.IGNORECASE): print(f"DEBUG: Detected response indicating unevaluable content: '{pattern}'") - # 提取评论,如果有的话 + # Extract comments if any comment = text[:200] if len(text) > 200 else text - # 创建一个默认的JSON响应 + # Create a default JSON response default_json = { "readability": 5, "efficiency": 5, @@ -1686,43 +1832,100 @@ async def _evaluate_diff_chunk(self, chunk: str) -> Dict[str, Any]: # 清理代码内容,移除异常字符 sanitized_chunk = self._sanitize_content(chunk) - review_prompt = f"""请评价以下代码: + review_prompt = f"""# Code Review Request -文件名:{file_name} -语言:{language} +## File Information +- **File Name**: {file_name} +- **Language**: {language.lower()} -``` +## Code to Review +```{language.lower()} {sanitized_chunk} ``` -请对这段代码进行全面评价,并给出1-10分的评分(10分为最高)。评价应包括以下几个方面: -1. 可读性 (readability):代码是否易于阅读和理解 -2. 效率 (efficiency):代码是否高效,是否有性能问题 -3. 安全性 (security):代码是否存在安全隐患 -4. 结构 (structure):代码结构是否合理,是否遵循良好的设计原则 -5. 错误处理 (error_handling):是否有适当的错误处理机制 -6. 文档和注释 (documentation):注释是否充分,是否有必要的文档 -7. 代码风格 (code_style):是否遵循一致的代码风格和最佳实践 -8. 总体评分 (overall_score):综合以上各项的总体评价 - -请以JSON格式返回结果,格式如下: +## Instructions + +Please conduct a comprehensive code review following these steps: + +1. **Initial Analysis**: Begin with a brief overview of the code's purpose and functionality. + +2. **Detailed Evaluation**: Analyze the code across these key dimensions: + + a. **Readability** (1-10): + - Variable and function naming clarity + - Code organization and structure + - Consistent formatting and indentation + - Appropriate use of comments + + b. **Efficiency** (1-10): + - Algorithm efficiency and complexity + - Resource utilization (memory, CPU) + - Optimization opportunities + - Potential bottlenecks + + c. **Security** (1-10): + - Input validation and sanitization + - Authentication and authorization concerns + - Data protection and privacy + - Potential vulnerabilities + + d. **Structure** (1-10): + - Modularity and separation of concerns + - Appropriate design patterns + - Code reusability + - Dependency management + + e. **Error Handling** (1-10): + - Exception handling completeness + - Edge case coverage + - Graceful failure mechanisms + - Informative error messages + + f. **Documentation** (1-10): + - Documentation completeness + - Comment quality and relevance + - API documentation + - Usage examples where appropriate + + g. **Code Style** (1-10): + - Adherence to language conventions + - Consistency with project style + - Readability enhancements + - Modern language feature usage + +3. **Code Change Classification**: + - Carefully distinguish between effective and non-effective code changes + - Non-effective changes include: whitespace adjustments, indentation fixes, comment additions, import reordering, variable/function renaming without behavior changes, code reformatting, changing string quotes, etc. + - Effective changes include: logic modifications, functionality additions/removals, algorithm changes, bug fixes, API changes, data structure modifications, performance optimizations, security fixes, etc. + +4. **Working Hours Estimation**: + - Estimate how many effective working hours an experienced programmer (5-10+ years) would need to complete these code changes + - Focus primarily on effective code changes, not formatting or style changes + - Consider code complexity, domain knowledge requirements, and context + - Include time for understanding, implementation, testing, and integration + +## Response Format + +Please return your evaluation in valid JSON format with the following structure: + ```json {{ - "readability": 评分, - "efficiency": 评分, - "security": 评分, - "structure": 评分, - "error_handling": 评分, - "documentation": 评分, - "code_style": 评分, - "overall_score": 总评分, - "comments": "详细评价意见和改进建议" + "readability": score, + "efficiency": score, + "security": score, + "structure": score, + "error_handling": score, + "documentation": score, + "code_style": score, + "overall_score": score, + "effective_code_lines": number, + "non_effective_code_lines": number, + "estimated_hours": number, + "comments": "detailed analysis with specific observations and recommendations" }} ``` -总评分应该是所有评分的加权平均值,保留一位小数。如果代码很小或者只是配置文件的修改,请根据实际情况给出合理的评分。 - -重要提示:请确保返回有效的JSON格式。如果无法评估代码(例如代码不完整或无法理解),请仍然返回JSON格式,但在comments中说明原因,并给出默认评分5分。""" +IMPORTANT: Ensure your response is valid JSON that can be parsed programmatically. If you cannot evaluate the code (e.g., incomplete or incomprehensible code), still return valid JSON with default scores of 5 and explain the reason in the comments field.""" # 打印完整的代码块用于调试 print(f"DEBUG: File name: {file_name}") @@ -1782,11 +1985,11 @@ async def _evaluate_diff_chunk(self, chunk: str) -> Dict[str, Any]: user_message = messages[0].content if len(messages) > 0 else "No user message" log_llm_interaction(user_message, "", interaction_type="diff_chunk_evaluation_prompt") - # 调用模型 + # Call the model response = await self.model.agenerate(messages=[messages]) self._last_request_time = time.time() - # 获取响应文本 + # Get response text generated_text = response.generations[0][0].text # Log the response to LLM_out.log @@ -1832,20 +2035,20 @@ async def _evaluate_diff_chunk(self, chunk: str) -> Dict[str, Any]: # 检查是否是上下文长度限制错误 is_context_length_error = "context length" in error_message.lower() or "maximum context length" in error_message.lower() - # 检查是否是DeepSeek API错误 + # Check if it's a DeepSeek API error is_deepseek_error = "deepseek" in error_message.lower() or "deepseek api" in error_message.lower() if is_context_length_error: - # 如果是上下文长度错误,尝试进一步分割 + # If it's a context length error, try further splitting logger.warning(f"Context length limit error, attempting further content splitting") - smaller_chunks = self._split_diff_content(chunk, max_tokens_per_chunk=4000) # 使用更小的块大小 + smaller_chunks = self._split_diff_content(chunk, max_tokens_per_chunk=4000) # Use smaller chunk size if len(smaller_chunks) > 1: - # 如果成功分割成多个小块,分别评估并合并结果 + # If successfully split into multiple smaller chunks, evaluate each and merge results sub_results = [] for i, sub_chunk in enumerate(smaller_chunks): logger.info(f"Evaluating sub-chunk {i+1}/{len(smaller_chunks)}") - sub_result = await self._evaluate_diff_chunk(sub_chunk) # 递归调用 + sub_result = await self._evaluate_diff_chunk(sub_chunk) # Recursive call sub_results.append(sub_result) return self._merge_chunk_results(sub_results) @@ -1862,42 +2065,42 @@ async def _evaluate_diff_chunk(self, chunk: str) -> Dict[str, Any]: logger.warning(f"Rate limit error, retrying in {wait_time}s (attempt {retry_count}/{max_retries})") await asyncio.sleep(wait_time) elif is_deepseek_error: - # 对于DeepSeek API错误,最多重试两次,然后放弃 + # For DeepSeek API errors, retry at most twice, then abandon retry_count += 1 - if retry_count >= 2: # 只重试两次 + if retry_count >= 2: # Only retry twice logger.error(f"DeepSeek API error after 2 retries, abandoning evaluation: {error_message}") logger.error(f"Original error: {e}") logger.error(f"Last response (if any): {generated_text[:500] if generated_text else 'No response'}") - # 创建一个详细的错误消息 - error_detail = f"DeepSeek API错误,放弃评估: {error_message}\n" - error_detail += f"原始错误: {e}\n" - error_detail += f"最后响应: {generated_text[:200] if generated_text else '无响应'}" + # Create a detailed error message + error_detail = f"DeepSeek API error, abandoning evaluation: {error_message}\n" + error_detail += f"Original error: {e}\n" + error_detail += f"Last response: {generated_text[:200] if generated_text else 'No response'}" return self._generate_default_scores(error_detail) - # 使用较短的等待时间 - wait_time = 3 # 固定3秒等待时间 + # Use a shorter wait time + wait_time = 3 # Fixed 3-second wait time logger.warning(f"DeepSeek API error, retrying in {wait_time}s (attempt {retry_count}/2)") logger.warning(f"Error details: {error_message}") await asyncio.sleep(wait_time) else: - # 其他错误直接返回 - return self._generate_default_scores(f"评价过程中出错: {error_message}") + # Return directly for other errors + return self._generate_default_scores(f"Error during evaluation: {error_message}") - # 如果所有重试都失败 - return self._generate_default_scores("达到最大重试次数,评价失败") + # If all retries fail + return self._generate_default_scores("Maximum retry count reached, evaluation failed") def _merge_chunk_results(self, chunk_results: List[Dict[str, Any]]) -> Dict[str, Any]: - """合并多个块的评估结果 + """Merge evaluation results from multiple chunks Args: - chunk_results: 多个块的评估结果列表 + chunk_results: List of evaluation results from multiple chunks Returns: - Dict[str, Any]: 合并后的评估结果 + Dict[str, Any]: Merged evaluation result """ if not chunk_results: - return self._generate_default_scores("没有可用的块评估结果") + return self._generate_default_scores("No chunk evaluation results available") if len(chunk_results) == 1: return chunk_results[0] @@ -2843,14 +3046,57 @@ async def evaluate_commit_as_whole( # Create a prompt for evaluating the entire commit language = "multiple" # Since we're evaluating multiple files - # Create a prompt that specifically asks for working hours estimation + # Create a prompt that specifically asks for working hours estimation and distinguishes effective changes prompt = f"""Act as a senior code reviewer with 10+ years of experience. I will provide you with a complete diff of a commit that includes multiple files. Please analyze the entire commit as a whole and provide: 1. A comprehensive evaluation of the code changes -2. An estimate of how many effective working hours an experienced programmer (5-10+ years) would need to complete these code changes -3. Scores for the following aspects (1-10 scale): +2. Carefully distinguish between effective code changes and non-effective changes: + - Non-effective changes (should NOT count toward working hours): + * Whitespace adjustments (spaces, tabs, line breaks) + * Indentation fixes without logic changes + * Comment additions or modifications without code changes + * Import reordering or reorganization + * Variable/function renaming without behavior changes + * Code reformatting (e.g., line wrapping, bracket placement) + * Changing string quotes (e.g., single to double quotes) + * Adding/removing trailing commas + * Changing code style to match linter rules + + - Effective changes (SHOULD count toward working hours): + * Logic modifications that alter program behavior + * Functionality additions or removals + * Algorithm changes or optimizations + * Bug fixes that correct actual issues + * API changes (parameters, return types, etc.) + * Data structure modifications + * Performance optimizations + * Security vulnerability fixes + * Error handling improvements + +3. Count the number of effective code lines changed and non-effective code lines changed: + - For each file, analyze line by line to determine if changes are effective or non-effective + - Count both added and removed lines, but categorize them correctly + - For mixed changes (both effective and non-effective in same line), count as effective + - Provide a breakdown of effective vs. non-effective changes by file + +4. Estimate how many effective working hours an experienced programmer (5-10+ years) would need: + - Base your estimate primarily on effective code changes, not total changes + - For purely non-effective changes (only formatting/style): + * 0.1-0.2 hours for small files + * 0.3-0.5 hours for large files with extensive formatting + - For effective changes, consider: + * Complexity of the logic being modified + * Domain knowledge required to understand the code + * Testing requirements for the changes + * Integration complexity with other components + - Be realistic - experienced programmers work efficiently but still need time to: + * Understand existing code + * Design appropriate solutions + * Implement changes carefully + * Test and verify correctness +5. Scores for the following aspects (1-10 scale): - Readability - Efficiency - Security @@ -2875,8 +3121,10 @@ async def evaluate_commit_as_whole( - documentation: (score 1-10) - code_style: (score 1-10) - overall_score: (score 1-10) -- estimated_hours: (number of hours) -- comments: (your detailed analysis) +- effective_code_lines: (number of lines with actual logic/functionality changes) +- non_effective_code_lines: (number of lines with formatting, whitespace, comment changes) +- estimated_hours: (number of hours based primarily on effective changes) +- comments: (your detailed analysis including breakdown of effective vs non-effective changes) """ logger.info("Preparing to evaluate combined diff") @@ -2929,6 +3177,18 @@ async def evaluate_commit_as_whole( logger.warning(f"Missing field {field} in evaluation, setting default value") eval_data[field] = 5 + # Add effective and non-effective code lines if not present + if "effective_code_lines" not in eval_data: + # Estimate based on additions and deletions + logger.warning("Missing effective_code_lines in evaluation, estimating") + # Assume 60% of changes are effective by default + eval_data["effective_code_lines"] = int(total_additions * 0.6) + int(total_deletions * 0.6) + + if "non_effective_code_lines" not in eval_data: + logger.warning("Missing non_effective_code_lines in evaluation, estimating") + # Assume 40% of changes are non-effective by default + eval_data["non_effective_code_lines"] = int(total_additions * 0.4) + int(total_deletions * 0.4) + # If overall_score is not provided, calculate it if "overall_score" not in eval_data or not eval_data["overall_score"]: score_fields = ["readability", "efficiency", "security", "structure", @@ -2941,7 +3201,7 @@ async def evaluate_commit_as_whole( logger.warning("Missing estimated_hours in evaluation, calculating default") eval_data["estimated_hours"] = self._estimate_default_hours(total_additions, total_deletions) - # Log all scores + # Log all scores and code line counts logger.info(f"Whole commit evaluation scores: " + f"readability={eval_data.get('readability', 'N/A')}, " + f"efficiency={eval_data.get('efficiency', 'N/A')}, " + @@ -2951,6 +3211,8 @@ async def evaluate_commit_as_whole( f"documentation={eval_data.get('documentation', 'N/A')}, " + f"code_style={eval_data.get('code_style', 'N/A')}, " + f"overall_score={eval_data.get('overall_score', 'N/A')}, " + + f"effective_code_lines={eval_data.get('effective_code_lines', 'N/A')}, " + + f"non_effective_code_lines={eval_data.get('non_effective_code_lines', 'N/A')}, " + f"estimated_hours={eval_data.get('estimated_hours', 'N/A')}") except Exception as e: @@ -3102,6 +3364,24 @@ def _create_summary_prompt(self, evaluation_results: Dict[str, Any]) -> str: whole_commit_evaluation = "" if "whole_commit_evaluation" in evaluation_results: eval_data = evaluation_results["whole_commit_evaluation"] + + # Include effective and non-effective code lines if available + code_lines_info = "" + if "effective_code_lines" in eval_data or "non_effective_code_lines" in eval_data: + effective_lines = eval_data.get('effective_code_lines', 'N/A') + non_effective_lines = eval_data.get('non_effective_code_lines', 'N/A') + total_lines = (effective_lines if isinstance(effective_lines, int) else 0) + (non_effective_lines if isinstance(non_effective_lines, int) else 0) + + if total_lines > 0 and isinstance(effective_lines, int) and isinstance(non_effective_lines, int): + effective_percentage = (effective_lines / total_lines) * 100 if total_lines > 0 else 0 + code_lines_info = f""" +- Effective Code Lines: {effective_lines} ({effective_percentage:.1f}% of total changes) +- Non-Effective Code Lines: {non_effective_lines} ({100 - effective_percentage:.1f}% of total changes)""" + else: + code_lines_info = f""" +- Effective Code Lines: {effective_lines} +- Non-Effective Code Lines: {non_effective_lines}""" + whole_commit_evaluation = f""" Whole Commit Evaluation: - Readability: {eval_data.get('readability', 'N/A')}/10 @@ -3111,7 +3391,7 @@ def _create_summary_prompt(self, evaluation_results: Dict[str, Any]) -> str: - Error Handling: {eval_data.get('error_handling', 'N/A')}/10 - Documentation: {eval_data.get('documentation', 'N/A')}/10 - Code Style: {eval_data.get('code_style', 'N/A')}/10 -- Overall Score: {eval_data.get('overall_score', 'N/A')}/10 +- Overall Score: {eval_data.get('overall_score', 'N/A')}/10{code_lines_info} - Comments: {eval_data.get('comments', 'No comments available.')} """ diff --git a/custom_system_prompt.txt b/custom_system_prompt.txt new file mode 100644 index 0000000..83af370 --- /dev/null +++ b/custom_system_prompt.txt @@ -0,0 +1,93 @@ +# EXPERT CODE REVIEWER ROLE + +You are a world-class code reviewer with expertise in multiple programming languages and frameworks. Your task is to provide detailed, actionable feedback on code changes to help developers improve their code quality and productivity. + +## EVALUATION CRITERIA + +Evaluate the code on these dimensions (1-10 scale): + +1. **Readability** (1-10): How easy is the code to read and understand? + - Variable/function naming clarity + - Code organization + - Consistent formatting + - Appropriate comments + +2. **Efficiency** (1-10): How efficiently does the code perform its tasks? + - Algorithm complexity + - Resource usage + - Performance considerations + - Potential bottlenecks + +3. **Security** (1-10): How well does the code handle security concerns? + - Input validation + - Authentication/authorization + - Data protection + - Vulnerability prevention + +4. **Structure** (1-10): How well is the code structured? + - Modularity + - Separation of concerns + - Design patterns + - SOLID principles + +5. **Error Handling** (1-10): How robust is the error handling? + - Exception management + - Edge cases + - Graceful failure + - Informative error messages + +6. **Documentation** (1-10): How well is the code documented? + - Comments quality + - Docstrings + - API documentation + - Usage examples + +7. **Code Style** (1-10): How well does the code adhere to style conventions? + - Language-specific conventions + - Project style consistency + - Modern language features + - Best practices + +## CODE CHANGE ANALYSIS + +When analyzing code changes (especially diffs): + +### Effective Changes (Count toward working hours) +- Logic modifications +- Functionality additions/removals +- Algorithm changes +- Bug fixes +- API changes +- Data structure modifications +- Performance optimizations +- Security fixes +- Error handling improvements + +### Non-Effective Changes (Minimal impact on working hours) +- Whitespace adjustments +- Indentation fixes +- Comment additions without code changes +- Import reordering +- Variable/function renaming without behavior changes +- Code reformatting +- String quote style changes +- Adding/removing trailing commas +- Style changes to match linter rules + +## WORKING HOURS ESTIMATION + +Provide a realistic estimate of how many hours an experienced programmer (5-10+ years) would need to implement these changes: + +- For purely non-effective changes: 0.1-0.5 hours depending on size +- For effective changes, consider: + * Complexity (simple, moderate, complex) + * Domain knowledge required + * Testing requirements + * Integration complexity + +Include time for: +- Understanding existing code +- Designing the solution +- Implementation +- Testing and debugging +- Documentation and review diff --git a/prompt_testing_README.md b/prompt_testing_README.md new file mode 100644 index 0000000..f5e8bb5 --- /dev/null +++ b/prompt_testing_README.md @@ -0,0 +1,202 @@ +# CodeDog Prompt Testing Tools + +这个目录包含两个用于测试代码评审提示(prompts)的工具: + +1. `test_prompt.py` - 单个文件或差异的提示测试工具 +2. `batch_test_prompts.py` - 从GitLab批量获取差异并测试提示的工具 + +## 环境设置 + +确保您已经安装了所需的依赖: + +```bash +pip install python-gitlab python-dotenv langchain-openai +``` + +并创建了一个包含必要环境变量的`.env`文件: + +``` +# OpenAI API配置 +OPENAI_API_KEY=your_openai_api_key + +# GitLab配置 +GITLAB_URL=https://gitlab.com # 或您的GitLab实例URL +GITLAB_TOKEN=your_gitlab_token +``` + +## 单个文件测试工具 (test_prompt.py) + +这个工具允许您测试单个文件或差异的代码评审提示。 + +### 基本用法 + +1. **评估文件**: + ```bash + python test_prompt.py --file example.py + ``` + +2. **评估差异文件**: + ```bash + python test_prompt.py --diff example.diff + ``` + +3. **使用特定模型**: + ```bash + python test_prompt.py --diff example.diff --model gpt-4 + ``` + +4. **使用自定义系统提示**: + ```bash + python test_prompt.py --diff example.diff --system-prompt custom_system_prompt.txt + ``` + +5. **输出为Markdown格式**: + ```bash + python test_prompt.py --diff example.diff --format markdown + ``` + +6. **保存输出到文件**: + ```bash + python test_prompt.py --diff example.diff --output results.json + ``` + +### 命令行选项 + +``` +usage: test_prompt.py [-h] (--file FILE | --diff DIFF) [--model MODEL] + [--system-prompt SYSTEM_PROMPT] [--output OUTPUT] + [--format {json,markdown}] + +Test code review prompts + +options: + -h, --help show this help message and exit + --file FILE Path to the file to evaluate + --diff DIFF Path to the diff file to evaluate + --model MODEL Model to use for evaluation (default: gpt-3.5-turbo) + --system-prompt SYSTEM_PROMPT + Path to a file containing a custom system prompt + --output OUTPUT Path to save the output (default: stdout) + --format {json,markdown} + Output format (default: json) +``` + +## 批量测试工具 (batch_test_prompts.py) + +这个工具允许您从GitLab获取多个差异并批量测试代码评审提示。 + +### 基本用法 + +1. **从GitLab获取MR并测试**: + ```bash + python batch_test_prompts.py --project your_group/your_project + ``` + +2. **指定文件类型**: + ```bash + python batch_test_prompts.py --project your_group/your_project --include .py,.js --exclude .md,.txt + ``` + +3. **测试多个模型**: + ```bash + python batch_test_prompts.py --project your_group/your_project --models gpt-3.5-turbo,gpt-4 + ``` + +4. **测试多个系统提示**: + ```bash + python batch_test_prompts.py --project your_group/your_project --system-prompts prompt1.txt,prompt2.txt + ``` + +5. **自定义输出目录和格式**: + ```bash + python batch_test_prompts.py --project your_group/your_project --output-dir my_tests --format markdown + ``` + +### 命令行选项 + +``` +usage: batch_test_prompts.py [-h] --project PROJECT [--mr-count MR_COUNT] + [--max-files MAX_FILES] [--include INCLUDE] + [--exclude EXCLUDE] + [--state {merged,opened,closed}] [--models MODELS] + [--system-prompts SYSTEM_PROMPTS] + [--output-dir OUTPUT_DIR] + [--format {json,markdown}] + +Batch test code review prompts on GitLab MRs + +options: + -h, --help show this help message and exit + --project PROJECT GitLab project ID or path + --mr-count MR_COUNT Number of MRs to fetch (default: 5) + --max-files MAX_FILES + Maximum files per MR (default: 3) + --include INCLUDE Included file extensions, comma separated, e.g. .py,.js + --exclude EXCLUDE Excluded file extensions, comma separated, e.g. .md,.txt + --state {merged,opened,closed} + MR state to fetch (default: merged) + --models MODELS Models to test, comma separated (default: gpt-3.5-turbo) + --system-prompts SYSTEM_PROMPTS + Paths to system prompt files, comma separated + --output-dir OUTPUT_DIR + Output directory (default: prompt_tests) + --format {json,markdown} + Output format (default: json) +``` + +## 输出结果 + +### 单个文件测试 + +单个文件测试工具的输出是一个JSON或Markdown文件,包含代码评审结果。 + +JSON格式示例: +```json +{ + "readability": 8, + "efficiency": 7, + "security": 6, + "structure": 7, + "error_handling": 5, + "documentation": 9, + "code_style": 8, + "overall_score": 7.1, + "effective_code_lines": 15, + "non_effective_code_lines": 5, + "estimated_hours": 1.5, + "comments": "详细分析..." +} +``` + +### 批量测试 + +批量测试工具的输出是一个目录结构,包含: + +1. `diffs/` - 保存从GitLab获取的差异文件 +2. `results/` - 保存测试结果 + - 每个模型一个子目录 + - 每个子目录包含每个差异文件的评估结果 + - `summary.json` - 包含该模型所有测试的汇总 +3. `comparison_report.md` - 如果测试了多个模型或提示,则生成比较报告 + +## 自定义系统提示 + +您可以创建自定义系统提示文件,用于测试不同的提示效果。系统提示文件是一个纯文本文件,包含您想要使用的系统提示。 + +示例:`custom_system_prompt.txt` + +## 提示优化建议 + +1. **明确角色和目标**:明确定义代码评审员的角色和评审目标。 + +2. **详细的评估维度**:为每个评估维度提供详细的评估标准。 + +3. **区分有效和无效代码修改**:明确区分哪些修改是有效的,哪些是无效的。 + +4. **工作时间估算指南**:提供详细的工作时间估算指南。 + +5. **结构化输出格式**:明确定义输出格式,确保一致性。 + +6. **语言特定考虑因素**:为不同的编程语言提供特定的考虑因素。 + +通过使用这些工具,您可以快速测试和优化代码评审提示,找到最适合您需求的提示。 diff --git a/test_codedog_prompt.py b/test_codedog_prompt.py new file mode 100644 index 0000000..6b821c1 --- /dev/null +++ b/test_codedog_prompt.py @@ -0,0 +1,600 @@ +#!/usr/bin/env python +""" +Prompt Testing Tool for CodeDog using the original CodeDog prompts + +This tool allows you to test code review prompts by providing a diff or code snippet +and getting the evaluation results using CodeDog's original prompts. +""" + +import argparse +import asyncio +import json +import os +import sys +import time +from typing import Dict, Any, Optional + +from langchain_core.messages import HumanMessage, SystemMessage +from langchain_openai import ChatOpenAI + +# 设置日志记录 +import logging +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') +logger = logging.getLogger(__name__) + +# 加载环境变量 +from dotenv import load_dotenv +load_dotenv() + +# 导入必要的模块 +from codedog.utils.langchain_utils import load_model_by_name + +# CodeDog的CODE_SUGGESTION提示模板 +CODE_SUGGESTION = """Act as a senior code review expert with deep knowledge of industry standards and best practices for programming languages. I will give a code diff content. +Perform a comprehensive review of the code changes, conduct static analysis, and provide a detailed evaluation with specific scores based on the detailed criteria below. + +## Review Requirements: +1. Provide a brief summary of the code's intended functionality and primary objectives +2. Conduct a thorough static analysis of code logic, performance, and security +3. Evaluate adherence to language-specific coding standards and best practices +4. Identify specific issues, vulnerabilities, and improvement opportunities +5. Score the code in each dimension using the detailed scoring criteria +6. Provide specific, actionable suggestions for improvement + +## Language-Specific Standards: +{language} code should follow these standards: +- Use clear, descriptive variable and function names +- Follow consistent formatting and indentation +- Include appropriate comments and documentation +- Handle errors and edge cases properly +- Optimize for performance and resource usage +- Follow security best practices +- Adhere to language-specific conventions and idioms + +### SCORES: +- Readability: [score] /10 +- Efficiency & Performance: [score] /10 +- Security: [score] /10 +- Structure & Design: [score] /10 +- Error Handling: [score] /10 +- Documentation & Comments: [score] /10 +- Code Style: [score] /10 +- Final Overall Score: [calculated_overall_score] /10 + +Replace [score] with your actual numeric scores (e.g., 8.5). + +Here's the code diff from file {name}: +```{language} +{content} +``` + +In addition to the code evaluation, please also estimate how many effective working hours an experienced programmer (5-10+ years) would need to complete these code changes. Include this estimate in your JSON response as 'estimated_hours'. + +Please also analyze the code changes to determine how many lines are effective code changes (logic, functionality, algorithm changes) versus non-effective code changes (formatting, whitespace, comments, variable renaming without behavior changes). Include these counts in your JSON response as 'effective_code_lines' and 'non_effective_code_lines'. +""" + + +def sanitize_content(content: str) -> str: + """清理代码内容,移除异常字符""" + # 移除不可打印字符,但保留换行符和制表符 + sanitized = ''.join(c for c in content if c.isprintable() or c in ['\n', '\t', '\r']) + return sanitized + + +def guess_language(file_path: str) -> str: + """根据文件扩展名猜测编程语言""" + import os + file_ext = os.path.splitext(file_path)[1].lower() + + # 文件扩展名到语言的映射 + ext_to_lang = { + # Python + '.py': 'Python', + '.pyx': 'Python', + '.pyi': 'Python', + '.ipynb': 'Python', + + # JavaScript/TypeScript + '.js': 'JavaScript', + '.jsx': 'JavaScript', + '.ts': 'TypeScript', + '.tsx': 'TypeScript', + '.mjs': 'JavaScript', + + # Java + '.java': 'Java', + '.jar': 'Java', + '.class': 'Java', + + # C/C++ + '.c': 'C', + '.cpp': 'C++', + '.h': 'C', + '.hpp': 'C++', + + # C# + '.cs': 'C#', + + # Go + '.go': 'Go', + + # Ruby + '.rb': 'Ruby', + + # PHP + '.php': 'PHP', + + # Swift + '.swift': 'Swift', + + # Kotlin + '.kt': 'Kotlin', + '.kts': 'Kotlin', + + # Rust + '.rs': 'Rust', + + # HTML/CSS + '.html': 'HTML', + '.htm': 'HTML', + '.css': 'CSS', + '.scss': 'SCSS', + '.sass': 'SASS', + '.less': 'LESS', + + # Shell + '.sh': 'Shell', + '.bash': 'Shell', + '.zsh': 'Shell', + + # SQL + '.sql': 'SQL', + + # Markdown + '.md': 'Markdown', + '.markdown': 'Markdown', + + # JSON + '.json': 'JSON', + + # YAML + '.yml': 'YAML', + '.yaml': 'YAML', + + # XML + '.xml': 'XML', + + # Other + '.txt': 'Text', + '.csv': 'CSV', + } + + return ext_to_lang.get(file_ext, 'Unknown') + + +async def test_codedog_prompt( + file_path: str, + content: str, + model_name: str = "gpt-3.5-turbo", + output_format: str = "json" +) -> Dict[str, Any]: + """ + 使用CodeDog的提示测试代码评审 + + Args: + file_path: 文件路径 + content: 代码内容或差异内容 + model_name: 模型名称 + output_format: 输出格式,可选值为json或markdown + + Returns: + Dict[str, Any]: 评估结果 + """ + # 加载模型 + model = load_model_by_name(model_name) + + # 清理代码内容 + sanitized_content = sanitize_content(content) + + # 猜测语言 + language = guess_language(file_path) + + # 使用CodeDog的CODE_SUGGESTION提示 + prompt = CODE_SUGGESTION.format( + language=language, + name=file_path, + content=sanitized_content + ) + + # 创建消息 + messages = [ + HumanMessage(content=prompt) + ] + + # 调用模型 + print(f"Sending request to {model_name}...") + start_time = time.time() + response = await model.agenerate(messages=[messages]) + end_time = time.time() + print(f"Response received in {end_time - start_time:.2f} seconds") + + # 获取响应文本 + generated_text = response.generations[0][0].text + + # 提取JSON + try: + # 尝试直接解析JSON + result = json.loads(generated_text) + print("Successfully parsed JSON response") + except json.JSONDecodeError: + # 如果直接解析失败,尝试提取JSON部分 + import re + json_match = re.search(r'```json\s*(.*?)\s*```', generated_text, re.DOTALL) + if json_match: + try: + result = json.loads(json_match.group(1)) + print("Successfully extracted and parsed JSON from code block") + except json.JSONDecodeError: + print("Failed to parse JSON from code block") + result = { + "error": "Failed to parse JSON response", + "raw_response": generated_text[:1000] + ("..." if len(generated_text) > 1000 else "") + } + else: + # 尝试提取评分部分 + scores_section = re.search(r'### SCORES:\s*\n([\s\S]*?)(?:\n\n|\Z)', generated_text) + if scores_section: + scores_text = scores_section.group(1) + scores_dict = {} + + # 提取各个评分 + for line in scores_text.split('\n'): + match = re.search(r'- ([\w\s&]+):\s*(\d+(\.\d+)?)\s*/10', line) + if match: + key = match.group(1).strip().lower().replace(' & ', '_').replace(' ', '_') + value = float(match.group(2)) + scores_dict[key] = value + + # 提取评论 + comments_match = re.search(r'(?:Analysis|Comments|Suggestions):([\s\S]*?)(?=###|\Z)', generated_text, re.IGNORECASE) + if comments_match: + scores_dict["comments"] = comments_match.group(1).strip() + else: + scores_dict["comments"] = "No detailed comments provided." + + # 提取工作时间估算 + hours_match = re.search(r'(?:estimated_hours|working hours|time estimate).*?(\d+(\.\d+)?)', generated_text, re.IGNORECASE) + if hours_match: + scores_dict["estimated_hours"] = float(hours_match.group(1)) + else: + scores_dict["estimated_hours"] = 0.0 + + # 提取有效代码行数 + effective_match = re.search(r'(?:effective_code_lines|effective lines).*?(\d+)', generated_text, re.IGNORECASE) + if effective_match: + scores_dict["effective_code_lines"] = int(effective_match.group(1)) + else: + scores_dict["effective_code_lines"] = 0 + + # 提取非有效代码行数 + non_effective_match = re.search(r'(?:non_effective_code_lines|non-effective lines).*?(\d+)', generated_text, re.IGNORECASE) + if non_effective_match: + scores_dict["non_effective_code_lines"] = int(non_effective_match.group(1)) + else: + scores_dict["non_effective_code_lines"] = 0 + + result = scores_dict + print("Successfully extracted scores from text") + else: + # 如果没有找到评分部分,尝试直接从文本中提取信息 + result = { + "comments": generated_text, + "summary": "See detailed analysis in comments." + } + + # 尝试提取评分 + readability_match = re.search(r'readability.*?(\d+(\.\d+)?)', generated_text, re.IGNORECASE) + if readability_match: + result["readability"] = float(readability_match.group(1)) + + efficiency_match = re.search(r'efficiency.*?(\d+(\.\d+)?)', generated_text, re.IGNORECASE) + if efficiency_match: + result["efficiency"] = float(efficiency_match.group(1)) + + security_match = re.search(r'security.*?(\d+(\.\d+)?)', generated_text, re.IGNORECASE) + if security_match: + result["security"] = float(security_match.group(1)) + + structure_match = re.search(r'structure.*?(\d+(\.\d+)?)', generated_text, re.IGNORECASE) + if structure_match: + result["structure"] = float(structure_match.group(1)) + + error_handling_match = re.search(r'error handling.*?(\d+(\.\d+)?)', generated_text, re.IGNORECASE) + if error_handling_match: + result["error_handling"] = float(error_handling_match.group(1)) + + documentation_match = re.search(r'documentation.*?(\d+(\.\d+)?)', generated_text, re.IGNORECASE) + if documentation_match: + result["documentation"] = float(documentation_match.group(1)) + + code_style_match = re.search(r'code style.*?(\d+(\.\d+)?)', generated_text, re.IGNORECASE) + if code_style_match: + result["code_style"] = float(code_style_match.group(1)) + + overall_match = re.search(r'overall.*?(\d+(\.\d+)?)', generated_text, re.IGNORECASE) + if overall_match: + result["overall_score"] = float(overall_match.group(1)) + + # 提取工作时间估算 + hours_match = re.search(r'(?:estimated_hours|working hours|time estimate).*?(\d+(\.\d+)?)', generated_text, re.IGNORECASE) + if hours_match: + result["estimated_hours"] = float(hours_match.group(1)) + else: + # 尝试找到任何数字后跟"hours"或"hour"的模式 + hours_match = re.search(r'(\d+(\.\d+)?)\s*(?:hours?|hrs?)', generated_text, re.IGNORECASE) + if hours_match: + result["estimated_hours"] = float(hours_match.group(1)) + else: + result["estimated_hours"] = 0.0 + + # 提取有效代码行数 + effective_match = re.search(r'(?:effective_code_lines|effective lines).*?(\d+)', generated_text, re.IGNORECASE) + if effective_match: + result["effective_code_lines"] = int(effective_match.group(1)) + else: + # 尝试找到任何数字后跟"effective"的模式 + effective_match = re.search(r'(\d+)\s*(?:effective)', generated_text, re.IGNORECASE) + if effective_match: + result["effective_code_lines"] = int(effective_match.group(1)) + else: + result["effective_code_lines"] = 0 + + # 提取非有效代码行数 + non_effective_match = re.search(r'(?:non_effective_code_lines|non-effective lines).*?(\d+)', generated_text, re.IGNORECASE) + if non_effective_match: + result["non_effective_code_lines"] = int(non_effective_match.group(1)) + else: + # 尝试找到任何数字后跟"non-effective"的模式 + non_effective_match = re.search(r'(\d+)\s*(?:non-effective)', generated_text, re.IGNORECASE) + if non_effective_match: + result["non_effective_code_lines"] = int(non_effective_match.group(1)) + else: + result["non_effective_code_lines"] = 0 + + print("Extracted information directly from text") + + # 根据输出格式返回结果 + if output_format == "json": + return result + else: # markdown + # 将结果转换为Markdown格式 + markdown = f"# Code Review for {file_path}\n\n" + markdown += f"## Scores\n\n" + + # 检查是否有static_analysis字段(CodeDog格式) + if 'static_analysis' in result: + static_analysis = result['static_analysis'] + markdown += f"- **Readability**: {static_analysis.get('readability', 'N/A')}/10\n" + markdown += f"- **Efficiency & Performance**: {static_analysis.get('efficiency_performance', 'N/A')}/10\n" + markdown += f"- **Security**: {static_analysis.get('security', 'N/A')}/10\n" + markdown += f"- **Structure & Design**: {static_analysis.get('structure_design', 'N/A')}/10\n" + markdown += f"- **Error Handling**: {static_analysis.get('error_handling', 'N/A')}/10\n" + markdown += f"- **Documentation & Comments**: {static_analysis.get('documentation_comments', 'N/A')}/10\n" + markdown += f"- **Code Style**: {static_analysis.get('code_style', 'N/A')}/10\n" + markdown += f"- **Overall Score**: {static_analysis.get('overall_score', 'N/A')}/10\n\n" + # 检查是否有scores字段(DeepSeek格式) + elif 'scores' in result: + scores = result['scores'] + markdown += f"- **Readability**: {scores.get('Readability', 'N/A')}/10\n" + markdown += f"- **Efficiency & Performance**: {scores.get('Efficiency_Performance', 'N/A')}/10\n" + markdown += f"- **Security**: {scores.get('Security', 'N/A')}/10\n" + markdown += f"- **Structure & Design**: {scores.get('Structure_Design', 'N/A')}/10\n" + markdown += f"- **Error Handling**: {scores.get('Error_Handling', 'N/A')}/10\n" + markdown += f"- **Documentation & Comments**: {scores.get('Documentation_Comments', 'N/A')}/10\n" + markdown += f"- **Code Style**: {scores.get('Code_Style', 'N/A')}/10\n" + markdown += f"- **Overall Score**: {scores.get('Final_Overall_Score', 'N/A')}/10\n\n" + # 检查是否有code_review.scores字段(DeepSeek格式) + elif 'code_review' in result and 'scores' in result['code_review']: + scores = result['code_review']['scores'] + markdown += f"- **Readability**: {scores.get('Readability', 'N/A')}/10\n" + markdown += f"- **Efficiency & Performance**: {scores.get('Efficiency_Performance', 'N/A')}/10\n" + markdown += f"- **Security**: {scores.get('Security', 'N/A')}/10\n" + markdown += f"- **Structure & Design**: {scores.get('Structure_Design', 'N/A')}/10\n" + markdown += f"- **Error Handling**: {scores.get('Error_Handling', 'N/A')}/10\n" + markdown += f"- **Documentation & Comments**: {scores.get('Documentation_Comments', 'N/A')}/10\n" + markdown += f"- **Code Style**: {scores.get('Code_Style', 'N/A')}/10\n" + markdown += f"- **Overall Score**: {scores.get('Final_Overall_Score', 'N/A')}/10\n\n" + else: + # 直接从结果中获取评分 + markdown += f"- **Readability**: {result.get('readability', 'N/A')}/10\n" + markdown += f"- **Efficiency & Performance**: {result.get('efficiency_&_performance', result.get('efficiency', 'N/A'))}/10\n" + markdown += f"- **Security**: {result.get('security', 'N/A')}/10\n" + markdown += f"- **Structure & Design**: {result.get('structure_&_design', result.get('structure', 'N/A'))}/10\n" + markdown += f"- **Error Handling**: {result.get('error_handling', 'N/A')}/10\n" + markdown += f"- **Documentation & Comments**: {result.get('documentation_&_comments', result.get('documentation', 'N/A'))}/10\n" + markdown += f"- **Code Style**: {result.get('code_style', 'N/A')}/10\n" + markdown += f"- **Overall Score**: {result.get('final_overall_score', result.get('overall_score', 'N/A'))}/10\n\n" + + markdown += f"## Code Change Analysis\n\n" + + # 检查是否有time_estimation字段(DeepSeek格式) + if 'time_estimation' in result: + time_estimation = result['time_estimation'] + markdown += f"- **Effective Code Lines**: {time_estimation.get('effective_code_lines', 'N/A')}\n" + markdown += f"- **Non-Effective Code Lines**: {time_estimation.get('non_effective_code_lines', 'N/A')}\n" + markdown += f"- **Estimated Hours**: {time_estimation.get('estimated_hours', 'N/A')}\n\n" + # 检查是否有change_analysis字段(DeepSeek格式) + elif 'change_analysis' in result: + change_analysis = result['change_analysis'] + markdown += f"- **Effective Code Lines**: {change_analysis.get('effective_code_lines', 'N/A')}\n" + markdown += f"- **Non-Effective Code Lines**: {change_analysis.get('non_effective_code_lines', 'N/A')}\n" + markdown += f"- **Estimated Hours**: {change_analysis.get('estimated_hours', 'N/A')}\n\n" + else: + markdown += f"- **Effective Code Lines**: {result.get('effective_code_lines', 'N/A')}\n" + markdown += f"- **Non-Effective Code Lines**: {result.get('non_effective_code_lines', 'N/A')}\n" + markdown += f"- **Estimated Hours**: {result.get('estimated_hours', 'N/A')}\n\n" + + markdown += f"## Detailed Analysis\n\n" + + # 检查是否有DeepSeek格式的字段 + if 'review_summary' in result: + review_summary = result['review_summary'] + markdown += f"**Intended Functionality**: {review_summary.get('intended_functionality', '')}\n\n" + + if 'primary_objectives' in review_summary: + markdown += f"**Primary Objectives**:\n" + for objective in review_summary.get('primary_objectives', []): + markdown += f"- {objective}\n" + markdown += "\n" + + # 检查是否有code_review字段(DeepSeek格式) + elif 'code_review' in result: + code_review = result['code_review'] + + if 'summary' in code_review: + markdown += f"**Summary**: {code_review.get('summary', '')}\n\n" + + if 'static_analysis' in code_review: + static_analysis = code_review['static_analysis'] + markdown += f"**Logic**: {static_analysis.get('logic', '')}\n\n" + markdown += f"**Performance**: {static_analysis.get('performance', '')}\n\n" + markdown += f"**Security**: {static_analysis.get('security', '')}\n\n" + + if 'issues_and_improvements' in code_review: + markdown += f"**Issues and Improvements**:\n" + for issue in code_review.get('issues_and_improvements', []): + markdown += f"- {issue}\n" + markdown += "\n" + + if 'actionable_suggestions' in code_review: + markdown += f"**Actionable Suggestions**:\n" + for suggestion in code_review.get('actionable_suggestions', []): + markdown += f"- {suggestion}\n" + markdown += "\n" + + if 'standards_adherence' in code_review: + standards = code_review['standards_adherence'] + markdown += f"**Standards Adherence**:\n" + markdown += f"- **Naming**: {standards.get('naming', '')}\n" + markdown += f"- **Formatting**: {standards.get('formatting', '')}\n" + markdown += f"- **Comments**: {standards.get('comments', '')}\n" + markdown += f"- **Error Handling**: {standards.get('error_handling', '')}\n" + markdown += f"- **Security Practices**: {standards.get('security_practices', '')}\n\n" + + if 'static_analysis' in result: + static_analysis = result['static_analysis'] + markdown += f"**Logic**: {static_analysis.get('logic', '')}\n\n" + markdown += f"**Performance**: {static_analysis.get('performance', '')}\n\n" + markdown += f"**Security**: {static_analysis.get('security', '')}\n\n" + + if 'issues_identified' in static_analysis: + markdown += f"**Issues Identified**:\n" + for issue in static_analysis.get('issues_identified', []): + markdown += f"- {issue}\n" + markdown += "\n" + + if 'improvement_opportunities' in static_analysis: + markdown += f"**Improvement Opportunities**:\n" + for opportunity in static_analysis.get('improvement_opportunities', []): + markdown += f"- {opportunity}\n" + markdown += "\n" + + if 'actionable_suggestions' in result: + markdown += f"**Actionable Suggestions**:\n" + for suggestion in result.get('actionable_suggestions', []): + markdown += f"- {suggestion}\n" + markdown += "\n" + + if 'change_analysis' in result and 'breakdown' in result['change_analysis']: + breakdown = result['change_analysis']['breakdown'] + + if 'functional_changes' in breakdown: + markdown += f"**Functional Changes**:\n" + for change in breakdown.get('functional_changes', []): + markdown += f"- {change}\n" + markdown += "\n" + + if 'non_functional_changes' in breakdown: + markdown += f"**Non-Functional Changes**:\n" + for change in breakdown.get('non_functional_changes', []): + markdown += f"- {change}\n" + markdown += "\n" + + # 标准格式的字段 + if 'summary' in result: + markdown += f"**Summary**: {result.get('summary', '')}\n\n" + + if 'suggestions' in result: + markdown += f"**Suggestions**:\n" + for suggestion in result.get('suggestions', []): + markdown += f"- {suggestion}\n" + markdown += "\n" + + markdown += result.get('comments', '') + + return {"markdown": markdown, "raw_result": result} + + +def parse_args(): + """解析命令行参数""" + parser = argparse.ArgumentParser(description="Test CodeDog's code review prompts") + + # 输入选项 + input_group = parser.add_mutually_exclusive_group(required=True) + input_group.add_argument("--file", help="Path to the file to evaluate") + input_group.add_argument("--diff", help="Path to the diff file to evaluate") + + # 模型选项 + parser.add_argument("--model", default="gpt-3.5-turbo", help="Model to use for evaluation (default: gpt-3.5-turbo)") + + # 输出选项 + parser.add_argument("--output", help="Path to save the output (default: stdout)") + parser.add_argument("--format", choices=["json", "markdown"], default="json", help="Output format (default: json)") + + return parser.parse_args() + + +async def main(): + """主函数""" + args = parse_args() + + # 读取输入内容 + if args.file: + file_path = args.file + with open(file_path, "r", encoding="utf-8") as f: + content = f.read() + else: # args.diff + diff_path = args.diff + with open(diff_path, "r", encoding="utf-8") as f: + content = f.read() + # 从diff文件名中提取原始文件名 + import os + file_path = os.path.basename(diff_path) + if file_path.endswith(".diff"): + file_path = file_path[:-5] + + # 测试提示 + result = await test_codedog_prompt( + file_path=file_path, + content=content, + model_name=args.model, + output_format=args.format + ) + + # 输出结果 + if args.output: + with open(args.output, "w", encoding="utf-8") as f: + if args.format == "json": + json.dump(result, f, indent=2, ensure_ascii=False) + else: # markdown + f.write(result["markdown"]) + print(f"Output saved to {args.output}") + else: + if args.format == "json": + print(json.dumps(result, indent=2, ensure_ascii=False)) + else: # markdown + print(result["markdown"]) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/test_diffs/README.md b/test_diffs/README.md new file mode 100644 index 0000000..08774e8 --- /dev/null +++ b/test_diffs/README.md @@ -0,0 +1,78 @@ +# 代码评审提示测试用例 + +这个目录包含了用于测试代码评审提示(prompts)效果的diff文件集合。这些文件被分为三类,以便测试提示在不同质量代码上的表现。 + +## 目录结构 + +``` +test_diffs/ +├── high_score/ # 高质量代码变更,应该获得高分 +├── low_score/ # 低质量代码变更,应该获得低分 +└── mixed_score/ # 混合质量代码变更,应该获得中等分数 +``` + +## 高分测试用例 (high_score/) + +这些diff文件展示了高质量的代码变更,包括良好的文档、错误处理、性能优化和安全实践。它们应该在代码评审中获得高分。 + +1. **python_feature_enhancement.diff** - Python文件的功能增强,添加了类型提示、错误处理和新功能 +2. **javascript_bug_fix.diff** - JavaScript文件的bug修复,改进了错误处理和安全性 +3. **java_refactoring.diff** - Java文件的代码重构,改进了架构和设计模式 +4. **css_optimization.diff** - CSS文件的样式优化,使用了CSS变量和现代布局技术 +5. **sql_query_optimization.diff** - SQL查询优化,提高了查询性能和增加了分析功能 + +## 低分测试用例 (low_score/) + +这些diff文件展示了低质量的代码变更,包含各种问题,如安全漏洞、性能问题、错误处理不足和可读性差。它们应该在代码评审中获得低分。 + +1. **python_security_issues.diff** - 包含多个安全漏洞的Python代码,如SQL注入、命令注入和硬编码凭据 +2. **javascript_performance_issues.diff** - 包含性能问题的JavaScript代码,如内存泄漏、低效循环和DOM操作 +3. **java_error_handling_issues.diff** - 错误处理不足的Java代码,如异常吞没、资源泄漏和线程不安全 +4. **cpp_readability_issues.diff** - 可读性差的C++代码,包含混乱的格式、神秘的变量名和缺乏注释 +5. **sql_structure_issues.diff** - 结构混乱的SQL代码,包含不一致的格式、低效查询和缺乏注释 + +## 使用方法 + +### 单个文件测试 + +使用`test_prompt.py`工具测试单个diff文件: + +```bash +python test_prompt.py --diff test_diffs/high_score/python_feature_enhancement.diff --model gpt-3.5-turbo +``` + +### 批量测试 + +使用`batch_test_prompts.py`工具批量测试多个diff文件: + +```bash +# 测试所有高分用例 +for diff_file in test_diffs/high_score/*.diff; do + python test_prompt.py --diff "$diff_file" --model gpt-3.5-turbo --output "${diff_file%.diff}_result.json" +done + +# 测试所有低分用例 +for diff_file in test_diffs/low_score/*.diff; do + python test_prompt.py --diff "$diff_file" --model gpt-3.5-turbo --output "${diff_file%.diff}_result.json" +done +``` + +### 比较不同模型 + +比较不同模型在同一组测试用例上的表现: + +```bash +python batch_test_prompts.py --project your_group/your_project --models gpt-3.5-turbo,gpt-4,deepseek +``` + +## 评估标准 + +使用这些测试用例评估代码评审提示时,应关注以下方面: + +1. **分数准确性** - 高分用例应获得高分,低分用例应获得低分 +2. **问题识别** - 是否正确识别代码中的问题和优点 +3. **有效代码识别** - 是否正确区分有效和无效代码修改 +4. **工作时间估算** - 工作时间估算是否合理 +5. **建议质量** - 提供的改进建议是否具体、可行 + +通过比较不同提示和模型在这些测试用例上的表现,可以找到最适合您需求的组合。 diff --git a/test_diffs/high_score/css_optimization.diff b/test_diffs/high_score/css_optimization.diff new file mode 100644 index 0000000..568b0d8 --- /dev/null +++ b/test_diffs/high_score/css_optimization.diff @@ -0,0 +1,323 @@ +diff --git a/src/styles/main.css b/src/styles/main.css +index abcdef0..1234567 100644 +--- a/src/styles/main.css ++++ b/src/styles/main.css +@@ -1,72 +1,105 @@ +-/* Main Styles */ +-body { +- font-family: Arial, sans-serif; +- margin: 0; +- padding: 0; +- background-color: #f5f5f5; +- color: #333; +-} ++:root { ++ /* Color variables */ ++ --primary-color: #3498db; ++ --secondary-color: #2ecc71; ++ --accent-color: #e74c3c; ++ --text-color: #333333; ++ --text-light: #666666; ++ --bg-color: #f5f5f5; ++ --bg-dark: #e0e0e0; ++ --white: #ffffff; ++ --black: #000000; ++ ++ /* Spacing variables */ ++ --spacing-xs: 0.25rem; ++ --spacing-sm: 0.5rem; ++ --spacing-md: 1rem; ++ --spacing-lg: 1.5rem; ++ --spacing-xl: 2rem; ++ ++ /* Font variables */ ++ --font-family: 'Roboto', Arial, sans-serif; ++ --font-size-sm: 0.875rem; ++ --font-size-md: 1rem; ++ --font-size-lg: 1.25rem; ++ --font-size-xl: 1.5rem; ++ ++ /* Border variables */ ++ --border-radius-sm: 4px; ++ --border-radius-md: 8px; ++ --border-radius-lg: 12px; ++ ++ /* Shadow variables */ ++ --shadow-sm: 0 1px 3px rgba(0, 0, 0, 0.12); ++ --shadow-md: 0 4px 6px rgba(0, 0, 0, 0.1); ++ --shadow-lg: 0 10px 15px rgba(0, 0, 0, 0.1); ++} ++ ++/* Base styles */ ++* { ++ box-sizing: border-box; ++ margin: 0; ++ padding: 0; ++} ++ ++body { ++ font-family: var(--font-family); ++ background-color: var(--bg-color); ++ color: var(--text-color); ++ line-height: 1.6; ++} + + .container { +- max-width: 1200px; ++ width: 100%; ++ max-width: 1280px; + margin: 0 auto; +- padding: 20px; ++ padding: var(--spacing-md); + } + +-/* Header Styles */ ++/* Header styles */ + header { +- background-color: #3498db; +- color: white; +- padding: 20px 0; +- box-shadow: 0 2px 5px rgba(0, 0, 0, 0.1); ++ background-color: var(--primary-color); ++ color: var(--white); ++ padding: var(--spacing-md) 0; ++ box-shadow: var(--shadow-sm); ++ position: sticky; ++ top: 0; ++ z-index: 100; + } + + .logo { +- font-size: 24px; ++ font-size: var(--font-size-xl); + font-weight: bold; ++ text-transform: uppercase; ++ letter-spacing: 1px; + } + +-/* Navigation Styles */ ++/* Navigation styles */ + nav ul { +- list-style: none; ++ list-style-type: none; + display: flex; +- margin: 0; +- padding: 0; ++ gap: var(--spacing-md); + } + + nav li { +- margin-right: 20px; ++ position: relative; + } + + nav a { +- color: white; ++ color: var(--white); + text-decoration: none; +- font-weight: bold; ++ font-weight: 500; ++ transition: color 0.3s ease; ++ padding: var(--spacing-sm) var(--spacing-md); ++ border-radius: var(--border-radius-sm); + } + + nav a:hover { +- text-decoration: underline; ++ background-color: rgba(255, 255, 255, 0.1); + } + +-/* Button Styles */ +-.btn { +- display: inline-block; +- padding: 10px 20px; +- background-color: #3498db; +- color: white; +- border: none; +- border-radius: 4px; +- cursor: pointer; +- text-decoration: none; +-} +- +-.btn:hover { +- background-color: #2980b9; +-} +- +-.btn-primary { +- background-color: #3498db; +-} +- +-.btn-secondary { +- background-color: #2ecc71; +-} +- +-.btn-danger { +- background-color: #e74c3c; ++nav a.active { ++ font-weight: 700; ++ background-color: rgba(255, 255, 255, 0.2); + } + +-/* Card Styles */ ++/* Card component */ + .card { +- background-color: white; +- border-radius: 4px; +- box-shadow: 0 2px 5px rgba(0, 0, 0, 0.1); +- padding: 20px; +- margin-bottom: 20px; ++ background-color: var(--white); ++ border-radius: var(--border-radius-md); ++ box-shadow: var(--shadow-md); ++ padding: var(--spacing-lg); ++ margin-bottom: var(--spacing-lg); ++ transition: transform 0.3s ease, box-shadow 0.3s ease; ++} ++ ++.card:hover { ++ transform: translateY(-5px); ++ box-shadow: var(--shadow-lg); + } + + .card-title { +- font-size: 18px; ++ font-size: var(--font-size-lg); + font-weight: bold; +- margin-bottom: 10px; ++ margin-bottom: var(--spacing-sm); ++ color: var(--text-color); + } + + .card-content { +- margin-bottom: 15px; ++ margin-bottom: var(--spacing-md); ++ color: var(--text-light); + } + +-/* Form Styles */ +-.form-group { +- margin-bottom: 15px; ++/* Button component */ ++.btn { ++ display: inline-block; ++ padding: var(--spacing-sm) var(--spacing-lg); ++ background-color: var(--primary-color); ++ color: var(--white); ++ border: none; ++ border-radius: var(--border-radius-sm); ++ cursor: pointer; ++ text-decoration: none; ++ font-weight: 500; ++ text-align: center; ++ transition: background-color 0.3s ease, transform 0.2s ease; ++ user-select: none; + } + +-label { +- display: block; +- margin-bottom: 5px; +- font-weight: bold; ++.btn:hover { ++ background-color: darken(var(--primary-color), 10%); ++ transform: translateY(-2px); + } + +-input[type="text"], +-input[type="email"], +-input[type="password"], +-textarea { +- width: 100%; +- padding: 10px; +- border: 1px solid #ddd; +- border-radius: 4px; ++.btn:active { ++ transform: translateY(0); + } + +-/* Footer Styles */ +-footer { +- background-color: #333; +- color: white; +- padding: 20px 0; +- text-align: center; +- margin-top: 40px; ++.btn-primary { ++ background-color: var(--primary-color); + } + +-/* Responsive Styles */ +-@media (max-width: 768px) { +- nav ul { +- flex-direction: column; +- } +- +- nav li { +- margin-right: 0; +- margin-bottom: 10px; +- } ++.btn-secondary { ++ background-color: var(--secondary-color); ++} ++ ++.btn-accent { ++ background-color: var(--accent-color); ++} ++ ++.btn-outline { ++ background-color: transparent; ++ border: 2px solid var(--primary-color); ++ color: var(--primary-color); ++} ++ ++.btn-outline:hover { ++ background-color: var(--primary-color); ++ color: var(--white); ++} ++ ++/* Utility classes */ ++.text-center { text-align: center; } ++.text-right { text-align: right; } ++.text-left { text-align: left; } ++ ++.mt-1 { margin-top: var(--spacing-sm); } ++.mt-2 { margin-top: var(--spacing-md); } ++.mt-3 { margin-top: var(--spacing-lg); } ++.mt-4 { margin-top: var(--spacing-xl); } ++ ++.mb-1 { margin-bottom: var(--spacing-sm); } ++.mb-2 { margin-bottom: var(--spacing-md); } ++.mb-3 { margin-bottom: var(--spacing-lg); } ++.mb-4 { margin-bottom: var(--spacing-xl); } ++ ++.p-1 { padding: var(--spacing-sm); } ++.p-2 { padding: var(--spacing-md); } ++.p-3 { padding: var(--spacing-lg); } ++.p-4 { padding: var(--spacing-xl); } ++ ++/* Media queries */ ++@media screen and (max-width: 768px) { ++ :root { ++ --spacing-lg: 1.25rem; ++ --spacing-xl: 1.75rem; ++ } ++ ++ nav ul { ++ flex-direction: column; ++ gap: var(--spacing-sm); ++ } ++ ++ .container { ++ padding: var(--spacing-sm); ++ } ++} ++ ++@media screen and (max-width: 480px) { ++ :root { ++ --font-size-lg: 1.125rem; ++ --font-size-xl: 1.375rem; ++ } ++ ++ .card { ++ padding: var(--spacing-md); ++ } + } diff --git a/test_diffs/high_score/java_refactoring.diff b/test_diffs/high_score/java_refactoring.diff new file mode 100644 index 0000000..db56ba5 --- /dev/null +++ b/test_diffs/high_score/java_refactoring.diff @@ -0,0 +1,237 @@ +diff --git a/src/main/java/com/example/service/OrderService.java b/src/main/java/com/example/service/OrderService.java +index 1122334..5566778 100644 +--- a/src/main/java/com/example/service/OrderService.java ++++ b/src/main/java/com/example/service/OrderService.java +@@ -1,62 +1,118 @@ + package com.example.service; + +-import java.util.List; +-import java.util.Optional; +- ++import com.example.exception.OrderNotFoundException; ++import com.example.exception.PaymentFailedException; ++import com.example.model.Customer; + import com.example.model.Order; ++import com.example.model.OrderItem; ++import com.example.model.OrderStatus; ++import com.example.model.Payment; ++import com.example.model.Product; + import com.example.repository.OrderRepository; ++import com.example.repository.ProductRepository; ++import com.example.service.notification.NotificationService; ++import com.example.service.payment.PaymentService; ++import lombok.RequiredArgsConstructor; ++import lombok.extern.slf4j.Slf4j; + import org.springframework.beans.factory.annotation.Autowired; + import org.springframework.stereotype.Service; ++import org.springframework.transaction.annotation.Transactional; ++ ++import java.math.BigDecimal; ++import java.time.LocalDateTime; ++import java.util.List; ++import java.util.Optional; ++import java.util.stream.Collectors; + + /** +- * Service for handling orders ++ * Service for handling order operations including creation, processing, and management. ++ * This service coordinates between repositories and other services to fulfill order operations. + */ + @Service ++@RequiredArgsConstructor ++@Slf4j + public class OrderService { + +- @Autowired +- private OrderRepository orderRepository; ++ private final OrderRepository orderRepository; ++ private final ProductRepository productRepository; ++ private final PaymentService paymentService; ++ private final NotificationService notificationService; + + /** +- * Get all orders +- * @return List of orders ++ * Retrieves all orders in the system. ++ * ++ * @return List of all orders + */ + public List getAllOrders() { ++ log.debug("Retrieving all orders"); + return orderRepository.findAll(); + } + + /** +- * Get order by ID +- * @param id Order ID +- * @return Optional containing order if found ++ * Retrieves an order by its unique identifier. ++ * ++ * @param id The order ID to look up ++ * @return The order if found ++ * @throws OrderNotFoundException if the order does not exist + */ +- public Optional getOrderById(Long id) { +- return orderRepository.findById(id); ++ public Order getOrderById(Long id) { ++ log.debug("Retrieving order with ID: {}", id); ++ return orderRepository.findById(id) ++ .orElseThrow(() -> new OrderNotFoundException("Order not found with ID: " + id)); + } + + /** +- * Create a new order +- * @param order Order to create +- * @return Created order ++ * Creates a new order for a customer with the specified items. ++ * Validates product availability and calculates the total price. ++ * ++ * @param customer The customer placing the order ++ * @param items List of order items ++ * @return The created order + */ +- public Order createOrder(Order order) { +- return orderRepository.save(order); ++ @Transactional ++ public Order createOrder(Customer customer, List items) { ++ log.info("Creating new order for customer: {}", customer.getId()); ++ ++ // Validate all products exist and are in stock ++ validateOrderItems(items); ++ ++ // Calculate order total ++ BigDecimal total = calculateOrderTotal(items); ++ ++ // Create new order ++ Order order = Order.builder() ++ .customer(customer) ++ .items(items) ++ .status(OrderStatus.PENDING) ++ .total(total) ++ .createdAt(LocalDateTime.now()) ++ .build(); ++ ++ // Set order reference in items ++ items.forEach(item -> item.setOrder(order)); ++ ++ // Save order ++ Order savedOrder = orderRepository.save(order); ++ ++ // Update product inventory ++ updateProductInventory(items); ++ ++ // Send notification ++ notificationService.sendOrderCreationNotification(customer, savedOrder); ++ ++ log.info("Order created successfully with ID: {}", savedOrder.getId()); ++ return savedOrder; + } + + /** +- * Update an existing order +- * @param id Order ID +- * @param orderDetails Updated order details +- * @return Updated order +- * @throws RuntimeException if order not found ++ * Processes payment for an order and updates its status. ++ * ++ * @param orderId The ID of the order to process ++ * @param payment The payment details ++ * @return The updated order with payment information ++ * @throws OrderNotFoundException if the order does not exist ++ * @throws PaymentFailedException if the payment processing fails + */ +- public Order updateOrder(Long id, Order orderDetails) { +- Order order = orderRepository.findById(id) +- .orElseThrow(() -> new RuntimeException("Order not found with id " + id)); +- +- order.setCustomerId(orderDetails.getCustomerId()); +- order.setItems(orderDetails.getItems()); +- order.setTotal(orderDetails.getTotal()); +- order.setStatus(orderDetails.getStatus()); +- +- return orderRepository.save(order); ++ @Transactional ++ public Order processPayment(Long orderId, Payment payment) { ++ log.info("Processing payment for order: {}", orderId); ++ ++ // Get order ++ Order order = getOrderById(orderId); ++ ++ // Validate order status ++ if (order.getStatus() != OrderStatus.PENDING) { ++ log.warn("Cannot process payment for order with status: {}", order.getStatus()); ++ throw new IllegalStateException("Cannot process payment for order with status: " + order.getStatus()); ++ } ++ ++ // Process payment ++ try { ++ Payment processedPayment = paymentService.processPayment(payment, order.getTotal()); ++ ++ // Update order with payment information ++ order.setPayment(processedPayment); ++ order.setStatus(OrderStatus.PAID); ++ order.setUpdatedAt(LocalDateTime.now()); ++ ++ Order updatedOrder = orderRepository.save(order); ++ ++ // Send notification ++ notificationService.sendPaymentConfirmationNotification(order.getCustomer(), updatedOrder); ++ ++ log.info("Payment processed successfully for order: {}", orderId); ++ return updatedOrder; ++ } catch (Exception e) { ++ log.error("Payment processing failed for order: {}", orderId, e); ++ throw new PaymentFailedException("Payment processing failed: " + e.getMessage()); ++ } + } + + /** +- * Delete an order +- * @param id Order ID ++ * Validates that all order items reference valid products and that ++ * sufficient inventory is available. ++ * ++ * @param items The order items to validate ++ * @throws IllegalArgumentException if any validation fails + */ +- public void deleteOrder(Long id) { +- orderRepository.deleteById(id); ++ private void validateOrderItems(List items) { ++ for (OrderItem item : items) { ++ Product product = productRepository.findById(item.getProduct().getId()) ++ .orElseThrow(() -> new IllegalArgumentException( ++ "Product not found with ID: " + item.getProduct().getId())); ++ ++ if (product.getStock() < item.getQuantity()) { ++ throw new IllegalArgumentException( ++ "Insufficient stock for product: " + product.getName()); ++ } ++ } ++ } ++ ++ /** ++ * Calculates the total price for all items in the order. ++ * ++ * @param items The order items ++ * @return The calculated total price ++ */ ++ private BigDecimal calculateOrderTotal(List items) { ++ return items.stream() ++ .map(item -> item.getProduct().getPrice().multiply(BigDecimal.valueOf(item.getQuantity()))) ++ .reduce(BigDecimal.ZERO, BigDecimal::add); ++ } ++ ++ /** ++ * Updates product inventory levels based on order items. ++ * ++ * @param items The order items ++ */ ++ private void updateProductInventory(List items) { ++ items.forEach(item -> { ++ Product product = item.getProduct(); ++ product.setStock(product.getStock() - item.getQuantity()); ++ productRepository.save(product); ++ }); + } + } diff --git a/test_diffs/high_score/javascript_bug_fix.diff b/test_diffs/high_score/javascript_bug_fix.diff new file mode 100644 index 0000000..858b8ed --- /dev/null +++ b/test_diffs/high_score/javascript_bug_fix.diff @@ -0,0 +1,120 @@ +diff --git a/src/utils/authentication.js b/src/utils/authentication.js +index 9876543..fedcba0 100644 +--- a/src/utils/authentication.js ++++ b/src/utils/authentication.js +@@ -1,6 +1,7 @@ + import axios from 'axios'; + import jwt from 'jsonwebtoken'; + import { API_URL } from '../config'; ++import { logError } from './logger'; + + /** + * Authentication utility functions +@@ -12,7 +13,7 @@ const TOKEN_EXPIRY_BUFFER = 300; // 5 minutes in seconds + * @param {string} username - User's username + * @param {string} password - User's password + * @returns {Promise} Authentication result with token +- * @throws {Error} If authentication fails ++ * @throws {Error} If authentication fails or network error occurs + */ + export const login = async (username, password) => { + try { +@@ -22,9 +23,15 @@ export const login = async (username, password) => { + + localStorage.setItem(TOKEN_KEY, response.data.token); + return response.data; +- } catch (error) { +- console.error('Login failed:', error); +- throw new Error('Authentication failed'); ++ } catch (error) { ++ // Log detailed error for debugging ++ logError('Login failed', error); ++ ++ // Provide more specific error messages based on error type ++ if (error.response && error.response.status === 401) { ++ throw new Error('Invalid username or password'); ++ } else if (error.response && error.response.status === 429) { ++ throw new Error('Too many login attempts. Please try again later.'); ++ } else if (error.code === 'ECONNABORTED' || !error.response) { ++ throw new Error('Network error. Please check your connection.'); ++ } else { ++ throw new Error('Authentication failed. Please try again.'); ++ } + } + }; + +@@ -35,12 +42,16 @@ export const logout = () => { + }; + + /** +- * Check if user is authenticated ++ * Check if user is authenticated with a valid token + * @returns {boolean} True if authenticated + */ + export const isAuthenticated = () => { + const token = localStorage.getItem(TOKEN_KEY); +- return !!token; ++ if (!token) { ++ return false; ++ } ++ ++ try { ++ // Decode token to check expiration ++ const decoded = jwt.decode(token); ++ if (!decoded || !decoded.exp) { ++ return false; ++ } ++ ++ // Check if token is expired (with buffer time) ++ const currentTime = Math.floor(Date.now() / 1000); ++ return decoded.exp > currentTime + TOKEN_EXPIRY_BUFFER; ++ } catch (error) { ++ logError('Token validation error', error); ++ return false; ++ } + }; + + /** +@@ -48,10 +59,19 @@ export const isAuthenticated = () => { + * @returns {string|null} The authentication token or null + */ + export const getToken = () => { +- return localStorage.getItem(TOKEN_KEY); ++ const token = localStorage.getItem(TOKEN_KEY); ++ ++ // Check if token exists and is valid ++ if (token && isAuthenticated()) { ++ return token; ++ } ++ ++ // If token is invalid, clear it and return null ++ if (token) { ++ logout(); ++ } ++ ++ return null; + }; + + /** +- * Get user info from token +- * @returns {Object|null} User info or null if not authenticated ++ * Get user info from token payload ++ * @returns {Object|null} User info or null if not authenticated or token is invalid + */ + export const getUserInfo = () => { + const token = getToken(); +@@ -59,6 +79,12 @@ export const getUserInfo = () => { + return null; + } + +- const decoded = jwt.decode(token); +- return decoded; ++ try { ++ const decoded = jwt.decode(token); ++ return decoded && decoded.user ? decoded.user : null; ++ } catch (error) { ++ logError('Error decoding user info from token', error); ++ logout(); // Clear invalid token ++ return null; ++ } + }; diff --git a/test_diffs/high_score/python_feature_enhancement.diff b/test_diffs/high_score/python_feature_enhancement.diff new file mode 100644 index 0000000..1beddc1 --- /dev/null +++ b/test_diffs/high_score/python_feature_enhancement.diff @@ -0,0 +1,120 @@ +diff --git a/data_processor.py b/data_processor.py +index 1234567..abcdefg 100644 +--- a/data_processor.py ++++ b/data_processor.py +@@ -1,15 +1,42 @@ + import os + import json + import logging ++from typing import Dict, List, Any, Optional, Union ++from datetime import datetime + + logger = logging.getLogger(__name__) + +-def process_data(input_file, output_file): ++def process_data(input_file: str, output_file: str, config: Optional[Dict[str, Any]] = None) -> Dict[str, Any]: + """ +- Process data from input file and save to output file ++ Process data from input file and save to output file. ++ ++ Args: ++ input_file: Path to the input JSON file ++ output_file: Path to save the processed output ++ config: Optional configuration dictionary with processing options ++ ++ Returns: ++ Dictionary containing processing statistics + """ ++ start_time = datetime.now() ++ ++ # Set default configuration if not provided ++ if config is None: ++ config = { ++ "normalize": True, ++ "remove_duplicates": True, ++ "max_items": None ++ } ++ + logger.info(f"Processing data from {input_file} to {output_file}") + ++ # Validate input file ++ if not os.path.exists(input_file): ++ error_msg = f"Input file not found: {input_file}" ++ logger.error(error_msg) ++ raise FileNotFoundError(error_msg) ++ ++ # Load data + with open(input_file, 'r') as f: + data = json.load(f) + +@@ -17,9 +44,44 @@ def process_data(input_file, output_file): + processed_data = [] + for item in data: + processed_item = { +- 'id': item['id'], +- 'value': item['value'] * 2 ++ 'id': str(item.get('id', '')), ++ 'value': float(item.get('value', 0)) * 2, ++ 'processed_at': datetime.now().isoformat() + } + processed_data.append(processed_item) + +- # Save processed data ++ # Apply normalization if configured ++ if config.get("normalize", False): ++ processed_data = _normalize_data(processed_data) ++ ++ # Remove duplicates if configured ++ if config.get("remove_duplicates", False): ++ processed_data = _remove_duplicates(processed_data) ++ ++ # Apply max items limit if configured ++ max_items = config.get("max_items") ++ if max_items is not None and isinstance(max_items, int): ++ processed_data = processed_data[:max_items] ++ ++ # Save processed data ++ with open(output_file, 'w') as f: ++ json.dump(processed_data, f, indent=2) ++ ++ # Generate statistics ++ end_time = datetime.now() ++ stats = { ++ "input_items": len(data), ++ "output_items": len(processed_data), ++ "processing_time_ms": (end_time - start_time).total_seconds() * 1000, ++ "output_file": output_file ++ } ++ ++ logger.info(f"Processing completed: {stats}") ++ return stats ++ ++def _normalize_data(data: List[Dict[str, Any]]) -> List[Dict[str, Any]]: ++ """Normalize values in the data to be between 0 and 1""" ++ if not data: ++ return data ++ ++ # Find maximum value for normalization ++ max_value = max(item.get('value', 0) for item in data) ++ if max_value == 0: ++ return data ++ ++ # Normalize values ++ for item in data: ++ if 'value' in item: ++ item['value'] = item['value'] / max_value ++ ++ return data ++ ++def _remove_duplicates(data: List[Dict[str, Any]]) -> List[Dict[str, Any]]: ++ """Remove duplicate items based on ID""" ++ seen_ids = set() ++ unique_data = [] ++ ++ for item in data: ++ item_id = item.get('id') ++ if item_id not in seen_ids: ++ seen_ids.add(item_id) ++ unique_data.append(item) ++ ++ return unique_data diff --git a/test_diffs/high_score/sql_query_optimization.diff b/test_diffs/high_score/sql_query_optimization.diff new file mode 100644 index 0000000..c6be7a8 --- /dev/null +++ b/test_diffs/high_score/sql_query_optimization.diff @@ -0,0 +1,168 @@ +diff --git a/database/queries/order_analytics.sql b/database/queries/order_analytics.sql +index 9876543..fedcba0 100644 +--- a/database/queries/order_analytics.sql ++++ b/database/queries/order_analytics.sql +@@ -1,42 +1,87 @@ +--- Order analytics queries ++-- Order analytics queries optimized for performance ++-- These queries are used for generating business reports and dashboards + +--- Get total sales by month +-SELECT +- DATE_FORMAT(order_date, '%Y-%m') AS month, +- SUM(total_amount) AS total_sales +-FROM orders +-WHERE order_status = 'completed' +-GROUP BY DATE_FORMAT(order_date, '%Y-%m') +-ORDER BY month DESC; ++-- Get total sales by month with YoY comparison ++WITH monthly_sales AS ( ++ SELECT ++ DATE_FORMAT(order_date, '%Y-%m') AS month, ++ EXTRACT(YEAR FROM order_date) AS year, ++ EXTRACT(MONTH FROM order_date) AS month_num, ++ SUM(total_amount) AS total_sales, ++ COUNT(DISTINCT order_id) AS order_count ++ FROM orders ++ WHERE order_status = 'completed' ++ AND order_date >= DATE_SUB(CURRENT_DATE(), INTERVAL 2 YEAR) ++ GROUP BY ++ DATE_FORMAT(order_date, '%Y-%m'), ++ EXTRACT(YEAR FROM order_date), ++ EXTRACT(MONTH FROM order_date) ++) ++SELECT ++ current_year.month, ++ current_year.total_sales AS current_sales, ++ previous_year.total_sales AS previous_sales, ++ current_year.order_count AS current_order_count, ++ previous_year.order_count AS previous_order_count, ++ ROUND((current_year.total_sales - previous_year.total_sales) / previous_year.total_sales * 100, 2) AS sales_growth_percent ++FROM monthly_sales current_year ++LEFT JOIN monthly_sales previous_year ON ++ previous_year.month_num = current_year.month_num AND ++ previous_year.year = current_year.year - 1 ++WHERE current_year.year = EXTRACT(YEAR FROM CURRENT_DATE()) ++ORDER BY current_year.month_num; + +--- Get top selling products ++-- Get top selling products with inventory status ++-- Added index on order_items(product_id) and products(category_id) + SELECT + p.product_id, + p.product_name, ++ p.category_id, ++ c.category_name, + SUM(oi.quantity) AS total_quantity, +- SUM(oi.quantity * oi.unit_price) AS total_revenue ++ SUM(oi.quantity * oi.unit_price) AS total_revenue, ++ p.stock_quantity AS current_stock, ++ CASE ++ WHEN p.stock_quantity = 0 THEN 'Out of stock' ++ WHEN p.stock_quantity < 10 THEN 'Low stock' ++ ELSE 'In stock' ++ END AS stock_status, ++ ROUND(SUM(oi.quantity * oi.unit_price) / ++ (SELECT SUM(total_amount) FROM orders WHERE order_status = 'completed') * 100, 2) AS revenue_percent + FROM order_items oi +-JOIN orders o ON oi.order_id = o.order_id +-JOIN products p ON oi.product_id = p.product_id +-WHERE o.order_status = 'completed' ++INNER JOIN orders o ON oi.order_id = o.order_id ++INNER JOIN products p ON oi.product_id = p.product_id ++INNER JOIN categories c ON p.category_id = c.category_id ++WHERE o.order_status = 'completed' ++ AND o.order_date >= DATE_SUB(CURRENT_DATE(), INTERVAL 6 MONTH) + GROUP BY + p.product_id, +- p.product_name ++ p.product_name, ++ p.category_id, ++ c.category_name, ++ p.stock_quantity + ORDER BY total_revenue DESC + LIMIT 20; + +--- Get customer purchase history ++-- Get customer purchase history with customer segments ++-- Added index on orders(customer_id, order_date) + SELECT + c.customer_id, + c.first_name, + c.last_name, + c.email, +- COUNT(o.order_id) AS total_orders, +- SUM(o.total_amount) AS total_spent, +- AVG(o.total_amount) AS average_order_value, +- MAX(o.order_date) AS last_order_date ++ COUNT(DISTINCT o.order_id) AS total_orders, ++ ROUND(SUM(o.total_amount), 2) AS total_spent, ++ ROUND(AVG(o.total_amount), 2) AS average_order_value, ++ MAX(o.order_date) AS last_order_date, ++ DATEDIFF(CURRENT_DATE(), MAX(o.order_date)) AS days_since_last_order, ++ CASE ++ WHEN COUNT(o.order_id) > 10 AND SUM(o.total_amount) > 5000 THEN 'VIP' ++ WHEN COUNT(o.order_id) > 5 THEN 'Loyal' ++ WHEN DATEDIFF(CURRENT_DATE(), MAX(o.order_date)) > 365 THEN 'Inactive' ++ WHEN DATEDIFF(CURRENT_DATE(), MIN(o.order_date)) < 30 THEN 'New' ++ ELSE 'Regular' ++ END AS customer_segment + FROM customers c +-JOIN orders o ON c.customer_id = o.customer_id ++LEFT JOIN orders o ON c.customer_id = o.customer_id AND o.order_status = 'completed' + GROUP BY + c.customer_id, + c.first_name, + c.last_name, + c.email +-ORDER BY total_spent DESC; ++HAVING total_orders > 0 ++ORDER BY total_spent DESC ++LIMIT 1000; + +--- Get sales by category ++-- Get sales by category with trend analysis ++-- Added materialized view refresh every 6 hours ++WITH category_sales AS ( ++ SELECT ++ c.category_id, ++ c.category_name, ++ DATE_FORMAT(o.order_date, '%Y-%m') AS month, ++ SUM(oi.quantity * oi.unit_price) AS category_revenue ++ FROM order_items oi ++ INNER JOIN orders o ON oi.order_id = o.order_id ++ INNER JOIN products p ON oi.product_id = p.product_id ++ INNER JOIN categories c ON p.category_id = c.category_id ++ WHERE o.order_status = 'completed' ++ AND o.order_date >= DATE_SUB(CURRENT_DATE(), INTERVAL 12 MONTH) ++ GROUP BY ++ c.category_id, ++ c.category_name, ++ DATE_FORMAT(o.order_date, '%Y-%m') ++) + SELECT +- c.category_name, +- SUM(oi.quantity * oi.unit_price) AS category_revenue +-FROM order_items oi +-JOIN orders o ON oi.order_id = o.order_id +-JOIN products p ON oi.product_id = p.product_id +-JOIN categories c ON p.category_id = c.category_id +-WHERE o.order_status = 'completed' +-GROUP BY c.category_name +-ORDER BY category_revenue DESC; ++ category_id, ++ category_name, ++ SUM(category_revenue) AS total_revenue, ++ ROUND(AVG(category_revenue), 2) AS avg_monthly_revenue, ++ MAX(category_revenue) AS best_month_revenue, ++ MIN(category_revenue) AS worst_month_revenue, ++ ROUND( ++ (SUM(CASE WHEN month >= DATE_FORMAT(DATE_SUB(CURRENT_DATE(), INTERVAL 6 MONTH), '%Y-%m') THEN category_revenue ELSE 0 END) - ++ SUM(CASE WHEN month < DATE_FORMAT(DATE_SUB(CURRENT_DATE(), INTERVAL 6 MONTH), '%Y-%m') THEN category_revenue ELSE 0 END)) / ++ NULLIF(SUM(CASE WHEN month < DATE_FORMAT(DATE_SUB(CURRENT_DATE(), INTERVAL 6 MONTH), '%Y-%m') THEN category_revenue ELSE 0 END), 0) * 100, ++ 2) AS six_month_growth_percent ++FROM category_sales ++GROUP BY ++ category_id, ++ category_name ++ORDER BY total_revenue DESC; diff --git a/test_diffs/low_score/cpp_readability_issues.diff b/test_diffs/low_score/cpp_readability_issues.diff new file mode 100644 index 0000000..fe4380e --- /dev/null +++ b/test_diffs/low_score/cpp_readability_issues.diff @@ -0,0 +1,224 @@ +diff --git a/src/algorithm/graph_processor.cpp b/src/algorithm/graph_processor.cpp +index 1234567..abcdefg 100644 +--- a/src/algorithm/graph_processor.cpp ++++ b/src/algorithm/graph_processor.cpp +@@ -1,50 +1,123 @@ + #include + #include + #include +-#include +-#include +-#include "graph_processor.h" ++#include ++#include ++#include"graph_processor.h" ++using namespace std; + +-/** +- * Implementation of graph processing algorithms +- */ ++// Global variables ++vector> g; ++vector v; ++vector d; ++int n, m; + +-namespace graph { ++// No comments or documentation ++class GraphProcessor { ++public: ++ // Unclear variable names ++ unordered_map>> a; ++ ++ // Constructor with no initialization list ++ GraphProcessor() { ++ a = unordered_map>>(); ++ } ++ ++ // No parameter validation ++ void addE(int x, int y, int w) { ++ a[x].push_back(make_pair(y, w)); ++ } ++ ++ // Inconsistent naming convention ++ vector ShortestPath(int s, int t) { ++ // Magic numbers ++ const int INF = 1000000000; ++ ++ // Unnecessary variable declarations ++ int i, j, k, u, v, w; ++ ++ // Inefficient data structure initialization ++ vector dist(a.size() + 1, INF); ++ vector prev(a.size() + 1, -1); ++ vector visited(a.size() + 1, false); ++ ++ // Unclear algorithm implementation ++ dist[s] = 0; ++ for (i = 0; i < a.size(); i++) { ++ int min_dist = INF; ++ int min_vertex = -1; ++ ++ for (j = 0; j < a.size() + 1; j++) { ++ if (!visited[j] && dist[j] < min_dist) { ++ min_dist = dist[j]; ++ min_vertex = j; ++ } ++ } ++ ++ if (min_vertex == -1) break; ++ ++ u = min_vertex; ++ visited[u] = true; ++ ++ if (u == t) break; ++ ++ // Nested loops with poor indentation ++ for (j = 0; j < a[u].size(); j++) { ++ v = a[u][j].first; ++ w = a[u][j].second; ++ ++ if (!visited[v] && dist[u] + w < dist[v]) { ++ dist[v] = dist[u] + w; ++ prev[v] = u; ++ } ++ } ++ } ++ ++ // Convoluted path reconstruction ++ vector path; ++ if (dist[t] == INF) return path; ++ ++ for (u = t; u != -1; u = prev[u]) ++ path.push_back(u); ++ ++ // Unnecessary reverse operation ++ vector result; ++ for (i = path.size() - 1; i >= 0; i--) ++ result.push_back(path[i]); ++ ++ return result; ++ } ++ ++ // Poorly implemented DFS with side effects ++ void dfs(int node) { ++ v[node] = true; ++ for (int i = 0; i < g[node].size(); i++) { ++ int to = g[node][i]; ++ if (!v[to]) { ++ dfs(to); ++ } ++ } ++ } ++ ++ // Function does too many things ++ void processGraph(string filename) { ++ // Hard-coded file path ++ FILE* f = fopen(filename.c_str(), "r"); ++ ++ // No error checking ++ fscanf(f, "%d %d", &n, &m); ++ ++ // Resizing global variables ++ g.resize(n + 1); ++ v.resize(n + 1); ++ d.resize(n + 1); ++ ++ // Raw loop with no explanation ++ for (int i = 0; i < m; i++) { ++ int x, y, w; ++ fscanf(f, "%d %d %d", &x, &y, &w); ++ g[x].push_back(y); ++ addE(x, y, w); ++ } ++ ++ fclose(f); ++ } ++ ++ // Cryptic algorithm with no explanation ++ int mst() { ++ // Magic numbers and unclear variable names ++ int res = 0; ++ priority_queue, vector>, greater>> pq; ++ vector vis(n + 1, 0); ++ pq.push({0, 1}); ++ while (!pq.empty()) { ++ auto p = pq.top(); ++ pq.pop(); ++ int w = p.first, u = p.second; ++ if (vis[u]) continue; ++ vis[u] = 1; ++ res += w; ++ for (auto& e : a[u]) { ++ int v = e.first, w = e.second; ++ if (!vis[v]) pq.push({w, v}); ++ } ++ } ++ return res; ++ } ++}; + +- GraphProcessor::GraphProcessor() { +- // Initialize graph processor +- } +- +- void GraphProcessor::addEdge(int source, int target, int weight) { +- adjacencyList[source].push_back(std::make_pair(target, weight)); +- } +- +- std::vector GraphProcessor::shortestPath(int source, int target) { +- const int INF = std::numeric_limits::max(); +- +- // Initialize distances +- std::vector distance(adjacencyList.size() + 1, INF); +- std::vector previous(adjacencyList.size() + 1, -1); +- std::vector visited(adjacencyList.size() + 1, false); +- +- // Dijkstra's algorithm +- distance[source] = 0; +- +- for (size_t i = 0; i < adjacencyList.size(); ++i) { +- // Find vertex with minimum distance +- int minDistance = INF; +- int minVertex = -1; +- +- for (size_t j = 0; j < adjacencyList.size() + 1; ++j) { +- if (!visited[j] && distance[j] < minDistance) { +- minDistance = distance[j]; +- minVertex = j; +- } +- } +- +- if (minVertex == -1) break; +- +- // Mark vertex as visited +- visited[minVertex] = true; +- +- // If target is reached, stop +- if (minVertex == target) break; +- +- // Update distances to neighbors +- for (const auto& edge : adjacencyList[minVertex]) { +- int neighbor = edge.first; +- int weight = edge.second; +- +- if (!visited[neighbor] && distance[minVertex] + weight < distance[neighbor]) { +- distance[neighbor] = distance[minVertex] + weight; +- previous[neighbor] = minVertex; +- } +- } +- } +- +- // Reconstruct path +- std::vector path; +- if (distance[target] == INF) return path; +- +- for (int vertex = target; vertex != -1; vertex = previous[vertex]) { +- path.push_back(vertex); +- } +- +- std::reverse(path.begin(), path.end()); +- return path; +- } +- +-} // namespace graph diff --git a/test_diffs/low_score/java_error_handling_issues.diff b/test_diffs/low_score/java_error_handling_issues.diff new file mode 100644 index 0000000..af7d8a3 --- /dev/null +++ b/test_diffs/low_score/java_error_handling_issues.diff @@ -0,0 +1,187 @@ +diff --git a/src/main/java/com/example/service/FileProcessorService.java b/src/main/java/com/example/service/FileProcessorService.java +index 1234567..abcdefg 100644 +--- a/src/main/java/com/example/service/FileProcessorService.java ++++ b/src/main/java/com/example/service/FileProcessorService.java +@@ -1,45 +1,108 @@ + package com.example.service; + +-import java.io.File; +-import java.io.IOException; +-import java.nio.file.Files; +-import java.nio.file.Paths; +-import java.util.List; +-import java.util.stream.Collectors; +- +-import org.springframework.stereotype.Service; +- +-import com.example.exception.FileProcessingException; +-import com.example.model.ProcessedFile; +- +-/** +- * Service for processing files +- */ +-@Service +-public class FileProcessorService { +- +- /** +- * Process a file and return the result +- * +- * @param filePath Path to the file +- * @return ProcessedFile object with results +- * @throws FileProcessingException if processing fails +- */ +- public ProcessedFile processFile(String filePath) throws FileProcessingException { +- try { +- File file = new File(filePath); +- +- if (!file.exists()) { +- throw new FileProcessingException("File not found: " + filePath); +- } +- +- List lines = Files.readAllLines(Paths.get(filePath)); +- +- // Process lines +- List processedLines = lines.stream() +- .map(String::toUpperCase) +- .collect(Collectors.toList()); +- +- return new ProcessedFile(file.getName(), processedLines, processedLines.size()); +- } catch (IOException e) { +- throw new FileProcessingException("Error processing file: " + e.getMessage(), e); +- } +- } ++import java.io.*; ++import java.nio.file.*; ++import java.sql.*; ++import java.util.*; ++ ++import org.springframework.stereotype.*; ++ ++import com.example.model.*; ++ ++// Missing proper imports ++ ++@Service ++public class FileProcessorService { ++ ++ // Hardcoded database credentials ++ private static final String DB_URL = "jdbc:mysql://localhost:3306/filedb"; ++ private static final String DB_USER = "root"; ++ private static final String DB_PASS = "password"; ++ ++ // No dependency injection ++ private Connection getConnection() { ++ try { ++ return DriverManager.getConnection(DB_URL, DB_USER, DB_PASS); ++ } catch (SQLException e) { ++ // Swallowing exception ++ System.out.println("Database connection error: " + e.getMessage()); ++ return null; ++ } ++ } ++ ++ // No exception handling ++ public ProcessedFile processFile(String filePath) { ++ // No input validation ++ File file = new File(filePath); ++ ++ // No null checks ++ List lines = readAllLines(filePath); ++ ++ // Process lines without checking for null ++ List processedLines = new ArrayList<>(); ++ for (String line : lines) { ++ processedLines.add(line.toUpperCase()); ++ } ++ ++ // Save to database without transaction ++ saveToDatabase(file.getName(), processedLines); ++ ++ return new ProcessedFile(file.getName(), processedLines, processedLines.size()); ++ } ++ ++ // Method with multiple responsibilities ++ private List readAllLines(String filePath) { ++ try { ++ // Resource leak - not using try-with-resources ++ FileInputStream fis = new FileInputStream(filePath); ++ BufferedReader reader = new BufferedReader(new InputStreamReader(fis)); ++ ++ List lines = new ArrayList<>(); ++ String line; ++ ++ while ((line = reader.readLine()) != null) { ++ lines.add(line); ++ } ++ ++ // Resources not closed properly ++ return lines; ++ } catch (Exception e) { ++ // Generic exception catch ++ // Exception details lost ++ System.out.println("Error reading file: " + e.getMessage()); ++ return new ArrayList<>(); // Returning empty list instead of throwing ++ } ++ } ++ ++ // No transaction management ++ private void saveToDatabase(String fileName, List lines) { ++ Connection conn = null; ++ Statement stmt = null; ++ ++ try { ++ conn = getConnection(); ++ stmt = conn.createStatement(); ++ ++ // SQL Injection vulnerability ++ stmt.executeUpdate("DELETE FROM processed_files WHERE file_name = '" + fileName + "'"); ++ ++ // Inefficient - should use batch ++ for (String line : lines) { ++ // SQL Injection vulnerability ++ stmt.executeUpdate( ++ "INSERT INTO processed_lines (file_name, line_content) VALUES ('" + ++ fileName + "', '" + line + "')" ++ ); ++ } ++ } catch (SQLException e) { ++ // Exception swallowed ++ System.out.println("Database error: " + e.getMessage()); ++ } finally { ++ // Nested try-catch in finally block ++ try { ++ if (stmt != null) stmt.close(); ++ if (conn != null) conn.close(); ++ } catch (SQLException e) { ++ // Exception swallowed ++ System.out.println("Error closing resources: " + e.getMessage()); ++ } ++ } ++ } ++ ++ // Dangerous method - allows arbitrary file deletion ++ public boolean deleteFile(String filePath) { ++ // No validation or permission checks ++ File file = new File(filePath); ++ return file.delete(); ++ } ++ ++ // Thread-unsafe implementation ++ public void processDirectory(String directoryPath) { ++ File directory = new File(directoryPath); ++ ++ // No directory existence check ++ File[] files = directory.listFiles(); ++ ++ // No null check ++ for (File file : files) { ++ if (file.isFile()) { ++ // Recursive call without depth limit ++ processFile(file.getAbsolutePath()); ++ } else if (file.isDirectory()) { ++ // Recursive call without depth limit ++ processDirectory(file.getAbsolutePath()); ++ } ++ } ++ } + } diff --git a/test_diffs/low_score/javascript_performance_issues.diff b/test_diffs/low_score/javascript_performance_issues.diff new file mode 100644 index 0000000..20ebcfe --- /dev/null +++ b/test_diffs/low_score/javascript_performance_issues.diff @@ -0,0 +1,153 @@ +diff --git a/src/utils/data-processor.js b/src/utils/data-processor.js +index 1234567..abcdefg 100644 +--- a/src/utils/data-processor.js ++++ b/src/utils/data-processor.js +@@ -1,30 +1,89 @@ +-import { fetchData } from './api'; ++// No imports organized ++import {fetchData} from './api' ++import moment from 'moment'; ++import _ from 'lodash'; ++import $ from 'jquery'; + +-/** +- * Process data from API +- * @param {string} endpoint - API endpoint +- * @returns {Promise} Processed data +- */ +-export const processData = async (endpoint) => { +- const data = await fetchData(endpoint); +- +- // Process data +- const processedData = data.map(item => ({ +- id: item.id, +- name: item.name, +- value: item.value * 2 +- })); +- +- return processedData; +-}; ++// Global variables ++var globalData = []; ++var processingComplete = false; ++ ++// Memory leak - event listeners never removed ++$(document).ready(function() { ++ $('#processButton').click(function() { ++ processAllData(); ++ }); ++}); ++ ++// No documentation ++export const processData = async (endpoint) => { ++ try { ++ console.log("Processing data from: " + endpoint); ++ ++ // Inefficient API calls - no caching ++ const data = await fetchData(endpoint); ++ ++ // Inefficient data processing ++ let processedData = []; ++ ++ // Using for loop instead of map ++ for (let i = 0; i < data.length; i++) { ++ const item = data[i]; ++ ++ // Creating new objects in a loop ++ const processed = { ++ id: item.id, ++ name: item.name, ++ value: item.value * 2, ++ // Using moment unnecessarily for each item ++ timestamp: moment().format('YYYY-MM-DD HH:mm:ss'), ++ // Deep cloning unnecessarily ++ originalData: _.cloneDeep(item) ++ }; ++ ++ // Inefficient array manipulation ++ processedData.push(processed); ++ ++ // DOM manipulation in a loop ++ $('#dataList').append('
  • ' + processed.name + '
  • '); ++ } ++ ++ // Store in global variable ++ globalData = processedData; ++ processingComplete = true; ++ ++ return processedData; ++ } catch (error) { ++ console.log("Error processing data: " + error); ++ return []; ++ } ++}; ++ ++// Inefficient recursive function with no termination check ++export const findItemById = (items, id) => { ++ if (items.length === 0) { ++ return null; ++ } ++ ++ const [first, ...rest] = items; ++ ++ if (first.id === id) { ++ return first; ++ } ++ ++ return findItemById(rest, id); ++}; ++ ++// Function with multiple responsibilities ++export const processAllData = async () => { ++ // Multiple API calls with no batching ++ const users = await processData('/api/users'); ++ const products = await processData('/api/products'); ++ const orders = await processData('/api/orders'); ++ ++ // Inefficient nested loops - O(n³) complexity ++ const result = []; ++ for (let user of users) { ++ for (let product of products) { ++ for (let order of orders) { ++ if (order.userId === user.id && order.productId === product.id) { ++ // Creating new objects in nested loops ++ result.push({ ++ userName: user.name, ++ productName: product.name, ++ orderDate: order.date ++ }); ++ ++ // DOM manipulation in nested loops ++ $('#resultTable').append( ++ '' + user.name + '' + ++ product.name + '' + ++ order.date + '' ++ ); ++ } ++ } ++ } ++ } ++ ++ // Memory intensive operation ++ localStorage.setItem('processedData', JSON.stringify(result)); ++ ++ return result; ++}; ++ ++// Inefficient sorting algorithm - bubble sort ++export const sortItems = (items) => { ++ const result = [...items]; ++ ++ for (let i = 0; i < result.length; i++) { ++ for (let j = 0; j < result.length - i - 1; j++) { ++ if (result[j].value > result[j + 1].value) { ++ // Swap ++ const temp = result[j]; ++ result[j] = result[j + 1]; ++ result[j + 1] = temp; ++ } ++ } ++ } ++ ++ return result; ++}; diff --git a/test_diffs/low_score/python_security_issues.diff b/test_diffs/low_score/python_security_issues.diff new file mode 100644 index 0000000..307f3c7 --- /dev/null +++ b/test_diffs/low_score/python_security_issues.diff @@ -0,0 +1,116 @@ +diff --git a/app/api/user_controller.py b/app/api/user_controller.py +index 1234567..abcdefg 100644 +--- a/app/api/user_controller.py ++++ b/app/api/user_controller.py +@@ -1,25 +1,58 @@ + import os + import sqlite3 ++import subprocess + from flask import Flask, request, jsonify + + app = Flask(__name__) + +-# Database connection +-def get_db_connection(): +- conn = sqlite3.connect('database.db') +- conn.row_factory = sqlite3.Row +- return conn ++# Hardcoded credentials ++DB_USER = "admin" ++DB_PASS = "admin123" ++SECRET_KEY = "my_super_secret_key_do_not_share" ++ ++# Insecure database connection ++def get_db_connection(db_name='database.db'): ++ try: ++ conn = sqlite3.connect(db_name) ++ conn.row_factory = sqlite3.Row ++ return conn ++ except Exception as e: ++ print(f"Database connection error: {e}") ++ return None + + @app.route('/api/users', methods=['GET']) + def get_users(): +- conn = get_db_connection() +- users = conn.execute('SELECT * FROM users').fetchall() +- conn.close() ++ # SQL Injection vulnerability ++ search = request.args.get('search', '') ++ ++ conn = get_db_connection() ++ if not conn: ++ return jsonify({"error": "Database connection failed"}), 500 ++ ++ # Vulnerable to SQL injection ++ query = f"SELECT * FROM users WHERE name LIKE '%{search}%'" ++ users = conn.execute(query).fetchall() ++ conn.close() + + return jsonify([dict(user) for user in users]) + + @app.route('/api/users/', methods=['GET']) + def get_user(user_id): ++ conn = get_db_connection() ++ user = conn.execute('SELECT * FROM users WHERE id = ?', (user_id,)).fetchone() ++ conn.close() ++ ++ if user is None: ++ return jsonify({"error": "User not found"}), 404 ++ ++ return jsonify(dict(user)) ++ ++@app.route('/api/execute', methods=['POST']) ++def execute_command(): ++ # Command injection vulnerability ++ command = request.json.get('command', '') ++ ++ # Extremely dangerous - allows arbitrary command execution ++ result = subprocess.check_output(command, shell=True) ++ ++ return jsonify({"result": result.decode('utf-8')}) ++ ++@app.route('/api/users', methods=['POST']) ++def create_user(): ++ # No input validation ++ user_data = request.json ++ ++ # No password hashing ++ name = user_data.get('name') ++ email = user_data.get('email') ++ password = user_data.get('password') # Storing plain text password ++ + conn = get_db_connection() +- user = conn.execute('SELECT * FROM users WHERE id = ?', (user_id,)).fetchone() ++ conn.execute( ++ 'INSERT INTO users (name, email, password) VALUES (?, ?, ?)', ++ (name, email, password) ++ ) ++ conn.commit() + conn.close() + +- if user is None: +- return jsonify({"error": "User not found"}), 404 +- +- return jsonify(dict(user)) ++ return jsonify({"success": True, "message": "User created"}) ++ ++@app.route('/api/backup', methods=['GET']) ++def backup_database(): ++ # Path traversal vulnerability ++ filename = request.args.get('filename', 'backup.db') ++ ++ # Vulnerable to path traversal ++ backup_path = os.path.join('/tmp', filename) ++ ++ # Copy database to specified path ++ conn = get_db_connection() ++ conn.close() ++ ++ os.system(f"cp database.db {backup_path}") ++ ++ return jsonify({"success": True, "backup_path": backup_path}) ++ ++if __name__ == '__main__': ++ # Running in debug mode in production ++ app.run(debug=True, host='0.0.0.0') diff --git a/test_diffs/low_score/sql_structure_issues.diff b/test_diffs/low_score/sql_structure_issues.diff new file mode 100644 index 0000000..9cea269 --- /dev/null +++ b/test_diffs/low_score/sql_structure_issues.diff @@ -0,0 +1,101 @@ +diff --git a/database/reports/customer_analysis.sql b/database/reports/customer_analysis.sql +index 1234567..abcdefg 100644 +--- a/database/reports/customer_analysis.sql ++++ b/database/reports/customer_analysis.sql +@@ -1,45 +1,97 @@ +--- Customer analysis queries ++-- no comments explaining purpose + +--- Get customer purchase summary +-SELECT +- c.customer_id, +- c.first_name, +- c.last_name, +- COUNT(o.order_id) AS total_orders, +- SUM(o.total_amount) AS total_spent, +- AVG(o.total_amount) AS average_order_value, +- MAX(o.order_date) AS last_order_date +-FROM customers c +-LEFT JOIN orders o ON c.customer_id = o.customer_id +-GROUP BY +- c.customer_id, +- c.first_name, +- c.last_name +-ORDER BY total_spent DESC; ++-- inconsistent formatting ++SELECT c.customer_id,c.first_name,c.last_name, ++COUNT(o.order_id) AS total_orders,SUM(o.total_amount) AS total_spent, ++AVG(o.total_amount) AS average_order_value,MAX(o.order_date) AS last_order_date ++FROM customers c LEFT JOIN orders o ON c.customer_id = o.customer_id GROUP BY c.customer_id, ++c.first_name,c.last_name ORDER BY total_spent DESC; + +--- Get customer segments +-SELECT +- customer_id, +- CASE +- WHEN total_spent > 1000 THEN 'High Value' +- WHEN total_spent > 500 THEN 'Medium Value' +- ELSE 'Low Value' +- END AS customer_segment, +- total_spent +-FROM ( +- SELECT +- c.customer_id, +- SUM(o.total_amount) AS total_spent +- FROM customers c +- LEFT JOIN orders o ON c.customer_id = o.customer_id +- GROUP BY c.customer_id +-) AS customer_totals +-ORDER BY total_spent DESC; ++-- inefficient query with cartesian product ++select * from customers, orders, order_items, products ++where customers.customer_id = orders.customer_id ++and orders.order_id = order_items.order_id ++and order_items.product_id = products.product_id; + +--- Get customer retention rate +-WITH first_orders AS ( +- SELECT +- customer_id, +- MIN(DATE_TRUNC('month', order_date)) AS first_order_month +- FROM orders +- GROUP BY customer_id +-), +-monthly_activity AS ( +- SELECT +- DATE_TRUNC('month', order_date) AS order_month, +- COUNT(DISTINCT customer_id) AS active_customers +- FROM orders +- GROUP BY DATE_TRUNC('month', order_date) +-), +-customer_monthly_activity AS ( +- SELECT +- fo.customer_id, +- fo.first_order_month, +- DATE_TRUNC('month', o.order_date) AS order_month +- FROM first_orders fo +- JOIN orders o ON fo.customer_id = o.customer_id +- GROUP BY +- fo.customer_id, +- fo.first_order_month, +- DATE_TRUNC('month', o.order_date) +-) +-SELECT +- cma.first_order_month, +- COUNT(DISTINCT cma.customer_id) AS cohort_size, +- cma.order_month, +- COUNT(DISTINCT cma.customer_id) AS active_customers, +- ROUND( +- COUNT(DISTINCT cma.customer_id)::NUMERIC / +- FIRST_VALUE(COUNT(DISTINCT cma.customer_id)) OVER ( +- PARTITION BY cma.first_order_month +- ORDER BY cma.order_month +- ) * 100, +- 2) AS retention_rate +-FROM customer_monthly_activity cma +-GROUP BY +- cma.first_order_month, +- cma.order_month +-ORDER BY +- cma.first_order_month, +- cma.order_month; diff --git a/test_prompt.py b/test_prompt.py new file mode 100644 index 0000000..b2fc541 --- /dev/null +++ b/test_prompt.py @@ -0,0 +1,471 @@ +#!/usr/bin/env python +""" +Prompt Testing Tool for CodeDog + +This tool allows you to test code review prompts by providing a diff or code snippet +and getting the evaluation results without running the full code review process. +""" + +import argparse +import asyncio +import json +import os +import sys +import time +from typing import Dict, Any, Optional + +from langchain_core.messages import HumanMessage, SystemMessage +from langchain_openai import ChatOpenAI + +# 设置日志记录 +import logging +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') +logger = logging.getLogger(__name__) + +# 加载环境变量 +from dotenv import load_dotenv +load_dotenv() + +# 导入必要的模块 +from codedog.utils.code_evaluator import DiffEvaluator +from codedog.utils.langchain_utils import load_model_by_name + + +def sanitize_content(content: str) -> str: + """清理代码内容,移除异常字符""" + # 移除不可打印字符,但保留换行符和制表符 + sanitized = ''.join(c for c in content if c.isprintable() or c in ['\n', '\t', '\r']) + return sanitized + + +def guess_language(file_path: str) -> str: + """根据文件扩展名猜测编程语言""" + import os + file_ext = os.path.splitext(file_path)[1].lower() + + # 文件扩展名到语言的映射 + ext_to_lang = { + # Python + '.py': 'Python', + '.pyx': 'Python', + '.pyi': 'Python', + '.ipynb': 'Python', + + # JavaScript/TypeScript + '.js': 'JavaScript', + '.jsx': 'JavaScript', + '.ts': 'TypeScript', + '.tsx': 'TypeScript', + '.mjs': 'JavaScript', + + # Java + '.java': 'Java', + '.jar': 'Java', + '.class': 'Java', + + # C/C++ + '.c': 'C', + '.cpp': 'C++', + '.h': 'C', + '.hpp': 'C++', + + # C# + '.cs': 'C#', + + # Go + '.go': 'Go', + + # Ruby + '.rb': 'Ruby', + + # PHP + '.php': 'PHP', + + # Swift + '.swift': 'Swift', + + # Kotlin + '.kt': 'Kotlin', + '.kts': 'Kotlin', + + # Rust + '.rs': 'Rust', + + # HTML/CSS + '.html': 'HTML', + '.htm': 'HTML', + '.css': 'CSS', + '.scss': 'SCSS', + '.sass': 'SASS', + '.less': 'LESS', + + # Shell + '.sh': 'Shell', + '.bash': 'Shell', + '.zsh': 'Shell', + + # SQL + '.sql': 'SQL', + + # Markdown + '.md': 'Markdown', + '.markdown': 'Markdown', + + # JSON + '.json': 'JSON', + + # YAML + '.yml': 'YAML', + '.yaml': 'YAML', + + # XML + '.xml': 'XML', + + # Other + '.txt': 'Text', + '.csv': 'CSV', + } + + return ext_to_lang.get(file_ext, 'Unknown') + + +async def test_prompt( + file_path: str, + content: str, + model_name: str = "gpt-3.5-turbo", + system_prompt: Optional[str] = None, + output_format: str = "json" +) -> Dict[str, Any]: + """ + 测试代码评审提示 + + Args: + file_path: 文件路径 + content: 代码内容或差异内容 + model_name: 模型名称 + system_prompt: 系统提示,如果为None则使用默认系统提示 + output_format: 输出格式,可选值为json或markdown + + Returns: + Dict[str, Any]: 评估结果 + """ + # 加载模型 + model = load_model_by_name(model_name) + + # 清理代码内容 + sanitized_content = sanitize_content(content) + + # 猜测语言 + language = guess_language(file_path) + + # 使用默认系统提示或自定义系统提示 + if system_prompt is None: + system_prompt = """# ROLE AND OBJECTIVE +You are a senior code reviewer with 15+ years of experience across multiple programming languages and frameworks. Your task is to provide a thorough, objective evaluation of code quality and estimate the effort required to implement the changes. + +# EVALUATION DIMENSIONS +Evaluate the code on these dimensions, scoring each from 1-10 (10 being highest): + +1. Readability (1-10): Code clarity, naming conventions, consistent formatting +2. Efficiency (1-10): Algorithmic efficiency, resource usage, performance considerations +3. Security (1-10): Protection against vulnerabilities, input validation, secure coding practices +4. Structure (1-10): Architecture, modularity, separation of concerns, SOLID principles +5. Error Handling (1-10): Robust error handling, edge cases, graceful failure +6. Documentation (1-10): Comments, docstrings, self-documenting code +7. Code Style (1-10): Adherence to language-specific conventions and best practices +8. Overall Score (1-10): Comprehensive evaluation considering all dimensions + +# CODE CHANGE CLASSIFICATION +When evaluating code changes (especially in diff format), carefully distinguish between: + +## Non-Effective Changes (Should NOT count significantly toward working hours) +- Whitespace adjustments (spaces, tabs, line breaks) +- Indentation fixes without logic changes +- Comment additions or modifications without code changes +- Import reordering or reorganization +- Variable/function renaming without behavior changes +- Code reformatting (line wrapping, bracket placement) +- Changing string quotes (single to double quotes) +- Adding/removing trailing commas +- Changing code style to match linter rules +- Removing unused imports or variables + +## Effective Changes (SHOULD count toward working hours) +- Logic modifications that alter program behavior +- Functionality additions or removals +- Algorithm changes or optimizations +- Bug fixes that correct actual issues +- API changes (parameters, return types, etc.) +- Data structure modifications +- Performance optimizations +- Security vulnerability fixes +- Error handling improvements +- Complex refactoring that maintains behavior but improves code quality + +# WORKING HOURS ESTIMATION GUIDELINES +When estimating the time an experienced programmer (5-10+ years) would need: + +1. For purely non-effective changes: + - 0.1-0.2 hours for small files + - 0.3-0.5 hours for large files with extensive formatting + +2. For effective changes, consider: + - Complexity of the logic (simple, moderate, complex) + - Domain knowledge required (general, specialized, expert) + - Testing requirements (minimal, moderate, extensive) + - Integration complexity (isolated, moderate dependencies, highly coupled) + +3. Time components to include in your estimate: + - Understanding the existing code + - Designing the solution + - Implementing the changes + - Testing and debugging + - Documentation and code review + +4. Provide a realistic estimate that reflects the actual work required, not just the line count. + +# LANGUAGE-SPECIFIC CONSIDERATIONS +- For Python: Consider PEP 8 compliance, type hints, docstrings +- For JavaScript/TypeScript: Consider ES6+ features, typing, framework conventions +- For Java: Consider OOP principles, exception handling, Java conventions +- For C/C++: Consider memory management, performance, platform considerations +- For other languages: Apply relevant best practices and conventions""" + + # 创建用户提示 + user_prompt = f"""# Code Review Request + +## File Information +- **File Name**: {file_path} +- **Language**: {language.lower()} + +## Code to Review +```{language.lower()} +{sanitized_content} +``` + +## Instructions + +Please conduct a comprehensive code review following these steps: + +1. **Initial Analysis**: Begin with a brief overview of the code's purpose and functionality. + +2. **Detailed Evaluation**: Analyze the code across these key dimensions: + + a. **Readability** (1-10): + - Variable and function naming clarity + - Code organization and structure + - Consistent formatting and indentation + - Appropriate use of comments + + b. **Efficiency** (1-10): + - Algorithm efficiency and complexity + - Resource utilization (memory, CPU) + - Optimization opportunities + - Potential bottlenecks + + c. **Security** (1-10): + - Input validation and sanitization + - Authentication and authorization concerns + - Data protection and privacy + - Potential vulnerabilities + + d. **Structure** (1-10): + - Modularity and separation of concerns + - Appropriate design patterns + - Code reusability + - Dependency management + + e. **Error Handling** (1-10): + - Exception handling completeness + - Edge case coverage + - Graceful failure mechanisms + - Informative error messages + + f. **Documentation** (1-10): + - Documentation completeness + - Comment quality and relevance + - API documentation + - Usage examples where appropriate + + g. **Code Style** (1-10): + - Adherence to language conventions + - Consistency with project style + - Readability enhancements + - Modern language feature usage + +3. **Code Change Classification**: + - Carefully distinguish between effective and non-effective code changes + - Non-effective changes include: whitespace adjustments, indentation fixes, comment additions, import reordering, variable/function renaming without behavior changes, code reformatting, changing string quotes, etc. + - Effective changes include: logic modifications, functionality additions/removals, algorithm changes, bug fixes, API changes, data structure modifications, performance optimizations, security fixes, etc. + +4. **Working Hours Estimation**: + - Estimate how many effective working hours an experienced programmer (5-10+ years) would need to complete these code changes + - Focus primarily on effective code changes, not formatting or style changes + - Consider code complexity, domain knowledge requirements, and context + - Include time for understanding, implementation, testing, and integration + +## Response Format + +Please return your evaluation in valid JSON format with the following structure: + +```json +{{ + "readability": score, + "efficiency": score, + "security": score, + "structure": score, + "error_handling": score, + "documentation": score, + "code_style": score, + "overall_score": score, + "effective_code_lines": number, + "non_effective_code_lines": number, + "estimated_hours": number, + "comments": "detailed analysis with specific observations and recommendations" +}} +``` + +IMPORTANT: Ensure your response is valid JSON that can be parsed programmatically. If you cannot evaluate the code (e.g., incomplete or incomprehensible code), still return valid JSON with default scores of 5 and explain the reason in the comments field.""" + + # 创建消息 + messages = [ + SystemMessage(content=system_prompt), + HumanMessage(content=user_prompt) + ] + + # 调用模型 + print(f"Sending request to {model_name}...") + start_time = time.time() + response = await model.agenerate(messages=[messages]) + end_time = time.time() + print(f"Response received in {end_time - start_time:.2f} seconds") + + # 获取响应文本 + generated_text = response.generations[0][0].text + + # 提取JSON + try: + # 尝试直接解析JSON + result = json.loads(generated_text) + print("Successfully parsed JSON response") + except json.JSONDecodeError: + # 如果直接解析失败,尝试提取JSON部分 + import re + json_match = re.search(r'```json\s*(.*?)\s*```', generated_text, re.DOTALL) + if json_match: + try: + result = json.loads(json_match.group(1)) + print("Successfully extracted and parsed JSON from code block") + except json.JSONDecodeError: + print("Failed to parse JSON from code block") + result = { + "error": "Failed to parse JSON response", + "raw_response": generated_text[:1000] + ("..." if len(generated_text) > 1000 else "") + } + else: + print("No JSON code block found in response") + result = { + "error": "No JSON found in response", + "raw_response": generated_text[:1000] + ("..." if len(generated_text) > 1000 else "") + } + + # 根据输出格式返回结果 + if output_format == "json": + return result + else: # markdown + # 将结果转换为Markdown格式 + markdown = f"# Code Review for {file_path}\n\n" + markdown += f"## Scores\n\n" + markdown += f"- **Readability**: {result.get('readability', 'N/A')}/10\n" + markdown += f"- **Efficiency**: {result.get('efficiency', 'N/A')}/10\n" + markdown += f"- **Security**: {result.get('security', 'N/A')}/10\n" + markdown += f"- **Structure**: {result.get('structure', 'N/A')}/10\n" + markdown += f"- **Error Handling**: {result.get('error_handling', 'N/A')}/10\n" + markdown += f"- **Documentation**: {result.get('documentation', 'N/A')}/10\n" + markdown += f"- **Code Style**: {result.get('code_style', 'N/A')}/10\n" + markdown += f"- **Overall Score**: {result.get('overall_score', 'N/A')}/10\n\n" + + markdown += f"## Code Change Analysis\n\n" + markdown += f"- **Effective Code Lines**: {result.get('effective_code_lines', 'N/A')}\n" + markdown += f"- **Non-Effective Code Lines**: {result.get('non_effective_code_lines', 'N/A')}\n" + markdown += f"- **Estimated Hours**: {result.get('estimated_hours', 'N/A')}\n\n" + + markdown += f"## Detailed Analysis\n\n" + markdown += result.get('comments', 'No comments provided') + + return {"markdown": markdown, "raw_result": result} + + +def parse_args(): + """解析命令行参数""" + parser = argparse.ArgumentParser(description="Test code review prompts") + + # 输入选项 + input_group = parser.add_mutually_exclusive_group(required=True) + input_group.add_argument("--file", help="Path to the file to evaluate") + input_group.add_argument("--diff", help="Path to the diff file to evaluate") + + # 模型选项 + parser.add_argument("--model", default="gpt-3.5-turbo", help="Model to use for evaluation (default: gpt-3.5-turbo)") + + # 系统提示选项 + parser.add_argument("--system-prompt", help="Path to a file containing a custom system prompt") + + # 输出选项 + parser.add_argument("--output", help="Path to save the output (default: stdout)") + parser.add_argument("--format", choices=["json", "markdown"], default="json", help="Output format (default: json)") + + return parser.parse_args() + + +async def main(): + """主函数""" + args = parse_args() + + # 读取输入内容 + if args.file: + file_path = args.file + with open(file_path, "r", encoding="utf-8") as f: + content = f.read() + else: # args.diff + diff_path = args.diff + with open(diff_path, "r", encoding="utf-8") as f: + content = f.read() + # 从diff文件名中提取原始文件名 + import os + file_path = os.path.basename(diff_path) + if file_path.endswith(".diff"): + file_path = file_path[:-5] + + # 读取自定义系统提示 + system_prompt = None + if args.system_prompt: + with open(args.system_prompt, "r", encoding="utf-8") as f: + system_prompt = f.read() + + # 测试提示 + result = await test_prompt( + file_path=file_path, + content=content, + model_name=args.model, + system_prompt=system_prompt, + output_format=args.format + ) + + # 输出结果 + if args.output: + with open(args.output, "w", encoding="utf-8") as f: + if args.format == "json": + json.dump(result, f, indent=2, ensure_ascii=False) + else: # markdown + f.write(result["markdown"]) + print(f"Output saved to {args.output}") + else: + if args.format == "json": + print(json.dumps(result, indent=2, ensure_ascii=False)) + else: # markdown + print(result["markdown"]) + + +if __name__ == "__main__": + asyncio.run(main())