From fa629538c0a1061f714195423d7c73380848dd7a Mon Sep 17 00:00:00 2001 From: iohub Date: Fri, 8 May 2026 14:01:27 +0800 Subject: [PATCH 01/12] feat(llm): add prompt cache token tracking and TUI display Track cache creation and read tokens from LLM responses (OpenAI's prompt_tokens_details and Anthropic-compatible raw JSON), propagate through agent metadata, and display in the TUI token dashboard. --- go.sum | 2 -- internal/agents/conductor.go | 8 +++++--- internal/agents/executor.go | 8 +++++--- internal/agents/meta.go | 8 +++++--- internal/llm/engine.go | 8 +++++--- internal/llm/engine_openai.go | 36 +++++++++++++++++++++++++++++++++++ internal/tui/tui_model.go | 14 +++++++++----- internal/tui/tui_update.go | 34 +++++++++++++++++++++++++++++++-- internal/tui/tui_view.go | 6 ++++++ 9 files changed, 103 insertions(+), 21 deletions(-) diff --git a/go.sum b/go.sum index 5e303d0..66f6780 100644 --- a/go.sum +++ b/go.sum @@ -12,8 +12,6 @@ github.com/atotto/clipboard v0.1.4 h1:EH0zSVneZPSuFR11BlR9YppQTVDbh5+16AmcJi4g1z github.com/atotto/clipboard v0.1.4/go.mod h1:ZY9tmq7sm5xIbd9bOK4onWV4S6X0u6GY7Vn0Yu86PYI= github.com/aymanbagabas/go-osc52/v2 v2.0.1 h1:HwpRHbFMcZLEVr42D4p7XBqjyuxQH5SMiErDT4WkJ2k= github.com/aymanbagabas/go-osc52/v2 v2.0.1/go.mod h1:uYgXzlJ7ZpABp8OJ+exZzJJhRNQ2ASbcXHWsFqH8hp8= -github.com/aymanbagabas/go-udiff v0.2.0 h1:TK0fH4MteXUDspT88n8CKzvK0X9O2xu9yQjWpi6yML8= -github.com/aymanbagabas/go-udiff v0.2.0/go.mod h1:RE4Ex0qsGkTAJoQdQQCA0uG+nAzJO/pI/QwceO5fgrA= github.com/aymanbagabas/go-udiff v0.4.1 h1:OEIrQ8maEeDBXQDoGCbbTTXYJMYRCRO1fnodZ12Gv5o= github.com/aymanbagabas/go-udiff v0.4.1/go.mod h1:0L9PGwj20lrtmEMeyw4WKJ/TMyDtvAoK9bf2u/mNo3w= github.com/aymerick/douceur v0.2.0 h1:Mv+mAeH1Q+n9Fr+oyamOlAkUNPWPlA8PPGR0QAaYuPk= diff --git a/internal/agents/conductor.go b/internal/agents/conductor.go index adb2f6b..d736f98 100644 --- a/internal/agents/conductor.go +++ b/internal/agents/conductor.go @@ -754,9 +754,11 @@ func (a *ConductorAgent) Run(ctx context.Context, input string, mem *memory.Conv metadata := map[string]interface{}{} if resp.Usage != nil { metadata["usage"] = map[string]interface{}{ - "prompt_tokens": resp.Usage.PromptTokens, - "completion_tokens": resp.Usage.CompletionTokens, - "total_tokens": resp.Usage.TotalTokens, + "prompt_tokens": resp.Usage.PromptTokens, + "completion_tokens": resp.Usage.CompletionTokens, + "total_tokens": resp.Usage.TotalTokens, + "cache_creation_input_tokens": resp.Usage.CacheCreationInputTokens, + "cache_read_input_tokens": resp.Usage.CacheReadInputTokens, } } if len(metadata) > 0 { diff --git a/internal/agents/executor.go b/internal/agents/executor.go index d5d219d..2742480 100644 --- a/internal/agents/executor.go +++ b/internal/agents/executor.go @@ -75,9 +75,11 @@ func RunAgentLoop(ctx context.Context, cfg ExecutorConfig) (string, error) { metadata := map[string]interface{}{} if resp.Usage != nil { metadata["usage"] = map[string]interface{}{ - "prompt_tokens": resp.Usage.PromptTokens, - "completion_tokens": resp.Usage.CompletionTokens, - "total_tokens": resp.Usage.TotalTokens, + "prompt_tokens": resp.Usage.PromptTokens, + "completion_tokens": resp.Usage.CompletionTokens, + "total_tokens": resp.Usage.TotalTokens, + "cache_creation_input_tokens": resp.Usage.CacheCreationInputTokens, + "cache_read_input_tokens": resp.Usage.CacheReadInputTokens, } } if len(metadata) > 0 { diff --git a/internal/agents/meta.go b/internal/agents/meta.go index 4aca676..b554760 100644 --- a/internal/agents/meta.go +++ b/internal/agents/meta.go @@ -71,9 +71,11 @@ func (a *MetaAgent) Run(ctx context.Context, input string) (string, error) { metadata := map[string]interface{}{} if resp.Usage != nil { metadata["usage"] = map[string]interface{}{ - "prompt_tokens": resp.Usage.PromptTokens, - "completion_tokens": resp.Usage.CompletionTokens, - "total_tokens": resp.Usage.TotalTokens, + "prompt_tokens": resp.Usage.PromptTokens, + "completion_tokens": resp.Usage.CompletionTokens, + "total_tokens": resp.Usage.TotalTokens, + "cache_creation_input_tokens": resp.Usage.CacheCreationInputTokens, + "cache_read_input_tokens": resp.Usage.CacheReadInputTokens, } } if len(metadata) > 0 { diff --git a/internal/llm/engine.go b/internal/llm/engine.go index cf31383..5a59930 100644 --- a/internal/llm/engine.go +++ b/internal/llm/engine.go @@ -52,9 +52,11 @@ type FunctionCall struct { // TokenUsage contains token usage information returned by the LLM API. type TokenUsage struct { - PromptTokens int64 `json:"prompt_tokens"` - CompletionTokens int64 `json:"completion_tokens"` - TotalTokens int64 `json:"total_tokens"` + PromptTokens int64 `json:"prompt_tokens"` + CompletionTokens int64 `json:"completion_tokens"` + TotalTokens int64 `json:"total_tokens"` + CacheCreationInputTokens int64 `json:"cache_creation_input_tokens,omitempty"` + CacheReadInputTokens int64 `json:"cache_read_input_tokens,omitempty"` } // Response represents the LLM's response to a GenerateContent call. diff --git a/internal/llm/engine_openai.go b/internal/llm/engine_openai.go index 32e241d..289daf5 100644 --- a/internal/llm/engine_openai.go +++ b/internal/llm/engine_openai.go @@ -137,6 +137,24 @@ func (e *OpenAIEngine) generateStreaming(ctx context.Context, params openai.Chat CompletionTokens: acc.Usage.CompletionTokens, TotalTokens: acc.Usage.TotalTokens, } + + // Extract cache-related tokens from prompt_tokens_details.cached_tokens (OpenAI format) + if acc.Usage.PromptTokensDetails.CachedTokens > 0 { + usage.CacheReadInputTokens = acc.Usage.PromptTokensDetails.CachedTokens + } + + // Also try to extract cache fields from raw JSON (for Anthropic-compatible APIs) + if raw := acc.Usage.RawJSON(); raw != "" { + var rawUsage map[string]any + if err := json.Unmarshal([]byte(raw), &rawUsage); err == nil { + if cacheRead, ok := rawUsage["cache_read_input_tokens"].(float64); ok && cacheRead > 0 { + usage.CacheReadInputTokens = int64(cacheRead) + } + if cacheCreate, ok := rawUsage["cache_creation_input_tokens"].(float64); ok && cacheCreate > 0 { + usage.CacheCreationInputTokens = int64(cacheCreate) + } + } + } } return &Response{ @@ -279,6 +297,24 @@ func (e *OpenAIEngine) toResponse(completion *openai.ChatCompletion) *Response { CompletionTokens: completion.Usage.CompletionTokens, TotalTokens: completion.Usage.TotalTokens, } + + // Extract cache-related tokens from prompt_tokens_details.cached_tokens (OpenAI format) + if completion.Usage.PromptTokensDetails.CachedTokens > 0 { + resp.Usage.CacheReadInputTokens = completion.Usage.PromptTokensDetails.CachedTokens + } + + // Also try to extract cache fields from raw JSON (for Anthropic-compatible APIs) + if raw := completion.Usage.RawJSON(); raw != "" { + var rawUsage map[string]any + if err := json.Unmarshal([]byte(raw), &rawUsage); err == nil { + if cacheRead, ok := rawUsage["cache_read_input_tokens"].(float64); ok && cacheRead > 0 { + resp.Usage.CacheReadInputTokens = int64(cacheRead) + } + if cacheCreate, ok := rawUsage["cache_creation_input_tokens"].(float64); ok && cacheCreate > 0 { + resp.Usage.CacheCreationInputTokens = int64(cacheCreate) + } + } + } } return resp diff --git a/internal/tui/tui_model.go b/internal/tui/tui_model.go index 7207413..55407e3 100644 --- a/internal/tui/tui_model.go +++ b/internal/tui/tui_model.go @@ -147,9 +147,11 @@ func (c *tuiEventConsumer) Consume(event *messaging.MessageEvent) error { // AgentTokenUsage tracks token consumption for a single agent. type AgentTokenUsage struct { - AgentName string - InputTokens int64 - OutputTokens int64 + AgentName string + InputTokens int64 + OutputTokens int64 + CacheCreationInputTokens int64 + CacheReadInputTokens int64 } // TUI Model @@ -220,8 +222,10 @@ type model struct { currentModel string // Token consumption tracking - inputTokens int64 // accumulated input tokens - outputTokens int64 // accumulated output tokens + inputTokens int64 // accumulated input tokens + outputTokens int64 // accumulated output tokens + cacheCreationInputTokens int64 // accumulated cache creation input tokens + cacheReadInputTokens int64 // accumulated cache read (hit) tokens // Per-agent token tracking tokenUsagePerAgent map[string]*AgentTokenUsage diff --git a/internal/tui/tui_update.go b/internal/tui/tui_update.go index ec40634..7fa33e9 100644 --- a/internal/tui/tui_update.go +++ b/internal/tui/tui_update.go @@ -840,8 +840,9 @@ func (m model) Update(msg tea.Msg) (tea.Model, tea.Cmd) { case int: completionVal = int64(v) } - m.outputTokens += completionVal } + m.outputTokens += completionVal + // Also track input tokens from API (PromptTokens) var promptVal int64 if promptTokens, ok := usageMap["prompt_tokens"]; ok { @@ -853,8 +854,35 @@ func (m model) Update(msg tea.Msg) (tea.Model, tea.Cmd) { case int: promptVal = int64(v) } - m.inputTokens += promptVal } + m.inputTokens += promptVal + + // Parse cache tokens + var cacheCreationVal int64 + if cacheCreationTokens, ok := usageMap["cache_creation_input_tokens"]; ok { + switch v := cacheCreationTokens.(type) { + case float64: + cacheCreationVal = int64(v) + case int64: + cacheCreationVal = v + case int: + cacheCreationVal = int64(v) + } + } + m.cacheCreationInputTokens += cacheCreationVal + + var cacheReadVal int64 + if cacheReadTokens, ok := usageMap["cache_read_input_tokens"]; ok { + switch v := cacheReadTokens.(type) { + case float64: + cacheReadVal = int64(v) + case int64: + cacheReadVal = v + case int: + cacheReadVal = int64(v) + } + } + m.cacheReadInputTokens += cacheReadVal // Per-agent token tracking agentName := msg.event.From @@ -868,6 +896,8 @@ func (m model) Update(msg tea.Msg) (tea.Model, tea.Cmd) { } agentUsage.InputTokens += promptVal agentUsage.OutputTokens += completionVal + agentUsage.CacheCreationInputTokens += cacheCreationVal + agentUsage.CacheReadInputTokens += cacheReadVal } } else { // Fallback: estimate tokens from content string diff --git a/internal/tui/tui_view.go b/internal/tui/tui_view.go index d96c6cd..61838ca 100644 --- a/internal/tui/tui_view.go +++ b/internal/tui/tui_view.go @@ -282,6 +282,9 @@ func (m model) renderTokenDashboard() string { totalLine := fmt.Sprintf("Total: ") totalLine += inputStyle.Render(fmt.Sprintf("In: %s ", inStr)) totalLine += outputStyle.Render(fmt.Sprintf("Out: %s ", outStr)) + if m.cacheReadInputTokens > 0 { + totalLine += fmt.Sprintf("Cache: %s ", formatToken(m.cacheReadInputTokens)) + } totalLine += sumStyle.Render(fmt.Sprintf("Σ %s", sumStr)) // Separator @@ -326,6 +329,9 @@ func (m model) renderTokenDashboard() string { agentLine := agentStyle.Render(paddedName) agentLine += " " + agentInStyle.Render(fmt.Sprintf("In: %s ", agentIn)) agentLine += agentOutStyle.Render(fmt.Sprintf("Out: %s", agentOut)) + if au.CacheReadInputTokens > 0 { + agentLine += " " + agentInStyle.Render(fmt.Sprintf("Cache: %s", formatToken(au.CacheReadInputTokens))) + } lines = append(lines, agentLine) } From 84b671a2fffe8033db5bedae96c4d18206d9373c Mon Sep 17 00:00:00 2001 From: iohub Date: Fri, 8 May 2026 14:47:00 +0800 Subject: [PATCH 02/12] feat(deepthinking): integrate deepthinking tool across all agents Register deepthinking tool in Coding, Conductor, and Repo agents with full JSON schema, prompt-level usage instructions, LLM configuration override support, and global context initialization. --- internal/agents/coding.go | 2 + internal/agents/coding.prompt.md | 5 ++ internal/agents/conductor.go | 4 + internal/agents/conductor.prompt.md | 8 ++ internal/agents/repo.go | 2 + internal/agents/repo.prompt.md | 5 +- internal/agents/tools.json | 18 ++++ internal/app/app.go | 27 +++--- internal/config/config.go | 11 ++- internal/globalctx/global_context.go | 23 ++--- internal/tools/deepthinking.go | 126 +++++++++++++++++++++++++++ 11 files changed, 205 insertions(+), 26 deletions(-) create mode 100644 internal/tools/deepthinking.go diff --git a/internal/agents/coding.go b/internal/agents/coding.go index 70fe418..6dd3fec 100644 --- a/internal/agents/coding.go +++ b/internal/agents/coding.go @@ -70,6 +70,8 @@ func NewCodingAgent(globalCtx *globalctx.GlobalCtx, llm llm.Engine, maxSteps int } case "micro_agent": fn = globalCtx.MicroAgentTool.Execute + case "deepthinking": + fn = globalCtx.DeepThinkingTool.Execute case "agent_exit": fn = globalCtx.FlowOps.ExecuteAgentExit case "ask_user_for_help": diff --git a/internal/agents/coding.prompt.md b/internal/agents/coding.prompt.md index d41781b..5decdce 100644 --- a/internal/agents/coding.prompt.md +++ b/internal/agents/coding.prompt.md @@ -34,6 +34,8 @@ You have access to the following tools. You must use them to interact with the s * **Thinking & Debugging**: * Use the `thinking` tool to analyze complex problems, plan multi-step tasks, or debug errors. * *Trigger*: If a tool execution fails (e.g., test failed, compilation error), you **MUST** use the `thinking` tool to analyze the error before retrying. **Analyze -> Plan -> Fix**. + * The `micro_agent` tool can delegate focused subtasks to a specialized micro-agent. + * The `deepthinking` tool is an extremely expensive, last-resort analysis tool — see constraints below. # Workflow 1. **Analyze**: Understand the user's intent. If ambiguous, use the `thinking` tool or ask clarifying questions (only if necessary). @@ -58,3 +60,6 @@ You have access to the following tools. You must use them to interact with the s * **Be Proactive**: Don't wait for the user to drive every step. Take initiative. * **Be Thorough**: Verify your work. Don't leave broken code. * **Be Safe**: Protect the user's environment. + +### DeepThinking Tool (Last Resort) +- **`deepthinking`**: An extremely expensive, isolated deep analysis tool. ONLY use when conventional methods (thinking tool, micro_agent, code analysis) have been exhausted and the problem requires systematic multi-dimensional analysis. Input: `context` (full problem context including errors, background, what failed) and `goal` (specific objective). This tool is VERY expensive — do NOT use for simple issues. diff --git a/internal/agents/conductor.go b/internal/agents/conductor.go index d736f98..a79ae70 100644 --- a/internal/agents/conductor.go +++ b/internal/agents/conductor.go @@ -294,6 +294,8 @@ func NewConductorAgent(globalCtx *globalctx.GlobalCtx, engine llm.Engine, repo * fn = globalCtx.FileOps.ExecuteReadFile case "print_dir_tree": fn = globalCtx.FileOps.ExecutePrintDirTree + case "deepthinking": + fn = globalCtx.DeepThinkingTool.Execute default: continue } @@ -383,6 +385,8 @@ func (a *ConductorAgent) getToolFunc(name string) tools.ToolFunc { } case "micro_agent": return a.GlobalCtx.MicroAgentTool.Execute + case "deepthinking": + return a.GlobalCtx.DeepThinkingTool.Execute case "agent_exit": return a.GlobalCtx.FlowOps.ExecuteAgentExit case "ask_user_for_help": diff --git a/internal/agents/conductor.prompt.md b/internal/agents/conductor.prompt.md index 93cc2b0..193a688 100644 --- a/internal/agents/conductor.prompt.md +++ b/internal/agents/conductor.prompt.md @@ -54,6 +54,9 @@ You have access to the following specialized sub-agents. You must delegate to th * **Decision Rule**: Before using Meta-Agent, first consider whether a combination of existing agents can solve the task. Only delegate to Meta-Agent when the task genuinely requires a novel agent design. Once a custom agent is registered, prefer reusing it for similar tasks rather than invoking Meta-Agent again. * **Already Registered Agents**: Check the **Custom Agents** section in the system prompt to see which custom agents have already been created and are available for delegation. +### Special Tools +- **`deepthinking`**: An extremely expensive deep analysis tool. ONLY use as a last resort when all other approaches have failed. It performs exhaustive system analysis and produces comprehensive solution designs. Input: `context` (full problem context including errors, background, what was tried) and `goal` (specific objective). + ### Workflow Strategy Your core decision loop: **Analyze → Design (if needed) → Execute → Review → Iterate**. @@ -101,6 +104,11 @@ Working agents that produce final output are: **Coding-Agent**, **Chat-Agent**, 4. **No Long-Running Processes**: Do not instruct agents to start development servers or applications (e.g., `npm run dev`). Verification should be done via unit tests, syntax checks, or compilation. 5. **Delegate Repo Analysis**: The Conductor's own `read_file`, `search_by_regex`, `list_dir`, `print_dir_tree` are **LOW-PRIORITY fallbacks** for repository understanding. You MUST delegate all codebase exploration to Repo-Agent via `delegate_repo` — it has codebase semantic tools (`semantic_search`, `query_code_skeleton`, `query_code_snippet`) that are far more effective than raw file operations. Only use your own file tools as a last resort when Repo-Agent is unavailable or its result is clearly insufficient. 6. **Enforce Parallelism**: When delegating read-only or exploration tasks, explicitly require the sub-agent to use parallel tool calls. +7. **Use DeepThinking Sparingly**: You have access to a `deepthinking` tool for extreme cases. This tool is VERY expensive (high token cost, high latency). ONLY use `deepthinking` when ALL of the following are true: + - Conventional methods have been exhausted (thinking tool, micro_agent, repo analysis, delegation to sub-agents) + - The problem involves complex multi-system interactions, deep architectural questions, or requires systematic analysis beyond normal reasoning + - Multiple attempts using standard approaches have failed + **Never** use `deepthinking` for simple issues, quick fixes, syntax errors, or straightforward coding tasks. ### Output Format You must structure your textual response (before the tool call) using the following markdown `Thought Process` block: diff --git a/internal/agents/repo.go b/internal/agents/repo.go index 63b3aa8..8229b58 100644 --- a/internal/agents/repo.go +++ b/internal/agents/repo.go @@ -79,6 +79,8 @@ func NewRepoAgent(globalCtx *globalctx.GlobalCtx, llm llm.Engine, publisher *mes fn = globalCtx.RepoOps.ExecuteQueryCodeSkeleton case "query_code_snippet": fn = globalCtx.RepoOps.ExecuteQueryCodeSnippet + case "deepthinking": + fn = globalCtx.DeepThinkingTool.Execute default: continue } diff --git a/internal/agents/repo.prompt.md b/internal/agents/repo.prompt.md index c9210bd..91fbc94 100644 --- a/internal/agents/repo.prompt.md +++ b/internal/agents/repo.prompt.md @@ -48,4 +48,7 @@ When you need to explore or investigate the repository beyond the provided repor Output a clear, structured summary that gives a developer a solid "mental map" of the codebase. **Language Compliance**: -The output summary MUST be in the language specified in **Language Instructions**. \ No newline at end of file +The output summary MUST be in the language specified in **Language Instructions**. + +### DeepThinking Tool (Last Resort) +- **`deepthinking`**: An extremely expensive deep analysis tool. ONLY use as a last resort when all other analysis methods have failed. Input: `context` (full problem context) and `goal` (specific objective). This tool is VERY expensive — do NOT use for simple code exploration tasks. \ No newline at end of file diff --git a/internal/agents/tools.json b/internal/agents/tools.json index 2f903f2..64666cb 100644 --- a/internal/agents/tools.json +++ b/internal/agents/tools.json @@ -154,6 +154,24 @@ "required": ["system_prompt", "task"] } }, + { + "name": "deepthinking", + "description": "A deep system analysis and design tool for complex, difficult problems. This tool performs exhaustive multi-dimensional analysis — including root cause analysis, system-level impact assessment, constraint analysis, and solution architecture design — using an isolated, expensive LLM call with a specialized system prompt. **IMPORTANT: This tool is VERY EXPENSIVE (high token cost and latency). Only use it when:** 1) Conventional methods (thinking tool, micro_agent, repo analysis, delegation to sub-agents) have been tried and failed to produce a satisfactory solution, 2) The problem is inherently complex and requires systematic design, or 3) Multiple failed attempts indicate a fundamental misunderstanding that needs deep analysis. **Do NOT use this tool** for simple issues, quick fixes, or routine tasks. The input must include the full problem context (execution environment feedback, key errors, problem background, constraints) and the specific goal to achieve.", + "parameters": { + "type": "object", + "properties": { + "context": { + "type": "string", + "description": "The complete problem context including: execution environment feedback, key errors encountered, problem background, constraints, what has been tried and failed, and any other relevant information needed for deep analysis." + }, + "goal": { + "type": "string", + "description": "The specific objective to achieve. What should the deep analysis produce? A solution design, a debugging strategy, an architectural decision, or a comprehensive plan." + } + }, + "required": ["context", "goal"] + } + }, { "name": "delete_file", "description": "You can use this tool to delete files, you can delete multi files in one toolcall, and you MUST make sure the files is exist before deleting.", diff --git a/internal/app/app.go b/internal/app/app.go index 157340a..8e35bf9 100644 --- a/internal/app/app.go +++ b/internal/app/app.go @@ -65,6 +65,12 @@ func (ca *CodingAssistant) Init(engine llm.Engine, workDir string) { microAgentEngine = ca.client.GetToolEngine("micro_agent") } + // Resolve tool-specific engine for deepthinking + deepthinkingEngine := engine + if ca.client != nil { + deepthinkingEngine = ca.client.GetToolEngine("deepthinking") + } + gctx := globalctx.GlobalCtx{ SpeakLang: ca.config.Agent.SpeakLang, ProjectPath: workDir, @@ -75,16 +81,17 @@ func (ca *CodingAssistant) Init(engine llm.Engine, workDir string) { CodebaseURL: fmt.Sprintf("http://127.0.0.1:%d", ca.CodebasePort), // Tools - FileOps: tools.NewFileOperationsTool(workDir), - SearchOps: tools.NewSearchOperationsTool(workDir), - SysOps: tools.NewSystemOperationsTool(workDir), - ReplaceTool: tools.NewReplaceBlockTool(workDir), - ThinkingTool: tools.NewThinkingTool(), - MicroAgentTool: tools.NewMicroAgentTool(microAgentEngine), - ImplPlanTool: tools.NewImplPlanTool(), - FlowOps: tools.NewFlowControlTool(workDir), - RepoOps: tools.NewRepoOperationsTool(fmt.Sprintf("http://127.0.0.1:%d", ca.CodebasePort), workDir), - UserConfirmMgr: userConfirmMgr, + FileOps: tools.NewFileOperationsTool(workDir), + SearchOps: tools.NewSearchOperationsTool(workDir), + SysOps: tools.NewSystemOperationsTool(workDir), + ReplaceTool: tools.NewReplaceBlockTool(workDir), + ThinkingTool: tools.NewThinkingTool(), + MicroAgentTool: tools.NewMicroAgentTool(microAgentEngine), + ImplPlanTool: tools.NewImplPlanTool(), + FlowOps: tools.NewFlowControlTool(workDir), + RepoOps: tools.NewRepoOperationsTool(fmt.Sprintf("http://127.0.0.1:%d", ca.CodebasePort), workDir), + UserConfirmMgr: userConfirmMgr, + DeepThinkingTool: tools.NewDeepThinkingTool(deepthinkingEngine), } ca.globalCtx = &gctx diff --git a/internal/config/config.go b/internal/config/config.go index 94b2e76..3013212 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -75,10 +75,11 @@ type ToolLLMOverride struct { // ToolsLLMConfig holds per-tool LLM overrides. // Priority: per-tool > tools.default > agent > global. type ToolsLLMConfig struct { - UseProvider string `toml:"use_provider"` // default for all tools - MicroAgent *ToolLLMOverride `toml:"micro_agent,omitempty"` - Thinking *ToolLLMOverride `toml:"thinking,omitempty"` - ImplPlan *ToolLLMOverride `toml:"impl_plan,omitempty"` + UseProvider string `toml:"use_provider"` // default for all tools + MicroAgent *ToolLLMOverride `toml:"micro_agent,omitempty"` + Thinking *ToolLLMOverride `toml:"thinking,omitempty"` + ImplPlan *ToolLLMOverride `toml:"impl_plan,omitempty"` + DeepThinking *ToolLLMOverride `toml:"deepthinking,omitempty"` } // TopLevelConfig groups the [global] section. @@ -178,6 +179,8 @@ func (c *Config) getToolOverride(toolName string) *ToolLLMOverride { return c.Tools.LLM.Thinking case "impl_plan": return c.Tools.LLM.ImplPlan + case "deepthinking": + return c.Tools.LLM.DeepThinking default: return nil } diff --git a/internal/globalctx/global_context.go b/internal/globalctx/global_context.go index 4a6eb94..44a778e 100644 --- a/internal/globalctx/global_context.go +++ b/internal/globalctx/global_context.go @@ -23,17 +23,18 @@ type GlobalCtx struct { MaxContextTokens int // Tools - FileOps *tools.FileOperationsTool - SearchOps *tools.SearchOperationsTool - SysOps *tools.SystemOperationsTool - ReplaceTool *tools.ReplaceBlockTool - ThinkingTool *tools.ThinkingTool - MicroAgentTool *tools.MicroAgentTool - ImplPlanTool *tools.ImplPlanTool - FlowOps *tools.FlowControlTool - RepoOps *tools.RepoOperationsTool - UserConfirmMgr *tools.UserConfirmManager - Guard *tools.WorkspaceGuard + FileOps *tools.FileOperationsTool + SearchOps *tools.SearchOperationsTool + SysOps *tools.SystemOperationsTool + ReplaceTool *tools.ReplaceBlockTool + ThinkingTool *tools.ThinkingTool + MicroAgentTool *tools.MicroAgentTool + ImplPlanTool *tools.ImplPlanTool + FlowOps *tools.FlowControlTool + RepoOps *tools.RepoOperationsTool + UserConfirmMgr *tools.UserConfirmManager + Guard *tools.WorkspaceGuard + DeepThinkingTool *tools.DeepThinkingTool } func (g *GlobalCtx) FormatPrompt(prompt string) string { diff --git a/internal/tools/deepthinking.go b/internal/tools/deepthinking.go new file mode 100644 index 0000000..d840551 --- /dev/null +++ b/internal/tools/deepthinking.go @@ -0,0 +1,126 @@ +package tools + +import ( + "context" + "fmt" + + "codeactor/internal/llm" +) + +// DeepThinkingTool provides system-level analysis and design capabilities. +// It uses an isolated LLM call with a specialized system prompt for deep, +// structured analysis of complex problems. This tool is EXPENSIVE and should +// only be used after conventional methods have been exhausted. +type DeepThinkingTool struct { + LLM llm.Engine +} + +// NewDeepThinkingTool creates a new DeepThinkingTool with the given LLM client. +func NewDeepThinkingTool(llm llm.Engine) *DeepThinkingTool { + return &DeepThinkingTool{LLM: llm} +} + +// Execute performs deep system analysis using an isolated LLM call. +// It takes a problem context and a goal, then returns a comprehensive solution. +// +// Parameters: +// - context: The full problem context including execution environment feedback, +// key errors, problem background, and any relevant constraints. +// - goal: The specific objective to achieve. +// +// Returns a structured analysis and solution plan. +func (t *DeepThinkingTool) Execute(ctx context.Context, params map[string]interface{}) (interface{}, error) { + problemContext, ok := params["context"].(string) + if !ok || problemContext == "" { + return nil, fmt.Errorf("context parameter is required and must be a non-empty string") + } + + goal, ok := params["goal"].(string) + if !ok || goal == "" { + return nil, fmt.Errorf("goal parameter is required and must be a non-empty string") + } + + systemPrompt := getDeepThinkingSystemPrompt() + + task := fmt.Sprintf( + "# Problem Context\n\n%s\n\n---\n\n# Goal\n\n%s\n\n---\n\nPlease perform a thorough system analysis and provide a comprehensive solution following the structure defined in your system prompt.", + problemContext, + goal, + ) + + messages := []llm.Message{ + { + Role: llm.RoleSystem, + Content: systemPrompt, + }, + { + Role: llm.RoleUser, + Content: task, + }, + } + + resp, err := t.LLM.GenerateContent(ctx, messages, nil, nil) + if err != nil { + return nil, fmt.Errorf("deepthinking LLM call failed: %w", err) + } + + if len(resp.Choices) > 0 { + return resp.Choices[0].Content, nil + } + return "", nil +} + +// getDeepThinkingSystemPrompt returns the specialized system prompt for deep analysis. +func getDeepThinkingSystemPrompt() string { + return `# Role +You are a **Deep Thinking Engine** — an elite system analyst and solution architect. You are activated only for the most challenging problems that cannot be solved by conventional methods. + +# Your Mission +Perform exhaustive, multi-dimensional analysis of the given problem context and produce a comprehensive, actionable solution. + +# Analysis Framework +You MUST structure your response using the following framework: + +## 1. Root Cause Analysis +- Identify the fundamental problem, not just symptoms +- Trace the causal chain from observed failures back to root causes +- Distinguish between primary and secondary issues + +## 2. System-Level Impact Assessment +- Map all affected components and their interdependencies +- Evaluate the blast radius: what else could break? +- Identify cascading effects and hidden risks + +## 3. Constraint Analysis +- Technical constraints (language, framework, platform, performance) +- Resource constraints (time, compute, memory, budget) +- Organizational constraints (team capabilities, existing architecture, policies) +- Risk constraints (security, reliability, data integrity) + +## 4. Solution Design +- Propose 2-3 candidate solutions with clear pros/cons for each +- For each solution, provide: + * Approach overview + * Estimated effort (Low/Medium/High) + * Risk level (Low/Medium/High) + * Trade-offs and implications + +## 5. Recommended Solution — Detailed Plan +- Select the best solution and justify the choice +- Provide a step-by-step implementation plan +- For each step: what to do, which files/components to modify, what to verify +- Include specific code patterns, architectural diagrams (in text), or pseudocode where helpful + +## 6. Verification Strategy +- How to verify the solution works correctly +- Test cases and edge cases to consider +- Rollback plan if the solution fails + +# Critical Rules +- Be thorough and systematic — this is an expensive call, make it count +- Ground ALL analysis in the provided context — do not hallucinate +- Provide concrete, actionable recommendations — not vague advice +- Consider long-term maintainability and scalability +- Flag any assumptions you are making explicitly +` +} From de3fb1928d4d0a75c59359bfcf71de7138f2280b Mon Sep 17 00:00:00 2001 From: iohub Date: Fri, 8 May 2026 14:54:43 +0800 Subject: [PATCH 03/12] docs --- docs/Prompt_cache.md | 58 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 docs/Prompt_cache.md diff --git a/docs/Prompt_cache.md b/docs/Prompt_cache.md new file mode 100644 index 0000000..139d89f --- /dev/null +++ b/docs/Prompt_cache.md @@ -0,0 +1,58 @@ + + + +--- + +### 一、 提示词结构设计:严格分离静态与动态 + +LLM 缓存的底层机制是**前缀匹配(Prefix Matching)**:即缓存必须从第一个 Token 开始完全一致,一旦中途有任何一个字符不同,该字符之后的所有缓存将全部失效。 + +* **黄金法则:静态在前,动态在后** + * **最前部(Static Prefix)**:系统角色(System Prompt)、行为准则、全量工具定义(Tool Descriptions)、知识库文档。这些内容在 Agent 运行周期内几乎不变,占据了绝大多数 Token,应在此部分末尾打上缓存断点(Cache Breakpoint)。 + * **最后部(Dynamic Suffix)**:用户的当前提问、实时的环境变量、最新的观察结果(Observation)和步骤输出。 +* **反模式:严禁在系统提示词头部注入动态变量** + * 很多开发者习惯在 System Prompt 开头加上当前时间戳(`Current Time: xxx`)、请求 ID、或当前的任务进度状态。这会导致整个系统提示词前缀每一轮都在变化,使得数万 Token 的缓存 100% 彻底失效。时间戳或状态应作为独立的 System Message **追加**到对话末尾。 +* **确定性序列化(Deterministic Serialization)** + * Agent 经常需要将 JSON、字典对象或代码树序列化后放入上下文。必须确保每次序列化时的**字段顺序是固定且一致的**(例如总是按 Key 的字母排序)。由于库的随机性导致的键位倒置,会在不经意间破坏前缀一致性。 + +### 二、 对话与状态管理:遵守“只追加(Append-Only)”原则 + +在传统的软件工程中,我们习惯修改变量来更新状态;但在面向 LLM 缓存编程时,**上下文不是可编辑的变量,而是只追加的日志**。 + +* **绝对避免修改历史记录** + * **❌ 错误做法:滑动窗口截断(Sliding Window)**。当历史记录过长时,直接删除最前面的几轮对话。这会改变动态部分的开头,导致整个对话历史的缓存全毁。 + * **❌ 错误做法:修改历史消息**。如修改此前某一步的中间思考过程或纠正历史错误。 + * **✅ 正确做法:只追加内容**。以新的消息告知模型之前的错误,或将状态变化作为新的 user/system message 附在最后。 +* **使用工具调用代替模式切换** + * 不要通过修改 System Prompt 来让 Agent 切换工作模式(如从“规划模式”切换到“执行模式”)。应将所有模式对应的逻辑写死在静态工具列表中,让 Agent 通过触发特定工具来切换状态。统一的工具前缀命名(如 `browser_x`, `shell_y`)也能增加命中率的稳定性。 + +### 三、 系统架构与路由调度设计 + +即使提示词设计完美,如果系统调度不当,同样无法命中缓存。 + +* **路由粘性(Routing Stickiness)** + * **现象**:由于分布式集群中有多台推理服务器,如果同一 Agent 会话的不同步骤被负载均衡分配到了不同的机器,也会导致 Cache Miss。 + * **对策**:如果使用第三方 API(如 OpenAI),可以在 API 请求中带上 `prompt_cache_key` 参数或 `Session ID` 参数,确保具有相同前缀的请求倾向于被路由到同一台已缓存该 KV 状态的服务器上。自托管模型(如 vLLM 架构)也需配合 Session 路由打通 Prefix Cache。 +* **多智能体架构(Multi-Agent Swarm)优化** + * 不要试图构建一个包含 100 个工具、具有超级庞大 System Prompt 的“万能 Agent”。 + * 应使用多智能体架构,每个专精子 Agent 拥有高度稳定的、固定的工具集和提示词(例如专职代码审计的 Agent 只加载代码审计的 Prompt)。这样单个子 Agent 被反复调用时,其头部缓存命中率会极高。 +* **注意缓存生命周期(TTL 悬崖)** + * 多数 API 厂商(如 Anthropic)的提示词缓存默认存活时间(TTL)只有 5 分钟。 + * **对策**:如果 Agent 有异步任务或长时间等待用户反馈(超过 5 分钟),缓存会失效。设计时应尽量让 Agent 密集执行任务;或对于超高价值的共享上下文,通过低频率的定时“Ping”来预热或维持缓存。 + +### 四、 应用层缓存补充(Exact / Semantic Caching) + +除了底层的 KV 缓存,Agent 系统自身也应该设计应用级缓存,拦截对 LLM 的不必要调用。 + +* **精准匹配缓存(Exact Match Caching)** + * 对于高度重复的 Agent 宏动作或工具调用。如果前置依赖和状态(如针对同一网页的相同查询)一致,直接从 Redis 等存储中返回上一次的工具解析或摘要结果。 +* **语义缓存(Semantic Caching)** + * Agent 经常会遇到“表述不同但意图相同”的用户指令。通过引入轻量级的 Embedding 模型(如 GPTCache),计算当前请求与历史请求的向量相似度。如果相似度极高,Agent 可以直接复用之前的规划路线(Plan)或输出,从而实现 100% 避免全量 LLM 推理。 + +### 总结:Agent 缓存优化的核心清单 + +1. **静态/动态拆分**:系统设定和工具说明放在最前,会话历史次之,当前任务和动态参数压轴。 +2. **清理系统提示词的“脏数据”**:移除一切时间戳、UUID 等动态变量。 +3. **遵循 Append-Only**:绝不随意修改、删除对话历史中的中间项。 +4. **固定输出格式**:强制业务系统的序列化(如 JSON)具有确定性的键值排序。 +5. **设计路由亲和性**:保障同一 Agent 任务的后续请求发往同样的缓存节点。 From ded610de609ad5538a4ad3d59a1044a10935cd02 Mon Sep 17 00:00:00 2001 From: iohub Date: Fri, 8 May 2026 15:11:32 +0800 Subject: [PATCH 04/12] docs --- docs/Prompt_Cache_Optimization_Plan.md | 224 +++++++++++++++++++++++++ 1 file changed, 224 insertions(+) create mode 100644 docs/Prompt_Cache_Optimization_Plan.md diff --git a/docs/Prompt_Cache_Optimization_Plan.md b/docs/Prompt_Cache_Optimization_Plan.md new file mode 100644 index 0000000..a2f96e9 --- /dev/null +++ b/docs/Prompt_Cache_Optimization_Plan.md @@ -0,0 +1,224 @@ +# Prompt 缓存优化方案 + +> 审计日期:2025-07-16 +> 审计依据:`docs/Prompt_cache.md` 最佳实践文档 +> 决策方法:经过 `deepthinking` 深度分析后确定 + +--- + +## 一、审计背景 + +对照 LLM Prompt Cache 最佳实践文档的五项核心检查清单,对项目中 7 个 Agent(Conductor、Coding、Repo、Chat、DevOps、Meta、ImplPlan)的 prompt 构建方式进行了全面审计。 + +LLM 缓存采用**严格前缀匹配**(Prefix Matching)机制:从第一个 Token 开始必须完全一致,一旦中途有任何字符不同,该字符之后的所有缓存全部失效。 + +## 二、审计结论总览 + +| 编号 | 严重程度 | 问题 | 决策 | +|------|---------|------|------| +| **A** | 🔴 P0 | Conductor 动态项目上下文放在静态 prompt 之前 | **必须修复** | +| **B** | 🟡 P0 | RepoAgent 动态数据插入顺序不当 | **必须修复** | +| **C** | 🟢 P1 | `FunctionDef.Parameters` 使用 `map[string]any` | **建议优化**(仅加防御注释) | +| **D** | 🟡 P0 | `FormatPrompt` 中 Environment 字段条件拼接 | **必须修复** | +| **E** | ⚪ P2 | 未实现路由亲和性 | **暂不修复**(集群部署时再处理) | + +--- + +## 三、P0 必须修复项 + +### 问题 A:Conductor 动态项目上下文放在静态 prompt 之前 + +**文件**:`internal/agents/conductor.go`(约第 688 行) + +**当前代码**: +```go +// ❌ 动态上下文被 prepend 到静态 prompt 之前 +systemPrompt = fmt.Sprintf("### Project Workspace Context\n%s\n\n", loadResult.Content) + systemPrompt +``` + +**问题分析**: +- `loadProjectContext()` 加载的 `CODEACTOR.md`/`CLAUDE.md`/`AGENTS.md` 每个项目内容不同 +- 它被放在 `conductor.prompt.md`(139 行静态 prompt)的**最前面** +- 切换项目时,第一个 token 就不同 → 整个 System Prompt 缓存 100% 失效 +- Conductor 的 prompt 是系统中最大、最复杂的,缓存失效代价极高 + +**修复方案**:将 Project Context 移到 system prompt **末尾** + +```go +// ✅ 正确:动态上下文放在末尾 +systemPrompt := a.GlobalCtx.FormatPrompt(conductorPrompt) +// ... 追加 Custom Agents 注册信息 ... +if shouldLoadProjectContext { + systemPrompt += "\n\n### Project Workspace Context\n" + loadResult.Content + "\n" +} +``` + +**预期收益**: +- 139 行静态模板 + Environment + Language Instructions 成为固定前缀 +- 跨项目、跨会话共享缓存 +- 缓存命中率预计提升 60%~80% + +--- + +### 问题 B:RepoAgent 动态数据插入顺序不当 + +**文件**:`internal/agents/repo.go`(约第 195-217 行) + +**当前代码**: +```go +// ❌ 动态 investigation 数据插在静态 prompt 和环境信息之间 +systemPrompt := repoPrompt // 54 行静态 +systemPrompt += info // ← 动态数据插在中间 +systemPrompt = a.GlobalCtx.FormatPrompt(systemPrompt) // 追加 Environment +``` + +**问题分析**: +- `doPreInvestigate()` 返回的 Directory Tree、Core Functions、File Skeletons 每个项目不同,甚至同一项目代码变化后也不同 +- 动态数据后的 Environment + Language Instructions 缓存连带失效 +- RepoAgent 是高频调用 Agent + +**修复方案**:先 `FormatPrompt`(静态 + 环境),最后追加动态调查数据 + +```go +// ✅ 正确:静态在前,动态在最后 +systemPrompt := a.GlobalCtx.FormatPrompt(repoPrompt) +systemPrompt += info // investigation 数据放在最后 +``` + +**预期收益**: +- 54 行静态指令完全固化于前缀 +- 动态数据放尾部符合 LLM 注意力机制的"近因效应" + +--- + +### 问题 D:`FormatPrompt` 中 Environment 字段条件拼接 + +**文件**:`internal/globalctx/global_context.go`(`FormatPrompt` 方法) + +**当前代码**: +```go +// ❌ 条件判断导致同环境下前缀不一致 +if g.ProjectPath != "" { + sb.WriteString(fmt.Sprintf("- **Project Path**: %s\n", g.ProjectPath)) +} +if g.OS != "" { + sb.WriteString(fmt.Sprintf("- **Operating System**: %s\n", g.OS)) +} +if g.Arch != "" { + sb.WriteString(fmt.Sprintf("- **Architecture**: %s\n", g.Arch)) +} +``` + +**问题分析**: +- 条件分支导致相同环境下的请求前缀长度/内容不同 +- 若某字段为空被跳过,Environment 块的结构发生变化 +- 虽然 Environment 在最末尾,不会破坏前面的静态缓存,但会影响完整前缀一致性 + +**修复方案**:移除条件判断,始终输出完整字段结构 + +```go +// ✅ 正确:始终输出完整字段,空值用占位符 +projectPath := g.ProjectPath +if projectPath == "" { + projectPath = "[NOT SET]" +} +os := g.OS +if os == "" { + os = "[NOT SET]" +} +arch := g.Arch +if arch == "" { + arch = "[NOT SET]" +} + +sb.WriteString("\n\n### Environment\n") +sb.WriteString(fmt.Sprintf("- **Project Path**: %s\n", projectPath)) +sb.WriteString(fmt.Sprintf("- **Operating System**: %s\n", os)) +sb.WriteString(fmt.Sprintf("- **Architecture**: %s\n", arch)) +``` + +**预期收益**: +- 保证了 Environment 块的结构和前缀长度绝对一致 +- 最大化缓存命中率 + +--- + +## 四、P1 建议优化项 + +### 问题 C:`FunctionDef.Parameters` 使用 `map[string]any` + +**文件**:`internal/llm/engine.go`(`FunctionDef` 结构体) + +**现状**: +```go +type FunctionDef struct { + Name string `json:"name"` + Description string `json:"description,omitempty"` + Parameters map[string]any `json:"parameters,omitempty"` +} +``` + +**分析**: +- Go 标准库 `encoding/json` 对 `map` 序列化时按 key **字母排序**,行为是确定性的 ✅ +- 但如果未来切换 JSON 库(如 `sonic`、`jsoniter`),需确保启用 `SortMapKeys` 配置 +- 当前改为 struct 的工程成本高、收益低,不建议重构 + +**建议方案**:仅添加防御性注释 + +```go +// ⚠️ IMPORTANT: The current implementation relies on encoding/json's deterministic +// sorting of map keys (alphabetical order). If migrating to sonic, jsoniter, or +// another JSON library in the future, ensure SortMapKeys is enabled to maintain +// deterministic key ordering and prevent prompt cache fragmentation. +type FunctionDef struct { + Name string `json:"name"` + Description string `json:"description,omitempty"` + Parameters map[string]any `json:"parameters,omitempty"` +} +``` + +--- + +## 五、P2 暂不修复项 + +### 问题 E:未实现路由亲和性 + +**分析**: +- 当前开发环境为单节点运行,路由亲和性无实际影响 +- 引入 `prompt_cache_key` 或 Session Router 需改造 LLM Client 层,增加状态管理复杂度 + +**规划**: +- 集群部署时再引入一致性哈希路由或共享 Redis 缓存层 +- 可在 `llm.CallOptions` 中预留 `PromptCacheKey` 字段供未来使用 + +--- + +## 六、其他发现:可接受的架构代价 + +### Compact 压缩导致的缓存 Miss + +压缩引擎的 L3(丢弃早期消息)和 L2(截断工具输出)会改变消息结构,导致后续 LLM 调用的前缀变化。这是**可接受的架构代价**:缓存 Miss 是换取 Token 超限安全的必要手段,不应为保缓存而限制压缩。 + +### 动态 Agent 注册导致的工具定义变化 + +Meta-Agent 运行时注册自定义 Agent 会改变 `tool_defs` 列表。由于 `tool_defs` 作为 API 请求参数参与前缀匹配,动态注册天然导致 Cache Miss。这属于业务特性,无法避免。 + +--- + +## 七、实施优先级 + +| 优先级 | 问题 | 预计改动量 | 风险 | +|--------|------|-----------|------| +| **1** | D — FormatPrompt 条件拼接 | ~10 行 | 极低 | +| **2** | B — RepoAgent 顺序调整 | ~3 行 | 低 | +| **3** | A — Conductor 上下文移至末尾 | ~5 行 | 低(需验证 LLM 指令遵循度) | +| **4** | C — 添加防御注释 | ~5 行 | 零风险 | + +--- + +## 八、验证策略 + +1. **单元测试**:验证 `FormatPrompt` 在不同参数下输出前缀一致 +2. **集成测试**:使用 Mock LLM 拦截请求,统计前缀命中率 +3. **LLM 行为回归**:选取典型编码任务,验证修复后指令遵循率、工具调用准确率无退化 +4. **回滚方案**:所有变更通过独立 Git Commit 隔离,异常时一键 Revert \ No newline at end of file From 4dc43b8b1da8d0da7227969097f3d2aa16fdaac2 Mon Sep 17 00:00:00 2001 From: iohub Date: Fri, 8 May 2026 15:20:47 +0800 Subject: [PATCH 05/12] perf(agents): reorder system prompt components for better cache reuse Move static agent descriptions before dynamic project context in the system prompt to improve LLM prompt cache hit rate. The fixed prefix portion can now be cached while the variable project context at the end avoids cache invalidation. Also include: - fix variable declaration scope in repo agent (info used before decl) - add explicit default values in global context env construction - document JSON key ordering requirement for cache determinism --- internal/agents/conductor.go | 27 ++++++++++++++++++--------- internal/agents/repo.go | 5 +++-- internal/globalctx/global_context.go | 21 ++++++++++++++------- internal/llm/engine.go | 5 +++++ 4 files changed, 40 insertions(+), 18 deletions(-) diff --git a/internal/agents/conductor.go b/internal/agents/conductor.go index a79ae70..3e24f03 100644 --- a/internal/agents/conductor.go +++ b/internal/agents/conductor.go @@ -658,14 +658,7 @@ func (a *ConductorAgent) Run(ctx context.Context, input string, mem *memory.Conv // Always start with System Prompt (with any registered custom agents appended) systemPrompt := a.GlobalCtx.FormatPrompt(conductorPrompt) - if len(a.customAgents) > 0 { - systemPrompt += "\n\n### Custom Agents\nThe following specialized agents have been designed by Meta-Agent and are permanently available for delegation:\n\n" - for _, ca := range a.customAgents { - systemPrompt += fmt.Sprintf("- **%s** (`delegate_%s`): %s\n", ca.DisplayName, ca.Name, ca.Description) - } - systemPrompt += "\nUse these agents via their delegate tools for tasks matching their specializations.\n" - } - + var projectContext string // 只在首次对话时加载项目上下文文件(CODEACTOR.md、CLAUDE.md、AGENTS.md), // 同一会话的后续追问无需重复注入,避免浪费 token。 // memory 中不存储 system 消息,因此 len(mem.GetMessages()) == 0 即可判断是否为首次对话。 @@ -675,8 +668,24 @@ func (a *ConductorAgent) Run(ctx context.Context, input string, mem *memory.Conv if a.Publisher != nil { a.Publisher.Publish("context_loaded", loadResult, a.Name()) } - systemPrompt = fmt.Sprintf("### Project Workspace Context\n%s\n\n", loadResult.Content) + systemPrompt + // 延迟追加:先构建完整的 system prompt(静态前缀 + 环境信息 + 自定义 Agent), + // 最后才追加项目上下文,确保静态前缀可被 LLM Prompt Cache 复用 + projectContext = fmt.Sprintf("\n\n### Project Workspace Context\n%s\n", loadResult.Content) + } + } + + // 自定义 Agent 描述 + if len(a.customAgents) > 0 { + systemPrompt += "\n\n### Custom Agents\nThe following specialized agents have been designed by Meta-Agent and are permanently available for delegation:\n\n" + for _, ca := range a.customAgents { + systemPrompt += fmt.Sprintf("- **%s** (`delegate_%s`): %s\n", ca.DisplayName, ca.Name, ca.Description) } + systemPrompt += "\nUse these agents via their delegate tools for tasks matching their specializations.\n" + } + + // 追加项目上下文(放在所有静态内容之后,确保缓存命中率) + if projectContext != "" { + systemPrompt += projectContext } messages = append(messages, llm.Message{ diff --git a/internal/agents/repo.go b/internal/agents/repo.go index 8229b58..bbd61ee 100644 --- a/internal/agents/repo.go +++ b/internal/agents/repo.go @@ -175,11 +175,12 @@ func (a *RepoAgent) Run(ctx context.Context, input string) (string, error) { slog.Info("RepoAgent performing pre-investigation", "project_dir", a.GlobalCtx.ProjectPath) investigation, err := a.doPreInvestigate(a.GlobalCtx.ProjectPath) + var info string if err != nil { slog.Warn("RepoAgent pre-investigation failed", "error", err) } else { // Add investigation results to system prompt - info := "\n\nRepository Information:\n" + info = "\n\nRepository Information:\n" info += "\nDirectory Tree:\n" + investigation.Data.DirectoryTree + "\n" info += "\nCore Functions:\n" for _, fn := range investigation.Data.CoreFunctions { @@ -211,10 +212,10 @@ func (a *RepoAgent) Run(ctx context.Context, input string) (string, error) { info += fmt.Sprintf("File: %s\n```%s\n%s\n```\n", sk.Filepath, sk.Language, sk.SkeletonText) } - systemPrompt += info } systemPrompt = a.GlobalCtx.FormatPrompt(systemPrompt) + systemPrompt += info cfg := ExecutorConfig{ SystemPrompt: systemPrompt, diff --git a/internal/globalctx/global_context.go b/internal/globalctx/global_context.go index 44a778e..2065da1 100644 --- a/internal/globalctx/global_context.go +++ b/internal/globalctx/global_context.go @@ -42,17 +42,24 @@ func (g *GlobalCtx) FormatPrompt(prompt string) string { sb.WriteString(prompt) // Environment context - sb.WriteString("\n\n### Environment\n") - if g.ProjectPath != "" { - sb.WriteString(fmt.Sprintf("- **Project Path**: %s\n", g.ProjectPath)) + projectPath := g.ProjectPath + if projectPath == "" { + projectPath = "[NOT SET]" } - if g.OS != "" { - sb.WriteString(fmt.Sprintf("- **Operating System**: %s\n", g.OS)) + os := g.OS + if os == "" { + os = "[NOT SET]" } - if g.Arch != "" { - sb.WriteString(fmt.Sprintf("- **Architecture**: %s\n", g.Arch)) + arch := g.Arch + if arch == "" { + arch = "[NOT SET]" } + sb.WriteString("\n\n### Environment\n") + sb.WriteString(fmt.Sprintf("- **Project Path**: %s\n", projectPath)) + sb.WriteString(fmt.Sprintf("- **Operating System**: %s\n", os)) + sb.WriteString(fmt.Sprintf("- **Architecture**: %s\n", arch)) + // Language if g.SpeakLang != "" { sb.WriteString(fmt.Sprintf("\n### Language Instructions\nYou MUST use **%s** for ALL output, including your internal 'Thought Process', 'Thinking Tool' usage, reasoning steps, and final responses.\n", g.SpeakLang)) diff --git a/internal/llm/engine.go b/internal/llm/engine.go index 5a59930..83e5583 100644 --- a/internal/llm/engine.go +++ b/internal/llm/engine.go @@ -31,6 +31,11 @@ type ToolDef struct { } // FunctionDef defines a function tool's signature. +// +// ⚠️ IMPORTANT: The current implementation relies on encoding/json's deterministic +// sorting of map keys (alphabetical order). If migrating to sonic, jsoniter, or +// another JSON library in the future, ensure SortMapKeys is enabled to maintain +// deterministic key ordering and prevent prompt cache fragmentation. type FunctionDef struct { Name string `json:"name"` Description string `json:"description,omitempty"` From cebeda5cf8002227e8f12c2f7224038504abf2b5 Mon Sep 17 00:00:00 2001 From: iohub Date: Fri, 8 May 2026 20:28:02 +0800 Subject: [PATCH 06/12] feat(tui): dynamic input area sizing and color harmonization Replace fixed input dimensions with content-aware dynamic sizing: height now grows from 3 to 12 lines based on content, and width adapts to terminal size without a hard cap. Unify editor colors with the 256-color palette (236/237 backgrounds). Fix key handling so non-empty input routes cursor keys to the textarea for line navigation. Update footer height estimate from 7 to 18 to match the larger input layout. --- internal/tui/tui_helpers.go | 44 ++++++++++++++++++++++++++++++++----- internal/tui/tui_model.go | 28 +++++++++++++---------- internal/tui/tui_render.go | 3 ++- internal/tui/tui_update.go | 8 +++---- internal/tui/tui_view.go | 1 + 5 files changed, 62 insertions(+), 22 deletions(-) diff --git a/internal/tui/tui_helpers.go b/internal/tui/tui_helpers.go index 9b04ed1..d3dc988 100644 --- a/internal/tui/tui_helpers.go +++ b/internal/tui/tui_helpers.go @@ -59,20 +59,52 @@ func StartTUI(taskFilePath string, ca *app.CodingAssistant, tm *http.TaskManager } func (m model) computeFieldWidth() int { const minField = 38 - const maxField = 90 + const margin = 4 // small padding from terminal edges if m.termWidth <= 0 { - return 60 + return 80 } - avail := m.termWidth - 8 + avail := m.termWidth - margin if avail < minField { return minField } - if avail > maxField { - return maxField - } return avail } +// computeInputHeight calculates the textarea height based on content lines +// and available terminal space. Height grows with content but is capped. +func (m model) computeInputHeight() int { + const minHeight = 3 + const maxHeight = 12 + + // Count lines in current value (at least 1 for empty input) + lines := strings.Count(m.input.Value(), "\n") + 1 + desired := lines + 1 // +1 line for comfortable editing headroom + + if desired < minHeight { + desired = minHeight + } + + // Cap to at most ~1/3 of terminal height so viewport remains usable + if m.termHeight > 0 { + termMax := (m.termHeight - 8) / 2 // 8 lines reserved for separator + status + token dashboard + if termMax < minHeight { + termMax = minHeight + } + if termMax > maxHeight { + termMax = maxHeight + } + if desired > termMax { + desired = termMax + } + } else { + if desired > maxHeight { + desired = maxHeight + } + } + + return desired +} + // getToolCallIDFromEventContent extracts tool_call_id from event content. func getToolCallIDFromEventContent(content interface{}) string { if m, ok := content.(map[string]interface{}); ok { diff --git a/internal/tui/tui_model.go b/internal/tui/tui_model.go index 55407e3..0454988 100644 --- a/internal/tui/tui_model.go +++ b/internal/tui/tui_model.go @@ -59,7 +59,7 @@ var ( toolDoneStyle = lipgloss.NewStyle().Foreground(lipgloss.Color("114")) // green — success toolErrorStyle = lipgloss.NewStyle().Foreground(lipgloss.Color("167")) // red — error - // Mode-specific styles (vim-like edit / command modes) + // Mode-specific styles (vim-like edit / command modes) — harmonized with TUI 256-color palette commandPrefixStyle = lipgloss.NewStyle().Foreground(lipgloss.Color("214")).Bold(true) // orange ":" commandModeBarStyle = lipgloss.NewStyle(). Background(lipgloss.Color("214")). @@ -242,29 +242,35 @@ type model struct { func initialModel(preloadedTaskContent string, ca *app.CodingAssistant, tm *http.TaskManager, dm *datamanager.DataManager, useDarkStyle bool) model { ti := textarea.New() + + // ── Editor input styles (harmonized with TUI 256-color palette) ── + // Accent: 39 (blue, matches NameNormal/tool names) + // Text: 252 (light gray, matches Body/AIResStyle) + // Muted: 245 (gray, matches ContentLine/ParamKey) + // Subtle bg: 236 (dark gray, barely visible on dark terminals) + // Cursor line: 237 (matches SeparatorStyle) + ti.Cursor.Style = lipgloss.NewStyle().Foreground(lipgloss.Color("39")) ti.Placeholder = langManager.GetText("TaskDescPlaceholder") ti.Focus() ti.CharLimit = 0 ti.SetWidth(60) - ti.SetHeight(2) + ti.SetHeight(3) ti.ShowLineNumbers = false - // Text style for both focused and blurred states textStyle := lipgloss.NewStyle().Foreground(lipgloss.Color("252")) ti.FocusedStyle.Text = textStyle ti.BlurredStyle.Text = textStyle - // Edit mode base style: dark background shadow - editBaseStyle := lipgloss.NewStyle().Background(lipgloss.Color("235")) + editBaseStyle := lipgloss.NewStyle().Background(lipgloss.Color("236")) ti.FocusedStyle.Base = editBaseStyle ti.BlurredStyle.Base = editBaseStyle - ti.FocusedStyle.Prompt = lipgloss.NewStyle().Foreground(lipgloss.Color("39")).Bold(true).Background(lipgloss.Color("235")) - ti.BlurredStyle.Prompt = lipgloss.NewStyle().Foreground(lipgloss.Color("244")).Background(lipgloss.Color("235")) - ti.FocusedStyle.CursorLine = lipgloss.NewStyle().Background(lipgloss.Color("235")) - ti.BlurredStyle.CursorLine = lipgloss.NewStyle().Background(lipgloss.Color("235")) - ti.FocusedStyle.Placeholder = lipgloss.NewStyle().Foreground(lipgloss.Color("245")).Background(lipgloss.Color("235")) - ti.BlurredStyle.Placeholder = lipgloss.NewStyle().Foreground(lipgloss.Color("245")).Background(lipgloss.Color("235")) + ti.FocusedStyle.Prompt = lipgloss.NewStyle().Foreground(lipgloss.Color("39")).Bold(true).Background(lipgloss.Color("236")) + ti.BlurredStyle.Prompt = lipgloss.NewStyle().Foreground(lipgloss.Color("244")).Background(lipgloss.Color("236")) + ti.FocusedStyle.CursorLine = lipgloss.NewStyle().Background(lipgloss.Color("237")) + ti.BlurredStyle.CursorLine = lipgloss.NewStyle().Background(lipgloss.Color("237")) + ti.FocusedStyle.Placeholder = lipgloss.NewStyle().Foreground(lipgloss.Color("245")).Background(lipgloss.Color("236")) + ti.BlurredStyle.Placeholder = lipgloss.NewStyle().Foreground(lipgloss.Color("245")).Background(lipgloss.Color("236")) // Dynamic prompt: "❯ " on first line, " " on continuation lines ti.SetPromptFunc(2, func(line int) string { diff --git a/internal/tui/tui_render.go b/internal/tui/tui_render.go index 3b52b66..5c934c7 100644 --- a/internal/tui/tui_render.go +++ b/internal/tui/tui_render.go @@ -12,7 +12,8 @@ import ( ) func (m *model) resizeViewport() { - footerHeight := 7 + // footer estimate: separator(1) + input(max 12) + token_dashboard(3) + status(1) + padding(1) = 18 + footerHeight := 18 if m.errMsg != "" { footerHeight++ } diff --git a/internal/tui/tui_update.go b/internal/tui/tui_update.go index 7fa33e9..bd019ce 100644 --- a/internal/tui/tui_update.go +++ b/internal/tui/tui_update.go @@ -798,10 +798,10 @@ func (m model) Update(msg tea.Msg) (tea.Model, tea.Cmd) { m.handleTaskHistoryCycle(msg.String()) return m, nil } - // Non-empty: pass to viewport for scrolling - var vpCmd tea.Cmd - m.viewport, vpCmd = m.viewport.Update(msg) - return m, vpCmd + // Input has content: pass to textarea for line navigation + var cmd tea.Cmd + m.input, cmd = m.input.Update(msg) + return m, cmd default: // Reset history cursor when user starts typing diff --git a/internal/tui/tui_view.go b/internal/tui/tui_view.go index 61838ca..9ff6b45 100644 --- a/internal/tui/tui_view.go +++ b/internal/tui/tui_view.go @@ -79,6 +79,7 @@ func (m model) View() string { } else { // ── Edit mode: textarea with dark background (via Base style), no bar ── m.input.SetWidth(m.computeFieldWidth()) + m.input.SetHeight(m.computeInputHeight()) inputLine := m.input.View() footer.WriteString(lipgloss.NewStyle().Render(inputLine)) footer.WriteString("\n") From fdbbd699fd7ac34e5b0ebcc87cf8eec378433f93 Mon Sep 17 00:00:00 2001 From: iohub Date: Fri, 8 May 2026 22:43:52 +0800 Subject: [PATCH 07/12] refactor(tui): consolidate token dashboard header and formatting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Merge the token statistics summary (In/Out/Cache/Σ) into the dashboard header line, eliminating the separate totalLine. Also simplify fmt.Sprintf width formatting from string concatenation to the idiomatic %-*s format specifier. --- internal/tui/tui_view.go | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/internal/tui/tui_view.go b/internal/tui/tui_view.go index 9ff6b45..98f408b 100644 --- a/internal/tui/tui_view.go +++ b/internal/tui/tui_view.go @@ -263,7 +263,7 @@ func (m model) renderTokenDashboard() string { headerStyle := lipgloss.NewStyle(). Bold(true). Foreground(lipgloss.Color("240")) - header := headerStyle.Render("─ Token 消耗 ") + var header string // Total line — highlighted inputStyle := lipgloss.NewStyle(). @@ -280,13 +280,14 @@ func (m model) renderTokenDashboard() string { outStr := formatToken(m.outputTokens) sumStr := formatToken(totalTokens) - totalLine := fmt.Sprintf("Total: ") - totalLine += inputStyle.Render(fmt.Sprintf("In: %s ", inStr)) - totalLine += outputStyle.Render(fmt.Sprintf("Out: %s ", outStr)) + // Combined header with token summary + header = headerStyle.Render("─ ") + + inputStyle.Render(fmt.Sprintf("In: %s ", inStr)) + + outputStyle.Render(fmt.Sprintf("Out: %s ", outStr)) if m.cacheReadInputTokens > 0 { - totalLine += fmt.Sprintf("Cache: %s ", formatToken(m.cacheReadInputTokens)) + header += fmt.Sprintf("Cache: %s ", formatToken(m.cacheReadInputTokens)) } - totalLine += sumStyle.Render(fmt.Sprintf("Σ %s", sumStr)) + header += sumStyle.Render(fmt.Sprintf("Σ %s", sumStr)) // Separator sepStyle := lipgloss.NewStyle().Foreground(lipgloss.Color("236")) @@ -308,7 +309,6 @@ func (m model) renderTokenDashboard() string { var lines []string lines = append(lines, header) - lines = append(lines, totalLine) lines = append(lines, sepStyle.Render(strings.Repeat("─", 48))) for _, au := range agents { @@ -317,7 +317,7 @@ func (m model) renderTokenDashboard() string { agentLabel = agentLabel[:maxAgentNameWidth-1] + "…" } // Pad agent name to fixed width - paddedName := fmt.Sprintf("%-"+fmt.Sprintf("%ds", maxAgentNameWidth)+"s", agentLabel) + paddedName := fmt.Sprintf("%-*s", maxAgentNameWidth, agentLabel) agentIn := formatToken(au.InputTokens) agentOut := formatToken(au.OutputTokens) From 0ffc463753a2e4106b5b00bb7cfacbad80a1381a Mon Sep 17 00:00:00 2001 From: iohub Date: Fri, 8 May 2026 23:39:51 +0800 Subject: [PATCH 08/12] refactor(tui): align token dashboard header with agent name column --- internal/tui/tui_view.go | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/internal/tui/tui_view.go b/internal/tui/tui_view.go index 98f408b..b9c23ba 100644 --- a/internal/tui/tui_view.go +++ b/internal/tui/tui_view.go @@ -259,12 +259,19 @@ func (m model) renderTokenDashboard() string { Padding(0, 1). Width(m.termWidth - 2) // account for viewport padding - // Header + // Header — left-aligned "Total" label + token summary headerStyle := lipgloss.NewStyle(). Bold(true). Foreground(lipgloss.Color("240")) var header string + // Token column width for alignment + const maxAgentNameWidth = 10 + + inStr := formatToken(m.inputTokens) + outStr := formatToken(m.outputTokens) + sumStr := formatToken(totalTokens) + // Total line — highlighted inputStyle := lipgloss.NewStyle(). Bold(true). @@ -276,12 +283,8 @@ func (m model) renderTokenDashboard() string { Bold(true). Foreground(lipgloss.Color("243")) // medium gray for sum - inStr := formatToken(m.inputTokens) - outStr := formatToken(m.outputTokens) - sumStr := formatToken(totalTokens) - // Combined header with token summary - header = headerStyle.Render("─ ") + + header = headerStyle.Render(fmt.Sprintf("%-*s", maxAgentNameWidth, "Total")) + " " + inputStyle.Render(fmt.Sprintf("In: %s ", inStr)) + outputStyle.Render(fmt.Sprintf("Out: %s ", outStr)) if m.cacheReadInputTokens > 0 { @@ -300,13 +303,9 @@ func (m model) renderTokenDashboard() string { } } sort.Slice(agents, func(i, j int) bool { - return (agents[i].InputTokens+agents[i].OutputTokens) > (agents[j].InputTokens+agents[j].OutputTokens) + return (agents[i].InputTokens + agents[i].OutputTokens) > (agents[j].InputTokens + agents[j].OutputTokens) }) - // Calculate column widths for alignment - const maxAgentNameWidth = 18 - const colFormat = "In: %-6s Out: %-6s" - var lines []string lines = append(lines, header) lines = append(lines, sepStyle.Render(strings.Repeat("─", 48))) From b4aa051c2dfee6c24a2e4b8777d47e73e433edd3 Mon Sep 17 00:00:00 2001 From: iohub Date: Fri, 8 May 2026 23:43:46 +0800 Subject: [PATCH 09/12] misc --- .codeactor/skills/commit.md | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/.codeactor/skills/commit.md b/.codeactor/skills/commit.md index a419141..b5064cf 100644 --- a/.codeactor/skills/commit.md +++ b/.codeactor/skills/commit.md @@ -29,18 +29,14 @@ 1. 运行 `git status --short` 获取所有变更文件列表。 2. **过滤排除以下文件**: - | 类别 | 排除规则 | - |------|----------| - | **数据文件** | 扩展名:`.csv`, `.tsv`, `.xlsx`, `.xls`, `.parquet`, `.arrow`, `.feather`, `.h5`, `.hdf5`, `.npz`, `.npy`, `.pkl`, `.joblib`, `.sqlite`, `.sqlite3`, `.db`, `.dta`, `.sav`, `.rds`, `.rda` | - | **二进制/编译产物** | 扩展名:`.exe`, `.dll`, `.so`, `.a`, `.o`, `.obj`, `.bin`, `.pt`, `.pth`, `.onnx`, `.safetensors`, `.gguf`, `.wasm`, `.pyc`, `.pyo`, `.class`, `.jar`, `.war`, `.apk`, `.ipa`, `.whl`, `.egg` | - | **媒体文件** | 扩展名:`.png`, `.jpg`, `.jpeg`, `.gif`, `.bmp`, `.tif`, `.tiff`, `.webp`, `.svg`(非图标/UI资源时排除),`.mp3`, `.wav`, `.flac`, `.ogg`, `.mp4`, `.avi`, `.mov`, `.mkv`, `.webm` | - | **压缩包** | 扩展名:`.zip`, `.tar`, `.gz`, `.bz2`, `.7z`, `.rar`, `.xz`, `.zst`, `.tgz`, `.tar.gz`, `.tar.bz2` | - | **测试数据/夹具** | 路径包含:`test/data/`、`tests/data/`、`test/fixtures/`、`tests/fixtures/`、`testdata/`、`__test_data__/`、`sample_data/`、`*.testdata.*` | - | **大文件提醒** | 单个文件超过 **5MB** 时,跳过并提醒用户手动处理 | + | **数据文件** | + | **二进制/编译产物** | + | **媒体文件** | + | **压缩包** | + | **测试数据/夹具** | 3. 对过滤后的代码文件执行 `git add ...`。 4. 执行 `git commit -m ""` 提交。**无需用户确认,直接提交。** -5. 如果过滤后没有任何文件可提交,告知用户「没有需要提交的代码文件(数据文件、二进制文件已自动排除)」,然后结束。 ## 步骤 4:展示提交结果 提交完成后,运行 `git log --oneline -3` 展示最近3条提交记录,让用户确认 commit 内容是否正确。 @@ -53,4 +49,3 @@ **注意事项**: - 如果仓库没有变更,直接告知用户 "没有需要提交的变更" 并结束 - 所有 git 命令使用 `run_bash` 工具执行 -- 保持交互自然,不要过度自动化 From cbe268c4ca55cdcf0680cac8cca6edcd84d3276c Mon Sep 17 00:00:00 2001 From: iohub Date: Sat, 9 May 2026 00:05:01 +0800 Subject: [PATCH 10/12] refactor(tui): compute footer height dynamically based on render state Replace the hardcoded footerHeight constant with a computeFooterHeight() method that calculates the actual height from current UI state including command mode, input area, autocomplete suggestions, error messages, and token dashboard. --- internal/tui/tui_render.go | 46 ++++++++++++++++++++++++++++++++++---- 1 file changed, 42 insertions(+), 4 deletions(-) diff --git a/internal/tui/tui_render.go b/internal/tui/tui_render.go index 5c934c7..fe01a96 100644 --- a/internal/tui/tui_render.go +++ b/internal/tui/tui_render.go @@ -11,12 +11,50 @@ import ( "github.com/charmbracelet/lipgloss" ) -func (m *model) resizeViewport() { - // footer estimate: separator(1) + input(max 12) + token_dashboard(3) + status(1) + padding(1) = 18 - footerHeight := 18 +// computeFooterHeight calculates the actual footer height based on current state. +// This must match the row count produced by model.View() footer rendering. +func (m *model) computeFooterHeight() int { + height := 1 // separator line + + // Input area + if m.commandMode { + height += 1 // command mode line + } else { + height += m.computeInputHeight() + // Skill autocomplete suggestions + if m.skillAutoComplete && len(m.skillSuggestions) > 0 { + height += len(m.skillSuggestions) + 1 // suggestion lines + hint line + } + } + + // Error message if m.errMsg != "" { - footerHeight++ + height += 1 } + + // Token dashboard + totalTokens := m.inputTokens + m.outputTokens + if totalTokens == 0 { + // Single line: "In: 0 | Out: 0" + height += 1 + } else { + // Dashboard with border: 2 (borders) + 1 (header) + 1 (separator) + agent rows + height += 4 // 2 borders + 1 header + 1 separator + for _, au := range m.tokenUsagePerAgent { + if au.InputTokens+au.OutputTokens > 0 { + height++ + } + } + } + + // Two blank lines + status line (see View() — footer.WriteString("\n") twice + statusLine) + height += 3 + + return height +} + +func (m *model) resizeViewport() { + footerHeight := m.computeFooterHeight() vpHeight := m.termHeight - footerHeight if vpHeight < 3 { vpHeight = 3 From 760092ff7642d15741c74bec702bfb165cedb000 Mon Sep 17 00:00:00 2001 From: iohub Date: Sat, 9 May 2026 00:20:12 +0800 Subject: [PATCH 11/12] refactor(tui): reposition running indicator to left of status line Move the running badge in front of the status line instead of appending it after, improving visual hierarchy in the footer bar. The variable is also renamed from taskIndicator to runningBadge for clarity. --- internal/tui/tui_view.go | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/internal/tui/tui_view.go b/internal/tui/tui_view.go index b9c23ba..2a5f295 100644 --- a/internal/tui/tui_view.go +++ b/internal/tui/tui_view.go @@ -125,13 +125,13 @@ func (m model) View() string { footer.WriteString(m.renderTokenDashboard()) footer.WriteString("\n") - // Status line: mode indicator + task indicator + model name - taskIndicator := "" + // Status line: Running indicator (leftmost) + mode indicator + var runningBadge string if m.taskRunning { if m.currentModel != "" { - taskIndicator = logStatusStyle.Render(fmt.Sprintf(" ◷ Running [%s]...", m.currentModel)) + runningBadge = logStatusStyle.Render(fmt.Sprintf(" ◷ Running [%s]...", m.currentModel)) } else { - taskIndicator = logStatusStyle.Render(" ◷ Running...") + runningBadge = logStatusStyle.Render(" ◷ Running...") } } footer.WriteString("\n") @@ -145,7 +145,11 @@ func (m model) View() string { } else { statusLine = footerStyle.Render(langManager.GetText("EditModeTips")) } - footer.WriteString(lipgloss.NewStyle().MarginLeft(2).Render(statusLine + taskIndicator)) + if runningBadge != "" { + footer.WriteString(lipgloss.NewStyle().MarginLeft(2).Render(runningBadge + " " + statusLine)) + } else { + footer.WriteString(lipgloss.NewStyle().MarginLeft(2).Render(statusLine)) + } b.WriteString(footer.String()) From d82b42ca06a06b3a0d2e55cfdd5933006e98a691 Mon Sep 17 00:00:00 2001 From: iohub Date: Sat, 9 May 2026 08:28:57 +0800 Subject: [PATCH 12/12] docs --- docs/Browser_Agent_Design.md | 902 +++++++++++++++++++++++++++++++++++ 1 file changed, 902 insertions(+) create mode 100644 docs/Browser_Agent_Design.md diff --git a/docs/Browser_Agent_Design.md b/docs/Browser_Agent_Design.md new file mode 100644 index 0000000..eee6242 --- /dev/null +++ b/docs/Browser_Agent_Design.md @@ -0,0 +1,902 @@ +# Browser-Agent 设计方案:go-rod 浏览器自动化集成 + +> **状态**: 设计阶段 +> **版本**: v1.0 +> **日期**: 2025-07 +> **依赖**: [go-rod/rod](https://github.com/go-rod/rod) + +--- + +## 目录 + +1. [概述](#1-概述) +2. [根因分析](#2-根因分析) +3. [候选架构对比](#3-候选架构对比) +4. [推荐架构设计](#4-推荐架构设计) +5. [分阶段实施路线图](#5-分阶段实施路线图) +6. [文件结构规划](#6-文件结构规划) +7. [配置设计](#7-配置设计) +8. [浏览器工具定义](#8-浏览器工具定义) +9. [安全设计](#9-安全设计) +10. [数据流设计](#10-数据流设计) +11. [关键代码模式](#11-关键代码模式) +12. [TUI 与 HTTP/WebSocket 集成](#12-tui-与-httpwebsocket-集成) +13. [生命周期与韧性](#13-生命周期与韧性) +14. [测试策略](#14-测试策略) +15. [回滚与监控](#15-回滚与监控) + +--- + +## 1. 概述 + +### 1.1 背景 + +**CodeActor Agent** 是一个基于 Hub-and-Spoke 多 Agent 架构的 AI 驱动自主编程助手,使用 Go 语言构建。当前系统包含以下 Agent: + +| Agent | 职责 | +|-------|------| +| **Conductor** | 中央调度器:任务分类、计划制定、Agent 委派、结果审核 | +| **Repo-Agent** | 代码分析:语义搜索、代码骨架、函数片段 | +| **Coding-Agent** | 代码编写、文件修改、Shell 执行、测试、自调试 | +| **Chat-Agent** | 技术解释、通用问答 | +| **DevOps-Agent** | 系统管理、Shell 命令、日志检查、进程管理 | +| **Meta-Agent** | 运行时创建自定义专用 Agent | + +### 1.2 目标 + +集成 **go-rod** 浏览器自动化库,创建 **Browser-Agent**,使系统具备以下能力: + +- 🌐 **网页自动化**:导航、点击、表单填写 +- 📸 **视觉捕获**:截图、PDF 生成 +- 📊 **数据提取**:文本、HTML、结构化数据抓取 +- 🔧 **JS 执行**:页面内脚本执行(需用户确认) +- 🍪 **会话管理**:Cookie 读取/设置 +- 📈 **监控检测**:网站健康检查、内容变更监控 + +--- + +## 2. 根因分析 + +| 维度 | 结论 | +|------|------| +| **核心问题** | 当前系统缺乏浏览器交互能力,所有 Agent 仅限于 OS/文件/代码层操作 | +| **影响范围** | 无法执行网页自动化、数据抓取、表单填写、视觉测试、网站监控等任务 | +| **根本原因** | 项目初始范围聚焦于代码分析,未纳入浏览器自动化需求 | +| **关键洞察** | 问题不仅是添加库依赖,而是需要按照 Hub-and-Spoke 模式设计完整的 Agent,同时保持安全性、资源管理和工具化交互的一致性 | + +### 系统影响评估 + +| 组件 | 影响程度 | 说明 | +|------|----------|------| +| `internal/agents/` | **高** | 新增 `BrowserAgent` 结构体、注册、任务执行循环 | +| `internal/tools/` | **高** | 新增 15 个浏览器专用工具 | +| `internal/llm/` | **无** | 复用现有 LLM 抽象层 | +| `pkg/messaging/` | **中** | 新增浏览器任务/结果消息主题 | +| `internal/config/` | **中** | 新增 `[browser]` 配置段 | +| `internal/http/` | **低** | 可能需要新增 workspace 文件服务端点 | +| `internal/tui/` | **低** | 浏览器输出文本/文件路径在聊天 UI 中显示 | +| `main.go` | **中** | Agent 注册、依赖注入、生命周期钩子 | +| `go.mod` | **低** | 添加 `github.com/go-rod/rod` | +| `codebase/` (Rust) | **无** | 不受影响 | + +--- + +## 3. 候选架构对比 + +### 候选 1:完全嵌入 Agent(紧耦合) + +``` +Browser-Agent 直接实例化 go-rod +``` + +| 优点 | 缺点 | +|------|------| +| 实现简单、开销低 | 浏览器实例无法跨任务共享 | +| Agent 完全控制 | 每个任务需冷启动浏览器 | + +### 候选 2:浏览器服务 + Agent 前端(解耦) + +``` +BrowserService(独立服务) ←→ Browser-Agent(命令翻译) +``` + +| 优点 | 缺点 | +|------|------| +| 关注点分离清晰 | 额外的通信层 | +| 浏览器生命周期可跨 Agent 复用 | 并发控制更复杂 | + +### 候选 3:混合模式 — Agent 持有 rod,生命周期由单例提供者管理 ⭐ + +``` +Browser-Agent ←→ BrowserManager(单例)←→ Chrome 进程 + ←→ Page 上下文(每任务独立) +``` + +| 优点 | 缺点 | +|------|------| +| 最优资源使用(单 Chromium 进程) | 需仔细同步控制 | +| Agent 代码简洁 | 浏览器崩溃影响所有待处理任务 | +| 避免冷启动成本 | 需实现健康检查和恢复机制 | +| 通过信号量控制并发 | — | + +--- + +## 4. 推荐架构设计 + +### 4.1 选择:候选 3 — 混合模式 + +**理由**:遵循"浏览器进程为单一事实来源"原则,通过 `BrowserManager` 单例管理整个应用生命周期的浏览器实例。与现有 Hub-and-Spoke 模式高度契合。 + +### 4.2 架构图 + +``` + ┌─────────────────┐ + │ Conductor │ ← 中央协调器 + │ (Orchestrator) │ + └────────┬────────┘ + │ + │ delegate_browser 工具调用 + │ 发布 BrowserTask 消息 + ▼ + ┌─────────────────┐ + │ Browser-Agent │ ← LLM 推理 + BrowserToolSet + │ (浏览器专家) │ + └────────┬────────┘ + │ + │ 工具执行调用 + ▼ + ┌─────────────────┐ + │ BrowserManager │ ← 单例,浏览器生命周期管理 + │ (浏览器管理器) │ + └────────┬────────┘ + │ + │ go-rod (Chrome DevTools Protocol) + ▼ + ┌─────────────────┐ + │ Chrome Browser │ ← Headless Chrome 进程 + │ (无头浏览器) │ + └────────┬────────┘ + │ + │ 为每个任务创建独立页面 + ▼ + ┌─────────────────┐ + │ Page Context │ ← 每个任务独立的标签页 + │ (页面上下文) │ + └─────────────────┘ +``` + +### 4.3 核心组件职责 + +| 组件 | 职责 | +|------|------| +| **Conductor** | 分类浏览器任务 → `delegate_browser` 调用 | +| **Browser-Agent** | 接收任务 → LLM 推理 → 工具调用 → 返回结果 | +| **BrowserManager** | 浏览器启动/关闭、页面获取/释放、健康检查、安全策略 | +| **BrowserToolSet** | 15 个浏览器操作工具,供 LLM 调用 | +| **WorkspaceGuard** | 复用现有机制,确保文件写入工作区内 | + +--- + +## 5. 分阶段实施路线图 + +### Phase 1:基础设施 — 依赖、配置、BrowserManager + +| 步骤 | 内容 | 优先级 | +|------|------|--------| +| 1.1 | `go.mod` 添加 `github.com/go-rod/rod` 依赖 | P0 | +| 1.2 | `internal/config/config.go` 扩展 `[browser]` 配置段 | P0 | +| 1.3 | 创建 `internal/browser/manager.go` — 浏览器生命周期管理 | P0 | +| 1.4 | 创建 `internal/browser/security.go` — 安全策略实现 | P0 | +| 1.5 | 创建 `internal/browser/config.go` — Chrome 命令行标志生成 | P1 | + +### Phase 2:浏览器工具定义 + +| 步骤 | 内容 | 优先级 | +|------|------|--------| +| 2.1 | 创建 `internal/tools/browser/` 包目录 | P0 | +| 2.2 | 实现导航类工具:`navigate`, `go_back`, `go_forward`, `reload`, `get_current_url` | P0 | +| 2.3 | 实现交互类工具:`click`, `input`, `scroll`, `wait_element`, `wait` | P0 | +| 2.4 | 实现提取类工具:`extract_text`, `extract_html` | P1 | +| 2.5 | 实现输出类工具:`screenshot`, `pdf` | P1 | +| 2.6 | 实现高级工具:`evaluate_js`, `get_cookies`, `set_cookies` | P2 | +| 2.7 | 创建 `registry.go` — `BrowserTools()` 注册函数 | P0 | + +### Phase 3:Browser-Agent 实现 + +| 步骤 | 内容 | 优先级 | +|------|------|--------| +| 3.1 | 创建 `internal/agents/browser_agent.go` | P0 | +| 3.2 | 实现 LLM 推理循环(接收任务 → 获取页面 → 工具调用 → 返回结果) | P0 | +| 3.3 | 编写浏览器 Agent 系统提示词 | P0 | +| 3.4 | 实现 Agent 工厂函数和注册逻辑 | P1 | + +### Phase 4:Conductor 集成 + +| 步骤 | 内容 | 优先级 | +|------|------|--------| +| 4.1 | 在 `conductor.go` 中新增 `delegate_browser` 工具 | P0 | +| 4.2 | 扩展 Conductor 任务分类规则(识别浏览器类意图) | P1 | +| 4.3 | 通过消息系统实现 Conductor ↔ Browser-Agent 通信 | P0 | + +### Phase 5:TUI 与 HTTP/WebSocket 支持 + +| 步骤 | 内容 | 优先级 | +|------|------|--------| +| 5.1 | 确保 Browser-Agent 消息流入 TUI 显示 | P1 | +| 5.2 | HTTP 服务器添加 workspace 文件服务端点 `GET /workspace/{filepath}` | P1 | +| 5.3 | WebSocket 消息中包含文件路径引用 | P2 | + +### Phase 6:生命周期与韧性 + +| 步骤 | 内容 | 优先级 | +|------|------|--------| +| 6.1 | 实现 `BrowserManager.HealthCheck()` 健康检查 | P1 | +| 6.2 | 浏览器崩溃自动重启机制 | P1 | +| 6.3 | 空闲超时自动关闭浏览器释放资源 | P2 | +| 6.4 | `main.go` 优雅关闭时清理浏览器进程 | P1 | + +### Phase 7:测试与验证 + +| 步骤 | 内容 | 优先级 | +|------|------|--------| +| 7.1 | 浏览器工具单元测试(使用 `httptest` 本地服务器) | P1 | +| 7.2 | Browser-Agent 集成测试(模拟 LLM 输出) | P2 | +| 7.3 | 安全测试(`file://` 拦截、工作区外下载阻止) | P1 | +| 7.4 | 并发压力测试(信号量正确序列化) | P2 | +| 7.5 | 端到端系统测试 | P2 | + +--- + +## 6. 文件结构规划 + +``` +codeactor-agent/ +├── main.go # 🔄 修改:初始化BrowserManager,注册BrowserAgent +├── go.mod # 🔄 修改:+ go-rod 依赖 +├── config.example.toml # 🔄 修改:更新 [browser] 配置段 +│ +├── internal/ +│ ├── agents/ +│ │ ├── browser_agent.go # 🆕 Browser-Agent 实现 +│ │ ├── conductor.go # 🔄 修改:delegate_browser 工具 +│ │ └── agent_registry.go # 🔄 修改:BrowserAgent 注册 +│ │ +│ ├── tools/ +│ │ └── browser/ # 🆕 浏览器工具包 +│ │ ├── navigate.go # 导航到指定 URL +│ │ ├── click.go # 元素点击 +│ │ ├── input.go # 表单输入 +│ │ ├── extract.go # 文本/HTML 提取 +│ │ ├── screenshot.go # 截图 +│ │ ├── pdf.go # PDF 生成 +│ │ ├── evaluate_js.go # JavaScript 执行 +│ │ ├── wait_element.go # 等待元素出现 +│ │ ├── cookies.go # Cookie 管理 +│ │ ├── scroll.go # 页面滚动 +│ │ ├── history.go # 浏览器历史导航 +│ │ └── registry.go # BrowserTools() 注册函数 +│ │ +│ ├── browser/ # 🆕 浏览器管理包 +│ │ ├── manager.go # 浏览器生命周期、页面获取/释放 +│ │ ├── config.go # 配置结构和 Chrome flags 生成 +│ │ ├── security.go # 域名过滤、file:// 拦截、下载边界 +│ │ └── manager_test.go # BrowserManager 测试 +│ │ +│ ├── config/ +│ │ └── config.go # 🔄 修改:BrowserConfig 结构体 +│ │ +│ └── http/ +│ └── fileserver.go # 🆕 可选:workspace 文件服务端点 +``` + +> 🆕 = 新增文件 | 🔄 = 修改现有文件 + +--- + +## 7. 配置设计 + +### 7.1 配置结构体 + +```go +// BrowserConfig 浏览器配置 +type BrowserConfig struct { + Headless bool `toml:"headless"` // 无头模式 + BrowserPath string `toml:"browser_path"` // 浏览器路径(空=自动下载) + UserDataDir string `toml:"user_data_dir"` // 用户数据目录(空=临时目录) + ViewportWidth int `toml:"viewport_width"` // 视口宽度 + ViewportHeight int `toml:"viewport_height"` // 视口高度 + AllowedDomains []string `toml:"allowed_domains"` // 允许域名列表(空=全部允许) + BlockedDomains []string `toml:"blocked_domains"` // 阻止域名列表 + TimeoutSeconds int `toml:"timeout_seconds"` // 单个操作超时 + MaxConcurrentPages int `toml:"max_concurrent_pages"` // 最大并发页面数 + AutoLaunch bool `toml:"auto_launch"` // 首次请求时自动启动 + IdleTimeout string `toml:"idle_timeout"` // 空闲超时(如 "5m") + AllowNoSandbox bool `toml:"allow_no_sandbox"` // 允许--no-sandbox(Docker中需要) + ExtraArgs []string `toml:"extra_args"` // 额外的Chrome命令行参数 +} +``` + +### 7.2 配置示例 + +```toml +[browser] +headless = true +browser_path = "" +user_data_dir = "" +viewport_width = 1280 +viewport_height = 720 +allowed_domains = ["example.com", "api.example.com"] +blocked_domains = ["malware.test"] +timeout_seconds = 30 +max_concurrent_pages = 4 +auto_launch = true +idle_timeout = "5m" +allow_no_sandbox = false # Docker 中设为 true +extra_args = [] +``` + +### 7.3 Chrome 启动标志(安全强化) + +```go +var secureChromeFlags = []string{ + "--headless=new", // 新版无头模式 + "--disable-gpu", // 禁用GPU + "--no-first-run", // 跳过首次运行向导 + "--disable-default-apps", // 禁用默认应用 + "--disable-extensions", // 禁用扩展 + "--disable-background-networking", // 禁用后台网络 + "--disable-sync", // 禁用同步 + "--disable-translate", // 禁用翻译 + "--hide-scrollbars", // 隐藏滚动条 + "--metrics-recording-only", // 仅记录指标 + "--mute-audio", // 静音 + "--disable-dev-shm-usage", // 使用 /tmp 而非 /dev/shm + "--no-sandbox", // 仅容器环境(由 allow_no_sandbox 控制) +} +``` + +--- + +## 8. 浏览器工具定义 + +### 8.1 工具总览 + +| # | 工具名 | 功能 | 参数 | 返回值 | +|---|--------|------|------|--------| +| 1 | `navigate` | 页面导航 | `url` (string), `timeout_seconds` (int?) | title, url, status | +| 2 | `click` | 元素点击 | `selector` (string), `button` (string?) | 操作结果 | +| 3 | `input` | 表单输入 | `selector` (string), `text` (string) | 操作结果 | +| 4 | `extract_text` | 提取文本 | `selector` (string), `max_chars` (int?) | 文本内容 | +| 5 | `extract_html` | 提取HTML | `selector` (string) | outerHTML(截断) | +| 6 | `screenshot` | 截图 | `selector` (string?), `whole_page` (bool), `output_file` (string?) | 文件路径 | +| 7 | `pdf` | 生成PDF | `output_file` (string?) | 文件路径 | +| 8 | `evaluate_js` | 执行JS | `code` (string) | 执行结果 JSON | +| 9 | `wait_element` | 等待元素 | `selector` (string), `timeout_seconds` (int?) | 是否出现 | +| 10 | `get_cookies` | 获取Cookie | 无 | Cookie列表 | +| 11 | `set_cookies` | 设置Cookie | `cookies` ([]map) | 操作结果 | +| 12 | `scroll` | 页面滚动 | `x` (int), `y` (int) | 操作结果 | +| 13 | `go_back` | 后退 | 无 | 操作结果 | +| 14 | `go_forward` | 前进 | 无 | 操作结果 | +| 15 | `reload` | 刷新 | 无 | 操作结果 | +| 16 | `get_current_url` | 获取当前URL | 无 | URL字符串 | +| 17 | `wait` | 等待毫秒 | `milliseconds` (int) | 操作结果 | + +### 8.2 工具安全分类 + +| 安全等级 | 工具 | 限制 | +|----------|------|------| +| 🟢 安全 | `navigate`, `click`, `input`, `extract_text`, `extract_html`, `screenshot`, `pdf`, `wait_element`, `scroll`, `go_back`, `go_forward`, `reload`, `get_current_url`, `wait`, `get_cookies`, `set_cookies` | URL 验证 + 工作区边界 | +| 🟡 需确认 | `evaluate_js` | 触发 `ask_user_for_help` 用户确认流程 | + +--- + +## 9. 安全设计 + +### 9.1 多层安全防护 + +``` +┌─────────────────────────────────────────────────────┐ +│ 第1层:URL 验证 │ +│ 仅允许 http:// 和 https:// scheme │ +│ 拦截 file://、data:、javascript:、chrome:// 等 │ +├─────────────────────────────────────────────────────┤ +│ 第2层:域名过滤 │ +│ HijackRequests 实现允许/阻止列表 │ +│ 支持域名模式匹配(如 *.example.com) │ +├─────────────────────────────────────────────────────┤ +│ 第3层:Chrome 沙箱 │ +│ --headless=new, --disable-extensions, etc. │ +│ --no-sandbox 仅在容器环境下启用 │ +├─────────────────────────────────────────────────────┤ +│ 第4层:工作区边界 │ +│ 截图/PDF/下载 强制写入 workspace 目录 │ +│ 复用现有 WorkspaceGuard 进行路径检查 │ +├─────────────────────────────────────────────────────┤ +│ 第5层:用户确认 │ +│ evaluate_js 等高风险操作需用户明确批准 │ +│ 通过 ask_user_for_help 机制实现 │ +├─────────────────────────────────────────────────────┤ +│ 第6层:资源限制 │ +│ --js-flags="--max-old-space-size=256" │ +│ --renderer-process-limit=4 │ +│ 操作超时 30s(可配置) │ +└─────────────────────────────────────────────────────┘ +``` + +### 9.2 BrowserManager 安全职责 + +```go +// BrowserManager 安全相关方法 +type BrowserManager struct { + // ... + securityPolicy *SecurityPolicy + workspaceGuard *WorkspaceGuard +} + +type SecurityPolicy struct { + AllowedDomains []string // 允许访问的域名 + BlockedDomains []string // 禁止访问的域名 + AllowFileAccess bool // 是否允许 file://(默认 false) + AllowDataURL bool // 是否允许 data: URL(默认 false) +} + +// ValidateURL 验证 URL 安全性 +func (m *Manager) ValidateURL(rawURL string) error + +// SetupPageSecurity 为页面设置安全路由器 +func (m *Manager) SetupPageSecurity(page *rod.Page) error + +// ValidateFilePath 验证文件路径在工作区内 +func (m *Manager) ValidateFilePath(path string) error +``` + +--- + +## 10. 数据流设计 + +### 10.1 完整请求流 + +``` +用户输入: "请截图 https://example.com 首页" + │ + ▼ +┌──────────────────────────────────────────────────────────┐ +│ Conductor │ +│ 1. 任务分类 → 识别为浏览器任务 │ +│ 2. 调用 delegate_browser(task="截图 example.com 首页") │ +│ 3. 发布 BrowserTask 消息到消息总线 │ +└──────────────┬───────────────────────────────────────────┘ + │ + ▼ +┌──────────────────────────────────────────────────────────┐ +│ Browser-Agent │ +│ 1. 从消息总线接收 BrowserTask │ +│ 2. 创建任务上下文(含超时) │ +│ 3. 从 BrowserManager 获取页面 │ +│ 4. 开始 LLM 推理循环: │ +│ ┌─────────────────────────────────────────┐ │ +│ │ LLM: 调用 navigate("https://example.com")│ │ +│ │ Tool: 验证URL → 导航 → 等待加载 │ │ +│ │ Result: {title:"Example Domain", url:...}│ │ +│ │ │ │ +│ │ LLM: 调用 screenshot(whole_page=true) │ │ +│ │ Tool: 截图 → 保存到 workspace/browser/ │ │ +│ │ Result: {path:"browser/screenshots/x.png"}│ │ +│ │ │ │ +│ │ LLM: 调用 finish("截图已完成") │ │ +│ └─────────────────────────────────────────┘ │ +│ 5. 释放页面 │ +│ 6. 发送 BrowserResult 消息 │ +└──────────────┬───────────────────────────────────────────┘ + │ + ▼ +┌──────────────────────────────────────────────────────────┐ +│ Conductor │ +│ 1. 接收 BrowserResult │ +│ 2. 汇总结果返回给用户 │ +└──────────────────────────────────────────────────────────┘ +``` + +### 10.2 消息格式 + +**BrowserTask(Conductor → Browser-Agent)**: +```json +{ + "task_id": "uuid-xxx", + "prompt": "截图 https://example.com 首页", + "context": "用户需要整页截图", + "timeout_seconds": 60 +} +``` + +**BrowserResult(Browser-Agent → Conductor)**: +```json +{ + "task_id": "uuid-xxx", + "status": "success", + "summary": "截图已完成", + "files": [ + { + "type": "screenshot", + "path": "browser/screenshots/abc123.png", + "description": "example.com 首页截图" + } + ] +} +``` + +--- + +## 11. 关键代码模式 + +### 11.1 BrowserManager 页面获取 + +```go +// AcquirePage 获取一个浏览器页面(受信号量控制) +func (m *Manager) AcquirePage(ctx context.Context) (*rod.Page, func(), error) { + // 信号量控制并发 + select { + case m.sem <- struct{}{}: + case <-ctx.Done(): + return nil, nil, ctx.Err() + } + + m.mu.Lock() + defer m.mu.Unlock() + + // 懒启动浏览器 + if m.browser == nil { + if err := m.launch(); err != nil { + <-m.sem + return nil, nil, fmt.Errorf("启动浏览器失败: %w", err) + } + } + + // 创建新页面 + page, err := m.browser.Page(proto.TargetCreateTarget{URL: "about:blank"}) + if err != nil { + <-m.sem + return nil, nil, fmt.Errorf("创建页面失败: %w", err) + } + + // 设置安全策略 + m.setupPageSecurity(page) + + // 释放函数 + release := func() { + page.Close() + <-m.sem + m.lastUsed = time.Now() + } + + return page, release, nil +} +``` + +### 11.2 浏览器工具示例(navigate) + +```go +// NavigateTool 页面导航工具 +type NavigateTool struct { + workspaceGuard *WorkspaceGuard +} + +func (t *NavigateTool) Execute(ctx context.Context, params map[string]interface{}) (interface{}, error) { + url, ok := params["url"].(string) + if !ok { + return nil, fmt.Errorf("参数 'url' 必须为字符串") + } + + // URL 安全验证 + uri, err := url.Parse(url) + if err != nil || (uri.Scheme != "http" && uri.Scheme != "https") { + return nil, fmt.Errorf("仅允许 http/https URL,收到: %s", url) + } + + // 获取页面上下文 + page, ok := ctx.Value(PageCtxKey).(*rod.Page) + if !ok { + return nil, errors.New("页面上下文不可用") + } + + // 超时控制 + timeout := 30 * time.Second + ctx, cancel := context.WithTimeout(ctx, timeout) + defer cancel() + + // 导航 + if err := page.Timeout(timeout).Navigate(url); err != nil { + return nil, fmt.Errorf("导航失败: %w", err) + } + + page.WaitLoad() + + info, _ := page.Info() + return map[string]string{ + "title": info.Title, + "url": info.URL, + "status": "success", + }, nil +} +``` + +### 11.3 Browser-Agent LLM 推理循环 + +```go +func (ag *BrowserAgent) handleTask(taskMsg BrowserTask) { + ctx, cancel := context.WithTimeout(context.Background(), ag.timeout) + defer cancel() + + // 获取页面 + page, release, err := ag.browserMgr.AcquirePage(ctx) + if err != nil { + ag.reportError(taskMsg, err) + return + } + defer release() + + // 将页面注入上下文 + ctx = context.WithValue(ctx, PageCtxKey, page) + + // 构建对话 + msgs := []llm.Message{ + {Role: "system", Content: browserSystemPrompt}, + {Role: "user", Content: taskMsg.Prompt}, + } + + // LLM 推理循环 + for { + response, err := ag.llm.Chat(ctx, msgs, ag.tools) + if err != nil { + ag.reportError(taskMsg, err) + return + } + + // 处理工具调用 + for _, call := range response.ToolCalls { + result := ag.executeTool(ctx, call) + msgs = append(msgs, llm.Message{ + Role: "tool", + Content: result, + }) + } + + if response.FinishReason == "stop" { + ag.sendResult(taskMsg, response.Content) + return + } + } +} +``` + +--- + +## 12. TUI 与 HTTP/WebSocket 集成 + +### 12.1 TUI 模式 + +- 浏览器 Agent 的文本输出通过消息系统流入 TUI,与现有 Agent 消息显示方式一致 +- 截图/PDF 文件路径以文本形式显示(如 `📸 截图已保存: browser/screenshots/abc.png`) +- 无需额外 TUI 组件修改 + +### 12.2 HTTP/WebSocket 模式 + +- **文件服务端点**:`GET /workspace/{filepath}` → 流式返回二进制文件 +- **WebSocket 消息**:Browser-Agent 结果消息中包含文件引用 +- **前端行为**:收到文件路径后通过 HTTP 端点获取文件内容渲染 + +```json +// WebSocket 消息示例 +{ + "type": "browser_result", + "task_id": "uuid-xxx", + "summary": "截图已完成", + "attachments": [ + { + "type": "image/png", + "url": "/workspace/browser/screenshots/abc123.png", + "description": "example.com 首页截图" + } + ] +} +``` + +### 12.3 工作区文件服务安全 + +```go +// 文件服务端点必须验证路径在工作区内 +func (s *Server) ServeWorkspaceFile(c *gin.Context) { + filepath := c.Param("filepath") + fullPath := filepath.Join(s.workspaceDir, filepath) + + // WorkspaceGuard 验证 + if !s.workspaceGuard.IsPathInWorkspace(fullPath) { + c.Status(http.StatusForbidden) + return + } + + c.File(fullPath) +} +``` + +--- + +## 13. 生命周期与韧性 + +### 13.1 状态机 + +``` + ┌─────────┐ 首次 AcquirePage ┌──────────┐ + │ Idle │ ──────────────────────→ │ Active │ + │ (空闲) │ │ (活跃) │ + └────┬────┘ └─────┬─────┘ + │ │ + │ 空闲超时 │ 页面全部释放 + │ (idle_timeout) │ + 无等待者 + │ │ + ▼ ▼ + ┌─────────┐ 崩溃/健康检查失败 ┌──────────┐ + │ Closed │ ←──────────────────────── │ Crashed │ + │ (已关闭) │ │ (已崩溃) │ + └─────────┘ └─────┬─────┘ + │ + │ 自动重启 + │ + ▼ + ┌──────────┐ + │ Active │ + │ (重新活跃) │ + └──────────┘ +``` + +### 13.2 健康检查 + +```go +// HealthCheck 通过 CDP ping 检查浏览器是否存活 +func (m *Manager) HealthCheck() error { + if m.browser == nil { + return nil // 尚未启动 + } + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + return m.browser.Ping(ctx) +} +``` + +### 13.3 优雅关闭 + +```go +// Close 优雅关闭浏览器管理器 +func (m *Manager) Close() error { + m.mu.Lock() + defer m.mu.Unlock() + + if m.browser != nil { + if err := m.browser.Close(); err != nil { + return fmt.Errorf("关闭浏览器失败: %w", err) + } + m.browser = nil + } + + // 清理临时 user-data-dir + if m.tempDir != "" { + os.RemoveAll(m.tempDir) + } + + return nil +} +``` + +--- + +## 14. 测试策略 + +### 14.1 测试层次 + +``` +┌──────────────────────────────────────────┐ +│ E2E 系统测试 │ +│ 真实用户请求 → Conductor → Browser-Agent │ +│ → Chrome → 真实网站 │ +├──────────────────────────────────────────┤ +│ 集成测试 │ +│ Browser-Agent + 模拟 LLM + 真实 Chrome │ +├──────────────────────────────────────────┤ +│ 单元测试 │ +│ 工具级测试(httptest 本地服务器) │ +├──────────────────────────────────────────┤ +│ 安全测试 │ +│ URL拦截、文件边界、JS确认流程 │ +└──────────────────────────────────────────┘ +``` + +### 14.2 测试用例清单 + +| 类别 | 测试用例 | 预期结果 | +|------|----------|----------| +| **工具测试** | `navigate` 正常URL | 返回页面标题和URL | +| | `navigate` file:// URL | 返回错误 | +| | `click` 有效选择器 | 点击成功 | +| | `click` 无效选择器 | 返回超时错误 | +| | `screenshot` 默认路径 | 文件保存到 workspace | +| | `screenshot` 自定义路径(工作区外) | 返回权限错误 | +| | `extract_text` 大页面 | 按 max_chars 截断 | +| **Agent测试** | 模拟LLM工具调用序列 | 正确执行并返回结果 | +| | 页面获取超时 | 返回超时错误 | +| **安全测试** | `file:///etc/passwd` 导航 | 被URL验证拦截 | +| | `evaluate_js` 未确认 | 等待确认/超时 | +| | 下载到工作区外 | WorkspaceGuard拦截 | +| **并发测试** | 超过 max_concurrent_pages 的请求 | 第5个请求阻塞直到释放 | +| **生命周期测试** | 空闲超时后新请求 | 浏览器重启并正常服务 | + +--- + +## 15. 回滚与监控 + +### 15.1 Feature Flag + +```toml +# 紧急回滚开关 +[agents] +enable_browser_agent = true # 设为 false 可禁用 Browser-Agent +``` + +- 设为 `false` 时:`BrowserManager` 不初始化,`delegate_browser` 工具返回"不可用" +- 不影响现有 Agent 的任何功能 +- `go-rod` 依赖可保留在 `go.mod` 中(不影响编译产物大小) + +### 15.2 监控指标 + +| 指标 | 来源 | 说明 | +|------|------|------| +| 浏览器存活状态 | `GET /health/browser` | `{"status":"ok","pages":2}` | +| 活跃页面数 | BrowserManager | 当前使用的标签页数 | +| 工具调用计数 | 日志 | 每种工具的调用频率 | +| 错误率 | 日志 | 工具执行失败/超时比例 | +| 浏览器崩溃次数 | 日志 | 自动恢复计数 | +| 内存使用 | Chrome进程监控 | 防止内存泄漏 | + +### 15.3 日志记录 + +```go +// 关键事件日志 +log.Info("浏览器启动", "flags", flags) +log.Info("页面获取", "available_pages", available) +log.Info("页面释放", "remaining_pages", remaining) +log.Warn("浏览器崩溃", "error", err, "restart_attempt", attempt) +log.Error("导航失败", "url", sanitizedURL, "error", err) +log.Info("浏览器空闲超时,自动关闭") +``` + +--- + +## 附录 + +### A. 依赖项 + +``` +github.com/go-rod/rod # 浏览器自动化核心库 +``` + +### B. 参考资源 + +- [go-rod 官方文档](https://go-rod.github.io/) +- [Chrome DevTools Protocol](https://chromedevtools.github.io/devtools-protocol/) +- [CodeActor Agent 架构文档](./ARCHITECTURE.md) + +### C. 术语表 + +| 术语 | 说明 | +|------|------| +| CDP | Chrome DevTools Protocol — Chrome 调试协议 | +| Hub-and-Spoke | 中心辐射型架构 — Conductor 为 Hub,Agent 为 Spoke | +| BrowserManager | 浏览器管理器 — 管理浏览器实例的单例 | +| WorkspaceGuard | 工作区守卫 — 确保文件操作在安全边界内 | +| Semaphore | 信号量 — 控制并发页面数的同步原语 | +| HijackRequests | Rod 的请求拦截功能 — 用于域名过滤和安全策略 | + +--- + +> **文档维护者**: CodeActor Team +> **最后更新**: 2025-07 +> **状态**: 待审查 → 待实施