diff --git a/agent.yaml b/agent.yaml index e43f2e5..e7188d2 100644 --- a/agent.yaml +++ b/agent.yaml @@ -317,6 +317,57 @@ spec: inject: - logger - playwright + - id: scroll + name: scroll + description: Scroll the page or element to a specific position or into view + tags: + - scroll + - navigation + - positioning + - playwright + schema: + type: object + properties: + target: + type: string + description: "What to scroll: 'page', 'element', or 'coordinates'" + enum: [page, element, coordinates] + selector: + type: string + description: "Element selector (required if target=element)" + behavior: + type: string + description: "Scroll behavior: 'smooth' or 'instant'" + enum: [smooth, instant] + default: smooth + block: + type: string + description: "Vertical alignment: 'start', 'center', 'end', 'nearest'" + enum: [start, center, end, nearest] + default: start + inline: + type: string + description: "Horizontal alignment: 'start', 'center', 'end', 'nearest'" + enum: [start, center, end, nearest] + default: nearest + x: + type: integer + description: "X coordinate for scrolling (if target=coordinates)" + y: + type: integer + description: "Y coordinate for scrolling (if target=coordinates)" + direction: + type: string + description: "Direction to scroll: 'up', 'down', 'left', 'right', 'top', 'bottom'" + enum: [up, down, left, right, top, bottom] + amount: + type: integer + description: "Amount to scroll in pixels (for directional scrolling)" + required: + - target + inject: + - logger + - playwright agent: provider: "" model: "" diff --git a/internal/playwright/mocks/browser_automation.go b/internal/playwright/mocks/browser_automation.go index e18de96..60fe91c 100644 --- a/internal/playwright/mocks/browser_automation.go +++ b/internal/playwright/mocks/browser_automation.go @@ -246,6 +246,27 @@ type FakeBrowserAutomation struct { waitForConditionReturnsOnCall map[int]struct { result1 error } + ScrollStub func(context.Context, string, string, string, string, string, string, string, int, int, int) error + scrollMutex sync.RWMutex + scrollArgsForCall []struct { + arg1 context.Context + arg2 string + arg3 string + arg4 string + arg5 string + arg6 string + arg7 string + arg8 string + arg9 int + arg10 int + arg11 int + } + scrollReturns struct { + result1 error + } + scrollReturnsOnCall map[int]struct { + result1 error + } invocations map[string][][]interface{} invocationsMutex sync.RWMutex } @@ -1386,6 +1407,8 @@ func (fake *FakeBrowserAutomation) Invocations() map[string][][]interface{} { defer fake.takeScreenshotMutex.RUnlock() fake.waitForConditionMutex.RLock() defer fake.waitForConditionMutex.RUnlock() + fake.scrollMutex.RLock() + defer fake.scrollMutex.RUnlock() copiedInvocations := map[string][][]interface{}{} for key, value := range fake.invocations { copiedInvocations[key] = value @@ -1405,4 +1428,75 @@ func (fake *FakeBrowserAutomation) recordInvocation(key string, args []interface fake.invocations[key] = append(fake.invocations[key], args) } +func (fake *FakeBrowserAutomation) Scroll(arg1 context.Context, arg2 string, arg3 string, arg4 string, arg5 string, arg6 string, arg7 string, arg8 string, arg9 int, arg10 int, arg11 int) error { + fake.scrollMutex.Lock() + ret, specificReturn := fake.scrollReturnsOnCall[len(fake.scrollArgsForCall)] + fake.scrollArgsForCall = append(fake.scrollArgsForCall, struct { + arg1 context.Context + arg2 string + arg3 string + arg4 string + arg5 string + arg6 string + arg7 string + arg8 string + arg9 int + arg10 int + arg11 int + }{arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10, arg11}) + stub := fake.ScrollStub + fakeReturns := fake.scrollReturns + fake.recordInvocation("Scroll", []interface{}{arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10, arg11}) + fake.scrollMutex.Unlock() + if stub != nil { + return stub(arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10, arg11) + } + if specificReturn { + return ret.result1 + } + return fakeReturns.result1 +} + +func (fake *FakeBrowserAutomation) ScrollCallCount() int { + fake.scrollMutex.RLock() + defer fake.scrollMutex.RUnlock() + return len(fake.scrollArgsForCall) +} + +func (fake *FakeBrowserAutomation) ScrollCalls(stub func(context.Context, string, string, string, string, string, string, string, int, int, int) error) { + fake.scrollMutex.Lock() + defer fake.scrollMutex.Unlock() + fake.ScrollStub = stub +} + +func (fake *FakeBrowserAutomation) ScrollArgsForCall(i int) (context.Context, string, string, string, string, string, string, string, int, int, int) { + fake.scrollMutex.RLock() + defer fake.scrollMutex.RUnlock() + argsForCall := fake.scrollArgsForCall[i] + return argsForCall.arg1, argsForCall.arg2, argsForCall.arg3, argsForCall.arg4, argsForCall.arg5, argsForCall.arg6, argsForCall.arg7, argsForCall.arg8, argsForCall.arg9, argsForCall.arg10, argsForCall.arg11 +} + +func (fake *FakeBrowserAutomation) ScrollReturns(result1 error) { + fake.scrollMutex.Lock() + defer fake.scrollMutex.Unlock() + fake.ScrollStub = nil + fake.scrollReturns = struct { + result1 error + }{result1} +} + +func (fake *FakeBrowserAutomation) ScrollReturnsOnCall(i int, result1 error) { + fake.scrollMutex.Lock() + defer fake.scrollMutex.Unlock() + fake.ScrollStub = nil + if fake.scrollReturnsOnCall == nil { + fake.scrollReturnsOnCall = make(map[int]struct { + result1 error + }) + } + fake.scrollReturnsOnCall[i] = struct { + result1 error + }{result1} +} + var _ playwright.BrowserAutomation = new(FakeBrowserAutomation) diff --git a/internal/playwright/playwright.go b/internal/playwright/playwright.go index 6188a8e..7ff190b 100644 --- a/internal/playwright/playwright.go +++ b/internal/playwright/playwright.go @@ -143,6 +143,7 @@ type BrowserAutomation interface { ExecuteScript(ctx context.Context, sessionID, script string, args []any) (any, error) WaitForCondition(ctx context.Context, sessionID, condition, selector, state string, timeout time.Duration, customFunction string) error HandleAuthentication(ctx context.Context, sessionID, authType, username, password, loginURL string, selectors map[string]string) error + Scroll(ctx context.Context, sessionID, target, selector, behavior, block, inline, direction string, amount, x, y int) error // Service management GetHealth(ctx context.Context) error @@ -899,6 +900,129 @@ func (p *playwrightImpl) HandleAuthentication(ctx context.Context, sessionID, au } } +// Scroll implements scrolling functionality for page, element, or coordinates +func (p *playwrightImpl) Scroll(ctx context.Context, sessionID, target, selector, behavior, block, inline, direction string, amount, x, y int) error { + session, err := p.GetSession(sessionID) + if err != nil { + return err + } + + p.logger.Info("executing scroll", + zap.String("sessionID", sessionID), + zap.String("target", target), + zap.String("selector", selector), + zap.String("behavior", behavior), + zap.String("direction", direction), + zap.Int("amount", amount), + zap.Int("x", x), + zap.Int("y", y)) + + switch target { + case "page": + return p.scrollPage(session, direction, amount, x, y, behavior) + case "element": + return p.scrollElement(session, selector, behavior, block, inline) + case "coordinates": + return p.scrollToCoordinates(session, x, y, behavior) + default: + return fmt.Errorf("unsupported scroll target: %s", target) + } +} + +// scrollPage scrolls the page in various ways +func (p *playwrightImpl) scrollPage(session *BrowserSession, direction string, amount, x, y int, behavior string) error { + var script string + var args []any + + smoothStr := "true" + if behavior == "instant" { + smoothStr = "false" + } + + switch direction { + case "top": + script = fmt.Sprintf("window.scrollTo({ top: 0, left: 0, behavior: '%s' })", + map[bool]string{true: "smooth", false: "auto"}[smoothStr == "true"]) + case "bottom": + script = fmt.Sprintf("window.scrollTo({ top: document.body.scrollHeight, left: 0, behavior: '%s' })", + map[bool]string{true: "smooth", false: "auto"}[smoothStr == "true"]) + case "up": + if amount == 0 { + amount = 500 // default scroll amount + } + script = fmt.Sprintf("window.scrollBy({ top: -%d, left: 0, behavior: '%s' })", + amount, map[bool]string{true: "smooth", false: "auto"}[smoothStr == "true"]) + case "down": + if amount == 0 { + amount = 500 // default scroll amount + } + script = fmt.Sprintf("window.scrollBy({ top: %d, left: 0, behavior: '%s' })", + amount, map[bool]string{true: "smooth", false: "auto"}[smoothStr == "true"]) + case "left": + if amount == 0 { + amount = 500 // default scroll amount + } + script = fmt.Sprintf("window.scrollBy({ top: 0, left: -%d, behavior: '%s' })", + amount, map[bool]string{true: "smooth", false: "auto"}[smoothStr == "true"]) + case "right": + if amount == 0 { + amount = 500 // default scroll amount + } + script = fmt.Sprintf("window.scrollBy({ top: 0, left: %d, behavior: '%s' })", + amount, map[bool]string{true: "smooth", false: "auto"}[smoothStr == "true"]) + default: + // If no direction specified, treat x/y as coordinates + script = fmt.Sprintf("window.scrollTo({ top: %d, left: %d, behavior: '%s' })", + y, x, map[bool]string{true: "smooth", false: "auto"}[smoothStr == "true"]) + } + + _, err := session.Page.Evaluate(script, args...) + return err +} + +// scrollElement scrolls an element into view +func (p *playwrightImpl) scrollElement(session *BrowserSession, selector, behavior, block, inline string) error { + locator := session.Page.Locator(selector) + + // Wait for element to be attached first + timeoutMs := float64(30000) + err := locator.WaitFor(playwright.LocatorWaitForOptions{ + State: playwright.WaitForSelectorStateAttached, + Timeout: &timeoutMs, + }) + if err != nil { + return fmt.Errorf("element not found or not attached: %w", err) + } + + // Build scroll into view options + script := fmt.Sprintf(` + const element = document.querySelector('%s'); + if (element) { + element.scrollIntoView({ + behavior: '%s', + block: '%s', + inline: '%s' + }); + } else { + throw new Error('Element not found: %s'); + } + `, selector, + map[string]string{"smooth": "smooth", "instant": "auto"}[behavior], + block, inline, selector) + + _, err = session.Page.Evaluate(script) + return err +} + +// scrollToCoordinates scrolls to specific x,y coordinates +func (p *playwrightImpl) scrollToCoordinates(session *BrowserSession, x, y int, behavior string) error { + script := fmt.Sprintf("window.scrollTo({ top: %d, left: %d, behavior: '%s' })", + y, x, map[string]string{"smooth": "smooth", "instant": "auto"}[behavior]) + + _, err := session.Page.Evaluate(script) + return err +} + // GetHealth checks the health of the service func (p *playwrightImpl) GetHealth(ctx context.Context) error { if p.pw == nil { diff --git a/skills/scroll.go b/skills/scroll.go new file mode 100644 index 0000000..0683d6a --- /dev/null +++ b/skills/scroll.go @@ -0,0 +1,263 @@ +package skills + +import ( + "context" + "encoding/json" + "fmt" + + server "github.com/inference-gateway/adk/server" + playwright "github.com/inference-gateway/browser-agent/internal/playwright" + zap "go.uber.org/zap" +) + +// ScrollSkill struct holds the skill with dependencies +type ScrollSkill struct { + logger *zap.Logger + playwright playwright.BrowserAutomation +} + +// NewScrollSkill creates a new scroll skill +func NewScrollSkill(logger *zap.Logger, playwright playwright.BrowserAutomation) server.Tool { + skill := &ScrollSkill{ + logger: logger, + playwright: playwright, + } + return server.NewBasicTool( + "scroll", + "Scroll the page or element to a specific position or into view", + map[string]any{ + "type": "object", + "properties": map[string]any{ + "target": map[string]any{ + "type": "string", + "description": "What to scroll: 'page', 'element', or 'coordinates'", + "enum": []string{"page", "element", "coordinates"}, + }, + "selector": map[string]any{ + "type": "string", + "description": "Element selector (required if target=element)", + }, + "behavior": map[string]any{ + "type": "string", + "description": "Scroll behavior: 'smooth' or 'instant'", + "enum": []string{"smooth", "instant"}, + "default": "smooth", + }, + "block": map[string]any{ + "type": "string", + "description": "Vertical alignment: 'start', 'center', 'end', 'nearest'", + "enum": []string{"start", "center", "end", "nearest"}, + "default": "start", + }, + "inline": map[string]any{ + "type": "string", + "description": "Horizontal alignment: 'start', 'center', 'end', 'nearest'", + "enum": []string{"start", "center", "end", "nearest"}, + "default": "nearest", + }, + "x": map[string]any{ + "type": "integer", + "description": "X coordinate for scrolling (if target=coordinates)", + }, + "y": map[string]any{ + "type": "integer", + "description": "Y coordinate for scrolling (if target=coordinates)", + }, + "direction": map[string]any{ + "type": "string", + "description": "Direction to scroll: 'up', 'down', 'left', 'right', 'top', 'bottom'", + "enum": []string{"up", "down", "left", "right", "top", "bottom"}, + }, + "amount": map[string]any{ + "type": "integer", + "description": "Amount to scroll in pixels (for directional scrolling)", + }, + }, + "required": []string{"target"}, + }, + skill.ScrollHandler, + ) +} + +// ScrollHandler handles the scroll skill execution +func (s *ScrollSkill) ScrollHandler(ctx context.Context, args map[string]any) (string, error) { + target, ok := args["target"].(string) + if !ok || target == "" { + return "", fmt.Errorf("target parameter is required and must be a non-empty string") + } + + // Validate target value + if !s.isValidTarget(target) { + return "", fmt.Errorf("invalid target value: %s. Must be one of: page, element, coordinates", target) + } + + // Extract optional parameters with defaults + behavior := "smooth" + if b, ok := args["behavior"].(string); ok && b != "" { + if !s.isValidBehavior(b) { + return "", fmt.Errorf("invalid behavior value: %s. Must be one of: smooth, instant", b) + } + behavior = b + } + + block := "start" + if bl, ok := args["block"].(string); ok && bl != "" { + if !s.isValidAlignment(bl) { + return "", fmt.Errorf("invalid block value: %s. Must be one of: start, center, end, nearest", bl) + } + block = bl + } + + inline := "nearest" + if in, ok := args["inline"].(string); ok && in != "" { + if !s.isValidAlignment(in) { + return "", fmt.Errorf("invalid inline value: %s. Must be one of: start, center, end, nearest", in) + } + inline = in + } + + selector := "" + if sel, ok := args["selector"].(string); ok { + selector = sel + } + + direction := "" + if dir, ok := args["direction"].(string); ok { + direction = dir + } + + amount := 0 + if amt, ok := args["amount"].(int); ok { + amount = amt + } else if amtf, ok := args["amount"].(float64); ok { + amount = int(amtf) + } + + x := 0 + if xVal, ok := args["x"].(int); ok { + x = xVal + } else if xf, ok := args["x"].(float64); ok { + x = int(xf) + } + + y := 0 + if yVal, ok := args["y"].(int); ok { + y = yVal + } else if yf, ok := args["y"].(float64); ok { + y = int(yf) + } + + // Validate based on target + switch target { + case "element": + if selector == "" { + return "", fmt.Errorf("selector is required when target is 'element'") + } + case "coordinates": + // x and y are optional, but at least one should be provided for coordinates + if x == 0 && y == 0 { + s.logger.Warn("both x and y are 0 for coordinates scrolling") + } + case "page": + if direction != "" && !s.isValidDirection(direction) { + return "", fmt.Errorf("invalid direction value: %s. Must be one of: up, down, left, right, top, bottom", direction) + } + } + + s.logger.Info("executing scroll", + zap.String("target", target), + zap.String("selector", selector), + zap.String("behavior", behavior), + zap.String("block", block), + zap.String("inline", inline), + zap.String("direction", direction), + zap.Int("amount", amount), + zap.Int("x", x), + zap.Int("y", y)) + + session, err := s.playwright.GetOrCreateTaskSession(ctx) + if err != nil { + s.logger.Error("failed to get browser session", zap.Error(err)) + return "", fmt.Errorf("failed to get browser session: %w", err) + } + + // Call the new scroll method on playwright service + err = s.playwright.Scroll(ctx, session.ID, target, selector, behavior, block, inline, direction, amount, x, y) + if err != nil { + s.logger.Error("scroll failed", + zap.String("target", target), + zap.String("sessionID", session.ID), + zap.Error(err)) + return "", fmt.Errorf("scroll failed: %w", err) + } + + s.logger.Info("scroll completed successfully", + zap.String("target", target), + zap.String("sessionID", session.ID)) + + response := map[string]any{ + "success": true, + "target": target, + "selector": selector, + "behavior": behavior, + "block": block, + "inline": inline, + "direction": direction, + "amount": amount, + "x": x, + "y": y, + "session_id": session.ID, + "message": "Scroll completed successfully", + } + + responseJSON, err := json.Marshal(response) + if err != nil { + return "", fmt.Errorf("failed to marshal response: %w", err) + } + + return string(responseJSON), nil +} + +// isValidTarget validates the target parameter +func (s *ScrollSkill) isValidTarget(target string) bool { + validTargets := []string{"page", "element", "coordinates"} + for _, valid := range validTargets { + if target == valid { + return true + } + } + return false +} + +// isValidBehavior validates the behavior parameter +func (s *ScrollSkill) isValidBehavior(behavior string) bool { + validBehaviors := []string{"smooth", "instant"} + for _, valid := range validBehaviors { + if behavior == valid { + return true + } + } + return false +} + +// isValidAlignment validates block and inline parameters +func (s *ScrollSkill) isValidAlignment(alignment string) bool { + validAlignments := []string{"start", "center", "end", "nearest"} + for _, valid := range validAlignments { + if alignment == valid { + return true + } + } + return false +} + +// isValidDirection validates the direction parameter +func (s *ScrollSkill) isValidDirection(direction string) bool { + validDirections := []string{"up", "down", "left", "right", "top", "bottom"} + for _, valid := range validDirections { + if direction == valid { + return true + } + } + return false +} diff --git a/skills/scroll_test.go b/skills/scroll_test.go new file mode 100644 index 0000000..8cce4b3 --- /dev/null +++ b/skills/scroll_test.go @@ -0,0 +1,244 @@ +package skills + +import ( + "context" + "encoding/json" + "errors" + "testing" + + server "github.com/inference-gateway/adk/server" + types "github.com/inference-gateway/adk/types" + "github.com/inference-gateway/browser-agent/internal/playwright" + "github.com/inference-gateway/browser-agent/internal/playwright/mocks" + zap "go.uber.org/zap" +) + +func TestNewScrollSkill(t *testing.T) { + logger := zap.NewNop() + mockPlaywright := &mocks.FakeBrowserAutomation{} + + skill := NewScrollSkill(logger, mockPlaywright) + + if skill == nil { + t.Errorf("Expected skill to be created, got nil") + } +} + +func TestScrollHandler_ValidPageScroll(t *testing.T) { + logger := zap.NewNop() + mockPlaywright := &mocks.FakeBrowserAutomation{} + skill := &ScrollSkill{ + logger: logger, + playwright: mockPlaywright, + } + + ctx := context.WithValue(context.Background(), server.TaskContextKey, &types.Task{ID: "test-task"}) + + session := &playwright.BrowserSession{ID: "test-session"} + mockPlaywright.GetOrCreateTaskSessionReturns(session, nil) + mockPlaywright.ScrollReturns(nil) + + args := map[string]any{ + "target": "page", + "direction": "top", + } + + result, err := skill.ScrollHandler(ctx, args) + + if err != nil { + t.Errorf("Expected no error, got %v", err) + } + if result == "" { + t.Errorf("Expected non-empty result") + } + + var response map[string]any + err = json.Unmarshal([]byte(result), &response) + if err != nil { + t.Errorf("Expected no unmarshal error, got %v", err) + } + if !response["success"].(bool) { + t.Errorf("Expected success to be true") + } + if response["target"] != "page" { + t.Errorf("Expected target to be 'page', got %v", response["target"]) + } + if response["direction"] != "top" { + t.Errorf("Expected direction to be 'top', got %v", response["direction"]) + } + + if mockPlaywright.GetOrCreateTaskSessionCallCount() != 1 { + t.Errorf("Expected GetOrCreateTaskSession to be called once") + } + if mockPlaywright.ScrollCallCount() != 1 { + t.Errorf("Expected Scroll to be called once") + } +} + +func TestScrollHandler_MissingTarget(t *testing.T) { + logger := zap.NewNop() + mockPlaywright := &mocks.FakeBrowserAutomation{} + skill := &ScrollSkill{ + logger: logger, + playwright: mockPlaywright, + } + + ctx := context.Background() + args := map[string]any{} + + result, err := skill.ScrollHandler(ctx, args) + + if err == nil { + t.Errorf("Expected error for missing target") + } + if result != "" { + t.Errorf("Expected empty result on error") + } + if err.Error() != "target parameter is required and must be a non-empty string" { + t.Errorf("Expected specific error message, got %v", err.Error()) + } +} + +func TestScrollHandler_InvalidTarget(t *testing.T) { + logger := zap.NewNop() + mockPlaywright := &mocks.FakeBrowserAutomation{} + skill := &ScrollSkill{ + logger: logger, + playwright: mockPlaywright, + } + + ctx := context.Background() + args := map[string]any{ + "target": "invalid", + } + + result, err := skill.ScrollHandler(ctx, args) + + if err == nil { + t.Errorf("Expected error for invalid target") + } + if result != "" { + t.Errorf("Expected empty result on error") + } +} + +func TestScrollHandler_ElementTargetMissingSelector(t *testing.T) { + logger := zap.NewNop() + mockPlaywright := &mocks.FakeBrowserAutomation{} + skill := &ScrollSkill{ + logger: logger, + playwright: mockPlaywright, + } + + ctx := context.Background() + args := map[string]any{ + "target": "element", + } + + result, err := skill.ScrollHandler(ctx, args) + + if err == nil { + t.Errorf("Expected error for missing selector") + } + if result != "" { + t.Errorf("Expected empty result on error") + } +} + +func TestScrollHandler_SessionError(t *testing.T) { + logger := zap.NewNop() + mockPlaywright := &mocks.FakeBrowserAutomation{} + skill := &ScrollSkill{ + logger: logger, + playwright: mockPlaywright, + } + + ctx := context.WithValue(context.Background(), server.TaskContextKey, &types.Task{ID: "test-task"}) + + mockPlaywright.GetOrCreateTaskSessionReturns(nil, errors.New("session error")) + + args := map[string]any{ + "target": "page", + } + + result, err := skill.ScrollHandler(ctx, args) + + if err == nil { + t.Errorf("Expected error for session failure") + } + if result != "" { + t.Errorf("Expected empty result on error") + } + + if mockPlaywright.GetOrCreateTaskSessionCallCount() != 1 { + t.Errorf("Expected GetOrCreateTaskSession to be called once") + } +} + +func TestIsValidTarget(t *testing.T) { + skill := &ScrollSkill{} + + tests := []struct { + target string + expected bool + }{ + {"page", true}, + {"element", true}, + {"coordinates", true}, + {"invalid", false}, + {"", false}, + } + + for _, test := range tests { + result := skill.isValidTarget(test.target) + if result != test.expected { + t.Errorf("For target '%s', expected %v, got %v", test.target, test.expected, result) + } + } +} + +func TestIsValidBehavior(t *testing.T) { + skill := &ScrollSkill{} + + tests := []struct { + behavior string + expected bool + }{ + {"smooth", true}, + {"instant", true}, + {"invalid", false}, + {"", false}, + } + + for _, test := range tests { + result := skill.isValidBehavior(test.behavior) + if result != test.expected { + t.Errorf("For behavior '%s', expected %v, got %v", test.behavior, test.expected, result) + } + } +} + +func TestIsValidDirection(t *testing.T) { + skill := &ScrollSkill{} + + tests := []struct { + direction string + expected bool + }{ + {"up", true}, + {"down", true}, + {"left", true}, + {"right", true}, + {"top", true}, + {"bottom", true}, + {"invalid", false}, + {"", false}, + } + + for _, test := range tests { + result := skill.isValidDirection(test.direction) + if result != test.expected { + t.Errorf("For direction '%s', expected %v, got %v", test.direction, test.expected, result) + } + } +}