Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
137 changes: 137 additions & 0 deletions tools/cmd/readability_extract/readability_extract.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
package main

import (
"bufio"
"encoding/json"
"errors"
"fmt"
"net/url"
"os"
"path/filepath"
"strings"
"time"

readability "github.com/go-shiori/go-readability"
)

type input struct {
HTML string `json:"html"`
BaseURL string `json:"base_url"`
}

type output struct {
Title string `json:"title"`
Byline string `json:"byline,omitempty"`
Text string `json:"text"`
ContentHTML string `json:"content_html"`
Length int `json:"length"`
}

const maxHTMLBytes = 5 << 20 // 5 MiB

func main() {
if err := run(); err != nil {
msg := strings.ReplaceAll(err.Error(), "\n", " ")
fmt.Fprintf(os.Stderr, "{\"error\":%q}\n", msg)
os.Exit(1)
}
}

func run() error {
in, err := decodeInput()
if err != nil {
return err
}
if strings.TrimSpace(in.HTML) == "" {
return errors.New("html is required")
}
if strings.TrimSpace(in.BaseURL) == "" {
return errors.New("base_url is required")
}
// Parse base URL to the type expected by go-readability
parsedBase, perr := url.Parse(in.BaseURL)
if perr != nil || parsedBase.Scheme == "" || parsedBase.Host == "" {
return errors.New("base_url must be an absolute URL")
}
if len(in.HTML) > maxHTMLBytes {
return fmt.Errorf("html too large: limit %d bytes", maxHTMLBytes)
}

start := time.Now()
art, err := readability.FromReader(strings.NewReader(in.HTML), parsedBase)
if err != nil {
return fmt.Errorf("readability extract: %w", err)
}

out := output{
Title: art.Title,
Byline: art.Byline,
Text: art.TextContent,
ContentHTML: art.Content,
Length: art.Length,
}
enc := json.NewEncoder(os.Stdout)
enc.SetEscapeHTML(false)
if err := enc.Encode(out); err != nil {
return fmt.Errorf("encode json: %w", err)
}
_ = appendAudit(map[string]any{ //nolint:errcheck
"ts": time.Now().UTC().Format(time.RFC3339Nano),
"tool": "readability_extract",
"length": art.Length,
"ms": time.Since(start).Milliseconds(),
})
return nil
}

func decodeInput() (input, error) {
var in input
dec := json.NewDecoder(bufio.NewReader(os.Stdin))
if err := dec.Decode(&in); err != nil {
return in, fmt.Errorf("parse json: %w", err)
}
return in, nil
}

// appendAudit writes an NDJSON line under .goagent/audit/YYYYMMDD.log at the repo root.
func appendAudit(entry any) error {
b, err := json.Marshal(entry)
if err != nil {
return err
}
root := moduleRoot()
dir := filepath.Join(root, ".goagent", "audit")
if err := os.MkdirAll(dir, 0o755); err != nil {
return err
}
fname := time.Now().UTC().Format("20060102") + ".log"
path := filepath.Join(dir, fname)
f, err := os.OpenFile(path, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0o644)
if err != nil {
return err
}
defer func() { _ = f.Close() }() //nolint:errcheck
if _, err := f.Write(append(b, '\n')); err != nil {
return err
}
return nil
}

// moduleRoot walks upward from CWD to the directory containing go.mod; falls back to CWD.
func moduleRoot() string {
cwd, err := os.Getwd()
if err != nil || cwd == "" {
return "."
}
dir := cwd
for {
if _, err := os.Stat(filepath.Join(dir, "go.mod")); err == nil {
return dir
}
parent := filepath.Dir(dir)
if parent == dir {
return cwd
}
dir = parent
}
}
69 changes: 69 additions & 0 deletions tools/cmd/readability_extract/readability_extract_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
package main_test

import (
"bytes"
"encoding/json"
"os/exec"
"strings"
"testing"

testutil "github.com/hyperifyio/goagent/tools/testutil"
)

func runTool(t *testing.T, bin string, input any) (string, string, error) {
t.Helper()
data, err := json.Marshal(input)
if err != nil {
t.Fatalf("marshal: %v", err)
}
cmd := exec.Command(bin)
cmd.Stdin = bytes.NewReader(data)
var stdout, stderr bytes.Buffer
cmd.Stdout = &stdout
cmd.Stderr = &stderr
err = cmd.Run()
return strings.TrimSpace(stdout.String()), strings.TrimSpace(stderr.String()), err
}

func TestReadabilityExtract_Simple(t *testing.T) {
bin := testutil.BuildTool(t, "readability_extract")
html := `<!doctype html><html><head><title>Example</title></head><body><nav>Links</nav><article><h1>My Title</h1><p>Hello <b>world</b>.</p></article></body></html>`
input := map[string]any{"html": html, "base_url": "https://example.org/page"}
outStr, errStr, err := runTool(t, bin, input)
if err != nil {
t.Fatalf("run error: %v, stderr=%s", err, errStr)
}
if !strings.Contains(outStr, "\"title\":") {
t.Fatalf("missing title in output: %s", outStr)
}
if !strings.Contains(outStr, "Hello") {
t.Fatalf("expected extracted text to include 'Hello': %s", outStr)
}
}

func TestReadabilityExtract_NavHeavy(t *testing.T) {
bin := testutil.BuildTool(t, "readability_extract")
html := `<!doctype html><html><body><div id="nav">home | about | contact</div><div id="content"><h1>Article Heading</h1><p>Core content here.</p></div></body></html>`
outStr, errStr, err := runTool(t, bin, map[string]any{"html": html, "base_url": "https://example.org/x"})
if err != nil {
t.Fatalf("run error: %v, stderr=%s", err, errStr)
}
if !strings.Contains(outStr, "Article Heading") {
t.Fatalf("expected heading present: %s", outStr)
}
if !strings.Contains(outStr, "Core content here") {
t.Fatalf("expected article text present: %s", outStr)
}
}

func TestReadabilityExtract_LargeRejected(t *testing.T) {
bin := testutil.BuildTool(t, "readability_extract")
big := strings.Repeat("A", (5<<20)+1)
outStr, errStr, err := runTool(t, bin, map[string]any{"html": big, "base_url": "https://e/x"})
if err == nil {
t.Fatalf("expected error for oversized html, got ok: %s", outStr)
}
if !strings.Contains(errStr, "html too large") {
t.Fatalf("expected size error, got: %s", errStr)
}
}