From b80fc6091291adec3dab595431a2d2f8f76e0bb1 Mon Sep 17 00:00:00 2001 From: aibuddy Date: Tue, 19 Aug 2025 16:34:41 +0000 Subject: [PATCH] tool(metadata_extract): add metadata extract tool with tests --- .../cmd/metadata_extract/metadata_extract.go | 212 ++++++++++++++++++ .../metadata_extract/metadata_extract_test.go | 60 +++++ 2 files changed, 272 insertions(+) create mode 100644 tools/cmd/metadata_extract/metadata_extract.go create mode 100644 tools/cmd/metadata_extract/metadata_extract_test.go diff --git a/tools/cmd/metadata_extract/metadata_extract.go b/tools/cmd/metadata_extract/metadata_extract.go new file mode 100644 index 0000000..4fbe9d3 --- /dev/null +++ b/tools/cmd/metadata_extract/metadata_extract.go @@ -0,0 +1,212 @@ +package main + +import ( + "bufio" + "encoding/json" + "errors" + "fmt" + "net/url" + "os" + "path/filepath" + "strings" + "time" +) + +type input struct { + HTML string `json:"html"` + BaseURL string `json:"base_url"` +} + +type output struct { + OpenGraph map[string]any `json:"opengraph"` + Twitter map[string]any `json:"twitter"` + JSONLD []any `json:"jsonld"` +} + +func main() { + if err := run(); err != nil { + msg := strings.ReplaceAll(err.Error(), "\n", " ") + fmt.Fprintf(os.Stderr, "{\"error\":%q}\n", msg) + os.Exit(1) + } +} + +func run() error { + in, err := decodeInput() + if err != nil { + return err + } + if strings.TrimSpace(in.HTML) == "" { + return errors.New("html is required") + } + if strings.TrimSpace(in.BaseURL) == "" { + return errors.New("base_url is required") + } + if u, perr := url.Parse(in.BaseURL); perr != nil || u.Scheme == "" || u.Host == "" { + return errors.New("base_url must be an absolute URL") + } + + // Minimal implementation that scans meta tags and JSON-LD scripts. + og, tw, ld := extractMetadata(in.HTML) + + out := output{OpenGraph: og, Twitter: tw, JSONLD: ld} + enc := json.NewEncoder(os.Stdout) + enc.SetEscapeHTML(false) + if err := enc.Encode(out); err != nil { + return fmt.Errorf("encode json: %w", err) + } + _ = appendAudit(map[string]any{ //nolint:errcheck + "ts": time.Now().UTC().Format(time.RFC3339Nano), + "tool": "metadata_extract", + "ms": 0, + }) + return nil +} + +func decodeInput() (input, error) { + var in input + dec := json.NewDecoder(bufio.NewReader(os.Stdin)) + if err := dec.Decode(&in); err != nil { + return in, fmt.Errorf("parse json: %w", err) + } + return in, nil +} + +func extractMetadata(html string) (map[string]any, map[string]any, []any) { + og := map[string]any{} + tw := map[string]any{} + var ld []any + + // Very basic parsing without external deps: regex-free naive scans. + lower := strings.ToLower(html) + // Extract + idx := 0 + for { + i := strings.Index(lower[idx:], "") + if end < 0 { + break + } + tag := html[i : i+end+1] + p := attrValue(tag, "property") + n := attrValue(tag, "name") + c := attrValue(tag, "content") + if strings.HasPrefix(strings.ToLower(p), "og:") && c != "" { + og[p] = c + } + if strings.HasPrefix(strings.ToLower(n), "twitter:") && c != "" { + tw[n] = c + } + idx = i + end + 1 + } + // Extract + idx = 0 + for { + i := strings.Index(lower[idx:], "") + if closeTag < 0 { + break + } + tag := html[i : i+closeTag+1] + t := attrValue(tag, "type") + if strings.EqualFold(strings.TrimSpace(t), "application/ld+json") { + // find + rest := html[i+closeTag+1:] + end := strings.Index(strings.ToLower(rest), "") + if end >= 0 { + payload := strings.TrimSpace(rest[:end]) + var v any + if err := json.Unmarshal([]byte(payload), &v); err == nil { + switch vv := v.(type) { + case []any: + ld = append(ld, vv...) + default: + ld = append(ld, vv) + } + } + idx = i + closeTag + 1 + end + len("") + continue + } + } + idx = i + closeTag + 1 + } + + return og, tw, ld +} + +// naive attribute value extractor for patterns like key="value" +func attrValue(tag string, key string) string { + // search case-insensitively for key= + lower := strings.ToLower(tag) + k := strings.ToLower(key) + "=" + j := strings.Index(lower, k) + if j < 0 { + return "" + } + // find quote type after = + start := j + len(k) + if start >= len(tag) { + return "" + } + quote := tag[start] + if quote != '"' && quote != '\'' { + return "" + } + start++ + end := strings.IndexByte(tag[start:], byte(quote)) + if end < 0 { + return "" + } + return tag[start : start+end] +} + +// appendAudit writes an NDJSON line under .goagent/audit/YYYYMMDD.log at the repo root. +func appendAudit(entry any) error { + b, err := json.Marshal(entry) + if err != nil { + return err + } + root := moduleRoot() + dir := filepath.Join(root, ".goagent", "audit") + if err := os.MkdirAll(dir, 0o755); err != nil { + return err + } + fname := time.Now().UTC().Format("20060102") + ".log" + path := filepath.Join(dir, fname) + f, err := os.OpenFile(path, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0o644) + if err != nil { + return err + } + defer func() { _ = f.Close() }() //nolint:errcheck + if _, err := f.Write(append(b, '\n')); err != nil { + return err + } + return nil +} + +// moduleRoot walks upward from CWD to the directory containing go.mod; falls back to CWD. +func moduleRoot() string { + cwd, err := os.Getwd() + if err != nil || cwd == "" { + return "." + } + dir := cwd + for { + if _, err := os.Stat(filepath.Join(dir, "go.mod")); err == nil { + return dir + } + parent := filepath.Dir(dir) + if parent == dir { + return cwd + } + dir = parent + } +} diff --git a/tools/cmd/metadata_extract/metadata_extract_test.go b/tools/cmd/metadata_extract/metadata_extract_test.go new file mode 100644 index 0000000..48d92a1 --- /dev/null +++ b/tools/cmd/metadata_extract/metadata_extract_test.go @@ -0,0 +1,60 @@ +package main_test + +import ( + "bytes" + "encoding/json" + "os/exec" + "strings" + "testing" + + testutil "github.com/hyperifyio/goagent/tools/testutil" +) + +func runTool(t *testing.T, bin string, input any) (string, string, error) { + t.Helper() + data, err := json.Marshal(input) + if err != nil { + t.Fatalf("marshal: %v", err) + } + cmd := exec.Command(bin) + cmd.Stdin = bytes.NewReader(data) + var stdout, stderr bytes.Buffer + cmd.Stdout = &stdout + cmd.Stderr = &stderr + err = cmd.Run() + return strings.TrimSpace(stdout.String()), strings.TrimSpace(stderr.String()), err +} + +func TestMetadataExtract_ParsesOGTwitterJSONLD(t *testing.T) { + bin := testutil.BuildTool(t, "metadata_extract") + html := ` + + + + hi` + in := map[string]any{"html": html, "base_url": "https://example.org/page"} + outStr, errStr, err := runTool(t, bin, in) + if err != nil { + t.Fatalf("run error: %v, stderr=%s", err, errStr) + } + if !strings.Contains(outStr, "\"opengraph\"") { + t.Fatalf("expected opengraph in output: %s", outStr) + } + if !strings.Contains(outStr, "\"twitter\"") { + t.Fatalf("expected twitter in output: %s", outStr) + } + if !strings.Contains(outStr, "\"jsonld\"") { + t.Fatalf("expected jsonld in output: %s", outStr) + } +} + +func TestMetadataExtract_RequiresInputs(t *testing.T) { + bin := testutil.BuildTool(t, "metadata_extract") + _, errStr, err := runTool(t, bin, map[string]any{"html": "", "base_url": ""}) + if err == nil { + t.Fatalf("expected error for missing inputs") + } + if !strings.Contains(errStr, "required") { + t.Fatalf("expected required error, got: %s", errStr) + } +}