internal/frontend/markdown.go

// Copyright 2023 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

package frontend

import (
	"bytes"
	"context"
	"errors"
	"fmt"
	"strings"

	"github.com/google/safehtml/template"
	"github.com/khulnasoft-lab/godep/internal"
	"github.com/khulnasoft-lab/godep/internal/derrors"
	"github.com/khulnasoft-lab/godep/internal/log"
	"github.com/khulnasoft-lab/godep/internal/source"
	"rsc.io/markdown"
)

// ProcessReadme processes the README of unit u, if it has one.
// Processing includes rendering and sanitizing the HTML or Markdown,
// and extracting headings and links.
//
// Headings are prefixed with "readme-" and heading levels are adjusted to start
// at h3 in order to nest them properly within the rest of the page. The
// readme's original styling is preserved in the html by giving headings a css
// class styled identical to their original heading level.
//
// The extracted links are for display outside of the readme contents.
//
// This function is exported for use by external tools.
func ProcessReadme(ctx context.Context, u *internal.Unit) (_ *Readme, err error) {
	defer derrors.WrapAndReport(&err, "ProcessReadme(%q, %q, %q)", u.Path, u.ModulePath, u.Version)
	return processReadme(ctx, u.Readme, u.SourceInfo)
}

func processReadme(ctx context.Context, readme *internal.Readme, info *source.Info) (frontendReadme *Readme, err error) {
	if readme == nil || readme.Contents == "" {
		return &Readme{}, nil
	}
	if !isMarkdown(readme.Filepath) {
		t := template.Must(template.New("").Parse(`<pre class="readme">{{.}}</pre>`))
		h, err := t.ExecuteToHTML(readme.Contents)
		if err != nil {
			return nil, err
		}
		return &Readme{HTML: h}, nil
	}

	p := markdown.Parser{
		HeadingIDs:    true,
		Strikethrough: true,
		TaskListItems: true,
		AutoLinkText:  true,
		Table:         true,
		Emoji:         true,
	}
	doc := p.Parse(readme.Contents)
	(&linkRewriter{info, readme}).rewriteLinks(doc)
	rewriteImgSrc(doc, info, readme)
	rewriteHeadingIDs(doc) // rewrite heading ids before extractTOC extracts them
	et := &extractTOC{ctx: ctx, removeTitle: true}
	et.extract(doc)
	el := &extractLinks{ctx: ctx}
	el.extract(doc)
	transformHeadingsToHTML(doc)
	var buf bytes.Buffer
	doc.PrintHTML(&buf)
	return &Readme{
		HTML:    sanitizeHTML(&buf),
		Outline: et.Headings,
		Links:   el.links,
	}, nil
}

// rewriteImgSrc rewrites the HTML in the markdown document to replace img
// src keys with a value that properly represents the source of the image
// from the repo.
func rewriteImgSrc(doc *markdown.Document, info *source.Info, readme *internal.Readme) {
	walkBlocks(doc.Blocks, func(b markdown.Block) error {
		switch x := b.(type) {
		case *markdown.HTMLBlock:
			htmlBlock := x
			for i := range htmlBlock.Text {
				translated, err := translateHTML([]byte(htmlBlock.Text[i]), info, readme)
				if err != nil {
					continue
				}
				htmlBlock.Text[i] = string(translated)
			}
		case *markdown.Text:
			rewriteHtmlInline(x.Inline, info, readme)
		}
		return nil
	})
}

func rewriteHtmlInline(inlines []markdown.Inline, info *source.Info, readme *internal.Readme) {
	for _, inl := range inlines {
		if htmlTag, ok := inl.(*markdown.HTMLTag); ok {
			translated, err := translateHTML([]byte(htmlTag.Text), info, readme)
			if err != nil {
				continue
			}
			htmlTag.Text = string(translated)
		}
	}
}

var errSkipChildren = errors.New("skip children")

// walkBlocks calls walkFunc on all the blocks in the markdown document. If the
// walkFunc returns the errSkipChildren error the children of that block will be skipped.
func walkBlocks(blocks []markdown.Block, walkFunc func(b markdown.Block) error) error {
	for _, b := range blocks {
		err := walkFunc(b)
		if err == errSkipChildren {
			continue
		} else if err != nil {
			return err
		}

		err = nil
		switch x := b.(type) {
		case *markdown.Document:
			err = walkBlocks(x.Blocks, walkFunc)
		case *markdown.Text:
		case *markdown.Paragraph:
			err = walkBlocks([]markdown.Block{x.Text}, walkFunc)
		case *markdown.Heading:
			err = walkBlocks([]markdown.Block{x.Text}, walkFunc)
		case *markdown.List:
			err = walkBlocks(x.Items, walkFunc)
		case *markdown.Item:
			err = walkBlocks(x.Blocks, walkFunc)
		case *markdown.Quote:
			err = walkBlocks(x.Blocks, walkFunc)
		case *markdown.HTMLBlock:
		case *markdown.CodeBlock:
		case *markdown.Empty:
		case *markdown.Table:
			for _, t := range x.Header {
				walkBlocks([]markdown.Block{t}, walkFunc)
			}
			for _, r := range x.Rows {
				for _, t := range r {
					walkBlocks([]markdown.Block{t}, walkFunc)
				}
			}
		case *markdown.ThematicBreak:
		default:
			return fmt.Errorf("unhandled block type %T", x)
		}
		if err != nil {
			return err
		}
	}
	return nil
}

type extractTOC struct {
	ctx         context.Context
	Headings    []*Heading
	removeTitle bool // omit title from TOC
}

// extract collects the headings from a readme into an outline
// of the document. It nests the headings based on the h-level hierarchy.
// See tests for heading levels in TestReadme for behavior.
func (e *extractTOC) extract(doc *markdown.Document) {
	var headings []*Heading
	err := walkBlocks(doc.Blocks, func(b markdown.Block) error {
		if heading, ok := b.(*markdown.Heading); ok {
			var textbuf bytes.Buffer
			for _, t := range heading.Text.Inline {
				t.PrintText(&textbuf)
			}
			section := &Heading{
				Level: heading.Level,
				Text:  textbuf.String(),
			}
			section.ID = heading.ID
			headings = append(headings, section)
			return errSkipChildren
		}
		return nil
	})
	if err != nil {
		log.Errorf(e.ctx, "extractTOC.extract: %v", err)
	}

	// We nest the headings by walking through the list we extracted and
	// establishing parent child relationships based on heading levels.
	var nested []*Heading
	for i, h := range headings {
		if i == 0 {
			nested = append(nested, h)
			continue
		}
		parent := headings[i-1]
		for parent != nil && parent.Level >= h.Level {
			parent = parent.parent
		}
		if parent == nil {
			nested = append(nested, h)
		} else {
			h.parent = parent
			parent.Children = append(parent.Children, h)
		}
	}
	if e.removeTitle {
		// If there is only one top tevel heading with 1 or more children we
		// assume it is the title of the document and remove it from the TOC.
		if len(nested) == 1 && len(nested[0].Children) > 0 {
			nested = nested[0].Children
		}
	}
	e.Headings = nested
}

type extractLinks struct {
	ctx            context.Context
	inLinksHeading bool
	links          []link
}

// The name of the heading from which we extract links.
const linkHeadingText = "Links"

var linkHeadingBytes = []byte(linkHeadingText) // for faster comparison to node contents

// extract extracts links from the "Links" section of a README.
func (e *extractLinks) extract(doc *markdown.Document) {
	var seenLinksHeading bool
	err := walkBlocks(doc.Blocks, func(b markdown.Block) error {
		switch x := b.(type) {
		case *markdown.Heading:
			// We are in the links heading from the point we see a heading with
			// linkHeadingText until the point we see the next heading.
			if e.inLinksHeading {
				e.inLinksHeading = false
			}
			var headingText bytes.Buffer
			for _, t := range x.Text.Inline {
				t.PrintText(&headingText)
			}
			if !seenLinksHeading && bytes.Equal(headingText.Bytes(), linkHeadingBytes) {
				seenLinksHeading = true
				e.inLinksHeading = true
			}
		case *markdown.Item:
			// When in the links heading, extract links from list items.
			if !e.inLinksHeading {
				return errSkipChildren
			}
			// We expect the pattern: ListItem -> TextBlock -> Link, with no
			// other children.
			if len(x.Blocks) == 0 {
				return errSkipChildren
			}
			if tb, ok := x.Blocks[0].(*markdown.Text); ok {
				if len(tb.Inline) != 1 {
					return errSkipChildren
				}
				if l, ok := tb.Inline[0].(*markdown.Link); ok {
					// Record the link.
					var linkText bytes.Buffer
					for _, t := range l.Inner {
						t.PrintText(&linkText)
					}
					e.links = append(e.links, link{
						Href: l.URL,
						Body: linkText.String(),
					})
				}
			}
			return errSkipChildren
		}
		return nil
	})
	if err != nil {
		log.Errorf(e.ctx, "extractLinks.extract: %v", err)
	}
}

// linkRewriter rewrites links and image targets in a markdown document
// using translateLink.
type linkRewriter struct {
	info   *source.Info
	readme *internal.Readme
}

func (g *linkRewriter) rewriteLinks(doc *markdown.Document) {
	walkBlocks(doc.Blocks, func(b markdown.Block) error {
		if text, ok := b.(*markdown.Text); ok {
			g.rewriteLinksInline(text.Inline)
		}
		return nil
	})
}

func (g *linkRewriter) rewriteLinksInline(inlines []markdown.Inline) {
	for _, inl := range inlines {
		switch x := inl.(type) {
		case *markdown.Link:
			g.rewriteLinksInline(x.Inner)
			if d := translateLink(x.URL, g.info, false, g.readme); d != "" {
				x.URL = d
			}
		case *markdown.Image:
			g.rewriteLinksInline(x.Inner)
			if d := translateLink(x.URL, g.info, true, g.readme); d != "" {
				x.URL = d
			}
		case *markdown.Emph:
			g.rewriteLinksInline(x.Inner)
		case *markdown.Strong:
			g.rewriteLinksInline(x.Inner)
		}

	}
}

// transformHeadingsToHTML replaces heading blocks with rendered html
// blocks for the heading. It converts heading levels above 6 to divs
// with the h[level] class set on them.
func transformHeadingsToHTML(doc *markdown.Document) {
	firstHeading := true
	offset := 0
	var rewriteHeadingsBlocks func([]markdown.Block)
	rewriteHeadingsBlocks = func(blocks []markdown.Block) {
		for i, b := range blocks {
			switch x := b.(type) {
			case *markdown.Text:
			case *markdown.HTMLBlock:
			case *markdown.Table:
			case *markdown.Empty:
			case *markdown.CodeBlock:
			case *markdown.ThematicBreak:
			case *markdown.Paragraph:
				rewriteHeadingsBlocks([]markdown.Block{x.Text})
			case *markdown.List:
				rewriteHeadingsBlocks(x.Items)
			case *markdown.Item:
				rewriteHeadingsBlocks(x.Blocks)
			case *markdown.Quote:
				rewriteHeadingsBlocks(x.Blocks)
			case *markdown.Heading:
				heading := x
				if firstHeading {
					// The offset ensures the first heading is always an <h3>.
					offset = 3 - heading.Level
					firstHeading = false
				}
				newLevel := heading.Level + offset

				htmltag := &markdown.HTMLBlock{}
				var buf bytes.Buffer
				// TODO(matloob): Do we want the div and h elements to have analogous classes?
				// Currently we're using newLevel for the div's class but n.Level for the h element's
				// class.
				if newLevel > 6 {
					fmt.Fprintf(&buf, `<div class="h%d" role="heading" aria-level="%d"`, newLevel, heading.Level)
				} else {
					fmt.Fprintf(&buf, `<h%d class="h%d"`, newLevel, heading.Level)
				}
				if heading.ID != "" {
					fmt.Fprintf(&buf, ` id="%s"`, htmlQuoteEscaper.Replace(heading.ID))
				}
				buf.WriteByte('>')
				heading.Text.PrintHTML(&buf)
				if newLevel > 6 {
					_, _ = buf.WriteString("</div>")
				} else {
					fmt.Fprintf(&buf, "</h%d>", newLevel)
				}
				htmltag.Text = append(htmltag.Text, buf.String())
				blocks[i] = htmltag
			}
		}
	}
	rewriteHeadingsBlocks(doc.Blocks)
}

var htmlQuoteEscaper = strings.NewReplacer(
	"\"", "&quot;",
	"&", "&amp;",
	"<", "&lt;",
	">", "&gt;",
)

// rewriteHeadingIDs generates ids based on the body of the heading.
// The ASCII letters and numbers from the text are used to generate
// each of the ids. Finally, all heading ids
// are prefixed with "readme-" to avoid name collisions with other ids on the
// unit page. Duplicated heading ids are given an incremental suffix. See
// readme_test.go for examples.
func rewriteHeadingIDs(doc *markdown.Document) {
	ids := map[string]bool{}

	generateID := func(heading *markdown.Heading) string {
		var buf bytes.Buffer
		for _, inl := range heading.Text.Inline {
			inl.PrintText(&buf)
		}
		f := func(c rune) bool {
			return !('a' <= c && c <= 'z') && !('A' <= c && c <= 'Z') && !('0' <= c && c <= '9')
		}
		str := strings.Join(strings.FieldsFunc(buf.String(), f), "-")
		str = strings.ToLower(str)
		if len(str) == 0 {
			str = "heading"
		}
		key := str
		for i := 1; ; i++ {
			if _, ok := ids[key]; !ok {
				ids[key] = true
				break
			}
			key = fmt.Sprintf("%s-%d", str, i)
		}
		return "readme-" + key
	}

	walkBlocks(doc.Blocks, func(b markdown.Block) error {
		if heading, ok := b.(*markdown.Heading); ok {
			id := generateID(heading)
			heading.ID = string(id)
		}
		return nil
	})
}