graph/formats/rdf/equi_canonical.go

// Copyright ©2021 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

package rdf

import (
	"errors"
	"sort"
)

// Throughout, the comments refer to doi:10.1145/3068333 which should be
// understood as a synonym for http://aidanhogan.com/docs/rdf-canonicalisation.pdf
// although there are differences between the two, see http://aidanhogan.com/#errataH17.
// Where there are differences, the document at http://aidanhogan.com/ is the
// canonical truth. The DOI reference is referred to for persistence.

// Lean returns an RDF core of g that entails g. If g contains any non-zero
// labels, Lean will return a non-nil error and a core of g assuming no graph
// labels exist.
//
// See http://aidanhogan.com/docs/rdf-canonicalisation.pdf for details of
// the algorithm.
func Lean(g []*Statement) ([]*Statement, error) {
	// BUG(kortschak): Graph leaning does not take into account graph label terms
	// since the formal semantics for a multiple graph data model have not been
	// defined. See https://www.w3.org/TR/rdf11-datasets/#declaring.

	var (
		hasBlanks bool
		err       error
	)
	for _, s := range g {
		if isBlank(s.Subject.Value) || isBlank(s.Object.Value) {
			hasBlanks = true
			if err != nil {
				break
			}
		}
		if s.Label.Value != "" && err == nil {
			err = errors.New("rdf: data-set contains graph names")
			if hasBlanks {
				break
			}
		}
	}
	if hasBlanks {
		g = lean(&dfs{}, g)
	}
	return g, err
}

// removeRedundantBnodes removes blank nodes whose edges are a subset of
// another term in the RDF graph.
//
// This is algorithm 4 in doi:10.1145/3068333.
func removeRedundantBnodes(g []*Statement) []*Statement {
	g = append(g[:0:0], g...)
	for {
		edges := make(map[string]map[triple]bool)
		for _, s := range g {
			for i, t := range []string{
				s.Subject.Value,
				s.Object.Value,
			} {
				e, ok := edges[t]
				if !ok {
					e = make(map[triple]bool)
					edges[t] = e
				}
				switch i {
				case 0:
					e[triple{s.Predicate.Value, s.Object.Value, "+"}] = true
				case 1:
					e[triple{s.Predicate.Value, s.Subject.Value, "-"}] = true
				}
			}
		}

		seen := make(map[string]bool)
		bNodes := make(map[string]bool)
		terms := make(map[string]bool)
		for _, s := range g {
			for _, t := range []string{
				s.Subject.Value,
				s.Predicate.Value,
				s.Object.Value,
			} {
				terms[t] = true
				if isBlank(t) {
					bNodes[t] = true
				} else {
					seen[t] = true
				}
			}
		}

		redundant := make(map[string]bool)
		for x := range bNodes {
			for xp := range terms {
				if isProperSubset(edges[x], edges[xp]) || (seen[xp] && isEqualEdges(edges[x], edges[xp])) {
					redundant[x] = true
					break
				}
			}
			seen[x] = true
		}

		n := len(g)
		for i := 0; i < len(g); {
			if !redundant[g[i].Subject.Value] && !redundant[g[i].Object.Value] {
				i++
				continue
			}
			g[i], g = g[len(g)-1], g[:len(g)-1]
		}
		if n == len(g) {
			return g
		}
	}
}

type triple [3]string

func isProperSubset(a, b map[triple]bool) bool {
	for k := range a {
		if !b[k] {
			return false
		}
	}
	return len(a) < len(b)
}

func isEqualEdges(a, b map[triple]bool) bool {
	if len(a) != len(b) {
		return false
	}
	for k := range a {
		if !b[k] {
			return false
		}
	}
	return true
}

// findCandidates finds candidates for blank nodes and blank nodes that are fixed.
//
// This is algorithm 5 in doi:10.1145/3068333.
func findCandidates(g []*Statement) ([]*Statement, map[string]bool, map[string]map[string]bool, bool) {
	g = removeRedundantBnodes(g)

	edges := make(map[triple]bool)
	f := make(map[string]bool)
	for _, s := range g {
		sub := s.Subject.Value
		prd := s.Predicate.Value
		obj := s.Object.Value

		edges[triple{sub, prd, obj}] = true
		edges[triple{sub, prd, "*"}] = true
		edges[triple{"*", prd, obj}] = true
		switch {
		case isBlank(sub) && isBlank(obj):
			f[sub] = false
			f[obj] = false
		case isBlank(sub):
			if _, ok := f[sub]; !ok {
				f[sub] = true
			}
		case isBlank(obj):
			if _, ok := f[obj]; !ok {
				f[obj] = true
			}
		}
	}
	for k, v := range f {
		if !v {
			delete(f, k)
		}
	}
	if len(f) == 0 {
		f = nil
	}

	cands := make(map[string]map[string]bool)
	bnodes := make(map[string]bool)
	for _, s := range g {
		for _, b := range []string{
			s.Subject.Value,
			s.Object.Value,
		} {
			if !isBlank(b) {
				continue
			}
			bnodes[b] = true
			if f[b] {
				cands[b] = map[string]bool{b: true}
			} else {
				terms := make(map[string]bool)
				for _, s := range g {
					for _, t := range []string{
						s.Subject.Value,
						s.Predicate.Value,
						s.Object.Value,
					} {
						terms[t] = true
					}
				}
				cands[b] = terms
			}
		}
	}
	if isEqualTerms(f, bnodes) {
		return g, f, cands, true
	}

	for {
		bb := make(map[string]bool)
		for b := range bnodes {
			if !f[b] {
				bb[b] = true
			}
		}
		for b := range bb {
			for x := range cands[b] {
				if x == b {
					continue
				}
				for _, s := range g {
					if s.Subject.Value != b {
						continue
					}
					prd := s.Predicate.Value
					obj := s.Object.Value
					if (inILF(obj, f) && !edges[triple{x, prd, obj}]) || (bb[obj] && !edges[triple{x, prd, "*"}]) {
						delete(cands[b], x)
						break
					}
				}
				if !cands[b][x] {
					continue
				}
				for _, s := range g {
					if s.Object.Value != b {
						continue
					}
					sub := s.Subject.Value
					prd := s.Predicate.Value
					if (inIF(sub, f) && !edges[triple{sub, prd, x}]) || (bb[sub] && !edges[triple{"*", prd, x}]) {
						delete(cands[b], x)
						break
					}
				}
			}
		}

		fp := f
		f = make(map[string]bool)
		for b := range fp {
			f[b] = true
		}
		for b := range bb { // Mark newly fixed blank nodes.
			if len(cands[b]) == 1 && cands[b][b] {
				f[b] = true
			}
		}
		allFixed := isEqualTerms(f, bnodes)
		if isEqualTerms(fp, f) || allFixed {
			if len(f) == 0 {
				f = nil
			}
			return g, f, cands, allFixed
		}
	}
}

// inILF returns whether t is in IL or F.
func inILF(t string, f map[string]bool) bool {
	return isIRI(t) || isLiteral(t) || f[t]
}

// inIF returns whether t is in I or F.
func inIF(t string, f map[string]bool) bool {
	return isIRI(t) || f[t]
}

// dfs is a depth-first search strategy.
type dfs struct{}

// lean returns a core of the RDF graph g using the given strategy.
//
// This is lines 1-9 of algorithm 6 in doi:10.1145/3068333.
func lean(strategy *dfs, g []*Statement) []*Statement {
	foundBnode := false
search:
	for _, s := range g {
		for _, t := range []string{
			s.Subject.Value,
			s.Object.Value,
		} {
			if isBlank(t) {
				foundBnode = true
				break search
			}
		}
	}
	if !foundBnode {
		return g
	}
	g, fixed, cands, allFixed := findCandidates(g)
	if allFixed {
		return g
	}
	for _, s := range g {
		if isBlank(s.Subject.Value) && isBlank(s.Object.Value) {
			mu := make(map[string]string, len(fixed))
			for b := range fixed {
				mu[b] = b
			}
			mu = findCoreEndomorphism(strategy, g, cands, mu)
			return applyMu(g, mu)
		}
	}
	return g
}

// findCoreEndomorphism returns a core solution using the given strategy.
//
// This is lines 10-14 of algorithm 6 in doi:10.1145/3068333.
func findCoreEndomorphism(strategy *dfs, g []*Statement, cands map[string]map[string]bool, mu map[string]string) map[string]string {
	var q []*Statement
	preds := make(map[string]int)
	seen := make(map[triple]bool)
	for _, s := range g {
		preds[s.Predicate.Value]++
		if isBlank(s.Subject.Value) && isBlank(s.Object.Value) {
			if seen[triple{s.Subject.Value, s.Predicate.Value, s.Object.Value}] {
				continue
			}
			seen[triple{s.Subject.Value, s.Predicate.Value, s.Object.Value}] = true
			q = append(q, s)
		}
	}
	sort.Slice(q, func(i, j int) bool {
		return selectivity(q[i], cands, preds) < selectivity(q[j], cands, preds)
	})
	return strategy.evaluate(g, q, cands, mu)
}

// selectivity returns the selectivity heuristic score for s. Lower scores
// are more selective.
func selectivity(s *Statement, cands map[string]map[string]bool, preds map[string]int) int {
	return min(len(cands[s.Subject.Value])*len(cands[s.Object.Value]), preds[s.Predicate.Value])
}

// evaluate returns an endomorphism using a DFS strategy.
//
// This is lines 25-32 of algorithm 6 in doi:10.1145/3068333.
func (st *dfs) evaluate(g, q []*Statement, cands map[string]map[string]bool, mu map[string]string) map[string]string {
	mu = st.search(g, q, cands, mu)
	for len(mu) != len(codom(mu)) {
		mupp := fixedFrom(cands)
		mup := findCoreEndomorphism(st, applyMu(g, mu), cands, mupp)
		if isAutomorphism(mup) {
			return mu
		}
		for b, x := range mu {
			if _, ok := mup[b]; !ok {
				mup[b] = x
			}
		}
		mu = mup
	}
	return mu
}

func fixedFrom(cands map[string]map[string]bool) map[string]string {
	fixed := make(map[string]string)
	for b, m := range cands {
		if len(m) == 1 && m[b] {
			fixed[b] = b
		}
	}
	return fixed
}

// applyMu applies mu to g returning the result.
func applyMu(g []*Statement, mu map[string]string) []*Statement {
	back := make([]Statement, 0, len(g))
	dst := make([]*Statement, 0, len(g))
	seen := make(map[Statement]bool)
	for _, s := range g {
		n := Statement{
			Subject:   Term{Value: translate(s.Subject.Value, mu)},
			Predicate: Term{Value: s.Predicate.Value},
			Object:    Term{Value: translate(s.Object.Value, mu)},
			Label:     Term{Value: s.Label.Value},
		}
		if seen[n] {
			continue
		}
		seen[n] = true
		back = append(back, n)
		dst = append(dst, &back[len(back)-1])
	}
	return dst
}

// search returns a minimum endomorphism using a DFS strategy.
//
// This is lines 33-46 of algorithm 6 in doi:10.1145/3068333.
func (st *dfs) search(g, q []*Statement, cands map[string]map[string]bool, mu map[string]string) map[string]string {
	qMin := q[0]
	m := st.join(qMin, g, cands, mu)
	if len(m) == 0 {
		// Early exit if no mapping found.
		return nil
	}
	sortByCodom(m)
	mMin := m[0]
	qp := q[1:]
	if len(qp) != 0 {
		for len(m) != 0 {
			mMin = m[0]
			mup := st.search(g, qp, cands, mMin)
			if !isAutomorphism(mup) {
				return mup
			}
			m = m[1:]
		}
	}
	return mMin
}

// isAutomorphism returns whether mu is an automorphism, this is equivalent to
// dom(mu) == codom(mu).
func isAutomorphism(mu map[string]string) bool {
	return isEqualTerms(dom(mu), codom(mu))
}

// dom returns the domain of mu.
func dom(mu map[string]string) map[string]bool {
	d := make(map[string]bool, len(mu))
	for v := range mu {
		d[v] = true
	}
	return d
}

// codom returns the codomain of mu.
func codom(mu map[string]string) map[string]bool {
	cd := make(map[string]bool, len(mu))
	for _, v := range mu {
		cd[v] = true
	}
	return cd
}

// isEqualTerms returns whether a and b are identical.
func isEqualTerms(a, b map[string]bool) bool {
	if len(a) != len(b) {
		return false
	}
	for k := range a {
		if !b[k] {
			return false
		}
	}
	return true
}

// sortByCodom performs a sort of maps ordered by fewest blank nodes in
// codomain, then fewest self mappings.
func sortByCodom(maps []map[string]string) {
	m := orderedByCodom{
		maps:  maps,
		attrs: make([]attrs, len(maps)),
	}
	for i, mu := range maps {
		m.attrs[i].blanks = make(map[string]bool)
		for x, y := range mu {
			if isBlank(y) {
				m.attrs[i].blanks[y] = true
			}
			if x == y {
				m.attrs[i].selfs++
			}
		}
	}
	sort.Sort(m)
}

type orderedByCodom struct {
	maps  []map[string]string
	attrs []attrs
}

type attrs struct {
	blanks map[string]bool
	selfs  int
}

func (m orderedByCodom) Len() int { return len(m.maps) }
func (m orderedByCodom) Less(i, j int) bool {
	attrI := m.attrs[i]
	attrJ := m.attrs[j]
	switch {
	case len(attrI.blanks) < len(attrJ.blanks):
		return true
	case len(attrI.blanks) > len(attrJ.blanks):
		return false
	default:
		return attrI.selfs < attrJ.selfs
	}
}
func (m orderedByCodom) Swap(i, j int) {
	m.maps[i], m.maps[j] = m.maps[j], m.maps[i]
	m.attrs[i], m.attrs[j] = m.attrs[j], m.attrs[i]
}

// join evaluates the given pattern, q, joining with solutions in m.
// This takes only a single mapping and so only works for the DFS strategy.
//
// This is lines 47-51 of algorithm 6 in doi:10.1145/3068333.
func (st *dfs) join(q *Statement, g []*Statement, cands map[string]map[string]bool, m map[string]string) []map[string]string {
	var mp []map[string]string
	isLoop := q.Subject.Value == q.Object.Value
	for _, s := range g {
		// Line 45: M_q ← {µ | µ(q) ∈ G}
		//  | µ(q) ∈ G
		//
		//    µ(q) ∈ G ↔ (µ(q_s),q_p,µ(q_o)) ∈ G
		if q.Predicate.Value != s.Predicate.Value {
			continue
		}
		//    q_s = q_o ↔ µ(q_s) =_µ(q_o)
		if isLoop && s.Subject.Value != s.Object.Value {
			continue
		}

		// Line 46: M_q' ← {µ ∈ M_q | for all b ∈ bnodes({q}), µ(b) ∈ cands[b]}
		//  | for all b ∈ bnodes({q}), µ(b) ∈ cands[b]
		if !cands[q.Subject.Value][s.Subject.Value] || !cands[q.Object.Value][s.Object.Value] {
			continue
		}

		// Line 47: M' ← M_q' ⋈ M
		// M₁ ⋈ M₂ = {μ₁ ∪ μ₂ | μ₁ ∈ M₁, μ₂ ∈ M₂ and μ₁, μ₂ are compatible mappings}
		//  | μ₁ ∈ M₁, μ₂ ∈ M₂ and μ₁, μ₂ are compatible mappings
		if mq, ok := m[q.Subject.Value]; ok && mq != s.Subject.Value {
			continue
		}
		if !isLoop {
			if mq, ok := m[q.Object.Value]; ok && mq != s.Object.Value {
				continue
			}
		}
		// Line 47: μ₁ ∪ μ₂
		var mu map[string]string
		if isLoop {
			mu = map[string]string{
				q.Subject.Value: s.Subject.Value,
			}
		} else {
			mu = map[string]string{
				q.Subject.Value: s.Subject.Value,
				q.Object.Value:  s.Object.Value,
			}
		}
		for b, mb := range m {
			mu[b] = mb
		}
		mp = append(mp, mu)
	}
	return mp
}