Add new fuzzy quote matching implementation

Implement a `matchQuote` function which will be used to replace `dom-anchor-text-quote` for finding the best match for annotation quotes in the document text. The new implementation is based on the `approx-string-match` library and provides several improvements over the existing one: - Better performance when there are many differences between the quote and closest document text - It will be easier for us to tune the degree of mismatch allowed between the quote and document text and how candidate matches are ranked
hypothesis · Dec 10, 2020 · 0920964 · 0920964
1 parent dd3fd83
commit 0920964
Show file tree

Hide file tree

Showing 4 changed files with 361 additions and 0 deletions.
diff --git a/package.json b/package.json
@@ -12,6 +12,7 @@
     "@babel/preset-react": "^7.0.0",
     "@octokit/rest": "^18.0.0",
     "@sentry/browser": "^5.6.2",
+    "approx-string-match": "^1.1.0",
     "autoprefixer": "^10.0.1",
     "aws-sdk": "^2.345.0",
     "axe-core": "^4.0.0",

diff --git a/src/annotator/anchoring/match-quote.js b/src/annotator/anchoring/match-quote.js
@@ -0,0 +1,157 @@
+import approxSearch from 'approx-string-match';
+
+/**
+ * @typedef {import('approx-string-match').Match} StringMatch
+ */
+
+/**
+ * @typedef Match
+ * @prop {number} start - Start offset of match in text
+ * @prop {number} end - End offset of match in text
+ * @prop {number} score -
+ *   Score for the match between 0 and 1.0, where 1.0 indicates a perfect match
+ *   for the quote and context.
+ */
+
+/**
+ * Find the best approximate matches for `str` in `text` allowing up to `maxErrors` errors.
+ *
+ * @param {string} text
+ * @param {string} str
+ * @param {number} maxErrors
+ * @return {StringMatch[]}
+ */
+function search(text, str, maxErrors) {
+  // Do a fast search for exact matches. The `approx-string-match` library
+  // doesn't currently incorporate this optimization itself.
+  let matchPos = 0;
+  let exactMatches = [];
+  while (matchPos !== -1) {
+    matchPos = text.indexOf(str, matchPos);
+    if (matchPos !== -1) {
+      exactMatches.push({
+        start: matchPos,
+        end: matchPos + str.length,
+        errors: 0,
+      });
+      matchPos += 1;
+    }
+  }
+  if (exactMatches.length > 0) {
+    return exactMatches;
+  }
+
+  // If there are no exact matches, do a more expensive search for matches
+  // with errors.
+  return approxSearch(text, str, maxErrors);
+}
+
+/**
+ * Compute a score between 0 and 1.0 for the similarity between `text` and `str`.
+ *
+ * @param {string} text
+ * @param {string} str
+ */
+function textMatchScore(text, str) {
+  /* istanbul ignore next - `scoreMatch` will never pass an empty string */
+  if (str.length === 0) {
+    return 0.0;
+  }
+  const matches = search(text, str, str.length);
+  return 1 - matches[0].errors / str.length;
+}
+
+/**
+ * Find the best approximate match for `quote` in `text`.
+ *
+ * Returns `null` if no match exceeding the minimum quality threshold was found.
+ *
+ * @param {string} text - Document text to search
+ * @param {string} quote - String to find within `text`
+ * @param {Object} context -
+ *   Context in which the quote originally appeared. This is used to choose the
+ *   best match.
+ *   @param {string} [context.prefix] - Expected text before the quote
+ *   @param {string} [context.suffix] - Expected text after the quote
+ *   @param {number} [context.hint] - Expected offset of match within text
+ * @return {Match|null}
+ */
+export function matchQuote(text, quote, context = {}) {
+  if (quote.length === 0) {
+    return null;
+  }
+
+  // Choose the maximum number of errors to allow for the initial search.
+  // This choice involves a tradeoff between:
+  //
+  //  - Recall (proportion of "good" matches found)
+  //  - Precision (proportion of matches found which are "good")
+  //  - Cost of the initial search and of processing the candidate matches [1]
+  //
+  // [1] Specifically, the expected-time complexity of the initial search is
+  //     `O((maxErrors / 32) * text.length)`. See `approx-string-match` docs.
+  const maxErrors = Math.min(256, quote.length / 2);
+
+  // Find closest matches for `quote` in `text` based on edit distance.
+  const matches = search(text, quote, maxErrors);
+
+  if (matches.length === 0) {
+    // All matches had more than `maxErrors` errors.
+    return null;
+  }
+
+  /**
+   * Compute a score between 0 and 1.0 for a match candidate.
+   *
+   * @param {StringMatch} match
+   */
+  const scoreMatch = match => {
+    const quoteWeight = 50; // Similarity of matched text to quote.
+    const prefixWeight = 20; // Similarity of text before matched text to `context.prefix`.
+    const suffixWeight = 20; // Similarity of text after matched text to `context.suffix`.
+    const posWeight = 2; // Proximity to expected location. Used as a tie-breaker.
+
+    const quoteScore = 1 - match.errors / quote.length;
+
+    const prefixScore = context.prefix
+      ? textMatchScore(
+          text.slice(match.start - context.prefix.length, match.start),
+          context.prefix
+        )
+      : 1.0;
+    const suffixScore = context.suffix
+      ? textMatchScore(
+          text.slice(match.end, match.end + context.suffix.length),
+          context.suffix
+        )
+      : 1.0;
+
+    let posScore = 1.0;
+    if (typeof context.hint === 'number') {
+      const offset = Math.abs(match.start - context.hint);
+      posScore = 1.0 - offset / text.length;
+    }
+
+    const rawScore =
+      quoteWeight * quoteScore +
+      prefixWeight * prefixScore +
+      suffixWeight * suffixScore +
+      posWeight * posScore;
+    const maxScore = quoteWeight + prefixWeight + suffixWeight + posWeight;
+    const normalizedScore = rawScore / maxScore;
+
+    return normalizedScore;
+  };
+
+  // Rank matches based on similarity of actual and expected surrounding text
+  // and actual/expected offset in the document text.
+  const scoredMatches = matches.map(m => ({
+    start: m.start,
+    end: m.end,
+    score: scoreMatch(m),
+  }));
+
+  // Choose match with highest score.
+  scoredMatches.sort((a, b) => b.score - a.score);
+  return scoredMatches[0];
+}
diff --git a/src/annotator/anchoring/test/match-quote-test.js b/src/annotator/anchoring/test/match-quote-test.js
@@ -0,0 +1,198 @@
+import { matchQuote } from '../match-quote';
+
+const fixtures = {
+  solitude: `Many years later, as he faced the firing squad,
+    Colonel Aureliano Buendía was to remember that distant afternoon
+    when his father took him to discover ice`,
+
+  twoCities: `It was the best of times, it was the worst of times,
+    it was the age of wisdom, it was the age of foolishness, it was the epoch of belief,
+    it was the epoch of incredulity, it was the season of Light, it was the
+    season of Darkness, it was the spring of hope, it was the winter of despair, we had
+    everything before us, we had nothing before us, we were all going direct to Heaven,
+    we were all going direct the other way.`,
+};
+
+function normalize(str) {
+  // Normalize whitespace.
+  return str.replace(/\s+/g, ' ');
+}
+
+Object.keys(fixtures).forEach(k => (fixtures[k] = normalize(fixtures[k])));
+
+describe('matchQuote', () => {
+  it('finds exact match', () => {
+    const match = matchQuote(fixtures.solitude, 'discover ice');
+    assert.equal(match.score, 1.0);
+    assert.equal(
+      fixtures.solitude.slice(match.start, match.end),
+      'discover ice'
+    );
+  });
+
+  it('finds best approximate match if there is no exact match', () => {
+    const match = matchQuote(fixtures.solitude, 'some years later');
+    assert.isTrue(match.score > 0);
+    assert.isTrue(match.score < 1);
+    assert.equal(
+      fixtures.solitude.slice(match.start, match.end),
+      'Many years later'
+    );
+  });
+
+  it('scores matches based on quote similarity', () => {
+    // List of quotes in descending order of similarity to the text.
+    const quotes = [
+      'Many years later',
+      'Many yers later',
+      'Some years later',
+      'Some years after',
+    ];
+
+    const scores = quotes.map(q => matchQuote(fixtures.solitude, q).score);
+
+    for (let i = 1; i < scores.length; i++) {
+      assert.isBelow(scores[i], scores[i - 1]);
+    }
+  });
+
+  it('scores matches based on prefix similarity', () => {
+    // List of prefixes in descending order of similarity to the actual prefix
+    // of the quote.
+    const prefixes = [
+      'Many years later',
+      'Many yers later',
+      'Some years later',
+      'Some years after',
+    ];
+
+    const quote = ', as he faced the firing squad';
+    const scores = prefixes.map(
+      p => matchQuote(fixtures.solitude, quote, { prefix: p }).score
+    );
+
+    for (let i = 1; i < scores.length; i++) {
+      assert.isBelow(scores[i], scores[i - 1]);
+    }
+  });
+
+  it('scores matches based on suffix similarity', () => {
+    // List of suffixes in descending order of similarity to the actual suffix
+    // of the quote.
+    const suffixes = [
+      ', as he faced the firing squad',
+      ', as she faced the firing squad',
+      ', as he awaited the firing squad',
+      ', as he awaited his death',
+    ];
+
+    const quote = 'Many years later';
+    const scores = suffixes.map(
+      s => matchQuote(fixtures.solitude, quote, { suffix: s }).score
+    );
+
+    for (let i = 1; i < scores.length; i++) {
+      assert.isBelow(scores[i], scores[i - 1]);
+    }
+  });
+
+  it('returns `null` if there is no acceptable approximate match', () => {
+    const match = matchQuote(fixtures.twoCities, fixtures.solitude);
+    assert.equal(match, null);
+  });
+
+  it('returns `null` if quote is empty', () => {
+    assert.equal(matchQuote('foobar', ''), null);
+  });
+
+  it('returns `null` if text is empty', () => {
+    assert.equal(matchQuote('', 'foobar'), null);
+  });
+
+  [
+    // Exact prefix matches.
+    {
+      quote: 'before us',
+      prefix: 'we had everything',
+      expected: 'before us, we had nothing',
+    },
+    {
+      quote: 'before us',
+      prefix: 'we had nothing',
+      expected: 'before us, we were all going',
+    },
+
+    // Approximate prefix matches.
+    {
+      quote: 'before us',
+      prefix: 'we had every-thing',
+      expected: 'before us, we had nothing',
+    },
+    {
+      quote: 'before us',
+      prefix: 'we had nout',
+      expected: 'before us, we were all going',
+    },
+
+    // Exact suffix matches.
+    {
+      quote: 'we had',
+      suffix: 'everything',
+      expected: 'we had everything',
+    },
+    {
+      quote: 'we had',
+      suffix: 'nothing',
+      expected: 'we had nothing',
+    },
+
+    // Approximate suffix matches.
+    {
+      quote: 'we had',
+      suffix: 'ever ting',
+      expected: 'we had everything',
+    },
+    {
+      quote: 'we had',
+      suffix: 'nutting',
+      expected: 'we had nothing',
+    },
+  ].forEach(({ quote, prefix, suffix, expected }, i) => {
+    it(`finds match with best context match (${i})`, () => {
+      const text = fixtures.twoCities;
+      const match = matchQuote(text, quote, {
+        prefix,
+        suffix,
+      });
+      assert.ok(match);
+      assert.equal(text.slice(match.start, match.end), quote);
+      assert.equal(match.start, text.indexOf(expected));
+    });
+  });
+
+  it('uses `hint` as a tie-breaker to choose between matches with close scores', () => {
+    const text = fixtures.twoCities;
+    const posA = text.indexOf('everything before us') + 'everything '.length;
+    const posB = text.indexOf('nothing before us') + 'nothing '.length;
+
+    // Search for a quote that appears multiple times in the text. Since no
+    // context is provided, there will be several matches with equal scores to
+    // choose between.
+    const matchHintA = matchQuote(text, 'befor us', { hint: posA });
+    const matchHintB = matchQuote(text, 'befor us', { hint: posB });
+    const matchNoHint = matchQuote(text, 'befor us');
+
+    // When a hint is provided, `matchQuote` should choose between otherwise
+    // equal matches based on how close the match start is to `hint`.
+    assert.ok(matchHintA);
+    assert.equal(matchHintA.start, posA, 'Wrong match for hint `posA`');
+
+    assert.ok(matchHintB);
+    assert.equal(matchHintB.start, posB, 'Wrong match for hint `posB`');
+
+    // When no hint is provided, the first match (ie. lowest `match.start`)
+    // should be chosen.
+    assert.ok(matchNoHint);
+    assert.equal(matchNoHint.start, posA, 'Wrong match with no hint');
+  });
+});
diff --git a/yarn.lock b/yarn.lock
@@ -1340,6 +1340,11 @@ append-buffer@^1.0.2:
   dependencies:
     buffer-equal "^1.0.0"
 
+approx-string-match@^1.1.0:
+  version "1.1.0"
+  resolved "https://registry.yarnpkg.com/approx-string-match/-/approx-string-match-1.1.0.tgz#2fb8e1d6dcb640acc1c0d1ae9f0895348d06f4c0"
+  integrity sha512-j1yQB9XhfGWsvTfHEuNsR/SrUT4XQDkAc0PEjMifyi97931LmNQyLsO6HbuvZ3HeMx+3Dvk8m8XGkUF+8lCeqw==
+
 archy@^1.0.0:
   version "1.0.0"
   resolved "https://registry.yarnpkg.com/archy/-/archy-1.0.0.tgz#f9c8c13757cc1dd7bc379ac77b2c62a5c2868c40"