Skip to content

Commit

Permalink
Add new fuzzy quote matching implementation
Browse files Browse the repository at this point in the history
Implement a `matchQuote` function which will be used to replace
`dom-anchor-text-quote` for finding the best match for annotation quotes
in the document text.

The new implementation is based on the `approx-string-match` library and
provides several improvements over the existing one:

 - Better performance when there are many differences between the quote
   and closest document text

 - It will be easier for us to tune the degree of mismatch allowed
   between the quote and document text and how candidate matches are
   ranked
  • Loading branch information
robertknight committed Dec 10, 2020
1 parent dd3fd83 commit 0920964
Show file tree
Hide file tree
Showing 4 changed files with 361 additions and 0 deletions.
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
"@babel/preset-react": "^7.0.0",
"@octokit/rest": "^18.0.0",
"@sentry/browser": "^5.6.2",
"approx-string-match": "^1.1.0",
"autoprefixer": "^10.0.1",
"aws-sdk": "^2.345.0",
"axe-core": "^4.0.0",
Expand Down
157 changes: 157 additions & 0 deletions src/annotator/anchoring/match-quote.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
import approxSearch from 'approx-string-match';

/**
* @typedef {import('approx-string-match').Match} StringMatch
*/

/**
* @typedef Match
* @prop {number} start - Start offset of match in text
* @prop {number} end - End offset of match in text
* @prop {number} score -
* Score for the match between 0 and 1.0, where 1.0 indicates a perfect match
* for the quote and context.
*/

/**
* Find the best approximate matches for `str` in `text` allowing up to `maxErrors` errors.
*
* @param {string} text
* @param {string} str
* @param {number} maxErrors
* @return {StringMatch[]}
*/
function search(text, str, maxErrors) {
// Do a fast search for exact matches. The `approx-string-match` library
// doesn't currently incorporate this optimization itself.
let matchPos = 0;
let exactMatches = [];
while (matchPos !== -1) {
matchPos = text.indexOf(str, matchPos);
if (matchPos !== -1) {
exactMatches.push({
start: matchPos,
end: matchPos + str.length,
errors: 0,
});
matchPos += 1;
}
}
if (exactMatches.length > 0) {
return exactMatches;
}

// If there are no exact matches, do a more expensive search for matches
// with errors.
return approxSearch(text, str, maxErrors);
}

/**
* Compute a score between 0 and 1.0 for the similarity between `text` and `str`.
*
* @param {string} text
* @param {string} str
*/
function textMatchScore(text, str) {
/* istanbul ignore next - `scoreMatch` will never pass an empty string */
if (str.length === 0) {
return 0.0;
}
const matches = search(text, str, str.length);
return 1 - matches[0].errors / str.length;
}

/**
* Find the best approximate match for `quote` in `text`.
*
* Returns `null` if no match exceeding the minimum quality threshold was found.
*
* @param {string} text - Document text to search
* @param {string} quote - String to find within `text`
* @param {Object} context -
* Context in which the quote originally appeared. This is used to choose the
* best match.
* @param {string} [context.prefix] - Expected text before the quote
* @param {string} [context.suffix] - Expected text after the quote
* @param {number} [context.hint] - Expected offset of match within text
* @return {Match|null}
*/
export function matchQuote(text, quote, context = {}) {
if (quote.length === 0) {
return null;
}

// Choose the maximum number of errors to allow for the initial search.
// This choice involves a tradeoff between:
//
// - Recall (proportion of "good" matches found)
// - Precision (proportion of matches found which are "good")
// - Cost of the initial search and of processing the candidate matches [1]
//
// [1] Specifically, the expected-time complexity of the initial search is
// `O((maxErrors / 32) * text.length)`. See `approx-string-match` docs.
const maxErrors = Math.min(256, quote.length / 2);

// Find closest matches for `quote` in `text` based on edit distance.
const matches = search(text, quote, maxErrors);

if (matches.length === 0) {
// All matches had more than `maxErrors` errors.
return null;
}

/**
* Compute a score between 0 and 1.0 for a match candidate.
*
* @param {StringMatch} match
*/
const scoreMatch = match => {
const quoteWeight = 50; // Similarity of matched text to quote.
const prefixWeight = 20; // Similarity of text before matched text to `context.prefix`.
const suffixWeight = 20; // Similarity of text after matched text to `context.suffix`.
const posWeight = 2; // Proximity to expected location. Used as a tie-breaker.

const quoteScore = 1 - match.errors / quote.length;

const prefixScore = context.prefix
? textMatchScore(
text.slice(match.start - context.prefix.length, match.start),
context.prefix
)
: 1.0;
const suffixScore = context.suffix
? textMatchScore(
text.slice(match.end, match.end + context.suffix.length),
context.suffix
)
: 1.0;

let posScore = 1.0;
if (typeof context.hint === 'number') {
const offset = Math.abs(match.start - context.hint);
posScore = 1.0 - offset / text.length;
}

const rawScore =
quoteWeight * quoteScore +
prefixWeight * prefixScore +
suffixWeight * suffixScore +
posWeight * posScore;
const maxScore = quoteWeight + prefixWeight + suffixWeight + posWeight;
const normalizedScore = rawScore / maxScore;

return normalizedScore;
};

// Rank matches based on similarity of actual and expected surrounding text
// and actual/expected offset in the document text.
const scoredMatches = matches.map(m => ({
start: m.start,
end: m.end,
score: scoreMatch(m),
}));

// Choose match with highest score.
scoredMatches.sort((a, b) => b.score - a.score);
return scoredMatches[0];
}
198 changes: 198 additions & 0 deletions src/annotator/anchoring/test/match-quote-test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,198 @@
import { matchQuote } from '../match-quote';

const fixtures = {
solitude: `Many years later, as he faced the firing squad,
Colonel Aureliano Buendía was to remember that distant afternoon
when his father took him to discover ice`,

twoCities: `It was the best of times, it was the worst of times,
it was the age of wisdom, it was the age of foolishness, it was the epoch of belief,
it was the epoch of incredulity, it was the season of Light, it was the
season of Darkness, it was the spring of hope, it was the winter of despair, we had
everything before us, we had nothing before us, we were all going direct to Heaven,
we were all going direct the other way.`,
};

function normalize(str) {
// Normalize whitespace.
return str.replace(/\s+/g, ' ');
}

Object.keys(fixtures).forEach(k => (fixtures[k] = normalize(fixtures[k])));

describe('matchQuote', () => {
it('finds exact match', () => {
const match = matchQuote(fixtures.solitude, 'discover ice');
assert.equal(match.score, 1.0);
assert.equal(
fixtures.solitude.slice(match.start, match.end),
'discover ice'
);
});

it('finds best approximate match if there is no exact match', () => {
const match = matchQuote(fixtures.solitude, 'some years later');
assert.isTrue(match.score > 0);
assert.isTrue(match.score < 1);
assert.equal(
fixtures.solitude.slice(match.start, match.end),
'Many years later'
);
});

it('scores matches based on quote similarity', () => {
// List of quotes in descending order of similarity to the text.
const quotes = [
'Many years later',
'Many yers later',
'Some years later',
'Some years after',
];

const scores = quotes.map(q => matchQuote(fixtures.solitude, q).score);

for (let i = 1; i < scores.length; i++) {
assert.isBelow(scores[i], scores[i - 1]);
}
});

it('scores matches based on prefix similarity', () => {
// List of prefixes in descending order of similarity to the actual prefix
// of the quote.
const prefixes = [
'Many years later',
'Many yers later',
'Some years later',
'Some years after',
];

const quote = ', as he faced the firing squad';
const scores = prefixes.map(
p => matchQuote(fixtures.solitude, quote, { prefix: p }).score
);

for (let i = 1; i < scores.length; i++) {
assert.isBelow(scores[i], scores[i - 1]);
}
});

it('scores matches based on suffix similarity', () => {
// List of suffixes in descending order of similarity to the actual suffix
// of the quote.
const suffixes = [
', as he faced the firing squad',
', as she faced the firing squad',
', as he awaited the firing squad',
', as he awaited his death',
];

const quote = 'Many years later';
const scores = suffixes.map(
s => matchQuote(fixtures.solitude, quote, { suffix: s }).score
);

for (let i = 1; i < scores.length; i++) {
assert.isBelow(scores[i], scores[i - 1]);
}
});

it('returns `null` if there is no acceptable approximate match', () => {
const match = matchQuote(fixtures.twoCities, fixtures.solitude);
assert.equal(match, null);
});

it('returns `null` if quote is empty', () => {
assert.equal(matchQuote('foobar', ''), null);
});

it('returns `null` if text is empty', () => {
assert.equal(matchQuote('', 'foobar'), null);
});

[
// Exact prefix matches.
{
quote: 'before us',
prefix: 'we had everything',
expected: 'before us, we had nothing',
},
{
quote: 'before us',
prefix: 'we had nothing',
expected: 'before us, we were all going',
},

// Approximate prefix matches.
{
quote: 'before us',
prefix: 'we had every-thing',
expected: 'before us, we had nothing',
},
{
quote: 'before us',
prefix: 'we had nout',
expected: 'before us, we were all going',
},

// Exact suffix matches.
{
quote: 'we had',
suffix: 'everything',
expected: 'we had everything',
},
{
quote: 'we had',
suffix: 'nothing',
expected: 'we had nothing',
},

// Approximate suffix matches.
{
quote: 'we had',
suffix: 'ever ting',
expected: 'we had everything',
},
{
quote: 'we had',
suffix: 'nutting',
expected: 'we had nothing',
},
].forEach(({ quote, prefix, suffix, expected }, i) => {
it(`finds match with best context match (${i})`, () => {
const text = fixtures.twoCities;
const match = matchQuote(text, quote, {
prefix,
suffix,
});
assert.ok(match);
assert.equal(text.slice(match.start, match.end), quote);
assert.equal(match.start, text.indexOf(expected));
});
});

it('uses `hint` as a tie-breaker to choose between matches with close scores', () => {
const text = fixtures.twoCities;
const posA = text.indexOf('everything before us') + 'everything '.length;
const posB = text.indexOf('nothing before us') + 'nothing '.length;

// Search for a quote that appears multiple times in the text. Since no
// context is provided, there will be several matches with equal scores to
// choose between.
const matchHintA = matchQuote(text, 'befor us', { hint: posA });
const matchHintB = matchQuote(text, 'befor us', { hint: posB });
const matchNoHint = matchQuote(text, 'befor us');

// When a hint is provided, `matchQuote` should choose between otherwise
// equal matches based on how close the match start is to `hint`.
assert.ok(matchHintA);
assert.equal(matchHintA.start, posA, 'Wrong match for hint `posA`');

assert.ok(matchHintB);
assert.equal(matchHintB.start, posB, 'Wrong match for hint `posB`');

// When no hint is provided, the first match (ie. lowest `match.start`)
// should be chosen.
assert.ok(matchNoHint);
assert.equal(matchNoHint.start, posA, 'Wrong match with no hint');
});
});
5 changes: 5 additions & 0 deletions yarn.lock
Original file line number Diff line number Diff line change
Expand Up @@ -1340,6 +1340,11 @@ append-buffer@^1.0.2:
dependencies:
buffer-equal "^1.0.0"

approx-string-match@^1.1.0:
version "1.1.0"
resolved "https://registry.yarnpkg.com/approx-string-match/-/approx-string-match-1.1.0.tgz#2fb8e1d6dcb640acc1c0d1ae9f0895348d06f4c0"
integrity sha512-j1yQB9XhfGWsvTfHEuNsR/SrUT4XQDkAc0PEjMifyi97931LmNQyLsO6HbuvZ3HeMx+3Dvk8m8XGkUF+8lCeqw==

archy@^1.0.0:
version "1.0.0"
resolved "https://registry.yarnpkg.com/archy/-/archy-1.0.0.tgz#f9c8c13757cc1dd7bc379ac77b2c62a5c2868c40"
Expand Down

0 comments on commit 0920964

Please sign in to comment.