Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add new fuzzy quote matching implementation #2814

Merged
merged 1 commit into from
Dec 11, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
"@babel/preset-react": "^7.0.0",
"@octokit/rest": "^18.0.0",
"@sentry/browser": "^5.6.2",
"approx-string-match": "^1.1.0",
"autoprefixer": "^10.0.1",
"aws-sdk": "^2.345.0",
"axe-core": "^4.0.0",
Expand Down
158 changes: 158 additions & 0 deletions src/annotator/anchoring/match-quote.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
import approxSearch from 'approx-string-match';

/**
* @typedef {import('approx-string-match').Match} StringMatch
*/

/**
* @typedef Match
* @prop {number} start - Start offset of match in text
* @prop {number} end - End offset of match in text
* @prop {number} score -
* Score for the match between 0 and 1.0, where 1.0 indicates a perfect match
* for the quote and context.
*/

/**
* Find the best approximate matches for `str` in `text` allowing up to `maxErrors` errors.
*
* @param {string} text
* @param {string} str
* @param {number} maxErrors
* @return {StringMatch[]}
*/
function search(text, str, maxErrors) {
// Do a fast search for exact matches. The `approx-string-match` library
// doesn't currently incorporate this optimization itself.
let matchPos = 0;
let exactMatches = [];
while (matchPos !== -1) {
matchPos = text.indexOf(str, matchPos);
if (matchPos !== -1) {
exactMatches.push({
start: matchPos,
end: matchPos + str.length,
errors: 0,
});
matchPos += 1;
}
}
if (exactMatches.length > 0) {
return exactMatches;
}

// If there are no exact matches, do a more expensive search for matches
// with errors.
return approxSearch(text, str, maxErrors);
}

/**
* Compute a score between 0 and 1.0 for the similarity between `text` and `str`.
*
* @param {string} text
* @param {string} str
*/
function textMatchScore(text, str) {
/* istanbul ignore next - `scoreMatch` will never pass an empty string */
if (str.length === 0) {
return 0.0;
}
const matches = search(text, str, str.length);

// prettier-ignore
return 1 - (matches[0].errors / str.length);
}

/**
* Find the best approximate match for `quote` in `text`.
*
* Returns `null` if no match exceeding the minimum quality threshold was found.
*
* @param {string} text - Document text to search
* @param {string} quote - String to find within `text`
* @param {Object} context -
* Context in which the quote originally appeared. This is used to choose the
* best match.
* @param {string} [context.prefix] - Expected text before the quote
* @param {string} [context.suffix] - Expected text after the quote
* @param {number} [context.hint] - Expected offset of match within text
* @return {Match|null}
*/
export function matchQuote(text, quote, context = {}) {
if (quote.length === 0) {
return null;
}

// Choose the maximum number of errors to allow for the initial search.
// This choice involves a tradeoff between:
//
// - Recall (proportion of "good" matches found)
// - Precision (proportion of matches found which are "good")
// - Cost of the initial search and of processing the candidate matches [1]
//
// [1] Specifically, the expected-time complexity of the initial search is
// `O((maxErrors / 32) * text.length)`. See `approx-string-match` docs.
const maxErrors = Math.min(256, quote.length / 2);

// Find closest matches for `quote` in `text` based on edit distance.
const matches = search(text, quote, maxErrors);

if (matches.length === 0) {
return null;
}

/**
* Compute a score between 0 and 1.0 for a match candidate.
*
* @param {StringMatch} match
*/
const scoreMatch = match => {
const quoteWeight = 50; // Similarity of matched text to quote.
const prefixWeight = 20; // Similarity of text before matched text to `context.prefix`.
const suffixWeight = 20; // Similarity of text after matched text to `context.suffix`.
const posWeight = 2; // Proximity to expected location. Used as a tie-breaker.

const quoteScore = 1 - match.errors / quote.length;

const prefixScore = context.prefix
? textMatchScore(
text.slice(match.start - context.prefix.length, match.start),
context.prefix
)
: 1.0;
const suffixScore = context.suffix
? textMatchScore(
text.slice(match.end, match.end + context.suffix.length),
context.suffix
)
: 1.0;

let posScore = 1.0;
if (typeof context.hint === 'number') {
const offset = Math.abs(match.start - context.hint);
posScore = 1.0 - offset / text.length;
}

const rawScore =
quoteWeight * quoteScore +
prefixWeight * prefixScore +
suffixWeight * suffixScore +
posWeight * posScore;
const maxScore = quoteWeight + prefixWeight + suffixWeight + posWeight;
const normalizedScore = rawScore / maxScore;

return normalizedScore;
};

// Rank matches based on similarity of actual and expected surrounding text
// and actual/expected offset in the document text.
const scoredMatches = matches.map(m => ({
start: m.start,
end: m.end,
score: scoreMatch(m),
}));

// Choose match with highest score.
scoredMatches.sort((a, b) => b.score - a.score);
return scoredMatches[0];
}
198 changes: 198 additions & 0 deletions src/annotator/anchoring/test/match-quote-test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,198 @@
import { matchQuote } from '../match-quote';

const fixtures = {
solitude: `Many years later, as he faced the firing squad,
Colonel Aureliano Buendía was to remember that distant afternoon
when his father took him to discover ice`,

twoCities: `It was the best of times, it was the worst of times,
it was the age of wisdom, it was the age of foolishness, it was the epoch of belief,
it was the epoch of incredulity, it was the season of Light, it was the
season of Darkness, it was the spring of hope, it was the winter of despair, we had
everything before us, we had nothing before us, we were all going direct to Heaven,
we were all going direct the other way.`,
};

function normalize(str) {
// Normalize whitespace.
return str.replace(/\s+/g, ' ');
}

Object.keys(fixtures).forEach(k => (fixtures[k] = normalize(fixtures[k])));

describe('matchQuote', () => {
it('finds exact match', () => {
const match = matchQuote(fixtures.solitude, 'discover ice');
assert.equal(match.score, 1.0);
assert.equal(
fixtures.solitude.slice(match.start, match.end),
'discover ice'
);
});

it('finds best approximate match if there is no exact match', () => {
const match = matchQuote(fixtures.solitude, 'some years later');
assert.isTrue(match.score > 0);
assert.isTrue(match.score < 1);
assert.equal(
fixtures.solitude.slice(match.start, match.end),
'Many years later'
);
});

it('scores matches based on quote similarity', () => {
// List of quotes in descending order of similarity to the text.
const quotes = [
'Many years later',
'Many yers later',
'Some years later',
'Some years after',
];

const scores = quotes.map(q => matchQuote(fixtures.solitude, q).score);

for (let i = 1; i < scores.length; i++) {
assert.isBelow(scores[i], scores[i - 1]);
}
});

it('scores matches based on prefix similarity', () => {
// List of prefixes in descending order of similarity to the actual prefix
// of the quote.
const prefixes = [
'Many years later',
'Many yers later',
'Some years later',
'Some years after',
];

const quote = ', as he faced the firing squad';
const scores = prefixes.map(
p => matchQuote(fixtures.solitude, quote, { prefix: p }).score
);

for (let i = 1; i < scores.length; i++) {
assert.isBelow(scores[i], scores[i - 1]);
}
});

it('scores matches based on suffix similarity', () => {
// List of suffixes in descending order of similarity to the actual suffix
// of the quote.
const suffixes = [
', as he faced the firing squad',
', as she faced the firing squad',
', as he awaited the firing squad',
', as he awaited his death',
];

const quote = 'Many years later';
const scores = suffixes.map(
s => matchQuote(fixtures.solitude, quote, { suffix: s }).score
);

for (let i = 1; i < scores.length; i++) {
assert.isBelow(scores[i], scores[i - 1]);
}
});

it('returns `null` if there is no acceptable approximate match', () => {
const match = matchQuote(fixtures.twoCities, fixtures.solitude);
assert.equal(match, null);
});

it('returns `null` if quote is empty', () => {
assert.equal(matchQuote('foobar', ''), null);
});

it('returns `null` if text is empty', () => {
assert.equal(matchQuote('', 'foobar'), null);
});

[
// Exact prefix matches.
{
quote: 'before us',
prefix: 'we had everything',
expected: 'before us, we had nothing',
},
{
quote: 'before us',
prefix: 'we had nothing',
expected: 'before us, we were all going',
},

// Approximate prefix matches.
{
quote: 'before us',
prefix: 'we had every-thing',
expected: 'before us, we had nothing',
},
{
quote: 'before us',
prefix: 'we had nout',
expected: 'before us, we were all going',
},

// Exact suffix matches.
{
quote: 'we had',
suffix: 'everything',
expected: 'we had everything',
},
{
quote: 'we had',
suffix: 'nothing',
expected: 'we had nothing',
},

// Approximate suffix matches.
{
quote: 'we had',
suffix: 'ever ting',
expected: 'we had everything',
},
{
quote: 'we had',
suffix: 'nutting',
expected: 'we had nothing',
},
].forEach(({ quote, prefix, suffix, expected }, i) => {
it(`finds match with best context match (${i})`, () => {
const text = fixtures.twoCities;
const match = matchQuote(text, quote, {
prefix,
suffix,
});
assert.ok(match);
assert.equal(text.slice(match.start, match.end), quote);
assert.equal(match.start, text.indexOf(expected));
});
});

it('uses `hint` as a tie-breaker to choose between matches with close scores', () => {
const text = fixtures.twoCities;
const posA = text.indexOf('everything before us') + 'everything '.length;
const posB = text.indexOf('nothing before us') + 'nothing '.length;

// Search for a quote that appears multiple times in the text. Since no
// context is provided, there will be several matches with equal scores to
// choose between.
const matchHintA = matchQuote(text, 'befor us', { hint: posA });
const matchHintB = matchQuote(text, 'befor us', { hint: posB });
const matchNoHint = matchQuote(text, 'befor us');

// When a hint is provided, `matchQuote` should choose between otherwise
// equal matches based on how close the match start is to `hint`.
assert.ok(matchHintA);
assert.equal(matchHintA.start, posA, 'Wrong match for hint `posA`');

assert.ok(matchHintB);
assert.equal(matchHintB.start, posB, 'Wrong match for hint `posB`');

// When no hint is provided, the first match (ie. lowest `match.start`)
// should be chosen.
assert.ok(matchNoHint);
assert.equal(matchNoHint.start, posA, 'Wrong match with no hint');
});
});
5 changes: 5 additions & 0 deletions yarn.lock
Original file line number Diff line number Diff line change
Expand Up @@ -1340,6 +1340,11 @@ append-buffer@^1.0.2:
dependencies:
buffer-equal "^1.0.0"

approx-string-match@^1.1.0:
version "1.1.0"
resolved "https://registry.yarnpkg.com/approx-string-match/-/approx-string-match-1.1.0.tgz#2fb8e1d6dcb640acc1c0d1ae9f0895348d06f4c0"
integrity sha512-j1yQB9XhfGWsvTfHEuNsR/SrUT4XQDkAc0PEjMifyi97931LmNQyLsO6HbuvZ3HeMx+3Dvk8m8XGkUF+8lCeqw==

archy@^1.0.0:
version "1.0.0"
resolved "https://registry.yarnpkg.com/archy/-/archy-1.0.0.tgz#f9c8c13757cc1dd7bc379ac77b2c62a5c2868c40"
Expand Down