-
Notifications
You must be signed in to change notification settings - Fork 311
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: computation of Levenshtein edit distance (#6117)
Subsequent PRs will prove properties of this definition, and implement the `rw_search` tactic using it. Co-authored-by: Scott Morrison <scott.morrison@gmail.com> Co-authored-by: Eric Wieser <wieser.eric@gmail.com> Co-authored-by: Thomas Browning <tb65536@uw.edu> Co-authored-by: Chris Hughes <chrishughes24@gmail.com> Co-authored-by: Oliver Nash <github@olivernash.org> Co-authored-by: sgouezel <sebastien.gouezel@univ-rennes1.fr> Co-authored-by: Alex J Best <alex.j.best@gmail.com> Co-authored-by: Xavier Roblot <46200072+xroblot@users.noreply.github.com> Co-authored-by: michaellee94 <michael_lee1@brown.edu>
- Loading branch information
1 parent
3045aef
commit 979bb0f
Showing
2 changed files
with
291 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,290 @@ | ||
/- | ||
Copyright (c) 2023 Kim Liesinger. All rights reserved. | ||
Released under Apache 2.0 license as described in the file LICENSE. | ||
Authors: Kim Liesinger | ||
-/ | ||
import Mathlib.Data.Nat.Basic | ||
|
||
/-! | ||
# Levenshtein distances | ||
We define the Levenshtein edit distance `levenshtein C xy ys` between two `List α`, | ||
with a customizable cost structure `C` for the `delete`, `insert`, and `substitute` operations. | ||
As an auxiliary function, we define `suffixLevenshtein C xs ys`, which gives the list of distances | ||
from each suffix of `xs` to `ys`. | ||
This is defined by recursion on `ys`, using the internal function `Levenshtein.impl`, | ||
which computes `suffixLevenshtein C xs (y :: ys)` using `xs`, `y`, and `suffixLevenshtein C xs ys`. | ||
(This corresponds to the usual algorithm | ||
using the last two rows of the matrix of distances between suffixes.) | ||
After setting up these definitions, we prove lemmas specifying their behaviour, | ||
particularly | ||
``` | ||
theorem suffixLevenshtein_eq_tails_map : | ||
(suffixLevenshtein C xs ys).1 = xs.tails.map fun xs' => levenshtein C xs' ys := ... | ||
``` | ||
and | ||
``` | ||
theorem levenshtein_cons_cons : | ||
levenshtein C (x :: xs) (y :: ys) = | ||
min (C.delete x + levenshtein C xs (y :: ys)) | ||
(min (C.insert y + levenshtein C (x :: xs) ys) | ||
(C.substitute x y + levenshtein C xs ys)) := ... | ||
``` | ||
-/ | ||
|
||
variable {α β δ : Type _} [AddZeroClass δ] [Min δ] | ||
|
||
namespace Levenshtein | ||
|
||
/-- A cost structure for Levenshtein edit distance. -/ | ||
structure Cost (α β : Type _) (δ : Type _) where | ||
/-- Cost to delete an element from a list. -/ | ||
delete : α → δ | ||
/-- Cost in insert an element into a list. -/ | ||
insert : β → δ | ||
/-- Cost to substitute one elemenet for another in a list. -/ | ||
substitute : α → β → δ | ||
|
||
/-- The default cost structure, for which all operations cost `1`. -/ | ||
@[simps] | ||
def defaultCost [DecidableEq α] : Cost α α ℕ where | ||
delete _ := 1 | ||
insert _ := 1 | ||
substitute a b := if a = b then 0 else 1 | ||
|
||
instance [DecidableEq α] : Inhabited (Cost α α ℕ) := ⟨defaultCost⟩ | ||
|
||
variable (C : Cost α β δ) | ||
|
||
/-- | ||
(Implementation detail for `levenshtein`) | ||
Given a list `xs` and the Levenshtein distances from each suffix of `xs` to some other list `ys`, | ||
compute the Levenshtein distances from each suffix of `xs` to `y :: ys`. | ||
(Note that we don't actually need to know `ys` itself here, so it is not an argument.) | ||
The return value is a list of length `x.length + 1`, | ||
and it is convenient for the recursive calls that we bundle this list | ||
with a proof that it is non-empty. | ||
-/ | ||
def impl | ||
(xs : List α) (y : β) (d : {r : List δ // 0 < r.length}) : {r : List δ // 0 < r.length} := | ||
let ⟨ds, w⟩ := d | ||
xs.zip (ds.zip ds.tail) |>.foldr | ||
(init := ⟨[C.insert y + ds.getLast (List.length_pos.mp w)], by simp⟩) | ||
(fun ⟨x, d₀, d₁⟩ ⟨r, w⟩ => | ||
⟨min (C.delete x + r[0]) (min (C.insert y + d₀) (C.substitute x y + d₁)) :: r, by simp⟩) | ||
|
||
variable {C} | ||
variable (x : α) (xs : List α) (y : β) (d : δ) (ds : List δ) (w : 0 < (d :: ds).length) | ||
|
||
-- Note this lemma has an unspecified proof `w'` on the right-hand-side, | ||
-- which will become an extra goal when rewriting. | ||
theorem impl_cons (w' : 0 < List.length ds) : | ||
impl C (x :: xs) y ⟨d :: ds, w⟩ = | ||
let ⟨r, w⟩ := impl C xs y ⟨ds, w'⟩ | ||
⟨min (C.delete x + r[0]) (min (C.insert y + d) (C.substitute x y + ds[0])) :: r, by simp⟩ := | ||
match ds, w' with | _ :: _, _ => rfl | ||
|
||
-- Note this lemma has two unspecified proofs: `h` appears on the left-hand-side | ||
-- and should be found by matching, but `w'` will become an extra goal when rewriting. | ||
theorem impl_cons_fst_zero (h) (w' : 0 < List.length ds) : | ||
(impl C (x :: xs) y ⟨d :: ds, w⟩).1[0] = | ||
let ⟨r, w⟩ := impl C xs y ⟨ds, w'⟩ | ||
min (C.delete x + r[0]) (min (C.insert y + d) (C.substitute x y + ds[0])) := | ||
match ds, w' with | _ :: _, _ => rfl | ||
|
||
theorem impl_length (d : {r : List δ // 0 < r.length}) (w : d.1.length = xs.length + 1) : | ||
(impl C xs y d).1.length = xs.length + 1 := by | ||
induction xs generalizing d | ||
· case nil => | ||
rfl | ||
· case cons x xs ih => | ||
dsimp [impl] | ||
match d, w with | ||
| ⟨d₁ :: d₂ :: ds, _⟩, w => | ||
dsimp | ||
congr 1 | ||
refine ih ⟨d₂ :: ds, (by simp)⟩ (by simpa using w) | ||
|
||
end Levenshtein | ||
|
||
open Levenshtein | ||
|
||
variable (C : Cost α β δ) | ||
|
||
/-- | ||
`suffixLevenshtein C xs ys` computes the Levenshtein distance | ||
(using the cost functions provided by a `C : Cost α β δ`) | ||
from each suffix of the list `xs` to the list `ys`. | ||
The first element of this list is the Levenshtein distance from `xs` to `ys`. | ||
Note that if the cost functions do not satisfy the inequalities | ||
* `C.delete a + C.insert b ≥ C.substitute a b` | ||
* `C.substitute a b + C.substitute b c ≥ C.substitute a c` | ||
(or if any values are negative) | ||
then the edit distances calculated here may not agree with the general | ||
geodesic distance on the edit graph. | ||
-/ | ||
def suffixLevenshtein (xs : List α) (ys : List β) : {r : List δ // 0 < r.length} := | ||
ys.foldr | ||
(impl C xs) | ||
(xs.foldr (init := ⟨[0], by simp⟩) (fun a ⟨r, w⟩ => ⟨(C.delete a + r[0]) :: r, by simp⟩)) | ||
|
||
variable {C} | ||
|
||
theorem suffixLevenshtein_length (xs : List α) (ys : List β) : | ||
(suffixLevenshtein C xs ys).1.length = xs.length + 1 := by | ||
induction ys | ||
· case nil => | ||
dsimp [suffixLevenshtein] | ||
induction xs | ||
· case nil => rfl | ||
· case cons _ xs ih => | ||
simp_all | ||
· case cons y ys ih => | ||
dsimp [suffixLevenshtein] | ||
rw [impl_length] | ||
exact ih | ||
|
||
-- This is only used in keeping track of estimates. | ||
theorem suffixLevenshtein_eq (xs : List α) (y ys) : | ||
impl C xs y (suffixLevenshtein C xs ys) = suffixLevenshtein C xs (y :: ys) := by | ||
rfl | ||
|
||
variable (C) | ||
|
||
/-- | ||
`levenshtein C xs ys` computes the Levenshtein distance | ||
(using the cost functions provided by a `C : Cost α β δ`) | ||
from the list `xs` to the list `ys`. | ||
Note that if the cost functions do not satisfy the inequalities | ||
* `C.delete a + C.insert b ≥ C.substitute a b` | ||
* `C.substitute a b + C.substitute b c ≥ C.substitute a c` | ||
(or if any values are negative) | ||
then the edit distance calculated here may not agree with the general | ||
geodesic distance on the edit graph. | ||
-/ | ||
def levenshtein (xs : List α) (ys : List β) : δ := | ||
let ⟨r, w⟩ := suffixLevenshtein C xs ys | ||
r[0] | ||
|
||
variable {C} | ||
|
||
theorem suffixLevenshtein_nil_nil : (suffixLevenshtein C [] []).1 = [0] := by | ||
rfl | ||
|
||
-- Not sure if this belongs in the main `List` API, or can stay local. | ||
theorem List.eq_of_length_one (x : List α) (w : x.length = 1) : | ||
have : 0 < x.length := (lt_of_lt_of_eq Nat.zero_lt_one w.symm) | ||
x = [x[0]] := by | ||
match x, w with | ||
| [r], _ => rfl | ||
|
||
theorem suffixLevenshtein_nil' (ys : List β) : | ||
(suffixLevenshtein C [] ys).1 = [levenshtein C [] ys] := | ||
List.eq_of_length_one _ (suffixLevenshtein_length [] _) | ||
|
||
theorem suffixLevenshtein_cons₂ (xs : List α) (y ys) : | ||
suffixLevenshtein C xs (y :: ys) = (impl C xs) y (suffixLevenshtein C xs ys) := | ||
rfl | ||
|
||
theorem suffixLevenshtein_cons₁_aux {x y : {r : List δ // 0 < r.length}} | ||
(w₀ : x.1[0]'x.2 = y.1[0]'y.2) (w : x.1.tail = y.1.tail) : x = y := by | ||
match x, y with | ||
| ⟨hx :: tx, _⟩, ⟨hy :: ty, _⟩ => simp_all | ||
|
||
theorem suffixLevenshtein_cons₁ | ||
(x : α) (xs ys) : | ||
suffixLevenshtein C (x :: xs) ys = | ||
⟨levenshtein C (x :: xs) ys :: | ||
(suffixLevenshtein C xs ys).1, by simp⟩ := by | ||
induction ys | ||
· case nil => | ||
dsimp [levenshtein, suffixLevenshtein] | ||
· case cons y ys ih => | ||
apply suffixLevenshtein_cons₁_aux | ||
· rfl | ||
· rw [suffixLevenshtein_cons₂ (x :: xs), ih, impl_cons] | ||
· rfl | ||
· simp [suffixLevenshtein_length] | ||
|
||
theorem suffixLevenshtein_cons₁_fst (x : α) (xs ys) : | ||
(suffixLevenshtein C (x :: xs) ys).1 = | ||
levenshtein C (x :: xs) ys :: | ||
(suffixLevenshtein C xs ys).1 := by | ||
simp [suffixLevenshtein_cons₁] | ||
|
||
theorem suffixLevenshtein_cons_cons_fst_get_zero | ||
(x : α) (xs y ys) (w) : | ||
(suffixLevenshtein C (x :: xs) (y :: ys)).1[0] = | ||
let ⟨dx, _⟩ := suffixLevenshtein C xs (y :: ys) | ||
let ⟨dy, _⟩ := suffixLevenshtein C (x :: xs) ys | ||
let ⟨dxy, _⟩ := suffixLevenshtein C xs ys | ||
min | ||
(C.delete x + dx[0]) | ||
(min | ||
(C.insert y + dy[0]) | ||
(C.substitute x y + dxy[0])) := by | ||
conv => | ||
lhs | ||
dsimp only [suffixLevenshtein_cons₂] | ||
simp only [suffixLevenshtein_cons₁] | ||
rw [impl_cons_fst_zero] | ||
rfl | ||
|
||
theorem suffixLevenshtein_eq_tails_map (xs ys) : | ||
(suffixLevenshtein C xs ys).1 = xs.tails.map fun xs' => levenshtein C xs' ys := by | ||
induction xs | ||
· case nil => | ||
simp only [List.map, suffixLevenshtein_nil'] | ||
· case cons x xs ih => | ||
simp only [List.map, suffixLevenshtein_cons₁, ih] | ||
|
||
@[simp] | ||
theorem levenshtein_nil_nil : levenshtein C [] [] = 0 := by | ||
simp [levenshtein] | ||
|
||
@[simp] | ||
theorem levenshtein_nil_cons (y) (ys) : | ||
levenshtein C [] (y :: ys) = C.insert y + levenshtein C [] ys := by | ||
dsimp [levenshtein] | ||
congr | ||
rw [List.getLast_eq_get] | ||
congr | ||
rw [show (List.length _) = 1 from _] | ||
induction ys with | ||
| nil => simp | ||
| cons y ys ih => | ||
simp only [List.foldr] | ||
rw [impl_length] <;> simp [ih] | ||
|
||
@[simp] | ||
theorem levenshtein_cons_nil (x : α) (xs : List α) : | ||
levenshtein C (x :: xs) [] = C.delete x + levenshtein C xs [] := | ||
rfl | ||
|
||
@[simp] | ||
theorem levenshtein_cons_cons | ||
(x : α) (xs : List α) (y : β) (ys : List β) : | ||
levenshtein C (x :: xs) (y :: ys) = | ||
min (C.delete x + levenshtein C xs (y :: ys)) | ||
(min (C.insert y + levenshtein C (x :: xs) ys) | ||
(C.substitute x y + levenshtein C xs ys)) := | ||
suffixLevenshtein_cons_cons_fst_get_zero _ _ _ _ _ | ||
|
||
#guard | ||
(suffixLevenshtein Levenshtein.defaultCost "kitten".toList "sitting".toList).1 = | ||
[3, 3, 4, 5, 6, 6, 7] | ||
|
||
#guard levenshtein Levenshtein.defaultCost | ||
"but our fish said, 'no! no!'".toList | ||
"'put me down!' said the fish.".toList = 21 |