forked from rymndhng/clj-diff
-
Notifications
You must be signed in to change notification settings - Fork 3
/
core.cljc
117 lines (100 loc) · 4.56 KB
/
core.cljc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
(ns lambdaisland.clj-diff.core
"Diff, patch and related functions for Clojure sequences."
(:require [lambdaisland.clj-diff.miller :as miller]))
(defn diff
"Create the edit script for transforming sequance a into sequence b.
An edit script is a map with keys :+ and :- for additions and deletions.
Additions are represented as a sequence of vectors. The first item in each
vector is the index where the rest of the items in the vector are to be
inserted. For example [3 b c] means to insert b an c after whatever is
in index 3. Deletions are represented as a sequence of indexes to delete.
For example: the diff of 'abcabba' and 'cbabac' would generate the edit
script below.
{:+ [[2 b] [6 c]], :- [0 1 5]}
An index of -1 may appear in additions and is a special case which means to
add the elements at the beginning of the sequence."
[a b]
(miller/diff a b))
(defn- merge-patch
[s edit-script delete-symbol]
(let [s (vec s)
additions (:+ edit-script)
deletions (:- edit-script)
s (reduce (fn [a b]
(assoc a b delete-symbol))
s
deletions)
s (reduce (fn [a b]
(let [index (first b)
items (rest b)]
(if (= index -1)
(assoc a 0 (conj (vec items) (get a 0)))
(assoc a index (conj items (get a index))))))
s
additions)]
(flatten s)))
(defn patch*
[s edit-script]
(filter #(not (nil? %)) (merge-patch s edit-script nil)))
(defmulti ^{:arglists '([s edit-script])} patch
"Use the instructions in the edit script to transform the sequence s into
a new sequence. If the edit script was created by using diff on a and b then
patch will use the edit script to transform a into b.
(diff a b) -> x, (patch a x) -> b."
#?(:clj (fn [s _] (class s))
:cljs (fn [s _] (when (string? s) :string))))
(defmethod patch :default
[s edit-script]
(patch* s edit-script))
(defmethod patch #?(:clj String :cljs :string)
[s edit-script]
(apply str (patch* s edit-script)))
(defn edit-distance
"Returns the edit distance between the two passed sequences. May also be
passed an edit script. The edit distance is the minimum number of insertions
and deletions required to transform one sequence into another."
([a b]
(miller/edit-distance a b))
([edit-script]
(+ (count (:- edit-script))
(reduce + (map #(count (drop 1 %)) (:+ edit-script))))))
(defn- max-or-zero [coll]
(if (and (coll? coll)
(not (empty? coll)))
(apply max coll)
0))
(defn levenshtein-distance
"Returns the Levenshtein distance between two sequences. May either be passed
the two sequences or a diff of the two sequences.
From [Wikipedia](http://en.wikipedia.org/wiki/Levenshtein_distance):
The Levenshtein distance between two strings is the minimum number of edits
needed to transform one string into the other, with the allowable edit
operations being insertion, deletion and substitution of a single character.
This function works not only with strings but with any Clojure sequence.
Warning! Technically this function is estimating the Levenshtein distance
from a computed diff. Most of the time, it is the same as the real Levenshtein
distance but in same cases it may be larger. The reason for this is that
there may be multiple paths through an edit graph with the same edit
distance but with differing Levenshtein distance. A future improvement to
the diff algorithm whould be to find all paths and prefer the one with the
minimum Levenshtein distance."
([a b]
(levenshtein-distance (diff a b)))
([edit-script]
(let [additions (map #(let [index (first %)
items (rest %)]
(apply vector index (repeat (count items) :a)))
(:+ edit-script))
max-index (max (max-or-zero (map first additions))
(max-or-zero (:- edit-script)))
v (vec (repeat max-index :e))
patched (merge-patch v (merge edit-script {:+ additions}) :d)
edit-groups (filter #(not= :e (first %))
(partition-by #(if (= % :e) :e :edit)
patched))]
(reduce + (map (fn [group]
(max (count (filter #(= % :a) group))
(count (filter #(= % :d) group))))
edit-groups)))))
(defn longest-common-subseq [a b]
(miller/longest-common-subseq a b))