-
-
Notifications
You must be signed in to change notification settings - Fork 59
/
search_utils.clj
157 lines (137 loc) · 5.61 KB
/
search_utils.clj
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
(ns datalevin.search-utils
"Some useful utility functions that can be passed as options to search
engine to customize search."
(:require
[clojure.string :as str]
[datalevin.interpret :as i :refer [inter-fn definterfn]]
[datalevin.stem :as s]
[datalevin.constants :as c])
(:import
[java.text Normalizer Normalizer$Form]
[org.eclipse.collections.impl.list.mutable FastList]
[org.tartarus.snowball SnowballStemmer]))
(definterfn default-tokenizer [s] (datalevin.search/en-analyzer s))
(defn create-analyzer
"Creates an analyzer fn ready for use in search.
`opts` have the following keys:
* `:tokenizer` is a tokenizing fn that takes a string and returns a seq of
[term, position, offset], where term is a word, position is the sequence
number of the term, and offset is the character offset of this term.
e.g. `create-regexp-tokenizer` produces such fn.
* `:token-filters` is an ordered list of token filters. A token filter
receives a [term, position, offset] and returns a transformed list of
tokens to replace it with."
[{:keys [tokenizer token-filters]
:or {tokenizer default-tokenizer
token-filters []}}]
(inter-fn
[s]
(let [tokens (tokenizer s)
filters-tx (apply comp (map #(mapcat %) token-filters))]
(sequence filters-tx tokens))))
(def lower-case-token-filter
"This token filter converts tokens to lower case."
(inter-fn
[t]
[(update t 0 (fn [s] (clojure.string/lower-case s)))]))
(def unaccent-token-filter
"This token filter removes accents and diacritics from tokens."
(inter-fn
[t]
[(update t 0
(fn [s]
(-> (java.text.Normalizer/normalize
s java.text.Normalizer$Form/NFD)
(clojure.string/replace #"[^\p{ASCII}]", ""))))]))
(defn create-stop-words-token-filter
"Takes a stop words predicate that returns `true` when the given token is
a stop word"
[stop-word-pred]
(inter-fn
[t]
(if (stop-word-pred (first t)) [] [t])))
(def en-stop-words-token-filter
"This token filter removes \"empty\" tokens (for english language)."
(inter-fn
[t]
(if (datalevin.constants/en-stop-words? (first t)) [] [t])))
(def prefix-token-filter
"Produces a series of every possible prefixes in a token and replace it with them.
For example: vault -> v, va, vau, vaul, vault
This is useful for producing efficient autocomplete engines, provided this
filter is NOT applied at query time."
(inter-fn
[[^String word pos start]]
(for [idx (range 1 (inc (.length word)))]
[(subs word 0 idx) pos start])))
(defn create-ngram-token-filter
"Produces character ngrams between min and max size from the token and returns
everything as tokens. This is useful for producing efficient fuzzy search."
([min-gram-size max-gram-size]
(inter-fn [[^String word pos start]]
(let [length (.length word)]
(loop [idx 0
gram-size min-gram-size
ngrams (transient [])]
(if (or (= idx length) (< length ^long (+ idx ^long gram-size)))
(persistent! ngrams)
(if-not (< ^long gram-size ^long max-gram-size)
(recur (inc idx) min-gram-size
(conj! ngrams
[(subs word idx (min (+ idx ^long gram-size) length))
pos start]))
(recur idx (inc ^long gram-size)
(conj! ngrams
[(subs word idx (min (+ idx ^long gram-size) length))
pos start]))))))))
([gram-size] (create-ngram-token-filter gram-size gram-size)))
(defn create-min-length-token-filter
"Filters tokens that are strictly shorter than `min-length`."
[min-length]
(inter-fn
[[^String word _ _ :as t]]
(if (< (.length word) ^long min-length) [] [t])))
(defn create-max-length-token-filter
"Filters tokens that are strictly longer than `max-length`."
[max-length]
(inter-fn
[[^String word _ _ :as t]]
(if (> (.length word) ^long max-length) [] [t])))
(defn create-stemming-token-filter
"Create a token filter that replaces tokens with their stems.
The stemming algorithm is Snowball https://snowballstem.org/
`language` is a string, its value can be one of the following:
arabic, armenian, basque, catalan, danish, dutch, english, french,
finnish, german, greek, hindi, hungarian, indonesian, irish, italian,
lithuanian, nepali, norwegian, portuguese, romanian, russian, serbian,
swedish, tamil, turkish, spanish, yiddish, and porter"
[^String language]
(inter-fn
[t]
(let [^org.tartarus.snowball.SnowballStemmer stemmer
(datalevin.stem/get-stemmer language)]
[(update t 0 (fn [s]
(.setCurrent stemmer s)
(.stem stemmer)
(.getCurrent stemmer)))])))
(defn create-regexp-tokenizer
"Creates a tokenizer that splits text on the given regular expression
pattern `pat`."
[pat]
(inter-fn
[^String s]
(let [matcher (re-matcher pat s)
res (volatile! [])
string-end (.length s)]
(loop [pos 0
last-separator-end 0]
(if (.find matcher)
(let [match-start (.start matcher)
match-end (.end matcher)
token (subs s last-separator-end match-start)]
(vswap! res conj [token pos last-separator-end])
(recur (inc pos) match-end))
(when (not= last-separator-end string-end)
(let [token (subs s last-separator-end string-end)]
(vswap! res conj [token pos last-separator-end])))))
@res)))