-
-
Notifications
You must be signed in to change notification settings - Fork 59
/
search_utils.clj
113 lines (99 loc) · 4.3 KB
/
search_utils.clj
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
(ns datalevin.search-utils
"Some useful utility functions that can be passed as options to search
engine to customize search"
(:require [clojure.string :as str]
[datalevin.interpret :as i]
[datalevin.constants :as c])
(:import [java.text Normalizer Normalizer$Form]))
(defn create-analyzer
"Creates an analyzer fn ready for use in search.
`opts` have the following keys:
* `:tokenizer` is a tokenizing fn that takes a string and returns a seq of
[term, position, offset], where term is a word, position is the sequence
number of the term, and offset is the character offset of this term.
`create-regexp-tokenizer` produces such fn.
* `:token-filters` is an ordered list of token filters. A token filter
receives a token [term, position, offset] and returns a transformed list of
tokens to replace it with."
[{:keys [tokenizer token-filters]}]
(i/inter-fn
[s]
(let [tokens (tokenizer s)
filters-tx (apply comp (map #(mapcat %) token-filters))]
(sequence filters-tx tokens))))
(def lower-case-token-filter
"This token filter converts tokens to lower case."
(i/inter-fn [t] [(update t 0 (fn [s] (str/lower-case s)))]))
(def unaccent-token-filter
"This token filter removes accents and diacritics from tokens."
(i/inter-fn
[t]
[(update t 0 (fn [s] (-> (java.text.Normalizer/normalize
s java.text.Normalizer$Form/NFD)
(str/replace #"[^\p{ASCII}]", ""))))]))
(def en-stop-words-token-filter
"This token filter removes \"empty\" tokens (for english language)."
(i/inter-fn [t] (if (c/en-stop-words? (first t)) [] [t])))
(def prefix-token-filter
"Produces a series of every possible prefixes in a token and replace it with them.
For example: vault -> v, va, vau, vaul, vault
This is useful for producing efficient autocomplete engines, provided this
filter is NOT applied at query time."
(i/inter-fn [[^String word pos start]]
(for [idx (range 1 (inc (.length word)))]
[(subs word 0 idx) pos start])))
(defn create-ngram-token-filter
"Produces character ngrams between min and max size from the token and returns
everything as tokens. This is useful for producing efficient fuzzy search."
([^long min-gram-size ^long max-gram-size]
(i/inter-fn
[[^String word pos start]]
(let [length (.length word)]
(loop [idx 0
gram-size min-gram-size
ngrams (transient [])]
(if (or (= idx length) (< length (+ idx gram-size)))
(persistent! ngrams)
(if-not (< gram-size max-gram-size)
(recur (inc idx) min-gram-size
(conj! ngrams
[(subs word idx (min (+ idx gram-size) length))
pos start]))
(recur idx (inc gram-size)
(conj! ngrams
[(subs word idx (min (+ idx gram-size) length))
pos start]))))))))
([gram-size] (create-ngram-token-filter gram-size gram-size)))
(defn create-min-length-token-filter
"Filters tokens that are strictly shorter than `min-length`."
[^long min-length]
(i/inter-fn
[[^String word _ _ :as t]]
(if (< (.length word) min-length) [] [t])))
(defn create-max-length-token-filter
"Filters tokens that are strictly longer than `max-length`."
[^long max-length]
(i/inter-fn
[[^String word _ _ :as t]]
(if (> (.length word) max-length) [] [t])))
(defn create-regexp-tokenizer
"Creates a tokenizer that splits the given text on the pattern given as
argument, and returns valid tokens."
[pat]
(i/inter-fn
[^String s]
(let [matcher (re-matcher pat s)
res (volatile! [])
string-end (.length s)]
(loop [pos 0
last-separator-end 0]
(if (.find matcher)
(let [match-start (.start matcher)
match-end (.end matcher)
token (subs s last-separator-end match-start)]
(vswap! res conj [token pos last-separator-end])
(recur (inc pos) match-end))
(when (not= last-separator-end string-end)
(let [token (subs s last-separator-end string-end)]
(vswap! res conj [token pos last-separator-end])))))
@res)))