forked from facebookarchive/duckling_old
-
Notifications
You must be signed in to change notification settings - Fork 0
/
core.clj
444 lines (398 loc) · 16.7 KB
/
core.clj
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
(ns duckling.core
(:require [clojure.tools.logging :refer [errorf warnf infof]]
[clojure.java.io :as io]
[clojure.set :as set]
[clojure.string :as string]
[duckling.corpus :as corpus]
[duckling.engine :as engine]
[duckling.learn :as learn]
[duckling.resource :as res]
[duckling.time.obj :as time]
[duckling.util :as util :refer [?> ?>>]]))
(defrecord RCC [lang lang-key rules corpus classifiers])
(defonce language->data (atom {}))
(defn default-context
"Build a default context for testing. opt can be either :corpus or :now"
[opt]
{:reference-time (case opt
:corpus (time/t -2 2013 2 12 4 30)
:now (time/now))})
(defn- get-classifiers [id]
(get-in @language->data [(keyword id) :classifiers]))
(defn- get-rules [id]
(get-in @language->data [(keyword id) :rules]))
(defn- get-corpus [id]
(get-in @language->data [(keyword id) :corpus]))
(defn- compare-tokens
"Compares two candidate tokens a and b for runtime selection.
wanted-dim is a hash whose keys are the :dim wanted by the caller, the value
can be anything truthy.
Returns nil: not comparable 0: equal 1: greater -1: lesser"
[a b classifiers wanted-dims]
{:pre [(map? classifiers)]}
(let [same-dim (= (:dim a) (:dim b))
wanted-a (get wanted-dims (:dim a))
wanted-b (get wanted-dims (:dim b))
cmp-interval (util/compare-intervals
[(:pos a) (:end a)]
[(:pos b) (:end b)])] ; +1 0 -1 nil
;(printf "Comparing %d and %d \n" (:index a) (:index b))
(if-not same-dim
; unless a is wanted and covers b, or the contrary, they are not comparable
(cond (and wanted-a (= 1 cmp-interval)) 1
(and wanted-b (= -1 cmp-interval)) -1
:else nil)
(if (not= 0 cmp-interval)
cmp-interval ; one interval recovers the other
(compare (:log-prob a) (:log-prob b))))))
(defn- select-winners*
[compare-fn resolve-fn already-selected candidates]
(if (seq candidates)
(let [[maxima others] (util/split-by-partial-max
compare-fn
candidates
(concat already-selected candidates))
new-winners (->> maxima
(mapcat resolve-fn)
(filter :value))] ; remove unresolved
(if (seq maxima)
(recur compare-fn resolve-fn (concat already-selected new-winners) others)
already-selected))
already-selected))
(defn- select-winners
"Winner= token that is not 'smaller' (in the sense of the provided partial
order) than another winner, and that resolves to a value"
[compare-fn log-prob-fn resolve-fn candidates]
(->> candidates
(map #(assoc % :log-prob (log-prob-fn %)))
(select-winners* compare-fn resolve-fn [])
(map #(dissoc % :log-prob))))
(defn analyze
"Parse a sentence, returns the stash and a curated list of winners.
Targets is a coll of {:dim dim :label label} : only winners of these dims are
kept, and they receive a :label key = the label provided.
If no targets specified, all winners are returned."
[s context module targets base-stash]
{:pre [s context module]}
(let [classifiers (get-classifiers module)
_ (when-not (map? classifiers)
(errorf "[duckling] Module %s is not loaded. Did you (load!)?" module))
rules (get-rules module)
stash (engine/pass-all s rules base-stash)
; add an index to tokens in the stash
stash (map #(if (map? %1) (assoc %1 :index %2) %1)
stash
(iterate inc 0))
dim-label (when (seq targets) (into {} (for [{:keys [dim label]} targets]
[(keyword dim) label])))
winners (->> stash
(filter :pos)
; just keep the dims we want, and add the label key
(?>> dim-label (keep #(when-let [label (get dim-label (:dim %))]
(assoc % :label label))))
(select-winners
#(compare-tokens %1 %2 classifiers dim-label)
#(learn/route-prob % classifiers)
#(engine/resolve-token % context module))
; add a confidence key
; low confidence for numbers covered by datetime
(engine/estimate-confidence context module)
; adapt the keys for the outside world
(map (fn [{:keys [pos end text] :as token}]
(merge token {:start pos
:end end
:body text}))))]
{:stash stash :winners winners}))
;--------------------------------------------------------------------------
; REPL utilities
;--------------------------------------------------------------------------
(defn- print-stash
"Print stash to STDOUT"
[stash classifiers winners]
(let [width (count (:text (first stash)))
winners-indices (map :index winners)]
(doseq [[tok i] (reverse (map vector stash (iterate inc 0)))]
(let [pos (:pos tok)
end (:end tok)]
(if pos
(printf "%s %s%s%s %2d | %-9s | %-25s | P = %04.4f | %.20s\n"
(if (some #{(:index tok)} winners-indices) "W" " ")
(apply str (repeat pos \space))
(apply str (repeat (- end pos) \-))
(apply str (repeat (- width end -1) \space))
i
(when-let [x (:dim tok)] (name x))
(when-let [x (-> tok :rule :name)] (name x))
(float (learn/route-prob tok classifiers))
(string/join " + " (mapv #(get-in % [:rule :name]) (:route tok))))
(printf " %s\n" (:text tok)))))))
(defn- print-tokens
"Recursively prints a tree representing a route"
([tokens classifiers]
{:pre [(coll? tokens)]}
(let [tokens (if (vector? tokens)
tokens
[tokens])
tokens (if (= 1 (count tokens))
tokens
[{:route tokens :rule {:name "root"}}])]
(print-tokens tokens classifiers 0)))
([tokens classifiers depth]
(print-tokens tokens classifiers depth ""))
([tokens classifiers depth prefix]
(doseq [[token i] (map vector tokens (iterate inc 1))]
(let [;; determine name to display
name (if-let [name (get-in token [:rule :name])]
name
(str "text: " (:text token)))
p (learn/route-prob token classifiers)
;; prepare children prefix
last? (= i (count tokens))
new-prefix (if last? \space \|)
new-prefix (str prefix new-prefix \space \space \space)]
(when (pos? depth)
(print (format "%s%s-- "
prefix
(if last? \` \|))))
(println (format "%s (%s)" name p))
(print-tokens (:route token)
classifiers
(inc depth)
(if (pos? depth) new-prefix ""))))))
(defn play
"Show processing details for one sentence. Defines a 'details' function."
([module-id s]
(play module-id s nil))
([module-id s targets]
(play module-id s targets (default-context :corpus)))
([module-id s targets context]
(let [targets (when targets (map (fn [dim] {:dim dim :label dim}) targets))
{stash :stash
winners :winners} (analyze s context module-id targets nil)]
;; 1. print stash
(print-stash stash (get-classifiers module-id) winners)
;; 2. print winners
(printf "\n%d winners:\n" (count winners))
(doseq [winner winners]
(printf "%-25s %s %s\n" (str (name (:dim winner))
(if (:latent winner) " (latent)" ""))
(engine/export-value winner {:date-fn str})
(dissoc winner :value :route :rule :pos :text :end :index
:dim :start :latent :body :pred :timezone :values)))
;; 3. ask for details
(printf "For further info: (details idx) where 1 <= idx <= %d\n" (dec (count stash)))
(def details (fn [n]
(print-tokens (nth stash n) (get-classifiers module-id))))
(def token (fn [n]
(nth stash n))))))
;--------------------------------------------------------------------------
; Configuration loading
;--------------------------------------------------------------------------
(defn- gen-config-for-lang
"Generates the full config for a language from directory structure."
[lang]
(->> ["corpus" "rules"]
(map (fn [dir]
(let [files (->> (format "languages/%s/%s" lang dir)
res/get-files
(remove #(clojure.string/starts-with? % "_"))
(map #(subs % 0 (- (count %) 4)))
vec)]
[(keyword dir) files])))
(into {})))
(defn- gen-config-for-langs
"Generates the full config for langs from directory structure."
[langs]
(->> langs
(map (fn [lang]
[(keyword (format "%s$core" lang)) (gen-config-for-lang lang)]))
(into {})))
(defn- read-rules
[lang new-file]
(-> (format "languages/%s/rules/%s.clj" lang new-file)
io/resource
slurp
read-string
engine/rules))
(defn- read-corpus
[lang new-file]
(-> (format "languages/%s/corpus/%s.clj" lang new-file)
io/resource
slurp
read-string
corpus/read-corpus))
(defn- load-corpus
[lang corpus-files]
(->> corpus-files
(pmap (partial read-corpus lang))
(reduce (partial util/merge-according-to {:tests concat :context merge}))))
(defn- load-rules
[lang rules-files]
(->> rules-files
(pmap (partial read-rules lang))
(apply concat)))
(defn- get-dims-for-test
[context module {:keys [text]}]
(mapcat (fn [text]
(try
(->> (analyze text context module nil nil)
:stash
(keep :dim))
(catch Exception e
(warnf "Error while analyzing module=%s context=%s text=%s"
module context text)
[])))
text))
(defn get-dims
"Retrieves all available dimensions for module by running its corpus."
[module {:keys [context tests]}]
(->> tests
(pmap (partial get-dims-for-test context module))
(apply concat)
distinct))
(defn- available-languages []
(->> "languages"
res/get-subdirs
set))
(defn- config-from-languages [langs]
(cond-> (available-languages)
langs (set/intersection (set langs))
true gen-config-for-langs))
(defn- pluck-lang [lang-key]
(-> lang-key name (string/split #"\$") first))
(defn- load-language-data [[lang-key {corpus-files :corpus rules-files :rules}]]
(let [lang (pluck-lang lang-key)
corpus (load-corpus lang corpus-files)
rules (load-rules lang rules-files)
classifiers (learn/train-classifiers corpus rules learn/extract-route-features)]
[lang-key
(map->RCC {:lang lang
:lang-key lang-key
:corpus corpus
:rules rules
:classifiers classifiers})]))
(defn clear! []
(reset! language->data {}))
(defn- load*! [lang-key->config]
(clear!)
(let [data (->> lang-key->config
(pmap load-language-data)
(into {}))]
(doseq [[config-key rcc] data]
(swap! language->data assoc config-key rcc)))
(->> @language->data
(pmap (fn [[module {:keys [corpus]}]]
[module (get-dims module corpus)]))
(into {})))
(defn load!
"Loads rules and classifiers for languages or/and config.
If no language list nor config provided, loads all languages.
Returns a map of loaded modules with available dimensions."
([] (load! (into [] (available-languages))))
([args]
(let [lang-key->config (if (vector? args)
(config-from-languages args)
args)]
(load*! lang-key->config))))
;--------------------------------------------------------------------------
; Corpus running
;--------------------------------------------------------------------------
(defn run-corpus
"Run the corpus given in parameter for the given module.
Returns a list of vectors [0|1 text error-msg]"
[{context :context, tests :tests} module]
(for [test tests
text (:text test)]
(try
(let [{:keys [stash winners]} (analyze text context module nil nil)
winner-count (count winners)
check (first (:checks test)) ; only one test is supported now
check-results (map (partial check context) winners)] ; return nil if OK, [expected actual] if not OK
(if (some #(or (nil? %) (false? %)) check-results)
[0 text nil]
[1 text [(ffirst check-results) (map second check-results)]]))
(catch Exception e
[1 text (.getMessage e)]))))
(defn run
"Runs the corpus and prints the results to the terminal."
([]
(run (keys @language->data)))
([module-id]
(loop [[mod & more] (if (seq? module-id) module-id [module-id])
line 0
acc []]
(if mod
(let [output (run-corpus (get-in @language->data [mod :corpus]) mod)
failed (remove (comp (partial = 0) first) output)]
(doseq [[[error-count text error-msg] i] (map vector failed (iterate inc line))]
(printf "%d FAIL \"%s\"\n Expected %s\n" i text (first error-msg))
(doseq [got (second error-msg)]
(printf " Got %s\n" got)))
(printf "%s: %d examples, %d failed.\n" mod (count output) (count failed))
(recur more (+ line (count failed)) (concat acc (map (fn [[_ t _]] [mod t]) failed))))
(defn c [n]
(let [[mod text] (nth acc n)]
(printf "(play %s \"%s\")\n" mod text)
(play mod text)))))))
;--------------------------------------------------------------------------
; Public API
;--------------------------------------------------------------------------
(defn parse
"Public API. Parses text using given module. If dims are provided as a list of
keywords referencing token dimensions, only these dimensions are extracted.
Context is a map with a :reference-time key. If not provided, the system
current date and time is used."
([module text]
(parse module text []))
([module text dims]
(parse module text dims (default-context :now)))
([module text dims context]
(let [targets (map (fn [x] {:dim x :label x}) dims)]
(->> (analyze text context module targets nil)
:winners
(map #(assoc % :value (engine/export-value % {})))
(map #(select-keys % [:dim :body :value :start :end :latent]))))))
;--------------------------------------------------------------------------
; The stuff below is specific to Wit.ai and will be moved out of Duckling
;--------------------------------------------------------------------------
(defn- generate-context
"Wit.ai internal. Will move to Wit."
[base-context]
(-> base-context
(?> (instance? org.joda.time.DateTime (:reference-time base-context))
(assoc :reference-time {:start (:reference-time base-context)
:grain :second}))))
(defn extract
"API used by Wit.ai (will be moved to Wit)
targets is a coll of maps {:module :dim :label} for instance:
{:module fr$core, :dim duration, :label wit$duration} to get duration results
Returns a single coll of tokens with :body :value :start :end :label (=wisp) :latent"
[sentence context leven-stash targets]
{:pre [(string? sentence)
(map? context)
(:reference-time context)
(vector? targets)]}
(try
(infof "Extracting from '%s' with targets %s" sentence targets)
(letfn [(extract'
[module targets] ; targets specify all the dims we should extract
(let [module (keyword module)
pic-context (generate-context context)]
(when-not (get-in @language->data [module :corpus])
(throw (ex-info "Unknown duckling module" {:module module})))
(->> (analyze sentence pic-context module targets leven-stash)
:winners
(map #(assoc % :value (engine/export-value % {:date-fn str})))
(map #(select-keys % [:label :body :value :start :end :latent])))))]
(->> targets
(group-by :module) ; we want to run each config only once
(mapcat (fn [[module targets]] (extract' module targets)))
vec))
(catch Exception e
(let [err {:e e
:sentence sentence
:context context
:leven-stash leven-stash
:targets targets}]
(errorf e "duckling error err=%s" (pr-str err))
[]))))