Permalink
Browse files

Creation of the interface for calling Ken Language Models.

  • Loading branch information...
1 parent 2e869fb commit 01058e4212de08e66b8786745e5e7f75b7676335 @julosaure committed Sep 21, 2012
Showing with 110 additions and 37 deletions.
  1. +62 −0 src/decloder/kenlm.clj
  2. +10 −8 src/decloder/model.clj
  3. +38 −29 src/decloder/translator.clj
@@ -0,0 +1,62 @@
+(ns decloder.kenlm
+ (:require clojure.java.shell)
+ (:require clojure.string)
+ )
+
+;; GLOBALS
+
+(def LM_QUERY "/Users/julien/workspaces/xp/kenlm/query")
+
+(def LM_BIN "/Users/julien/workspaces/clojure/decloder/data/lm/lm_giga_64k_vp_3gram/lm_giga_64k_vp_3gram.kenbinary")
+
+;; UTILS
+
+
+;; FUNCTIONS
+
+(defn call-lm [n-grams]
+ (let [res (clojure.java.shell/sh LM_QUERY LM_BIN :in n-grams)]
+ (if (> 0 (count (:err res)))
+ (println "Error while querying LM with " n-grams ": " (:err res))
+ (:out res)
+ )
+ )
+ )
+
+(def TOTAL_PAT #"Total: ([-0-9.]+) ")
+
+(defn parse-lm-output-line [line]
+ (let [matcher (re-matcher TOTAL_PAT line)
+ score (re-find matcher)]
+ (println line)
+ ;(println (map #(str "X" % "X") score))
+ ;(println score)
+ (Double. (second score))
+ )
+ )
+
+
+(defn parse-lm-output [out list-n-grams]
+ (let [first-word (first list-n-grams)
+ first-word-escaped (clojure.string/escape first-word {\( "\\(", \) "\\)", \* "\\*", \? "\\?"})
+ ;tt (println first-word-escaped)
+ pat (re-pattern first-word-escaped)]
+ (loop [lines (clojure.string/split-lines out)]
+ (if (re-find pat (first lines))
+ (parse-lm-output-line (first lines))
+ (recur (rest lines))
+ )
+ )
+ )
+ )
+
+
+(defn score-ngrams [n-grams]
+ {:pre [(string? n-grams)
+ (> (count n-grams) 0)]
+ :post [(pos? %)]}
+ (let [list-n-grams (clojure.string/split n-grams #" ")
+ out (call-lm n-grams)]
+ (- (parse-lm-output out list-n-grams))
+ )
+ )
@@ -20,21 +20,23 @@
;; FUNCTIONS
(defn read-voc [f]
- {:post [(map? %)]}
+ {:post [(vector? %)
+ (= true (reduce #(and %1 %2) (map #(map? %) %)))]}
(println "Reading vocabulary " f)
(with-open [rdr (BufferedReader. (FileReader. f))]
(loop [line (.readLine rdr)
- token_map {}]
+ token_id_map {}
+ id_token_map {}]
(if line
(let [tab (clojure.string/split line #" ")
token_id (first tab)
token (second tab)]
- (recur (.readLine rdr) (assoc token_map token token_id))
+ (recur (.readLine rdr) (assoc token_id_map token token_id) (assoc id_token_map token_id token))
)
(do
- (println (count token_map) " tokens read.")
- token_map
+ (println (count token_id_map) " tokens read.")
+ [token_id_map id_token_map]
)
)
)
@@ -85,10 +87,10 @@
(defn init-engine []
{:post [(map? %)]}
- (let [voc-src (read-voc VOC_SRC)
- voc-trg (read-voc VOC_TRG)
+ (let [[voc-src-id voc-id-src] (read-voc VOC_SRC)
+ [voc-trg-id voc-id-trg] (read-voc VOC_TRG)
lex-prob (read-lex-prob LEX_PROB)]
;(println (sort (filter #(.startsWith (key %) "ann") voc-src)))
- {:voc-src voc-src, :voc-trg voc-trg, :lex-prob lex-prob}
+ {:voc-src-id voc-src-id, :voc-id-src voc-id-src, :voc-trg-id voc-trg-id, :voc-id-trg voc-id-trg, :lex-prob lex-prob}
))
@@ -3,6 +3,7 @@
(:require clojure.java.io)
(:import [java.io BufferedReader FileReader])
(:require clojure.data.priority-map)
+ (:require decloder.lm)
)
@@ -17,28 +18,33 @@
;; FUNCTIONS
-(defn score-hypothesis [lex-prob pred-hypo]
- {:pre [(= java.lang.Double (type lex-prob))
+(defn score-hypothesis [trg-token lex-prob pred-hypo]
+ {:pre [(string? trg-token)
+ (= java.lang.Double (type lex-prob))
(>= lex-prob 0)
(or (nil? pred-hypo) (= decloder.translator.Hypothesis (type pred-hypo)))]
:post [(>= % 0) (>= % lex-prob)]}
-
+
(if (nil? pred-hypo)
lex-prob
- (+ lex-prob (:score pred-hypo))
+ (let [n-grams (str trg-token " " (:token pred-hypo))]
+ (println "n-gram to score: " n-grams)
+ (+ lex-prob (:score pred-hypo) (decloder.lm/score-ngrams n-grams))
+ )
)
)
-(defn new-hypo [stack lex-prob]
+(defn new-hypo [model stack lex-prob]
{:pre [(map? stack)
(= clojure.lang.MapEntry (type lex-prob))]
:post [(map? %)
(>= (count %) (count stack))]}
- (let [trg-token (key lex-prob)
+ (let [trg-token-id (key lex-prob)
+ trg-token ((model :voc-id-trg) trg-token-id)
lexical-prob (val lex-prob)
pred (Hypothesis. nil 0 nil)
- score (score-hypothesis lexical-prob pred)]
+ score (score-hypothesis trg-token lexical-prob pred)]
(assoc stack (Hypothesis. trg-token score pred) score)
)
)
@@ -50,18 +56,20 @@
(= java.lang.String (type src-token))]
:post [(map? %)
(>= (count %) (count stack))]}
-
- (loop [stack_ stack
- ;lex-probs (filter #(= (first (key %)) src-token) (model :lex-prob))
- lex-probs ((model :lex-prob) src-token)
- tata (println "count lex-probs" (count lex-probs))]
- (if (empty? lex-probs)
- stack_
- (let [lex-prob (first lex-probs)
- trg-token (key lex-prob)
- lexical-prob (val lex-prob)
- score (score-hypothesis lexical-prob top-hypo)]
- (recur (assoc stack_ (Hypothesis. trg-token score top-hypo) score) (rest lex-probs) "dd")
+
+ (let [src-token-id ((model :voc-src-id) src-token)]
+ (loop [stack_ stack
+ lex-probs ((model :lex-prob) src-token-id)
+ tata (println "count lex-probs" (count lex-probs))]
+ (if (empty? lex-probs)
+ stack_
+ (let [lex-prob (first lex-probs)
+ trg-token-id (key lex-prob)
+ lexical-prob (val lex-prob)
+ trg-token ((model :voc-id-trg) trg-token-id)
+ score (score-hypothesis trg-token lexical-prob top-hypo)]
+ (recur (assoc stack_ (Hypothesis. trg-token score top-hypo) score) (rest lex-probs) "dd")
+ )
)
)
)
@@ -88,7 +96,8 @@
pos 0
stacks {}]
- (let [src-token (first src-sentence_)]
+ (let [src-token (first src-sentence_)
+ src-token-id ((model :voc-src-id) src-token)]
(println "Main loop, pos " pos ", src-token " src-token ", count(stacks) " (count-stacks stacks) "(count src-sentence) " (count src-sentence))
(if (nil? (stacks pos))
(recur src-sentence_ pos (assoc stacks pos (clojure.data.priority-map/priority-map)))
@@ -100,15 +109,15 @@
(recur (rest src-sentence_) (+ pos 1) stacks)
- (if (= 0 (count ((model :lex-prob) src-token)))
+ (if (= 0 (count ((model :lex-prob) src-token-id)))
(recur (rest src-sentence_) (+ pos 1) stacks)
(if (= pos 0)
- (let [hypos ((model :lex-prob) src-token)
+ (let [hypos ((model :lex-prob) src-token-id)
;hypos (filter #(= (first (key %)) src-token) (model :lex-prob))
;titi (println "(count hypos) " (count hypos))
- stack_ (reduce new-hypo (stacks 0) hypos)
+ stack_ (reduce (partial new-hypo model) (stacks 0) hypos)
];tata (println "(count stack_) " (count stack_))]
(recur (rest src-sentence_) (+ pos 1) (assoc stacks 0 stack_)))
@@ -183,7 +192,7 @@
)
(defn tokens-to-ids [model s]
- (let [voc-src (model :voc-src)]
+ (let [voc-src (model :voc-src-id)]
(map #(voc-src %) s)
)
)
@@ -211,16 +220,16 @@
(defn translate-sentence [model sentence]
(println "Translating: " sentence)
(let [sent-tok (tokenize-sentence sentence)
- sent-tok-id (tokens-to-ids model sent-tok)]
+ ];sent-tok-id (tokens-to-ids model sent-tok)]
(println "Tokenized: " sent-tok)
- (println "Ids: " sent-tok-id)
+ ;(println "Ids: " sent-tok-id)
(let [;model (filter-src-lex-probs model sent-tok-id)
- graph (search model sent-tok-id)
+ graph (search model sent-tok)
best-path (extract-best-path graph)
- inv-voc-trg (reduce #(assoc %1 (val %2) (key %2)) {} (model :voc-trg))
+ ;inv-voc-trg (reduce #(assoc %1 (val %2) (key %2)) {} (model :voc-trg))
];tt (println (take 10 inv-voc-trg))]
(println best-path)
- (println (ids-to-tokens inv-voc-trg best-path))
+ ;(println (ids-to-tokens inv-voc-trg best-path))
)
)
)

0 comments on commit 01058e4

Please sign in to comment.