Skip to content

Commit

Permalink
Creation of the interface for calling Ken Language Models.
Browse files Browse the repository at this point in the history
  • Loading branch information
julosaure committed Sep 21, 2012
1 parent 2e869fb commit 01058e4
Show file tree
Hide file tree
Showing 3 changed files with 110 additions and 37 deletions.
62 changes: 62 additions & 0 deletions src/decloder/kenlm.clj
@@ -0,0 +1,62 @@
(ns decloder.kenlm
(:require clojure.java.shell)
(:require clojure.string)
)

;; GLOBALS

(def LM_QUERY "/Users/julien/workspaces/xp/kenlm/query")

(def LM_BIN "/Users/julien/workspaces/clojure/decloder/data/lm/lm_giga_64k_vp_3gram/lm_giga_64k_vp_3gram.kenbinary")

;; UTILS


;; FUNCTIONS

(defn call-lm [n-grams]
(let [res (clojure.java.shell/sh LM_QUERY LM_BIN :in n-grams)]
(if (> 0 (count (:err res)))
(println "Error while querying LM with " n-grams ": " (:err res))
(:out res)
)
)
)

(def TOTAL_PAT #"Total: ([-0-9.]+) ")

(defn parse-lm-output-line [line]
(let [matcher (re-matcher TOTAL_PAT line)
score (re-find matcher)]
(println line)
;(println (map #(str "X" % "X") score))
;(println score)
(Double. (second score))
)
)


(defn parse-lm-output [out list-n-grams]
(let [first-word (first list-n-grams)
first-word-escaped (clojure.string/escape first-word {\( "\\(", \) "\\)", \* "\\*", \? "\\?"})
;tt (println first-word-escaped)
pat (re-pattern first-word-escaped)]
(loop [lines (clojure.string/split-lines out)]
(if (re-find pat (first lines))
(parse-lm-output-line (first lines))
(recur (rest lines))
)
)
)
)


(defn score-ngrams [n-grams]
{:pre [(string? n-grams)
(> (count n-grams) 0)]
:post [(pos? %)]}
(let [list-n-grams (clojure.string/split n-grams #" ")
out (call-lm n-grams)]
(- (parse-lm-output out list-n-grams))
)
)
18 changes: 10 additions & 8 deletions src/decloder/model.clj
Expand Up @@ -20,21 +20,23 @@
;; FUNCTIONS ;; FUNCTIONS


(defn read-voc [f] (defn read-voc [f]
{:post [(map? %)]} {:post [(vector? %)
(= true (reduce #(and %1 %2) (map #(map? %) %)))]}


(println "Reading vocabulary " f) (println "Reading vocabulary " f)
(with-open [rdr (BufferedReader. (FileReader. f))] (with-open [rdr (BufferedReader. (FileReader. f))]
(loop [line (.readLine rdr) (loop [line (.readLine rdr)
token_map {}] token_id_map {}
id_token_map {}]
(if line (if line
(let [tab (clojure.string/split line #" ") (let [tab (clojure.string/split line #" ")
token_id (first tab) token_id (first tab)
token (second tab)] token (second tab)]
(recur (.readLine rdr) (assoc token_map token token_id)) (recur (.readLine rdr) (assoc token_id_map token token_id) (assoc id_token_map token_id token))
) )
(do (do
(println (count token_map) " tokens read.") (println (count token_id_map) " tokens read.")
token_map [token_id_map id_token_map]
) )
) )
) )
Expand Down Expand Up @@ -85,10 +87,10 @@
(defn init-engine [] (defn init-engine []
{:post [(map? %)]} {:post [(map? %)]}


(let [voc-src (read-voc VOC_SRC) (let [[voc-src-id voc-id-src] (read-voc VOC_SRC)
voc-trg (read-voc VOC_TRG) [voc-trg-id voc-id-trg] (read-voc VOC_TRG)
lex-prob (read-lex-prob LEX_PROB)] lex-prob (read-lex-prob LEX_PROB)]
;(println (sort (filter #(.startsWith (key %) "ann") voc-src))) ;(println (sort (filter #(.startsWith (key %) "ann") voc-src)))
{:voc-src voc-src, :voc-trg voc-trg, :lex-prob lex-prob} {:voc-src-id voc-src-id, :voc-id-src voc-id-src, :voc-trg-id voc-trg-id, :voc-id-trg voc-id-trg, :lex-prob lex-prob}
)) ))


67 changes: 38 additions & 29 deletions src/decloder/translator.clj
Expand Up @@ -3,6 +3,7 @@
(:require clojure.java.io) (:require clojure.java.io)
(:import [java.io BufferedReader FileReader]) (:import [java.io BufferedReader FileReader])
(:require clojure.data.priority-map) (:require clojure.data.priority-map)
(:require decloder.lm)
) )




Expand All @@ -17,28 +18,33 @@


;; FUNCTIONS ;; FUNCTIONS


(defn score-hypothesis [lex-prob pred-hypo] (defn score-hypothesis [trg-token lex-prob pred-hypo]
{:pre [(= java.lang.Double (type lex-prob)) {:pre [(string? trg-token)
(= java.lang.Double (type lex-prob))
(>= lex-prob 0) (>= lex-prob 0)
(or (nil? pred-hypo) (= decloder.translator.Hypothesis (type pred-hypo)))] (or (nil? pred-hypo) (= decloder.translator.Hypothesis (type pred-hypo)))]
:post [(>= % 0) (>= % lex-prob)]} :post [(>= % 0) (>= % lex-prob)]}

(if (nil? pred-hypo) (if (nil? pred-hypo)
lex-prob lex-prob
(+ lex-prob (:score pred-hypo)) (let [n-grams (str trg-token " " (:token pred-hypo))]
(println "n-gram to score: " n-grams)
(+ lex-prob (:score pred-hypo) (decloder.lm/score-ngrams n-grams))
)
) )
) )


(defn new-hypo [stack lex-prob] (defn new-hypo [model stack lex-prob]
{:pre [(map? stack) {:pre [(map? stack)
(= clojure.lang.MapEntry (type lex-prob))] (= clojure.lang.MapEntry (type lex-prob))]
:post [(map? %) :post [(map? %)
(>= (count %) (count stack))]} (>= (count %) (count stack))]}


(let [trg-token (key lex-prob) (let [trg-token-id (key lex-prob)
trg-token ((model :voc-id-trg) trg-token-id)
lexical-prob (val lex-prob) lexical-prob (val lex-prob)
pred (Hypothesis. nil 0 nil) pred (Hypothesis. nil 0 nil)
score (score-hypothesis lexical-prob pred)] score (score-hypothesis trg-token lexical-prob pred)]
(assoc stack (Hypothesis. trg-token score pred) score) (assoc stack (Hypothesis. trg-token score pred) score)
) )
) )
Expand All @@ -50,18 +56,20 @@
(= java.lang.String (type src-token))] (= java.lang.String (type src-token))]
:post [(map? %) :post [(map? %)
(>= (count %) (count stack))]} (>= (count %) (count stack))]}


(loop [stack_ stack (let [src-token-id ((model :voc-src-id) src-token)]
;lex-probs (filter #(= (first (key %)) src-token) (model :lex-prob)) (loop [stack_ stack
lex-probs ((model :lex-prob) src-token) lex-probs ((model :lex-prob) src-token-id)
tata (println "count lex-probs" (count lex-probs))] tata (println "count lex-probs" (count lex-probs))]
(if (empty? lex-probs) (if (empty? lex-probs)
stack_ stack_
(let [lex-prob (first lex-probs) (let [lex-prob (first lex-probs)
trg-token (key lex-prob) trg-token-id (key lex-prob)
lexical-prob (val lex-prob) lexical-prob (val lex-prob)
score (score-hypothesis lexical-prob top-hypo)] trg-token ((model :voc-id-trg) trg-token-id)
(recur (assoc stack_ (Hypothesis. trg-token score top-hypo) score) (rest lex-probs) "dd") score (score-hypothesis trg-token lexical-prob top-hypo)]
(recur (assoc stack_ (Hypothesis. trg-token score top-hypo) score) (rest lex-probs) "dd")
)
) )
) )
) )
Expand All @@ -88,7 +96,8 @@
pos 0 pos 0
stacks {}] stacks {}]


(let [src-token (first src-sentence_)] (let [src-token (first src-sentence_)
src-token-id ((model :voc-src-id) src-token)]
(println "Main loop, pos " pos ", src-token " src-token ", count(stacks) " (count-stacks stacks) "(count src-sentence) " (count src-sentence)) (println "Main loop, pos " pos ", src-token " src-token ", count(stacks) " (count-stacks stacks) "(count src-sentence) " (count src-sentence))
(if (nil? (stacks pos)) (if (nil? (stacks pos))
(recur src-sentence_ pos (assoc stacks pos (clojure.data.priority-map/priority-map))) (recur src-sentence_ pos (assoc stacks pos (clojure.data.priority-map/priority-map)))
Expand All @@ -100,15 +109,15 @@
(recur (rest src-sentence_) (+ pos 1) stacks) (recur (rest src-sentence_) (+ pos 1) stacks)




(if (= 0 (count ((model :lex-prob) src-token))) (if (= 0 (count ((model :lex-prob) src-token-id)))
(recur (rest src-sentence_) (+ pos 1) stacks) (recur (rest src-sentence_) (+ pos 1) stacks)




(if (= pos 0) (if (= pos 0)
(let [hypos ((model :lex-prob) src-token) (let [hypos ((model :lex-prob) src-token-id)
;hypos (filter #(= (first (key %)) src-token) (model :lex-prob)) ;hypos (filter #(= (first (key %)) src-token) (model :lex-prob))
;titi (println "(count hypos) " (count hypos)) ;titi (println "(count hypos) " (count hypos))
stack_ (reduce new-hypo (stacks 0) hypos) stack_ (reduce (partial new-hypo model) (stacks 0) hypos)
];tata (println "(count stack_) " (count stack_))] ];tata (println "(count stack_) " (count stack_))]
(recur (rest src-sentence_) (+ pos 1) (assoc stacks 0 stack_))) (recur (rest src-sentence_) (+ pos 1) (assoc stacks 0 stack_)))


Expand Down Expand Up @@ -183,7 +192,7 @@
) )


(defn tokens-to-ids [model s] (defn tokens-to-ids [model s]
(let [voc-src (model :voc-src)] (let [voc-src (model :voc-src-id)]
(map #(voc-src %) s) (map #(voc-src %) s)
) )
) )
Expand Down Expand Up @@ -211,16 +220,16 @@
(defn translate-sentence [model sentence] (defn translate-sentence [model sentence]
(println "Translating: " sentence) (println "Translating: " sentence)
(let [sent-tok (tokenize-sentence sentence) (let [sent-tok (tokenize-sentence sentence)
sent-tok-id (tokens-to-ids model sent-tok)] ];sent-tok-id (tokens-to-ids model sent-tok)]
(println "Tokenized: " sent-tok) (println "Tokenized: " sent-tok)
(println "Ids: " sent-tok-id) ;(println "Ids: " sent-tok-id)
(let [;model (filter-src-lex-probs model sent-tok-id) (let [;model (filter-src-lex-probs model sent-tok-id)
graph (search model sent-tok-id) graph (search model sent-tok)
best-path (extract-best-path graph) best-path (extract-best-path graph)
inv-voc-trg (reduce #(assoc %1 (val %2) (key %2)) {} (model :voc-trg)) ;inv-voc-trg (reduce #(assoc %1 (val %2) (key %2)) {} (model :voc-trg))
];tt (println (take 10 inv-voc-trg))] ];tt (println (take 10 inv-voc-trg))]
(println best-path) (println best-path)
(println (ids-to-tokens inv-voc-trg best-path)) ;(println (ids-to-tokens inv-voc-trg best-path))
) )
) )
) )

0 comments on commit 01058e4

Please sign in to comment.