From 01058e4212de08e66b8786745e5e7f75b7676335 Mon Sep 17 00:00:00 2001 From: julosaure Date: Fri, 21 Sep 2012 16:51:41 -0400 Subject: [PATCH] Creation of the interface for calling Ken Language Models. --- src/decloder/kenlm.clj | 62 ++++++++++++++++++++++++++++++++++ src/decloder/model.clj | 18 +++++----- src/decloder/translator.clj | 67 +++++++++++++++++++++---------------- 3 files changed, 110 insertions(+), 37 deletions(-) create mode 100644 src/decloder/kenlm.clj diff --git a/src/decloder/kenlm.clj b/src/decloder/kenlm.clj new file mode 100644 index 0000000..2deaed0 --- /dev/null +++ b/src/decloder/kenlm.clj @@ -0,0 +1,62 @@ +(ns decloder.kenlm + (:require clojure.java.shell) + (:require clojure.string) + ) + +;; GLOBALS + +(def LM_QUERY "/Users/julien/workspaces/xp/kenlm/query") + +(def LM_BIN "/Users/julien/workspaces/clojure/decloder/data/lm/lm_giga_64k_vp_3gram/lm_giga_64k_vp_3gram.kenbinary") + +;; UTILS + + +;; FUNCTIONS + +(defn call-lm [n-grams] + (let [res (clojure.java.shell/sh LM_QUERY LM_BIN :in n-grams)] + (if (> 0 (count (:err res))) + (println "Error while querying LM with " n-grams ": " (:err res)) + (:out res) + ) + ) + ) + +(def TOTAL_PAT #"Total: ([-0-9.]+) ") + +(defn parse-lm-output-line [line] + (let [matcher (re-matcher TOTAL_PAT line) + score (re-find matcher)] + (println line) + ;(println (map #(str "X" % "X") score)) + ;(println score) + (Double. (second score)) + ) + ) + + +(defn parse-lm-output [out list-n-grams] + (let [first-word (first list-n-grams) + first-word-escaped (clojure.string/escape first-word {\( "\\(", \) "\\)", \* "\\*", \? "\\?"}) + ;tt (println first-word-escaped) + pat (re-pattern first-word-escaped)] + (loop [lines (clojure.string/split-lines out)] + (if (re-find pat (first lines)) + (parse-lm-output-line (first lines)) + (recur (rest lines)) + ) + ) + ) + ) + + +(defn score-ngrams [n-grams] + {:pre [(string? n-grams) + (> (count n-grams) 0)] + :post [(pos? %)]} + (let [list-n-grams (clojure.string/split n-grams #" ") + out (call-lm n-grams)] + (- (parse-lm-output out list-n-grams)) + ) + ) diff --git a/src/decloder/model.clj b/src/decloder/model.clj index 0cf8286..cf70214 100644 --- a/src/decloder/model.clj +++ b/src/decloder/model.clj @@ -20,21 +20,23 @@ ;; FUNCTIONS (defn read-voc [f] - {:post [(map? %)]} + {:post [(vector? %) + (= true (reduce #(and %1 %2) (map #(map? %) %)))]} (println "Reading vocabulary " f) (with-open [rdr (BufferedReader. (FileReader. f))] (loop [line (.readLine rdr) - token_map {}] + token_id_map {} + id_token_map {}] (if line (let [tab (clojure.string/split line #" ") token_id (first tab) token (second tab)] - (recur (.readLine rdr) (assoc token_map token token_id)) + (recur (.readLine rdr) (assoc token_id_map token token_id) (assoc id_token_map token_id token)) ) (do - (println (count token_map) " tokens read.") - token_map + (println (count token_id_map) " tokens read.") + [token_id_map id_token_map] ) ) ) @@ -85,10 +87,10 @@ (defn init-engine [] {:post [(map? %)]} - (let [voc-src (read-voc VOC_SRC) - voc-trg (read-voc VOC_TRG) + (let [[voc-src-id voc-id-src] (read-voc VOC_SRC) + [voc-trg-id voc-id-trg] (read-voc VOC_TRG) lex-prob (read-lex-prob LEX_PROB)] ;(println (sort (filter #(.startsWith (key %) "ann") voc-src))) - {:voc-src voc-src, :voc-trg voc-trg, :lex-prob lex-prob} + {:voc-src-id voc-src-id, :voc-id-src voc-id-src, :voc-trg-id voc-trg-id, :voc-id-trg voc-id-trg, :lex-prob lex-prob} )) diff --git a/src/decloder/translator.clj b/src/decloder/translator.clj index 13a21bd..894d366 100644 --- a/src/decloder/translator.clj +++ b/src/decloder/translator.clj @@ -3,6 +3,7 @@ (:require clojure.java.io) (:import [java.io BufferedReader FileReader]) (:require clojure.data.priority-map) + (:require decloder.lm) ) @@ -17,28 +18,33 @@ ;; FUNCTIONS -(defn score-hypothesis [lex-prob pred-hypo] - {:pre [(= java.lang.Double (type lex-prob)) +(defn score-hypothesis [trg-token lex-prob pred-hypo] + {:pre [(string? trg-token) + (= java.lang.Double (type lex-prob)) (>= lex-prob 0) (or (nil? pred-hypo) (= decloder.translator.Hypothesis (type pred-hypo)))] :post [(>= % 0) (>= % lex-prob)]} - + (if (nil? pred-hypo) lex-prob - (+ lex-prob (:score pred-hypo)) + (let [n-grams (str trg-token " " (:token pred-hypo))] + (println "n-gram to score: " n-grams) + (+ lex-prob (:score pred-hypo) (decloder.lm/score-ngrams n-grams)) + ) ) ) -(defn new-hypo [stack lex-prob] +(defn new-hypo [model stack lex-prob] {:pre [(map? stack) (= clojure.lang.MapEntry (type lex-prob))] :post [(map? %) (>= (count %) (count stack))]} - (let [trg-token (key lex-prob) + (let [trg-token-id (key lex-prob) + trg-token ((model :voc-id-trg) trg-token-id) lexical-prob (val lex-prob) pred (Hypothesis. nil 0 nil) - score (score-hypothesis lexical-prob pred)] + score (score-hypothesis trg-token lexical-prob pred)] (assoc stack (Hypothesis. trg-token score pred) score) ) ) @@ -50,18 +56,20 @@ (= java.lang.String (type src-token))] :post [(map? %) (>= (count %) (count stack))]} - - (loop [stack_ stack - ;lex-probs (filter #(= (first (key %)) src-token) (model :lex-prob)) - lex-probs ((model :lex-prob) src-token) - tata (println "count lex-probs" (count lex-probs))] - (if (empty? lex-probs) - stack_ - (let [lex-prob (first lex-probs) - trg-token (key lex-prob) - lexical-prob (val lex-prob) - score (score-hypothesis lexical-prob top-hypo)] - (recur (assoc stack_ (Hypothesis. trg-token score top-hypo) score) (rest lex-probs) "dd") + + (let [src-token-id ((model :voc-src-id) src-token)] + (loop [stack_ stack + lex-probs ((model :lex-prob) src-token-id) + tata (println "count lex-probs" (count lex-probs))] + (if (empty? lex-probs) + stack_ + (let [lex-prob (first lex-probs) + trg-token-id (key lex-prob) + lexical-prob (val lex-prob) + trg-token ((model :voc-id-trg) trg-token-id) + score (score-hypothesis trg-token lexical-prob top-hypo)] + (recur (assoc stack_ (Hypothesis. trg-token score top-hypo) score) (rest lex-probs) "dd") + ) ) ) ) @@ -88,7 +96,8 @@ pos 0 stacks {}] - (let [src-token (first src-sentence_)] + (let [src-token (first src-sentence_) + src-token-id ((model :voc-src-id) src-token)] (println "Main loop, pos " pos ", src-token " src-token ", count(stacks) " (count-stacks stacks) "(count src-sentence) " (count src-sentence)) (if (nil? (stacks pos)) (recur src-sentence_ pos (assoc stacks pos (clojure.data.priority-map/priority-map))) @@ -100,15 +109,15 @@ (recur (rest src-sentence_) (+ pos 1) stacks) - (if (= 0 (count ((model :lex-prob) src-token))) + (if (= 0 (count ((model :lex-prob) src-token-id))) (recur (rest src-sentence_) (+ pos 1) stacks) (if (= pos 0) - (let [hypos ((model :lex-prob) src-token) + (let [hypos ((model :lex-prob) src-token-id) ;hypos (filter #(= (first (key %)) src-token) (model :lex-prob)) ;titi (println "(count hypos) " (count hypos)) - stack_ (reduce new-hypo (stacks 0) hypos) + stack_ (reduce (partial new-hypo model) (stacks 0) hypos) ];tata (println "(count stack_) " (count stack_))] (recur (rest src-sentence_) (+ pos 1) (assoc stacks 0 stack_))) @@ -183,7 +192,7 @@ ) (defn tokens-to-ids [model s] - (let [voc-src (model :voc-src)] + (let [voc-src (model :voc-src-id)] (map #(voc-src %) s) ) ) @@ -211,16 +220,16 @@ (defn translate-sentence [model sentence] (println "Translating: " sentence) (let [sent-tok (tokenize-sentence sentence) - sent-tok-id (tokens-to-ids model sent-tok)] + ];sent-tok-id (tokens-to-ids model sent-tok)] (println "Tokenized: " sent-tok) - (println "Ids: " sent-tok-id) + ;(println "Ids: " sent-tok-id) (let [;model (filter-src-lex-probs model sent-tok-id) - graph (search model sent-tok-id) + graph (search model sent-tok) best-path (extract-best-path graph) - inv-voc-trg (reduce #(assoc %1 (val %2) (key %2)) {} (model :voc-trg)) + ;inv-voc-trg (reduce #(assoc %1 (val %2) (key %2)) {} (model :voc-trg)) ];tt (println (take 10 inv-voc-trg))] (println best-path) - (println (ids-to-tokens inv-voc-trg best-path)) + ;(println (ids-to-tokens inv-voc-trg best-path)) ) ) )