Browse files

Integration of serialized binary translation models for faster initia…

…lization of the engine.
  • Loading branch information...
1 parent 6592057 commit 404687f4c448f3612d6efda4e6b27a77008ec336 @julosaure committed Sep 26, 2012
Showing with 54 additions and 7 deletions.
  1. +36 −5 src/decloder/model.clj
  2. +16 −0 src/decloder/scripts/makeBinaryTM.clj
  3. +2 −2 src/decloder/translator.clj
View
41 src/decloder/model.clj
@@ -1,7 +1,9 @@
(ns decloder.model
(:require clojure.string)
(:require clojure.java.io)
- (:import [java.io BufferedReader FileReader])
+ (:import [java.io BufferedReader FileReader InputStream ObjectInputStream FileInputStream BufferedInputStream])
+ (:import [java.nio.channels Channels])
+ (:import [java.util.zip GZIPInputStream])
(:import [java.lang Math])
(:require decloder.blm)
)
@@ -13,8 +15,7 @@
(def VOC_TRG "/Users/julien/workspaces/clojure/decloder/data/sentfr/fr-en.trn.trg.vcb")
-(def LEX_PROB "/Users/julien/workspaces/clojure/decloder/data/sentfr/fr-en.t3.final")
-
+(def LEX_PROB "/Users/julien/workspaces/clojure/decloder/data/sentfr/fr-en.t3.final.bin")
;; UTILS
@@ -44,10 +45,29 @@
)
)
-(defn read-lex-prob [f]
+(defn read-lex-prob-bin [f]
+ (let [fis (FileInputStream. f)
+ channel (.getChannel fis)
+ cis (Channels/newInputStream channel)
+ bis (BufferedInputStream. cis)
+ bis (if (= ".gz" (subs f (- (count f) 3))) (GZIPInputStream. bis) bis)
+ ois (ObjectInputStream. bis)
+ unserializedModel (.readObject ois)]
+ (.close ois)
+ (println "Unserialized translation model with " (count unserializedModel) " lexical probabilities.")
+ unserializedModel
+ )
+ )
+ ; final InputStream fis = getBufferedInputStream(path);
+ ; return path.getName().endsWith(".gz") ? new ObjectInputStream(new GZIPInputStream(fis)) : new ObjectInputStream(fis); }
+ ;final ObjectInputStream in = openObjIn(path);
+ ;final Object obj = in.readObject();
+ ; in.close();
+
+
+(defn read-lex-prob_ [f]
{:post [(map? %)]}
- (println "Reading lexical probabilities " f)
(with-open [rdr (BufferedReader. (FileReader. f))]
(loop [i 0
line (.readLine rdr)
@@ -72,6 +92,17 @@
)
)
+(defn read-lex-prob [f]
+ {:post [(map? %)]}
+
+ (println "Reading lexical probabilities " f)
+ (if (or (= ".gz" (subs f (- (count f) 3)))
+ (= ".bin" (subs f (- (count f) 4))))
+ (read-lex-prob-bin f)
+ (read-lex-prob_ f)
+ )
+ )
+
(defn startswith [str pat]
(loop [str_ str
pat_ pat]
View
16 src/decloder/scripts/makeBinaryTM.clj
@@ -0,0 +1,16 @@
+(ns decloder.scripts.makeBinaryTM
+ (:require decloder.model)
+ (:import [java.io ObjectOutputStream OutputStream BufferedOutputStream FileOutputStream])
+ )
+
+(defn -main [inputTM outputBinaryTM]
+ ; Reads a giza translation model and serializr it
+ (let [model (decloder.model/read-lex-prob inputTM)
+ out (ObjectOutputStream. (BufferedOutputStream. (FileOutputStream. outputBinaryTM)))]
+ (doto out
+ (.writeObject model)
+ (.close)
+ )
+ )
+ )
+
View
4 src/decloder/translator.clj
@@ -31,12 +31,12 @@
(if (nil? (:pred pred-hypo))
(let [lm-score (decloder.blm/score-ngrams (model :lm) bi-gram)]
;(println "bi-gram to score: " bi-gram " -> " lm-score)
- (+ lex-prob (:score pred-hypo) lm-score)
+ (+ lex-prob (* 0.5 (:score pred-hypo)) (* 0.1 lm-score))
)
(let [tri-gram (str (:token (:pred pred-hypo)) " " bi-gram)
lm-score (decloder.blm/score-ngrams (model :lm) tri-gram)]
;(println "tri-gram to score: " tri-gram " -> " lm-score)
- (+ lex-prob (:score pred-hypo) lm-score)
+ (+ lex-prob (* 0.5 (:score pred-hypo)) (* 0.1 lm-score))
)
)
)

0 comments on commit 404687f

Please sign in to comment.