Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP

Comparing changes

Choose two branches to see what's changed or to start a new pull request. If you need to, you can also compare across forks.

Open a pull request

Create a new pull request by comparing changes across two branches. If you need to, you can also compare across forks.
base fork: julosaure/decloder
base: 453f25f737
...
head fork: julosaure/decloder
compare: 27c8f95dab
Checking mergeability… Don't worry, you can still create the pull request.
  • 2 commits
  • 4 files changed
  • 0 commit comments
  • 1 contributor
Commits on Oct 04, 2012
@julosaure Cleaned code. 74c5d16
@julosaure Tried to store lex probs with priority-map for pruning them; the resu…
…lting model is very big on disk whereas it contains less lex probs, not very efficient...
27c8f95
View
2  project.clj
@@ -9,5 +9,5 @@
[berkeleylm/berkeleylm "1.1.2"]
]
- :jvm-opts ["-server" "-Dfile.encoding=UTF-8" "-Xmx1024m"]
+ :jvm-opts ["-server" "-Dfile.encoding=UTF-8" "-Xmx2560m"]
:main decloder.core)
View
45 src/decloder/model.clj
@@ -6,6 +6,7 @@
(:import [java.util.zip GZIPInputStream])
(:import [java.lang Math])
(:require decloder.blm)
+ (:require clojure.data.priority-map)
)
@@ -15,7 +16,11 @@
(def VOC_TRG "/Users/julien/workspaces/clojure/decloder/data/sentfr/fr-en.trn.trg.vcb")
-(def LEX_PROB "/Users/julien/workspaces/clojure/decloder/data/sentfr/fr-en.t3.final.bin")
+(def LEX_PROB "/Users/julien/workspaces/clojure/decloder/data/sentfr/fr-en.t3.final.bin") ;.pc10.bin")
+
+(def PC_FILTER_LEX_PROBS 0.1)
+
+(def MIN_KEEP_LEX_PROBS 10)
;; UTILS
@@ -58,11 +63,6 @@
unserializedModel
)
)
- ; final InputStream fis = getBufferedInputStream(path);
- ; return path.getName().endsWith(".gz") ? new ObjectInputStream(new GZIPInputStream(fis)) : new ObjectInputStream(fis); }
- ;final ObjectInputStream in = openObjIn(path);
- ;final Object obj = in.readObject();
- ; in.close();
(defn read-lex-prob_ [f]
@@ -79,7 +79,7 @@
lex_prob (last tab)
minus_log_lex_prob (- (Math/log (Double. lex_prob)))]
(if (nil? (lex_prob_map token_src))
- (recur (+ i 1) (.readLine rdr) (assoc! lex_prob_map token_src {token_trg minus_log_lex_prob}))
+ (recur (+ i 1) (.readLine rdr) (assoc! lex_prob_map token_src (clojure.data.priority-map/priority-map token_trg minus_log_lex_prob)))
(recur (+ i 1) (.readLine rdr) (assoc! lex_prob_map token_src (assoc (lex_prob_map token_src) token_trg minus_log_lex_prob)))
)
)
@@ -103,27 +103,30 @@
)
)
-(defn startswith [str pat]
- (loop [str_ str
- pat_ pat]
- (if pat_
- (if (= (first pat_) (first str_))
- (recur (rest str_) (rest pat_))
- false
- )
- true
- )
- )
- )
+(defn filter-lex-probs [lex-probs percent-filter]
+ {:post [(= (count lex-probs) (count %))]}
+
+ (loop [filtered-lex-probs (transient {})
+ seq_ (seq lex-probs)
+ nb-lex-probs 0]
+ (if (empty? seq_)
+ (let [_ (println "Filtered model has " nb-lex-probs " lex probs for " (count filtered-lex-probs) " src tokens.")]
+ (persistent! filtered-lex-probs))
+ (let [[src-tok list-trg-probs] (first seq_)
+ max_ (max MIN_KEEP_LEX_PROBS (int (* percent-filter (count list-trg-probs))))
+ ];_ (println src-tok ":" (count list-trg-probs) ":" max_)]
+ (recur (assoc! filtered-lex-probs src-tok (take max_ list-trg-probs)) (rest seq_) (+ nb-lex-probs max_))))))
+
(defn init-engine []
{:post [(map? %)]}
(let [[voc-src-id voc-id-src] (read-voc VOC_SRC)
[voc-trg-id voc-id-trg] (read-voc VOC_TRG)
- lex-prob (read-lex-prob LEX_PROB)
+ lex-probs (read-lex-prob LEX_PROB)
+ ;lex-probs (filter-lex-probs lex-probs PC_FILTER_LEX_PROBS)
lm (decloder.blm/load-lm)]
;(println (sort (filter #(.startsWith (key %) "ann") voc-src)))
- {:voc-src-id voc-src-id, :voc-id-src voc-id-src, :voc-trg-id voc-trg-id, :voc-id-trg voc-id-trg, :lex-prob lex-prob, :lm lm}
+ {:voc-src-id voc-src-id, :voc-id-src voc-id-src, :voc-trg-id voc-trg-id, :voc-id-trg voc-id-trg, :lex-prob lex-probs, :lm lm}
))
View
1  src/decloder/scripts/makeBinaryTM.clj
@@ -6,6 +6,7 @@
(defn -main [inputTM outputBinaryTM]
; Reads a giza translation model and serializr it
(let [model (decloder.model/read-lex-prob inputTM)
+ model (decloder.model/filter-lex-probs model decloder.model/PC_FILTER_LEX_PROBS)
out (ObjectOutputStream. (BufferedOutputStream. (FileOutputStream. outputBinaryTM)))]
(doto out
(.writeObject model)
View
29 src/decloder/translator.clj
@@ -59,10 +59,10 @@
)
)
-(defn extend-hypo [model stack top-hypo src-token]
+(defn extend-hypo [model stack prev-hypo src-token]
{:pre [(map? model)
(map? stack)
- (or (nil? top-hypo) (= decloder.translator.Hypothesis (type top-hypo)))
+ (or (nil? prev-hypo) (= decloder.translator.Hypothesis (type prev-hypo)))
(= java.lang.String (type src-token))]
:post [(map? %)
(>= (count %) (count stack))]}
@@ -70,20 +70,16 @@
(let [src-token-id ((model :voc-src-id) src-token)]
(loop [stack_ stack
lex-probs ((model :lex-prob) src-token-id)
- ];tata (println "count lex-probs" (count lex-probs))]
+ ];_ (println "Extending stack with " (count lex-probs) " hypos")]
(if (empty? lex-probs)
stack_
(let [lex-prob (first lex-probs)
trg-token-id (key lex-prob)
lexical-prob (val lex-prob)
trg-token ((model :voc-id-trg) trg-token-id)
- score (score-hypothesis model trg-token lexical-prob top-hypo)]
- (recur (assoc stack_ (Hypothesis. trg-token score top-hypo) score) (rest lex-probs))
- )
- )
- )
- )
- )
+ score (score-hypothesis model trg-token lexical-prob prev-hypo)]
+ (recur (assoc stack_ (Hypothesis. trg-token score prev-hypo) score) (rest lex-probs)))))))
+
(defn count-stacks [stacks]
(loop [stacks_ stacks
@@ -93,11 +89,7 @@
(let [first-key (first (sort (keys stacks_)))
stack (stacks first-key)
msg (str msg " " first-key ":" (count stack))]
- (recur (dissoc stacks_ first-key) msg)
- )
- )
- )
- )
+ (recur (dissoc stacks_ first-key) msg)))))
(defn shave-stack [stack]
{:pre [(= clojure.data.priority_map.PersistentPriorityMap (type stack))]
@@ -108,6 +100,7 @@
(reduce #(apply assoc %1 %2) (clojure.data.priority-map/priority-map) s1))
stack))
+
(defn search-first-not-empty-prev-stack [stacks pos]
{:post [(map? %)]}
@@ -116,7 +109,8 @@
(if (= 0 (count prev-stack))
(recur (+ 1 prev-stack-not-empty-pos))
prev-stack))))
-
+
+
(defn extend-stack [stack prev-stack model src-token]
{:post [(>= (count %) (count stack))]}
@@ -138,12 +132,11 @@
(let [src-token (first src-sentence_)
src-token-id ((model :voc-src-id) src-token)]
- (println "Main loop, pos " pos ", src-token " src-token ", count(stacks) " (count-stacks stacks) "(count src-sentence) " (count src-sentence))
+ (println "Main loop, pos " pos ", src-token " src-token ", count(stacks) " (count-stacks stacks))
(if (nil? (stacks pos))
(recur src-sentence_ pos (assoc stacks pos (clojure.data.priority-map/priority-map)))
(if (= 0 (count src-sentence_))
- ;(if (>= pos (count src-sentence))
stacks
(if (nil? src-token)

No commit comments for this range

Something went wrong with that request. Please try again.