Skip to content
This repository

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse code

Tried to store lex probs with priority-map for pruning them; the resu…

…lting model is very big on disk whereas it contains less lex probs, not very efficient...
  • Loading branch information...
commit 27c8f95dabe748c3bbf9f755de374a38be79f5fd 1 parent 74c5d16
julosaure authored
2  project.clj
@@ -9,5 +9,5 @@
9 9 [berkeleylm/berkeleylm "1.1.2"]
10 10 ]
11 11
12   - :jvm-opts ["-server" "-Dfile.encoding=UTF-8" "-Xmx1024m"]
  12 + :jvm-opts ["-server" "-Dfile.encoding=UTF-8" "-Xmx2560m"]
13 13 :main decloder.core)
45 src/decloder/model.clj
@@ -6,6 +6,7 @@
6 6 (:import [java.util.zip GZIPInputStream])
7 7 (:import [java.lang Math])
8 8 (:require decloder.blm)
  9 + (:require clojure.data.priority-map)
9 10 )
10 11
11 12
@@ -15,7 +16,11 @@
15 16
16 17 (def VOC_TRG "/Users/julien/workspaces/clojure/decloder/data/sentfr/fr-en.trn.trg.vcb")
17 18
18   -(def LEX_PROB "/Users/julien/workspaces/clojure/decloder/data/sentfr/fr-en.t3.final.bin")
  19 +(def LEX_PROB "/Users/julien/workspaces/clojure/decloder/data/sentfr/fr-en.t3.final.bin") ;.pc10.bin")
  20 +
  21 +(def PC_FILTER_LEX_PROBS 0.1)
  22 +
  23 +(def MIN_KEEP_LEX_PROBS 10)
19 24
20 25 ;; UTILS
21 26
@@ -58,11 +63,6 @@
58 63 unserializedModel
59 64 )
60 65 )
61   - ; final InputStream fis = getBufferedInputStream(path);
62   - ; return path.getName().endsWith(".gz") ? new ObjectInputStream(new GZIPInputStream(fis)) : new ObjectInputStream(fis); }
63   - ;final ObjectInputStream in = openObjIn(path);
64   - ;final Object obj = in.readObject();
65   - ; in.close();
66 66
67 67
68 68 (defn read-lex-prob_ [f]
@@ -79,7 +79,7 @@
79 79 lex_prob (last tab)
80 80 minus_log_lex_prob (- (Math/log (Double. lex_prob)))]
81 81 (if (nil? (lex_prob_map token_src))
82   - (recur (+ i 1) (.readLine rdr) (assoc! lex_prob_map token_src {token_trg minus_log_lex_prob}))
  82 + (recur (+ i 1) (.readLine rdr) (assoc! lex_prob_map token_src (clojure.data.priority-map/priority-map token_trg minus_log_lex_prob)))
83 83 (recur (+ i 1) (.readLine rdr) (assoc! lex_prob_map token_src (assoc (lex_prob_map token_src) token_trg minus_log_lex_prob)))
84 84 )
85 85 )
@@ -103,27 +103,30 @@
103 103 )
104 104 )
105 105
106   -(defn startswith [str pat]
107   - (loop [str_ str
108   - pat_ pat]
109   - (if pat_
110   - (if (= (first pat_) (first str_))
111   - (recur (rest str_) (rest pat_))
112   - false
113   - )
114   - true
115   - )
116   - )
117   - )
  106 +(defn filter-lex-probs [lex-probs percent-filter]
  107 + {:post [(= (count lex-probs) (count %))]}
  108 +
  109 + (loop [filtered-lex-probs (transient {})
  110 + seq_ (seq lex-probs)
  111 + nb-lex-probs 0]
  112 + (if (empty? seq_)
  113 + (let [_ (println "Filtered model has " nb-lex-probs " lex probs for " (count filtered-lex-probs) " src tokens.")]
  114 + (persistent! filtered-lex-probs))
  115 + (let [[src-tok list-trg-probs] (first seq_)
  116 + max_ (max MIN_KEEP_LEX_PROBS (int (* percent-filter (count list-trg-probs))))
  117 + ];_ (println src-tok ":" (count list-trg-probs) ":" max_)]
  118 + (recur (assoc! filtered-lex-probs src-tok (take max_ list-trg-probs)) (rest seq_) (+ nb-lex-probs max_))))))
118 119
  120 +
119 121 (defn init-engine []
120 122 {:post [(map? %)]}
121 123
122 124 (let [[voc-src-id voc-id-src] (read-voc VOC_SRC)
123 125 [voc-trg-id voc-id-trg] (read-voc VOC_TRG)
124   - lex-prob (read-lex-prob LEX_PROB)
  126 + lex-probs (read-lex-prob LEX_PROB)
  127 + ;lex-probs (filter-lex-probs lex-probs PC_FILTER_LEX_PROBS)
125 128 lm (decloder.blm/load-lm)]
126 129 ;(println (sort (filter #(.startsWith (key %) "ann") voc-src)))
127   - {:voc-src-id voc-src-id, :voc-id-src voc-id-src, :voc-trg-id voc-trg-id, :voc-id-trg voc-id-trg, :lex-prob lex-prob, :lm lm}
  130 + {:voc-src-id voc-src-id, :voc-id-src voc-id-src, :voc-trg-id voc-trg-id, :voc-id-trg voc-id-trg, :lex-prob lex-probs, :lm lm}
128 131 ))
129 132
1  src/decloder/scripts/makeBinaryTM.clj
@@ -6,6 +6,7 @@
6 6 (defn -main [inputTM outputBinaryTM]
7 7 ; Reads a giza translation model and serializr it
8 8 (let [model (decloder.model/read-lex-prob inputTM)
  9 + model (decloder.model/filter-lex-probs model decloder.model/PC_FILTER_LEX_PROBS)
9 10 out (ObjectOutputStream. (BufferedOutputStream. (FileOutputStream. outputBinaryTM)))]
10 11 (doto out
11 12 (.writeObject model)

0 comments on commit 27c8f95

Please sign in to comment.
Something went wrong with that request. Please try again.