Permalink
Browse files

padding for ngrams

Signed-off-by: feedbackmine <feedbackmine@feedbackmine.com>
  • Loading branch information...
Ye Dingding authored and feedbackmine committed Jul 7, 2009
1 parent e917a50 commit 3a9dfb806e16e132e0f888c0b4cbdedd8aa88041
Showing with 73,891 additions and 80,992 deletions.
  1. +37 −32 lib/language_detector.rb
  2. +73,838 −80,934 lib/model.yml
  3. +16 −26 test/language_detector_test.rb
View
@@ -1,3 +1,7 @@
+require 'yaml'
+require 'jcode'
+$KCODE = 'u' if RUBY_VERSION < '1.9'
+
class LanguageDetector
def detect text
@profiles ||= load_model
@@ -8,19 +12,19 @@ def detect text
best_distance = nil
@profiles.each {|profile|
distance = profile.compute_distance(p)
-
+
if !best_distance || distance < best_distance
best_distance = distance
best_profile = profile
end
}
return best_profile.name
end
-
+
def self.train
-
+
# For a full list of ISO 639 language tags visit:
-
+
# http:#www.loc.gov/standards/iso639-2/englangn.html
#LARGE profiles follow:
@@ -31,7 +35,7 @@ def self.train
#always a good source of data.
#
# http:#en.wikipedia.org/wiki/World_War_II
-
+
training_data = [
# af (afrikaans)
[ "ar", "ar-utf8.txt", "utf8", "arabic" ],
@@ -92,7 +96,7 @@ def self.train
]
profiles = []
- training_data.each {|data|
+ training_data.each {|data|
p = Profile.new data[0]
p.init_with_file data[1]
profiles << p
@@ -103,7 +107,7 @@ def self.train
YAML.dump(profiles, f)
}
end
-
+
def load_model
filename = File.expand_path(File.join(File.dirname(__FILE__), "model.yml"))
@profiles = YAML.load_file(filename)
@@ -112,12 +116,12 @@ def load_model
class Profile
- PUNCTUATIONS = [?\n, ?\r, ?\t, ?\s, ?!, ?", ?#, ?$, ?%, ?&, ?', ?(, ?), ?*, ?+, ?,, ?-, ?., ?/,
- ?0, ?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9,
+ PUNCTUATIONS = [?\n, ?\r, ?\t, ?\s, ?!, ?", ?#, ?$, ?%, ?&, ?', ?(, ?), ?*, ?+, ?,, ?-, ?., ?/,
+ ?0, ?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9,
?:, ?;, ?<, ?=, ?>, ??, ?@, ?[, ?\\, ?], ?^, ?_, ?`, ?{, ?|, ?}, ?~]
-
+
LIMIT = 2000
-
+
def compute_distance other_profile
distance = 0
other_profile.ngrams.each {|k, v|
@@ -130,62 +134,62 @@ def compute_distance other_profile
}
return distance
end
-
+
attr_reader :ngrams, :name
-
+
def initialize(name)
@name = name
@puctuations = {}
PUNCTUATIONS.each {|p| @puctuations[p] = 1}
@ngrams = {}
end
-
+
def init_with_file filename
ngram_count = {}
-
+
path = File.expand_path(File.join(File.dirname(__FILE__), "training_data/" + filename))
puts "training with " + path
- File.open(path).each_line{ |line|
- _init_with_string line, ngram_count
+ File.open(path).each_line{ |line|
+ _init_with_string line, ngram_count
}
-
+
a = ngram_count.sort {|a,b| b[1] <=> a[1]}
i = 1
- a.each {|t|
+ a.each {|t|
@ngrams[t[0]] = i
i += 1
break if i > LIMIT
}
end
-
+
def init_with_string str
ngram_count = {}
-
+
_init_with_string str, ngram_count
-
+
a = ngram_count.sort {|a,b| b[1] <=> a[1]}
i = 1
- a.each {|t|
+ a.each {|t|
@ngrams[t[0]] = i
i += 1
break if i > LIMIT
}
end
-
+
def _init_with_string str, ngram_count
tokens = tokenize(str)
tokens.each {|token|
count_ngram token, 2, ngram_count
count_ngram token, 3, ngram_count
count_ngram token, 4, ngram_count
count_ngram token, 5, ngram_count
- }
+ }
end
-
+
def tokenize str
tokens = []
s = ''
- str.each_byte {|b|
+ str.each_byte {|b|
if is_puctuation?(b)
tokens << s unless s.empty?
s = ''
@@ -196,12 +200,13 @@ def tokenize str
tokens << s unless s.empty?
return tokens
end
-
+
def is_puctuation? b
@puctuations[b]
end
-
+
def count_ngram token, n, counts
+ token = "_#{token}#{'_' * (n-1)}" if n > 1 && token.jlength >= n
i = 0
while i + n <= token.length
s = ''
@@ -210,14 +215,14 @@ def count_ngram token, n, counts
s << token[i+j]
j += 1
end
- if counts[s]
+ if counts[s]
counts[s] = counts[s] + 1
- else
+ else
counts[s] = 1
end
i += 1
end
-
+
return counts
end
Oops, something went wrong.

0 comments on commit 3a9dfb8

Please sign in to comment.