Skip to content
Browse files

padding for ngrams

Signed-off-by: feedbackmine <feedbackmine@feedbackmine.com>
  • Loading branch information...
1 parent e917a50 commit 3a9dfb806e16e132e0f888c0b4cbdedd8aa88041 Ye Dingding committed with feedbackmine Jul 7, 2009
Showing with 73,891 additions and 80,992 deletions.
  1. +37 −32 lib/language_detector.rb
  2. +73,838 −80,934 lib/model.yml
  3. +16 −26 test/language_detector_test.rb
View
69 lib/language_detector.rb
@@ -1,3 +1,7 @@
+require 'yaml'
+require 'jcode'
+$KCODE = 'u' if RUBY_VERSION < '1.9'
+
class LanguageDetector
def detect text
@profiles ||= load_model
@@ -8,19 +12,19 @@ def detect text
best_distance = nil
@profiles.each {|profile|
distance = profile.compute_distance(p)
-
+
if !best_distance || distance < best_distance
best_distance = distance
best_profile = profile
end
}
return best_profile.name
end
-
+
def self.train
-
+
# For a full list of ISO 639 language tags visit:
-
+
# http:#www.loc.gov/standards/iso639-2/englangn.html
#LARGE profiles follow:
@@ -31,7 +35,7 @@ def self.train
#always a good source of data.
#
# http:#en.wikipedia.org/wiki/World_War_II
-
+
training_data = [
# af (afrikaans)
[ "ar", "ar-utf8.txt", "utf8", "arabic" ],
@@ -92,7 +96,7 @@ def self.train
]
profiles = []
- training_data.each {|data|
+ training_data.each {|data|
p = Profile.new data[0]
p.init_with_file data[1]
profiles << p
@@ -103,7 +107,7 @@ def self.train
YAML.dump(profiles, f)
}
end
-
+
def load_model
filename = File.expand_path(File.join(File.dirname(__FILE__), "model.yml"))
@profiles = YAML.load_file(filename)
@@ -112,12 +116,12 @@ def load_model
class Profile
- PUNCTUATIONS = [?\n, ?\r, ?\t, ?\s, ?!, ?", ?#, ?$, ?%, ?&, ?', ?(, ?), ?*, ?+, ?,, ?-, ?., ?/,
- ?0, ?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9,
+ PUNCTUATIONS = [?\n, ?\r, ?\t, ?\s, ?!, ?", ?#, ?$, ?%, ?&, ?', ?(, ?), ?*, ?+, ?,, ?-, ?., ?/,
+ ?0, ?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9,
?:, ?;, ?<, ?=, ?>, ??, ?@, ?[, ?\\, ?], ?^, ?_, ?`, ?{, ?|, ?}, ?~]
-
+
LIMIT = 2000
-
+
def compute_distance other_profile
distance = 0
other_profile.ngrams.each {|k, v|
@@ -130,62 +134,62 @@ def compute_distance other_profile
}
return distance
end
-
+
attr_reader :ngrams, :name
-
+
def initialize(name)
@name = name
@puctuations = {}
PUNCTUATIONS.each {|p| @puctuations[p] = 1}
@ngrams = {}
end
-
+
def init_with_file filename
ngram_count = {}
-
+
path = File.expand_path(File.join(File.dirname(__FILE__), "training_data/" + filename))
puts "training with " + path
- File.open(path).each_line{ |line|
- _init_with_string line, ngram_count
+ File.open(path).each_line{ |line|
+ _init_with_string line, ngram_count
}
-
+
a = ngram_count.sort {|a,b| b[1] <=> a[1]}
i = 1
- a.each {|t|
+ a.each {|t|
@ngrams[t[0]] = i
i += 1
break if i > LIMIT
}
end
-
+
def init_with_string str
ngram_count = {}
-
+
_init_with_string str, ngram_count
-
+
a = ngram_count.sort {|a,b| b[1] <=> a[1]}
i = 1
- a.each {|t|
+ a.each {|t|
@ngrams[t[0]] = i
i += 1
break if i > LIMIT
}
end
-
+
def _init_with_string str, ngram_count
tokens = tokenize(str)
tokens.each {|token|
count_ngram token, 2, ngram_count
count_ngram token, 3, ngram_count
count_ngram token, 4, ngram_count
count_ngram token, 5, ngram_count
- }
+ }
end
-
+
def tokenize str
tokens = []
s = ''
- str.each_byte {|b|
+ str.each_byte {|b|
if is_puctuation?(b)
tokens << s unless s.empty?
s = ''
@@ -196,12 +200,13 @@ def tokenize str
tokens << s unless s.empty?
return tokens
end
-
+
def is_puctuation? b
@puctuations[b]
end
-
+
def count_ngram token, n, counts
+ token = "_#{token}#{'_' * (n-1)}" if n > 1 && token.jlength >= n
i = 0
while i + n <= token.length
s = ''
@@ -210,14 +215,14 @@ def count_ngram token, n, counts
s << token[i+j]
j += 1
end
- if counts[s]
+ if counts[s]
counts[s] = counts[s] + 1
- else
+ else
counts[s] = 1
end
i += 1
end
-
+
return counts
end
View
154,772 lib/model.yml
73,838 additions, 80,934 deletions not shown because the diff is too large. Please use a local Git client to view these changes.
View
42 test/language_detector_test.rb
@@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*-
require 'test/unit'
require File.dirname(__FILE__) + '/../lib/language_detector'
@@ -9,55 +10,44 @@ def test_is_puctuation
assert !p.is_puctuation?(?A)
assert !p.is_puctuation?(?a)
end
-
+
def test_tokenize
p = Profile.new("test")
assert_equal ["this", "is", "A", "test"], p.tokenize("this is ,+_ A \t 123 test")
end
-
+
def test_count_ngram
p = Profile.new("test")
assert_equal({"w"=>1, "o"=>1, "r"=>1, "d"=>1, "s"=>1}, p.count_ngram('words', 1, {}))
- assert_equal({"wo"=>1, "or"=>1, "rd"=>1, "ds"=>1}, p.count_ngram('words', 2, {}))
- assert_equal({"wor"=>1, "ord"=>1, "rds"=>1}, p.count_ngram('words', 3, {}))
- assert_equal({"word"=>1, "ords"=>1}, p.count_ngram('words', 4, {}))
- assert_equal({"words"=>1}, p.count_ngram('words', 5, {}))
+ assert_equal({"wo"=>1, "or"=>1, "rd"=>1, "ds"=>1, "_w" => 1, "s_" => 1}, p.count_ngram('words', 2, {}))
+ assert_equal({"wor"=>1, "ord"=>1, "rds"=>1, "_wo" => 1, "ds_" => 1, "s__" => 1}, p.count_ngram('words', 3, {}))
+ assert_equal({"word"=>1, "ords"=>1, "_wor" => 1, "rds_" => 1, "ds__" => 1, "s___" => 1}, p.count_ngram('words', 4, {}))
+ assert_equal({"words"=>1, "_word" => 1, "ords_" => 1, "rds__" => 1, "ds___" => 1, "s____" => 1}, p.count_ngram('words', 5, {}))
assert_equal({}, p.count_ngram('words', 6, {}))
end
-
+
def test_init_with_string
p = Profile.new("test")
p.init_with_string("this is ,+_ A \t 123 test")
- assert_equal({"st"=>12,
- "hi"=>7,
- "tes"=>3,
- "es"=>4,
- "te"=>6,
- "est"=>5,
- "his"=>8,
- "test"=>2,
- "this"=>9,
- "th"=>10,
- "thi"=>11,
- "is"=>1}, p.ngrams)
+ assert_equal([["st", 12], ["hi", 7], ["tes", 3], ["es", 4], ["te", 6], ["est", 5], ["his", 8], ["test", 2], ["this", 9], ["th", 10], ["thi", 11], ["is", 1]], p.ngrams.sort_by { |a,b| a[1] <=> b[1] })
end
-
+
def test_init_with_file
p = Profile.new("test")
p.init_with_file("bg-utf8.txt")
assert !p.ngrams.empty?
end
-
+
def test_compute_distance
p1 = Profile.new("test")
p1.init_with_string("this is ,+_ A \t 123 test")
p2 = Profile.new("test")
p2.init_with_string("this is ,+_ A \t 123 test")
assert_equal 0, p1.compute_distance(p2)
-
+
p3 = Profile.new("test")
p3.init_with_string("xxxx")
- assert_equal 6000, p1.compute_distance(p3)
+ assert_equal 24000, p1.compute_distance(p3)
end
end
@@ -70,14 +60,14 @@ def test_detect
assert_equal "fr", d.detect("serait désigné peu après PDG d'Antenne 2 et de FR 3. Pas même lui ! Le")
assert_equal "it", d.detect("studio dell'uomo interiore? La scienza del cuore umano, che")
assert_equal "ro", d.detect("taiate pe din doua, in care vezi stralucind brun sau violet cristalele interioare")
- assert_equal "pl", d.detect("na porozumieniu, na ³±czeniu si³ i ¶rodków. Dlatego szukam ludzi, którzy")
+ assert_equal "pl", d.detect("na porozumieniu, na ³±czeniu si³ i ¶rodków. Dlatego szukam ludzi, którzy")
assert_equal "de", d.detect("sagt Hühsam das war bei Über eine Annonce in einem Frankfurter der Töpfer ein. Anhand von gefundenen gut kennt, hatte ihm die wahren Tatsachen Sechzehn Adorno-Schüler erinnern und daß ein Weiterdenken der Theorie für ihre Festlegung sind drei Jahre Erschütterung Einblick in die Abhängigkeit der Bauarbeiten sei")
assert_equal "fi", d.detect("koulun arkistoihin pölyttymään, vaan nuoret saavat itse vaikuttaa ajatustensa eteenpäinviemiseen esimerkiksi")
- #assert_equal "sv", d.detect("enligt all sannolikhet för att få ro oavsiktligt intagit en för")
+# assert_equal "sv", d.detect("enligt all sannolikhet för att få ro oavsiktligt intagit en för")
assert_equal "hu", d.detect("esôzéseket egy kissé túlméretezte, ebbôl kifolyólag a Földet egy hatalmas árvíz mosta el")
assert_equal "fi", d.detect("koulun arkistoihin pölyttymään, vaan nuoret saavat itse vaikuttaa ajatustensa eteenpäinviemiseen esimerkiksi")
assert_equal "nl", d.detect("tegen de kabinetsplannen. Een speciaal in het leven geroepen Landelijk")
- #assert_equal "da", d.detect("viksomhed, 58 pct. har et arbejde eller er under uddannelse, 76 pct. forsørges ikke længere af Kolding")
+ assert_equal "da", d.detect("viksomhed, 58 pct. har et arbejde eller er under uddannelse, 76 pct. forsørges ikke længere af Kolding")
assert_equal "cs", d.detect("datují rokem 1862. Naprosto zakázán byl v pocitech smutku, beznadìje èi jiné")
assert_equal "no", d.detect("hånd på den enda hvitere restaurant-duken med en bevegelse så forfinet")
assert_equal "pt", d.detect("popular. Segundo o seu biógrafo, a Maria Adelaide auxiliava muita gente")

0 comments on commit 3a9dfb8

Please sign in to comment.
Something went wrong with that request. Please try again.