Skip to content

Commit

Permalink
Added Oniguruma to support UCS-2 regular expressions.
Browse files Browse the repository at this point in the history
Ported the Detector class with tests (failing).
UCS2String mimics a String class.
Added language profiles copied from http://language-detection.googlecode.com/svn/trunk/profiles.
  • Loading branch information
jasiek committed Mar 28, 2011
1 parent 6b7f721 commit 77be06e
Show file tree
Hide file tree
Showing 58 changed files with 364 additions and 6 deletions.
1 change: 1 addition & 0 deletions Gemfile
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
source :gemcutter
gem "i18n", "0.5.0"
gem "activesupport", "3.0.5"
gem "oniguruma", "1.1.0"

group :development do
gem "bundler", "~> 1.0.0"
Expand Down
2 changes: 2 additions & 0 deletions Gemfile.lock
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ GEM
bundler (~> 1.0.0)
git (>= 1.2.5)
rake
oniguruma (1.1.0)
rake (0.8.7)
rcov (0.9.9)

Expand All @@ -19,4 +20,5 @@ DEPENDENCIES
bundler (~> 1.0.0)
i18n (= 0.5.0)
jeweler (~> 1.5.2)
oniguruma (= 1.1.0)
rcov
6 changes: 6 additions & 0 deletions lib/langusta.rb
Original file line number Diff line number Diff line change
@@ -1,12 +1,18 @@
$: << File.expand_path(File.dirname(__FILE__))

require 'iconv'
require 'oniguruma'

module Langusta
VERSION = '0.0.1'

autoload :UCS2String, 'langusta/ucs2_string'
autoload :Language, 'langusta/language'
autoload :LangProfile, 'langusta/lang_profile'
autoload :Detector, 'langusta/detector'
autoload :JavaPropertyReader, 'langusta/java_property_reader'
autoload :UnicodeBlock, 'langusta/unicode_block'
autoload :NGram, 'langusta/n_gram'
autoload :DetectorFactory, 'langusta/detector_factory'
autoload :Detector, 'langusta/detector'
end
184 changes: 183 additions & 1 deletion lib/langusta/detector.rb
Original file line number Diff line number Diff line change
@@ -1,2 +1,184 @@
class Langusta::Detector
module Langusta
class Detector
attr_accessor :verbose, :alpha, :max_text_length

ALPHA_DEFAULT = 0.5
ALPHA_WIDTH = 0.05
ITERATION_LIMIT = 1000
PROB_THRESHOLD = 0.1
CONV_THRESHOLD = 0.99999
BASE_FREQ = 10000
UNKNOWN_LANG = "unknown"

URL_REGEX = Oniguruma::ORegexp.new("https?://[-_.?&~;+=/#0-9A-Za-z]+", :encoding => Oniguruma::ENCODING_UTF16_BE)
MAIL_REGEX = Oniguruma::ORegexp.new("[-_.0-9A-Za-z]+@[-_0-9A-Za-z]+[-_.0-9A-Za-z]+", :encoding => Oniguruma::ENCODING_UTF16_BE)

def initialize(factory)
@word_lang_prob_map = factory.word_lang_prob_map
@lang_list = factory.lang_list
@text = UCS2String.new('')
@langprob = nil
@alpha = ALPHA_DEFAULT
@n_trial = 7
@max_text_length = 10000
@prior_map = nil
@verbose = false
end

def append(text)
text.gsub!(URL_REGEX, "\x00\x20")
text.gsub!(MAIL_REGEX, "\x00\x20")
text.each_char do |c|
NGram.normalize(c)
end
@text = text.gsub!(Oniguruma::ORegexp.new("(\x00\x20)*", :encoding => Oniguruma::ENCODING_UTF16_BE), "\x00\x20")
end

def detect
probabilities = get_probabilities()
(probabilities.length > 0) ? probabilities.first.lang : UNKNOWN_LANG
end

def detect_block
cleaning_text()
ngrams = extract_ngrams()
raise "no features in text" if ngrams.empty?
langprob = Array.new(@lang_list.length)

n_trial.times do
prob = init_probability()
alpha = @alpha + next_gaussian() * ALPHA_WIDTH

i = 0
Kernel.loop do
r = Kernel.rand(ngrams.length)
update_lang_prob(prob, ngrams.get(r), alpha)
if i % 5
break if normalize_prob(prob) > CONV_THRESHOLD || i >= ITERATION_LIMIT
# verbose
end
end
langprob.length.times do |j|
langprob[j] += prob[j] / n_trial
end
# verbose
end
end

def set_prior_map(prior_map)
@prior_map = Array.new[@lang_list.length]
sump = 0.0
@prior_map.length.times do |i|
lang = @lang_list[i]
if @prior_map.has_key?(lang)
p = @prior_map[lang]
raise "probability must be non-negative" if p < 0
@prior_map[i] = p
sump += p
end
end
raise "more one of prob must be non-zero" if sump <= 0
@prior_map.map! do |p|
p /= sump
end
end

def self.normalize_prob(prob)
maxp = 0.0; sump = 0.0
prob.each do |p|
sump += p
end
prob.map! do |p|
q = p / sump
maxp = q if q > maxp
q
end
maxp
end

private
def cleaning_text
non_latin_count = latin_count = 0
@text.each_char do |c|
if c < "\00z" && c >= "\x00A"
latin_count += 1
elsif c > "\x03\x00" && UnicodeBlock.of(c) != UnicodeBlock::LATIN_EXTENDED_ADDITIONAL
non_latin_count += 1
end
end
if latin_count * 2 < non_latin_count
text_without_latin = StringIO.new
@text.each_char do |c|
text_without_latin << c if c > "\x00z" || c < "\x00A"
end
@text = text_without_latin.to_s
end
end

def extract_ngrams
list = []
ngram = NGram.new
@text.each_char do |char|
ngram.add(char)
(1..NGram.N_GRAM).each do |n|
w = ngram.get(n)
list << w if w && @word_lang_prob_map.has_key?(w)
end
end
list
end

def get_probabilities
if @langprob.nil?
detect_block()
end
sort_probability(@langprob)
@langprob
end

def init_probability
prob = Array.new(@lang_list.length)
if @prior_map
prob = @prior_map.clone
else
prob.length.times do |i|
prob[i] = 1.0 / @lang_list.length
end
end
prob
end

def sort_probability(prob)
list = prob.zip(@lang_list)
list.sort_by! do |x|
x[0]
end
list.select! do |x|
x[0] > PROB_THRESHOLD
end
list.map do |x|
x[1]
end
end

def update_lang_prob(prob, word, alpha)
return false if word.nil? || ! @word_lang_prob_map.has_key?(word)

lang_prob_map = @word_lang_prob_map[word]
# verbose
weight = alpha / BASE_FREQ
prob.length.times do |i|
prob[i] *= weight + lang_prob_map[i]
end
true
end

def word_prob_to_string(prob)
prob.zip(@lang_list).select do |p, lang|
p > 0.00001
end.map do |p, lang|
"%s:%.5f" % [p, lang]
end.join(' ')
end
end
end
39 changes: 39 additions & 0 deletions lib/langusta/detector_factory.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
module Langusta
class LangDetectException < StandardError; end

class DetectorFactory
attr_reader :word_lang_prob_map, :lang_list

def initialize
@word_lang_prob_map = {}
@lang_list = []
end

def add_profile(profile, index, langsize)
raise LangDetectException.new("duplicate the same language profile") if @lang_list.include?(profile.name)
@lang_list << profile.name
profile.freq.keys.each do |word|
if not @word_lang_prob_map.has_key?(word)
@word_lang_prob_map[word] = Array.new(langsize)
end
prob = 1.0 * profile.freq[word] / profile.n_words[word.length - 1]
@word_lang_prob_map[word][index] = prob
end
end

def create(alpha=nil)
if alpha
detector = create_detector()
detector.alpha = alpha
detector
else
create_detector()
end
end

def create_detector
raise LangDetectException.new("need to load profiles") if @lang_list.length == 0
detector = Detector.new(self)
end
end
end
7 changes: 2 additions & 5 deletions lib/langusta/lang_profile.rb
Original file line number Diff line number Diff line change
@@ -1,13 +1,10 @@
require 'set'
require 'active_support/core_ext/string'

TODO_NGRAM_LEN = 3

class Langusta::LangProfile
MINIMUM_FREQ = 2
LESS_FREQ_RATIO = 100_000

attr_reader :name, :freq
attr_reader :name, :freq, :n_words

def initialize(name=nil)
@name = name
Expand All @@ -18,7 +15,7 @@ def initialize(name=nil)
def add(gram)
return if @name.nil? or gram.nil?
length = gram.mb_chars.size
return if length < 1 or length > TODO_NGRAM_LEN
return if length < 1 or length > NGram.N_GRAM
@n_words[length - 1] += 1
@freq[gram] ||= 0
@freq[gram] += 1
Expand Down
31 changes: 31 additions & 0 deletions lib/langusta/ucs2_string.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
module Langusta
class UCS2String
include Enumerable

attr_reader :underlying

def initialize(underlying)
@underlying = Iconv.conv("ucs-2", "ucs-2", underlying)
end

def [](index)
@underlying[index / 2, 2]
end

def gsub!(oregexp, subst)
oregexp.gsub!(@underlying, subst)
end

def <<(ucs2string)
raise TypeError unless ucs2string.is_a?(UCS2String)
self.new(@underlying + ucs2string.underlying)
end

def each_char(&blk)
(0..(@underlying.length - 2)).step(2) do |index|
blk.call(@underlying[index, 2])
end
end
alias :each :each_char
end
end
1 change: 1 addition & 0 deletions profiles/af

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions profiles/ar

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions profiles/bg

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions profiles/bn

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions profiles/cs

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions profiles/da

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions profiles/de

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions profiles/el

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions profiles/en

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions profiles/es

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions profiles/fa

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions profiles/fi

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions profiles/fr

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions profiles/gu

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions profiles/he

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions profiles/hi

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions profiles/hr

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions profiles/hu

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions profiles/id

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions profiles/it

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions profiles/ja

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions profiles/kn

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions profiles/ko
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"freq":{"D":3892,"E":3299,"F":3285,"G":3045,"A":6416,"B":4070,"C":6617,"L":3054,"M":5299,"N":2815,"O":2578,"H":2337,"I":4798,"J":1557,"K":2201,"U":1696,"T":4435,"W":1804,"V":1999,"P":5115,"S":7482,"R":3099,"Y":511,"X":864,"Z":322,"f":4104,"g":6365,"d":8971,"e":33474,"b":3763,"c":11043,"a":29207,"n":24208,"o":24326,"l":15315,"m":9717,"j":569,"k":3352,"h":9005,"i":26119,"w":2253,"v":3158,"u":10429,"t":20234,"s":16203,"r":23102,"q":406,"p":6165,"z":1050,"y":5016,"x":1132,"·":3643,"é":335,"α":340," l":537," m":1211," n":447," o":1533," h":426," i":705," k":705," d":1030," e":627," f":736," g":373," a":1340," b":551," c":1312," t":1282," v":337," p":1037," s":1367," r":515," J":1393," K":1837," H":1901," I":2769," N":1730," O":1347," L":1992," M":3624," B":2598," C":4248," A":3943," F":2209," G":2328," D":2552," E":1916," Y":379," X":512,"и":342,"о":346," S":4739," R":2101," P":3310,"а":357," W":1489," V":980," U":1053," T":2804," ·":608,"가가》":1806,"A ":1090,"F ":430,"Da":387,"Co":1100,"Ch":633,"FA":344,"G ":347,"De":463,"Di":385,"Ge":351,"I ":810,"Fr":387,"B ":341,"C ":1323,"BS":411,"Ar":322,"D ":656,"Ba":510,"An":388,"Al":410,"Br":354,"Ca":585,"E ":441,"Be":332,"Bo":326,"Le":342,"Li":387,"N ":341,"La":422,"Lo":321,"Me":417,"Mi":434,"O ":403,"Ma":994,"Mo":474,"Ne":370,"Na":336,"P ":715,"Gr":424,"Ha":385,"He":353,"II":481,"In":800,"Ja":347,"L ":466,"Jo":379,"M ":639,"Un":351,"Tr":351,"Th":790,"Te":335,"V ":541,"St":667,"TV":438,"Wi":435,"Pr":609,"S ":1197,"Pa":670,"R ":385,"Se":487,"Sc":370,"So":342,"U ":356,"Sa":396,"Re":588,"Ro":375,"T ":528,"b ":354,"a ":3271,"i ":1146,"ge":1157,"ga":443,"ff":332,"fi":424,"fo":504,"Int":339,"he":2310,"ha":1401,"gi":481,"gh":427,"gu":409,"gr":369,"g ":1575,"ea":1479,"ec":1049,"ed":1247,"de":1816,"di":1071,"do":684,"dr":338,"ew":384,"ev":385,"h ":1328,"fe":378,"eg":405,"ee":580,"el":1725,"ei":535,"ep":401,"eo":702,"en":3472,"em":983,"et":1358,"es":2573,"er":5897,"ca":1157,"e ":7980,"bo":372,"bl":393,"bi":443,"be":654,"da":619,"f ":1427,"cu":340,"ct":1002,"co":1189,"ck":563,"ci":913,"ch":1939,"ce":1477,"c ":925,"ay":495,"ba":511,"d ":2871,"at":3499,"as":1398,"ar":3373,"av":444,"au":580,"al":3307,"ai":767,"ap":659,"am":1381,"an":5017,"ac":1106,"ad":861,"ab":469,"ag":721,"ae":329,"nt":2416,"ns":1211,"no":896,"nn":583,"of":1300,"oc":729,"od":611,"ob":397,"om":1422,"on":5415,"ol":1638,"og":510,"ot":772,"os":950,"ov":494,"ou":1250,"op":740,"oo":649,"or":3530,"r ":3525,"ow":507,"pe":982,"pa":741,"pl":372,"po":651,"ph":572,"pi":450,"lo":1098,"ll":1500,"lu":455,"lt":410,"ly":332,"o ":1739,"ma":1374,"mb":376,"me":1635,"mi":890,"mm":492,"mp":610,"mo":691,"mu":332,"p ":731,"na":1797,"nc":1140,"nd":2000,"ne":1878,"ng":2212,"ni":1786,"ke":469,"m ":2197,"km":481,"li":1968,"le":2422,"ld":489,"la":1786,"n ":6058,"ht":434,"hu":339,"hi":1040,"ho":848,"id":750,"ic":2901,"ia":1671,"ig":816,"if":357,"ie":1267,"k ":934,"ir":791,"is":2128,"it":2101,"iu":388,"iv":722,"il":1499,"im":625,"in":4140,"io":2942,"ip":462,"l ":3328,"y ":2799,"wa":428,"vi":761,"ve":1364,"va":525,"x ":549,"ui":350,"ul":813,"ue":443,"ur":1440,"us":1642,"ut":842,"um":757,"un":1114,"up":321,"ty":669,"tu":765,"tt":612,"ua":440,"uc":387,"w ":493,"to":1445,"ts":387,"tr":1222,"te":3099,"ti":3945,"th":1754,"ta":1709,"su":334,"ss":1114,"st":2539,"sp":332,"so":726,"sc":476,"se":1269,"sh":547,"si":1432,"u ":440,"sa":574,"rr":396,"rs":871,"rt":1127,"ru":469,"ry":772,"ro":2074,"rn":743,"rm":593,"rl":533,"rk":352,"ri":2784,"rg":488,"re":2795,"rd":807,"rc":621,"ra":2892,"t ":3426,"s ":5482,"pr":409,"ys":349,"丞丞 ":694,"丞一 ":1816,"》가":1345,"《가":2228,"丞丕 ":551,"Com":345,"アアア":475,"》가 ":1068,"一一一":5909,"一一丞":1736,"一一丕":1309,"丕一 ":1155,"Pro":348," ·가":327,"가·가":2886," 《":2439," 》":343," 〈":356,"あ":978," 가 ":130222,"》":2546,"《":2558,"〉":392,"〈":395,"ア":871,"丕丞 ":472,"一丞丞":570,"丕丕 ":348,"가가 ":783362,"一丞一":1754,"一丞丕":441,"가가·":2619,"一丕一":1043,"一丕丞":389,"丕丞一":339,"両":634,"丞":11346,"丐":1267,"丕":7734,"一":36326,"丞一":4590,"丞丕":1151,"丞丞":1515,"丕丞":1076,"丕丕":711,"丕一":2824,"丐一":586,"一丕":3304,"一丐":489,"一丞":5004,"一一":15752," 丞":3275," 丐":491," 丕":2250," 一":11320,"The":618,"丞一丕":423,"丞一丞":536,"丞一一":1649,"ああ":644," 一 ":651,"ber":375,"ce ":804,"al ":1627,"ant":323,"ang":422,"anc":360,"and":945,"ame":329,"all":348,"an ":1234,"ard":453,"ari":335,"art":327,"ar ":393,"ate":490,"ati":1694,"アア":629,"가》":1921,"丕一一":978,"丕一丞":350,"ity":425,"ist":505,"ive":498,"is ":521,"ion":2383,"가가가":1019221,"》 ":1160," Ge":348," Fr":383," Ha":384," He":345," Gr":419," Ja":344," In":792,"har":332," Jo":377," La":409," Le":335," Li":369," Ma":978," Mi":422," Me":403,"he ":1035," Ne":354," Na":322," Mo":461,"her":322," An":382," Al":401," Ba":502," Be":321," Br":346," Ca":565," Ch":622," Co":1074," Da":378," Di":376," De":454," Wi":415," Pa":649," Ro":366," Re":579," Pr":602," St":642," Th":749," Te":327," Tr":341," Sa":393," Sc":357," Se":468," So":330," Un":342,"ian":427," in":405,"ic ":729,"ia ":540," 丞丞":589," 丞一":1845," of":1120," 丞丕":471,"ich":356," km":461,"ica":619," an":470," 가가":780590,"ine":530,"ing":1131," 丕丞":431," co":491,"in ":695," 丕一":1167,"ill":401," de":521," 一丞":2175," 一丕":1192," 一一":6508," th":680,"est":422,"ess":440,"er ":2043,"es ":1105,"eri":458,"era":435,"et ":385,"ers":564,"ern":454,"en ":605,"ell":389,"enc":362,"ent":998,"el ":452,"ge ":450,"for":417," 《가":2128,"de ":544,"cti":343,"ch ":513,"che":357,"ed ":647,"ect":433,"·가가":3042,"der":419,"rea":453,"re ":598,"rch":365,"rd ":425,"rat":517,"ran":534,"ric":460,"ry ":656,"rt ":353," 가":911452,"se ":376,"st ":511,"ss ":355,"ste":469,"sti":465,"str":431,"te ":613,"가":2734665,"per":399,"ng ":1232,"nce":521,"ne ":683,"nal":636,"nd ":828,"가·":2913,"가 ":919426,"nte":685,"nt ":696,"ns ":350,"m가":495,"of ":1107,"or ":608,"ore":443,"on ":2671,"ona":592,"ons":366,"le ":899,"lan":381,"A가":441,"lli":322,"ll ":408,"S가":359,"man":401,"丞丕一":331,"men":485,"C가":415,"A가 ":326,"丞丞一":486,"·가":3221,"《가가":2147,"ver":541,"ve ":360,"가》가":891,"us ":933,"um ":349,"ty ":554,"tra":420,"tor":430,"tin":357,"tio":1917,"tic":456,"th ":330,"ter":1180,"the":803,"一丞 ":2036,"一丕 ":1523,"一一 ":6123,"丞 ":3597,"丐 ":360,"丕 ":2805,"가가":1808522,"가》 ":998,"一 ":10399,"あああ":427},"n_words":[3186351,4098367,2871816],"name":"ko"}
1 change: 1 addition & 0 deletions profiles/mk

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions profiles/ml

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions profiles/mr

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions profiles/ne

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions profiles/nl

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions profiles/no

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions profiles/pa

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions profiles/pl

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions profiles/pt

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions profiles/ro

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions profiles/ru

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions profiles/sk

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions profiles/so

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions profiles/sq

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions profiles/sv

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions profiles/sw

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions profiles/ta

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions profiles/te

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions profiles/th

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions profiles/tl

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions profiles/tr

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions profiles/uk

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions profiles/ur

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions profiles/vi

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions profiles/zh-cn

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions profiles/zh-tw

Large diffs are not rendered by default.

46 changes: 46 additions & 0 deletions test/test_detector.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
require 'test/helper'

class DetectorTest < Test::Unit::TestCase
TRAINING_EN = "\x00a \x00a \x00a \x00b \x00b \x00c \x00c \x00d \x00e"
TRAINING_FR = "\x00a \x00b \x00b \x00c \x00c \x00c \x00d \x00d \x00d"
TRAINING_JP = "\x30\x42 \x30\x42 \x30\x42 \x30\x44 \x30\x46 \x30\x48 \x30\x48"

def setup
@factory = DetectorFactory.new
profile_en = LangProfile.new("en")
TRAINING_EN.split(/ /).each do |w|
profile_en.add(w)
end
@factory.add_profile(profile_en, 0, 3)

profile_fr = LangProfile.new("fr")
TRAINING_FR.split(/ /).each do |w|
profile_fr.add(w)
end
@factory.add_profile(profile_fr, 1, 3)

profile_jp = LangProfile.new("jp")
TRAINING_JP.split(/ /).each do |w|
profile_jp.add(w)
end
@factory.add_profile(profile_jp, 2, 3)
end

def test_detector1
detector = @factory.create()
detector.append(UCS2String.new("\x00a"))
assert_equal("en", detector.detect())
end

def test_detector2
detector = @factory.create()
detector.append(UCS2String.new("\x00b\x00\x20\x00d"))
assert_equal("fr", detector.detect())
end

def test_detector3
detector = @factory.create()
detector.append(UCS2String.new("\x30\x42\x30\x42\x30\x42\x40\x42\x00a"))
assert_equal("jp", detector.detect())
end
end
5 changes: 5 additions & 0 deletions test/test_detector_factory.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
require 'test/helper'

class DetectorFactoryTest < Test::Unit::TestCase

end

0 comments on commit 77be06e

Please sign in to comment.