-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
Ported the Detector class with tests (failing). UCS2String mimics a String class. Added language profiles copied from http://language-detection.googlecode.com/svn/trunk/profiles.
- Loading branch information
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,12 +1,18 @@ | ||
$: << File.expand_path(File.dirname(__FILE__)) | ||
|
||
require 'iconv' | ||
require 'oniguruma' | ||
|
||
module Langusta | ||
VERSION = '0.0.1' | ||
|
||
autoload :UCS2String, 'langusta/ucs2_string' | ||
autoload :Language, 'langusta/language' | ||
autoload :LangProfile, 'langusta/lang_profile' | ||
autoload :Detector, 'langusta/detector' | ||
autoload :JavaPropertyReader, 'langusta/java_property_reader' | ||
autoload :UnicodeBlock, 'langusta/unicode_block' | ||
autoload :NGram, 'langusta/n_gram' | ||
autoload :DetectorFactory, 'langusta/detector_factory' | ||
autoload :Detector, 'langusta/detector' | ||
end |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,184 @@ | ||
class Langusta::Detector | ||
module Langusta | ||
class Detector | ||
attr_accessor :verbose, :alpha, :max_text_length | ||
|
||
ALPHA_DEFAULT = 0.5 | ||
ALPHA_WIDTH = 0.05 | ||
ITERATION_LIMIT = 1000 | ||
PROB_THRESHOLD = 0.1 | ||
CONV_THRESHOLD = 0.99999 | ||
BASE_FREQ = 10000 | ||
UNKNOWN_LANG = "unknown" | ||
|
||
URL_REGEX = Oniguruma::ORegexp.new("https?://[-_.?&~;+=/#0-9A-Za-z]+", :encoding => Oniguruma::ENCODING_UTF16_BE) | ||
MAIL_REGEX = Oniguruma::ORegexp.new("[-_.0-9A-Za-z]+@[-_0-9A-Za-z]+[-_.0-9A-Za-z]+", :encoding => Oniguruma::ENCODING_UTF16_BE) | ||
|
||
def initialize(factory) | ||
@word_lang_prob_map = factory.word_lang_prob_map | ||
@lang_list = factory.lang_list | ||
@text = UCS2String.new('') | ||
@langprob = nil | ||
@alpha = ALPHA_DEFAULT | ||
@n_trial = 7 | ||
@max_text_length = 10000 | ||
@prior_map = nil | ||
@verbose = false | ||
end | ||
|
||
def append(text) | ||
text.gsub!(URL_REGEX, "\x00\x20") | ||
text.gsub!(MAIL_REGEX, "\x00\x20") | ||
text.each_char do |c| | ||
NGram.normalize(c) | ||
end | ||
@text = text.gsub!(Oniguruma::ORegexp.new("(\x00\x20)*", :encoding => Oniguruma::ENCODING_UTF16_BE), "\x00\x20") | ||
end | ||
|
||
def detect | ||
probabilities = get_probabilities() | ||
(probabilities.length > 0) ? probabilities.first.lang : UNKNOWN_LANG | ||
end | ||
|
||
def detect_block | ||
cleaning_text() | ||
ngrams = extract_ngrams() | ||
raise "no features in text" if ngrams.empty? | ||
langprob = Array.new(@lang_list.length) | ||
|
||
n_trial.times do | ||
prob = init_probability() | ||
alpha = @alpha + next_gaussian() * ALPHA_WIDTH | ||
|
||
i = 0 | ||
Kernel.loop do | ||
r = Kernel.rand(ngrams.length) | ||
update_lang_prob(prob, ngrams.get(r), alpha) | ||
if i % 5 | ||
break if normalize_prob(prob) > CONV_THRESHOLD || i >= ITERATION_LIMIT | ||
# verbose | ||
end | ||
end | ||
langprob.length.times do |j| | ||
langprob[j] += prob[j] / n_trial | ||
end | ||
# verbose | ||
end | ||
end | ||
|
||
def set_prior_map(prior_map) | ||
@prior_map = Array.new[@lang_list.length] | ||
sump = 0.0 | ||
@prior_map.length.times do |i| | ||
lang = @lang_list[i] | ||
if @prior_map.has_key?(lang) | ||
p = @prior_map[lang] | ||
raise "probability must be non-negative" if p < 0 | ||
@prior_map[i] = p | ||
sump += p | ||
end | ||
end | ||
raise "more one of prob must be non-zero" if sump <= 0 | ||
@prior_map.map! do |p| | ||
p /= sump | ||
end | ||
end | ||
|
||
def self.normalize_prob(prob) | ||
maxp = 0.0; sump = 0.0 | ||
prob.each do |p| | ||
sump += p | ||
end | ||
prob.map! do |p| | ||
q = p / sump | ||
maxp = q if q > maxp | ||
q | ||
end | ||
maxp | ||
end | ||
|
||
private | ||
def cleaning_text | ||
non_latin_count = latin_count = 0 | ||
@text.each_char do |c| | ||
if c < "\00z" && c >= "\x00A" | ||
latin_count += 1 | ||
elsif c > "\x03\x00" && UnicodeBlock.of(c) != UnicodeBlock::LATIN_EXTENDED_ADDITIONAL | ||
non_latin_count += 1 | ||
end | ||
end | ||
if latin_count * 2 < non_latin_count | ||
text_without_latin = StringIO.new | ||
@text.each_char do |c| | ||
text_without_latin << c if c > "\x00z" || c < "\x00A" | ||
end | ||
@text = text_without_latin.to_s | ||
end | ||
end | ||
|
||
def extract_ngrams | ||
list = [] | ||
ngram = NGram.new | ||
@text.each_char do |char| | ||
ngram.add(char) | ||
(1..NGram.N_GRAM).each do |n| | ||
w = ngram.get(n) | ||
list << w if w && @word_lang_prob_map.has_key?(w) | ||
end | ||
end | ||
list | ||
end | ||
|
||
def get_probabilities | ||
if @langprob.nil? | ||
detect_block() | ||
end | ||
sort_probability(@langprob) | ||
@langprob | ||
end | ||
|
||
def init_probability | ||
prob = Array.new(@lang_list.length) | ||
if @prior_map | ||
prob = @prior_map.clone | ||
else | ||
prob.length.times do |i| | ||
prob[i] = 1.0 / @lang_list.length | ||
end | ||
end | ||
prob | ||
end | ||
|
||
def sort_probability(prob) | ||
list = prob.zip(@lang_list) | ||
list.sort_by! do |x| | ||
x[0] | ||
end | ||
list.select! do |x| | ||
x[0] > PROB_THRESHOLD | ||
end | ||
list.map do |x| | ||
x[1] | ||
end | ||
end | ||
|
||
def update_lang_prob(prob, word, alpha) | ||
return false if word.nil? || ! @word_lang_prob_map.has_key?(word) | ||
|
||
lang_prob_map = @word_lang_prob_map[word] | ||
# verbose | ||
weight = alpha / BASE_FREQ | ||
prob.length.times do |i| | ||
prob[i] *= weight + lang_prob_map[i] | ||
end | ||
true | ||
end | ||
|
||
def word_prob_to_string(prob) | ||
prob.zip(@lang_list).select do |p, lang| | ||
p > 0.00001 | ||
end.map do |p, lang| | ||
"%s:%.5f" % [p, lang] | ||
end.join(' ') | ||
end | ||
end | ||
end |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
module Langusta | ||
class LangDetectException < StandardError; end | ||
|
||
class DetectorFactory | ||
attr_reader :word_lang_prob_map, :lang_list | ||
|
||
def initialize | ||
@word_lang_prob_map = {} | ||
@lang_list = [] | ||
end | ||
|
||
def add_profile(profile, index, langsize) | ||
raise LangDetectException.new("duplicate the same language profile") if @lang_list.include?(profile.name) | ||
@lang_list << profile.name | ||
profile.freq.keys.each do |word| | ||
if not @word_lang_prob_map.has_key?(word) | ||
@word_lang_prob_map[word] = Array.new(langsize) | ||
end | ||
prob = 1.0 * profile.freq[word] / profile.n_words[word.length - 1] | ||
@word_lang_prob_map[word][index] = prob | ||
end | ||
end | ||
|
||
def create(alpha=nil) | ||
if alpha | ||
detector = create_detector() | ||
detector.alpha = alpha | ||
detector | ||
else | ||
create_detector() | ||
end | ||
end | ||
|
||
def create_detector | ||
raise LangDetectException.new("need to load profiles") if @lang_list.length == 0 | ||
detector = Detector.new(self) | ||
end | ||
end | ||
end |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
module Langusta | ||
class UCS2String | ||
include Enumerable | ||
|
||
attr_reader :underlying | ||
|
||
def initialize(underlying) | ||
@underlying = Iconv.conv("ucs-2", "ucs-2", underlying) | ||
end | ||
|
||
def [](index) | ||
@underlying[index / 2, 2] | ||
end | ||
|
||
def gsub!(oregexp, subst) | ||
oregexp.gsub!(@underlying, subst) | ||
end | ||
|
||
def <<(ucs2string) | ||
raise TypeError unless ucs2string.is_a?(UCS2String) | ||
self.new(@underlying + ucs2string.underlying) | ||
end | ||
|
||
def each_char(&blk) | ||
(0..(@underlying.length - 2)).step(2) do |index| | ||
blk.call(@underlying[index, 2]) | ||
end | ||
end | ||
alias :each :each_char | ||
end | ||
end |
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
{"freq":{"D":3892,"E":3299,"F":3285,"G":3045,"A":6416,"B":4070,"C":6617,"L":3054,"M":5299,"N":2815,"O":2578,"H":2337,"I":4798,"J":1557,"K":2201,"U":1696,"T":4435,"W":1804,"V":1999,"P":5115,"S":7482,"R":3099,"Y":511,"X":864,"Z":322,"f":4104,"g":6365,"d":8971,"e":33474,"b":3763,"c":11043,"a":29207,"n":24208,"o":24326,"l":15315,"m":9717,"j":569,"k":3352,"h":9005,"i":26119,"w":2253,"v":3158,"u":10429,"t":20234,"s":16203,"r":23102,"q":406,"p":6165,"z":1050,"y":5016,"x":1132,"·":3643,"é":335,"α":340," l":537," m":1211," n":447," o":1533," h":426," i":705," k":705," d":1030," e":627," f":736," g":373," a":1340," b":551," c":1312," t":1282," v":337," p":1037," s":1367," r":515," J":1393," K":1837," H":1901," I":2769," N":1730," O":1347," L":1992," M":3624," B":2598," C":4248," A":3943," F":2209," G":2328," D":2552," E":1916," Y":379," X":512,"и":342,"о":346," S":4739," R":2101," P":3310,"а":357," W":1489," V":980," U":1053," T":2804," ·":608,"가가》":1806,"A ":1090,"F ":430,"Da":387,"Co":1100,"Ch":633,"FA":344,"G ":347,"De":463,"Di":385,"Ge":351,"I ":810,"Fr":387,"B ":341,"C ":1323,"BS":411,"Ar":322,"D ":656,"Ba":510,"An":388,"Al":410,"Br":354,"Ca":585,"E ":441,"Be":332,"Bo":326,"Le":342,"Li":387,"N ":341,"La":422,"Lo":321,"Me":417,"Mi":434,"O ":403,"Ma":994,"Mo":474,"Ne":370,"Na":336,"P ":715,"Gr":424,"Ha":385,"He":353,"II":481,"In":800,"Ja":347,"L ":466,"Jo":379,"M ":639,"Un":351,"Tr":351,"Th":790,"Te":335,"V ":541,"St":667,"TV":438,"Wi":435,"Pr":609,"S ":1197,"Pa":670,"R ":385,"Se":487,"Sc":370,"So":342,"U ":356,"Sa":396,"Re":588,"Ro":375,"T ":528,"b ":354,"a ":3271,"i ":1146,"ge":1157,"ga":443,"ff":332,"fi":424,"fo":504,"Int":339,"he":2310,"ha":1401,"gi":481,"gh":427,"gu":409,"gr":369,"g ":1575,"ea":1479,"ec":1049,"ed":1247,"de":1816,"di":1071,"do":684,"dr":338,"ew":384,"ev":385,"h ":1328,"fe":378,"eg":405,"ee":580,"el":1725,"ei":535,"ep":401,"eo":702,"en":3472,"em":983,"et":1358,"es":2573,"er":5897,"ca":1157,"e ":7980,"bo":372,"bl":393,"bi":443,"be":654,"da":619,"f ":1427,"cu":340,"ct":1002,"co":1189,"ck":563,"ci":913,"ch":1939,"ce":1477,"c ":925,"ay":495,"ba":511,"d ":2871,"at":3499,"as":1398,"ar":3373,"av":444,"au":580,"al":3307,"ai":767,"ap":659,"am":1381,"an":5017,"ac":1106,"ad":861,"ab":469,"ag":721,"ae":329,"nt":2416,"ns":1211,"no":896,"nn":583,"of":1300,"oc":729,"od":611,"ob":397,"om":1422,"on":5415,"ol":1638,"og":510,"ot":772,"os":950,"ov":494,"ou":1250,"op":740,"oo":649,"or":3530,"r ":3525,"ow":507,"pe":982,"pa":741,"pl":372,"po":651,"ph":572,"pi":450,"lo":1098,"ll":1500,"lu":455,"lt":410,"ly":332,"o ":1739,"ma":1374,"mb":376,"me":1635,"mi":890,"mm":492,"mp":610,"mo":691,"mu":332,"p ":731,"na":1797,"nc":1140,"nd":2000,"ne":1878,"ng":2212,"ni":1786,"ke":469,"m ":2197,"km":481,"li":1968,"le":2422,"ld":489,"la":1786,"n ":6058,"ht":434,"hu":339,"hi":1040,"ho":848,"id":750,"ic":2901,"ia":1671,"ig":816,"if":357,"ie":1267,"k ":934,"ir":791,"is":2128,"it":2101,"iu":388,"iv":722,"il":1499,"im":625,"in":4140,"io":2942,"ip":462,"l ":3328,"y ":2799,"wa":428,"vi":761,"ve":1364,"va":525,"x ":549,"ui":350,"ul":813,"ue":443,"ur":1440,"us":1642,"ut":842,"um":757,"un":1114,"up":321,"ty":669,"tu":765,"tt":612,"ua":440,"uc":387,"w ":493,"to":1445,"ts":387,"tr":1222,"te":3099,"ti":3945,"th":1754,"ta":1709,"su":334,"ss":1114,"st":2539,"sp":332,"so":726,"sc":476,"se":1269,"sh":547,"si":1432,"u ":440,"sa":574,"rr":396,"rs":871,"rt":1127,"ru":469,"ry":772,"ro":2074,"rn":743,"rm":593,"rl":533,"rk":352,"ri":2784,"rg":488,"re":2795,"rd":807,"rc":621,"ra":2892,"t ":3426,"s ":5482,"pr":409,"ys":349,"丞丞 ":694,"丞一 ":1816,"》가":1345,"《가":2228,"丞丕 ":551,"Com":345,"アアア":475,"》가 ":1068,"一一一":5909,"一一丞":1736,"一一丕":1309,"丕一 ":1155,"Pro":348," ·가":327,"가·가":2886," 《":2439," 》":343," 〈":356,"あ":978," 가 ":130222,"》":2546,"《":2558,"〉":392,"〈":395,"ア":871,"丕丞 ":472,"一丞丞":570,"丕丕 ":348,"가가 ":783362,"一丞一":1754,"一丞丕":441,"가가·":2619,"一丕一":1043,"一丕丞":389,"丕丞一":339,"両":634,"丞":11346,"丐":1267,"丕":7734,"一":36326,"丞一":4590,"丞丕":1151,"丞丞":1515,"丕丞":1076,"丕丕":711,"丕一":2824,"丐一":586,"一丕":3304,"一丐":489,"一丞":5004,"一一":15752," 丞":3275," 丐":491," 丕":2250," 一":11320,"The":618,"丞一丕":423,"丞一丞":536,"丞一一":1649,"ああ":644," 一 ":651,"ber":375,"ce ":804,"al ":1627,"ant":323,"ang":422,"anc":360,"and":945,"ame":329,"all":348,"an ":1234,"ard":453,"ari":335,"art":327,"ar ":393,"ate":490,"ati":1694,"アア":629,"가》":1921,"丕一一":978,"丕一丞":350,"ity":425,"ist":505,"ive":498,"is ":521,"ion":2383,"가가가":1019221,"》 ":1160," Ge":348," Fr":383," Ha":384," He":345," Gr":419," Ja":344," In":792,"har":332," Jo":377," La":409," Le":335," Li":369," Ma":978," Mi":422," Me":403,"he ":1035," Ne":354," Na":322," Mo":461,"her":322," An":382," Al":401," Ba":502," Be":321," Br":346," Ca":565," Ch":622," Co":1074," Da":378," Di":376," De":454," Wi":415," Pa":649," Ro":366," Re":579," Pr":602," St":642," Th":749," Te":327," Tr":341," Sa":393," Sc":357," Se":468," So":330," Un":342,"ian":427," in":405,"ic ":729,"ia ":540," 丞丞":589," 丞一":1845," of":1120," 丞丕":471,"ich":356," km":461,"ica":619," an":470," 가가":780590,"ine":530,"ing":1131," 丕丞":431," co":491,"in ":695," 丕一":1167,"ill":401," de":521," 一丞":2175," 一丕":1192," 一一":6508," th":680,"est":422,"ess":440,"er ":2043,"es ":1105,"eri":458,"era":435,"et ":385,"ers":564,"ern":454,"en ":605,"ell":389,"enc":362,"ent":998,"el ":452,"ge ":450,"for":417," 《가":2128,"de ":544,"cti":343,"ch ":513,"che":357,"ed ":647,"ect":433,"·가가":3042,"der":419,"rea":453,"re ":598,"rch":365,"rd ":425,"rat":517,"ran":534,"ric":460,"ry ":656,"rt ":353," 가":911452,"se ":376,"st ":511,"ss ":355,"ste":469,"sti":465,"str":431,"te ":613,"가":2734665,"per":399,"ng ":1232,"nce":521,"ne ":683,"nal":636,"nd ":828,"가·":2913,"가 ":919426,"nte":685,"nt ":696,"ns ":350,"m가":495,"of ":1107,"or ":608,"ore":443,"on ":2671,"ona":592,"ons":366,"le ":899,"lan":381,"A가":441,"lli":322,"ll ":408,"S가":359,"man":401,"丞丕一":331,"men":485,"C가":415,"A가 ":326,"丞丞一":486,"·가":3221,"《가가":2147,"ver":541,"ve ":360,"가》가":891,"us ":933,"um ":349,"ty ":554,"tra":420,"tor":430,"tin":357,"tio":1917,"tic":456,"th ":330,"ter":1180,"the":803,"一丞 ":2036,"一丕 ":1523,"一一 ":6123,"丞 ":3597,"丐 ":360,"丕 ":2805,"가가":1808522,"가》 ":998,"一 ":10399,"あああ":427},"n_words":[3186351,4098367,2871816],"name":"ko"} |
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
require 'test/helper' | ||
|
||
class DetectorTest < Test::Unit::TestCase | ||
TRAINING_EN = "\x00a \x00a \x00a \x00b \x00b \x00c \x00c \x00d \x00e" | ||
TRAINING_FR = "\x00a \x00b \x00b \x00c \x00c \x00c \x00d \x00d \x00d" | ||
TRAINING_JP = "\x30\x42 \x30\x42 \x30\x42 \x30\x44 \x30\x46 \x30\x48 \x30\x48" | ||
|
||
def setup | ||
@factory = DetectorFactory.new | ||
profile_en = LangProfile.new("en") | ||
TRAINING_EN.split(/ /).each do |w| | ||
profile_en.add(w) | ||
end | ||
@factory.add_profile(profile_en, 0, 3) | ||
|
||
profile_fr = LangProfile.new("fr") | ||
TRAINING_FR.split(/ /).each do |w| | ||
profile_fr.add(w) | ||
end | ||
@factory.add_profile(profile_fr, 1, 3) | ||
|
||
profile_jp = LangProfile.new("jp") | ||
TRAINING_JP.split(/ /).each do |w| | ||
profile_jp.add(w) | ||
end | ||
@factory.add_profile(profile_jp, 2, 3) | ||
end | ||
|
||
def test_detector1 | ||
detector = @factory.create() | ||
detector.append(UCS2String.new("\x00a")) | ||
assert_equal("en", detector.detect()) | ||
end | ||
|
||
def test_detector2 | ||
detector = @factory.create() | ||
detector.append(UCS2String.new("\x00b\x00\x20\x00d")) | ||
assert_equal("fr", detector.detect()) | ||
end | ||
|
||
def test_detector3 | ||
detector = @factory.create() | ||
detector.append(UCS2String.new("\x30\x42\x30\x42\x30\x42\x40\x42\x00a")) | ||
assert_equal("jp", detector.detect()) | ||
end | ||
end |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
require 'test/helper' | ||
|
||
class DetectorFactoryTest < Test::Unit::TestCase | ||
|
||
end |