Permalink
Browse files

Added support for language name, code, and reliability

  • Loading branch information...
1 parent 1bc5a17 commit d3c6ad8e68f5a801b26060d4baf95dffaeea9165 Andrew Kane committed Feb 5, 2012
Showing with 187 additions and 988 deletions.
  1. +19 −4 .gitignore
  2. +6 −0 Gemfile
  3. +33 −0 README.md
  4. +0 −173 README.rdoc
  5. +2 −15 Rakefile
  6. +0 −48 build.sh
  7. +0 −28 build.win.cmd
  8. +20 −0 cld.gemspec
  9. +0 −1 encodings/compact_lang_det/.#compact_lang_det_impl.h
  10. +0 −1 encodings/compact_lang_det/.#ext_lang_enc.h
  11. +0 −1 encodings/compact_lang_det/.#getonescriptspan.cc
  12. +0 −1 encodings/compact_lang_det/.#tote.cc
  13. +0 −1 encodings/compact_lang_det/.#tote.h
  14. +0 −1 encodings/compact_lang_det/win/.#cld_unilib_windows.cc
  15. +31 −0 ext/cld/Makefile
  16. 0 { → ext/cld}/base/basictypes.h
  17. 0 { → ext/cld}/base/build_config.h
  18. 0 { → ext/cld}/base/casts.h
  19. 0 { → ext/cld}/base/commandlineflags.h
  20. 0 { → ext/cld}/base/crash.h
  21. 0 { → ext/cld}/base/dynamic_annotations.h
  22. 0 { → ext/cld}/base/global_strip_options.h
  23. 0 { → ext/cld}/base/log_severity.h
  24. 0 { → ext/cld}/base/logging.h
  25. 0 { → ext/cld}/base/macros.h
  26. 0 { → ext/cld}/base/port.h
  27. 0 { → ext/cld}/base/scoped_ptr.h
  28. 0 { → ext/cld}/base/stl_decl.h
  29. 0 { → ext/cld}/base/stl_decl_msvc.h
  30. 0 { → ext/cld}/base/string_util.h
  31. 0 { → ext/cld}/base/strtoint.h
  32. 0 { → ext/cld}/base/template_util.h
  33. 0 { → ext/cld}/base/type_traits.h
  34. 0 { → ext/cld}/base/vlog_is_on.h
  35. 0 { → ext/cld}/cld_encodings.h
  36. 0 { → ext/cld}/encodings/compact_lang_det/#cldutil.cc#
  37. 0 { → ext/cld}/encodings/compact_lang_det/#cldutil.h#
  38. 0 { → ext/cld}/encodings/compact_lang_det/#compact_lang_det_impl.h#
  39. 0 { → ext/cld}/encodings/compact_lang_det/#ext_lang_enc.cc#
  40. 0 { → ext/cld}/encodings/compact_lang_det/#ext_lang_enc.h#
  41. 0 { → ext/cld}/encodings/compact_lang_det/#getonescriptspan.cc#
  42. 0 { → ext/cld}/encodings/compact_lang_det/#getonescriptspan.h#
  43. 0 { → ext/cld}/encodings/compact_lang_det/#tote.cc#
  44. 0 { → ext/cld}/encodings/compact_lang_det/#tote.h#
  45. 0 { → ext/cld}/encodings/compact_lang_det/cldutil.cc
  46. 0 { → ext/cld}/encodings/compact_lang_det/cldutil.h
  47. 0 { → ext/cld}/encodings/compact_lang_det/cldutil_dbg.h
  48. 0 { → ext/cld}/encodings/compact_lang_det/cldutil_dbg_empty.cc
  49. 0 { → ext/cld}/encodings/compact_lang_det/compact_lang_det.cc
  50. 0 { → ext/cld}/encodings/compact_lang_det/compact_lang_det.h
  51. 0 { → ext/cld}/encodings/compact_lang_det/compact_lang_det_impl.cc
  52. 0 { → ext/cld}/encodings/compact_lang_det/compact_lang_det_impl.h
  53. 0 { → ext/cld}/encodings/compact_lang_det/compact_lang_det_unittest_small.cc
  54. 0 { → ext/cld}/encodings/compact_lang_det/compile.cmd
  55. 0 { → ext/cld}/encodings/compact_lang_det/ext_lang_enc.cc
  56. 0 { → ext/cld}/encodings/compact_lang_det/ext_lang_enc.h
  57. 0 { → ext/cld}/encodings/compact_lang_det/generated/cld_generated_score_deltaoctachrome_0406.cc
  58. 0 { → ext/cld}/encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc
  59. 0 { → ext/cld}/encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc
  60. 0 { → ext/cld}/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc
  61. 0 { → ext/cld}/encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz_0.cc
  62. 0 { → ext/cld}/encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc
  63. 0 { → ext/cld}/encodings/compact_lang_det/generated/compact_lang_det_generated_longwords8_0.cc
  64. 0 { → ext/cld}/encodings/compact_lang_det/generated/compact_lang_det_generated_meanscore.h
  65. 0 { → ext/cld}/encodings/compact_lang_det/generated/compact_lang_det_generated_quads_0.cc
  66. 0 { → ext/cld}/encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc
  67. 0 { → ext/cld}/encodings/compact_lang_det/getonescriptspan.cc
  68. 0 { → ext/cld}/encodings/compact_lang_det/getonescriptspan.h
  69. 0 { → ext/cld}/encodings/compact_lang_det/letterscript_enum.cc
  70. 0 { → ext/cld}/encodings/compact_lang_det/letterscript_enum.h
  71. 0 { → ext/cld}/encodings/compact_lang_det/subsetsequence.cc
  72. 0 { → ext/cld}/encodings/compact_lang_det/subsetsequence.h
  73. 0 { → ext/cld}/encodings/compact_lang_det/subsetsequence_unittest.cc
  74. 0 { → ext/cld}/encodings/compact_lang_det/tote.cc
  75. 0 { → ext/cld}/encodings/compact_lang_det/tote.h
  76. 0 { → ext/cld}/encodings/compact_lang_det/unittest_data.h
  77. 0 { → ext/cld}/encodings/compact_lang_det/utf8propjustletter.h
  78. 0 { → ext/cld}/encodings/compact_lang_det/utf8propletterscriptnum.h
  79. 0 { → ext/cld}/encodings/compact_lang_det/utf8scannotjustletterspecial.h
  80. 0 { → ext/cld}/encodings/compact_lang_det/win/#cld_unilib_windows.cc#
  81. 0 { → ext/cld}/encodings/compact_lang_det/win/cld_basictypes.h
  82. 0 { → ext/cld}/encodings/compact_lang_det/win/cld_commandlineflags.h
  83. 0 { → ext/cld}/encodings/compact_lang_det/win/cld_google.h
  84. 0 { → ext/cld}/encodings/compact_lang_det/win/cld_htmlutils.h
  85. 0 { → ext/cld}/encodings/compact_lang_det/win/cld_htmlutils_google3.cc
  86. 0 { → ext/cld}/encodings/compact_lang_det/win/cld_htmlutils_windows.cc
  87. 0 { → ext/cld}/encodings/compact_lang_det/win/cld_logging.h
  88. 0 { → ext/cld}/encodings/compact_lang_det/win/cld_macros.h
  89. 0 { → ext/cld}/encodings/compact_lang_det/win/cld_strtoint.h
  90. 0 { → ext/cld}/encodings/compact_lang_det/win/cld_unicodetext.cc
  91. 0 { → ext/cld}/encodings/compact_lang_det/win/cld_unicodetext.h
  92. 0 { → ext/cld}/encodings/compact_lang_det/win/cld_unilib.h
  93. 0 { → ext/cld}/encodings/compact_lang_det/win/cld_unilib_google3.cc
  94. 0 { → ext/cld}/encodings/compact_lang_det/win/cld_unilib_windows.cc
  95. 0 { → ext/cld}/encodings/compact_lang_det/win/cld_utf.h
  96. 0 { → ext/cld}/encodings/compact_lang_det/win/cld_utf8statetable.cc
  97. 0 { → ext/cld}/encodings/compact_lang_det/win/cld_utf8statetable.h
  98. 0 { → ext/cld}/encodings/compact_lang_det/win/cld_utf8utils.h
  99. 0 { → ext/cld}/encodings/compact_lang_det/win/cld_utf8utils_google3.cc
  100. 0 { → ext/cld}/encodings/compact_lang_det/win/cld_utf8utils_windows.cc
  101. 0 { → ext/cld}/encodings/compact_lang_det/win/normalizedunicodetext.cc
  102. 0 { → ext/cld}/encodings/compact_lang_det/win/normalizedunicodetext.h
  103. 0 { → ext/cld}/encodings/internal/encodings.cc
  104. 0 { → ext/cld}/encodings/lang_enc.h
  105. 0 { → ext/cld}/encodings/proto/encodings.pb.h
  106. 0 { → ext/cld}/encodings/public/encodings.h
  107. +2 −7 ext/cld/extconf.rb
  108. 0 { → ext/cld}/languages/internal/#languages.cc#
  109. 0 { → ext/cld}/languages/internal/languages.cc
  110. 0 { → ext/cld}/languages/proto/languages.pb.h
  111. 0 { → ext/cld}/languages/public/languages.h
  112. +56 −0 ext/cld/thunk.cc
  113. +15 −6 lib/cld.rb
  114. +3 −0 lib/cld/version.rb
  115. +0 −570 test/test.rb
  116. +0 −131 thunk.cc
View
@@ -1,5 +1,20 @@
+*.gem
+*.rbc
+.bundle
+.config
+.yardoc
+Gemfile.lock
+InstalledFiles
+_yardoc
+coverage
+doc/
+lib/bundler/man
pkg
-doc
-Manifest
-*.o
-*.so
+rdoc
+spec/reports
+test/tmp
+test/version_tmp
+tmp
+ext/cld/*.o
+ext/cld/*.a
+ext/cld/*.so
View
@@ -0,0 +1,6 @@
+source 'https://rubygems.org'
+
+# Specify your gem's dependencies in cld.gemspec
+gemspec
+
+gem "rake"
View
@@ -0,0 +1,33 @@
+# Compact Language Detection
+
+Blazing-fast langauge detection for Ruby provided by
+Google Chrome's Compact Language Detector.
+
+## Installation
+
+Add this line to your application's Gemfile:
+
+ gem 'cld'
+
+And then execute:
+
+ $ bundle
+
+Or install it yourself as:
+
+ $ gem install cld
+
+## Usage
+
+```ruby
+lang = CLD.detect_language("This is a test")
+# => {:name => "ENGLISH", :code => "en", :reliable => true}
+```
+
+## Contributing
+
+1. Fork it
+2. Create your feature branch (`git checkout -b my-new-feature`)
+3. Commit your changes (`git commit -am 'Added some feature'`)
+4. Push to the branch (`git push origin my-new-feature`)
+5. Create new Pull Request
View
@@ -1,173 +0,0 @@
-This is a wrapper of the Compact Language Detection library from Chrome.
-To use :
-require "cld"
-language = CLD.detect_language("piece of text")
-is_english = CLD.english?("我不是英文")
-
-
-detect_language returns a unique integer representing each language, here are the languages:
-ENGLISH = 0,
-DANISH = 1,
-DUTCH = 2,
-FINNISH = 3,
-FRENCH = 4,
-GERMAN = 5,
-HEBREW = 6,
-ITALIAN = 7,
-JAPANESE = 8,
-KOREAN = 9,
-NORWEGIAN = 10,
-POLISH = 11,
-PORTUGUESE = 12,
-RUSSIAN = 13,
-SPANISH = 14,
-SWEDISH = 15,
-CHINESE = 16,
-CZECH = 17,
-GREEK = 18,
-ICELANDIC = 19,
-LATVIAN = 20,
-LITHUANIAN = 21,
-ROMANIAN = 22,
-HUNGARIAN = 23,
-ESTONIAN = 24,
-TG_UNKNOWN_LANGUAGE = 25,
-UNKNOWN_LANGUAGE = 26,
-BULGARIAN = 27,
-CROATIAN = 28,
-SERBIAN = 29,
-IRISH = 30,
-GALICIAN = 31,
-TAGALOG = 32,
-TURKISH = 33,
-UKRAINIAN = 34,
-HINDI = 35,
-MACEDONIAN = 36,
-BENGALI = 37,
-INDONESIAN = 38,
-LATIN = 39,
-MALAY = 40,
-MALAYALAM = 41,
-WELSH = 42,
-NEPALI = 43,
-TELUGU = 44,
-ALBANIAN = 45,
-TAMIL = 46,
-BELARUSIAN = 47,
-JAVANESE = 48,
-OCCITAN = 49,
-URDU = 50,
-BIHARI = 51,
-GUJARATI = 52,
-THAI = 53,
-ARABIC = 54,
-CATALAN = 55,
-ESPERANTO = 56,
-BASQUE = 57,
-INTERLINGUA = 58,
-KANNADA = 59,
-PUNJABI = 60,
-SCOTS_GAELIC = 61,
-SWAHILI = 62,
-SLOVENIAN = 63,
-MARATHI = 64,
-MALTESE = 65,
-VIETNAMESE = 66,
-FRISIAN = 67,
-SLOVAK = 68,
-CHINESE_T = 69,
-FAROESE = 70,
-SUNDANESE = 71,
-UZBEK = 72,
-AMHARIC = 73,
-AZERBAIJANI = 74,
-GEORGIAN = 75,
-TIGRINYA = 76,
-PERSIAN = 77,
-BOSNIAN = 78,
-SINHALESE = 79,
-NORWEGIAN_N = 80,
-PORTUGUESE_P = 81,
-PORTUGUESE_B = 82,
-XHOSA = 83,
-ZULU = 84,
-GUARANI = 85,
-SESOTHO = 86,
-TURKMEN = 87,
-KYRGYZ = 88,
-BRETON = 89,
-TWI = 90,
-YIDDISH = 91,
-SERBO_CROATIAN= 92,
-SOMALI = 93,
-UIGHUR = 94,
-KURDISH = 95,
-MONGOLIAN = 96,
-ARMENIAN = 97,
-LAOTHIAN = 98,
-SINDHI = 99,
-RHAETO_ROMANCE= 100,
-AFRIKAANS = 101,
-LUXEMBOURGISH = 102,
-BURMESE = 103,
-KHMER = 104,
-TIBETAN = 105,
-DHIVEHI = 106,
-CHEROKEE = 107,
-SYRIAC = 108,
-LIMBU = 109,
-ORIYA = 110,
-ASSAMESE = 111,
-CORSICAN = 112,
-INTERLINGUE = 113,
-KAZAKH = 114,
-LINGALA = 115,
-MOLDAVIAN = 116,
-PASHTO = 117,
-QUECHUA = 118,
-SHONA = 119,
-TAJIK = 120,
-TATAR = 121,
-TONGA = 122,
-YORUBA = 123,
-CREOLES_AND_PIDGINS_ENGLISH_BASED = 124,
-CREOLES_AND_PIDGINS_FRENCH_BASED = 125,
-CREOLES_AND_PIDGINS_PORTUGUESE_BASED = 126,
-CREOLES_AND_PIDGINS_OTHER = 127,
-MAORI = 128,
-WOLOF = 129,
-ABKHAZIAN = 130,
-AFAR = 131,
-AYMARA = 132,
-BASHKIR = 133,
-BISLAMA = 134,
-DZONGKHA = 135,
-FIJIAN = 136,
-GREENLANDIC = 137,
-HAUSA = 138,
-HAITIAN_CREOLE= 139,
-INUPIAK = 140,
-INUKTITUT = 141,
-KASHMIRI = 142,
-KINYARWANDA = 143,
-MALAGASY = 144,
-NAURU = 145,
-OROMO = 146,
-RUNDI = 147,
-SAMOAN = 148,
-SANGO = 149,
-SANSKRIT = 150,
-SISWANT = 151,
-TSONGA = 152,
-TSWANA = 153,
-VOLAPUK = 154,
-ZHUANG = 155,
-KHASI = 156,
-SCOTS = 157,
-GANDA = 158,
-MANX = 159,
-MONTENEGRIN = 160,
-NUM_LANGUAGES = 161,
-
-Thanks to Mike McCandless for finding this code and writing a python version
-Thanks to the Chrome Authors.
View
@@ -1,15 +1,2 @@
-require 'rubygems'
-require 'rake'
-require 'echoe'
-
-Echoe.new('cld', '0.4.0') do |p|
- p.description = "Compact Language Detection from chrome"
- p.url = "http://github.com/jtoy/cld"
- p.author = "Jason Toy"
- p.email = "jtoy@jtoy.net"
- p.ignore_pattern = ["tmp/*", "script/*"]
- p.runtime_dependencies = ["ffi"]
- p.development_dependencies = []
-end
-
-#Dir["#{File.dirname(__FILE__)}/tasks/*.rake"].sort.each { |ext| load ext }
+#!/usr/bin/env rake
+require "bundler/gem_tasks"
View
@@ -1,48 +0,0 @@
-#!/bin/bash
-
-CFLAGS="-fPIC -I. -O2 -DCLD_WINDOWS"
-LDFLAGS=-L.
-CC=g++
-AR=ar
-
-rm -f *.o
-rm -f libcld.a
-
-SOURCES="encodings/compact_lang_det/cldutil.cc \
- encodings/compact_lang_det/cldutil_dbg_empty.cc \
- encodings/compact_lang_det/compact_lang_det.cc \
- encodings/compact_lang_det/compact_lang_det_impl.cc \
- encodings/compact_lang_det/ext_lang_enc.cc \
- encodings/compact_lang_det/getonescriptspan.cc \
- encodings/compact_lang_det/letterscript_enum.cc \
- encodings/compact_lang_det/tote.cc \
- encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc \
- encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc \
- encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc \
- encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc \
- encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc \
- encodings/compact_lang_det/win/cld_htmlutils_windows.cc \
- encodings/compact_lang_det/win/cld_unilib_windows.cc \
- encodings/compact_lang_det/win/cld_utf8statetable.cc \
- encodings/compact_lang_det/win/cld_utf8utils_windows.cc \
- encodings/internal/encodings.cc \
- languages/internal/languages.cc \
- thunk.cc"
-
- #encodings/compact_lang_det/win/cld_unicodetext.cc \
-
-echo
-echo "Compile..."
-$CC -c $CFLAGS $SOURCES
-
-echo
-echo "Make libcld.a"
-$AR rcs libcld.a *.o
-
-echo
-#$CC -DCLD_WINDOWS -I. -L. -o example example.cc -lcld -lstdc++
-$CC -DCLD_WINDOWS -I. -L. -shared -o cld.so -lstdc++ *.o
-
-
-echo
-echo "Done!"
View
@@ -1,28 +0,0 @@
-REM "c:\Program Files\Microsoft Visual Studio 8\vc\vcvarsall.bat"
-
-set CFLAGS=/nologo /I. /O2 /DCLD_WINDOWS /DWIN32 /EHsc
-set LDFLAGS=-L.
-set CC=cl.exe
-set AR=lib.exe
-
-del *.obj
-del libcld.lib
-
-set SOURCES=encodings/compact_lang_det/cldutil.cc encodings/compact_lang_det/cldutil_dbg_empty.cc encodings/compact_lang_det/compact_lang_det.cc encodings/compact_lang_det/compact_lang_det_impl.cc encodings/compact_lang_det/ext_lang_enc.cc encodings/compact_lang_det/getonescriptspan.cc encodings/compact_lang_det/letterscript_enum.cc encodings/compact_lang_det/tote.cc encodings/compact_lang_det/generated/cld_generated_score_quadchrome_0406.cc encodings/compact_lang_det/generated/compact_lang_det_generated_cjkbis_0.cc encodings/compact_lang_det/generated/compact_lang_det_generated_ctjkvz.cc encodings/compact_lang_det/generated/compact_lang_det_generated_deltaoctachrome.cc encodings/compact_lang_det/generated/compact_lang_det_generated_quadschrome.cc encodings/compact_lang_det/win/cld_htmlutils_windows.cc encodings/compact_lang_det/win/cld_unilib_windows.cc encodings/compact_lang_det/win/cld_utf8statetable.cc encodings/compact_lang_det/win/cld_utf8utils_windows.cc encodings/internal/encodings.cc languages/internal/languages.cc
-
-REM encodings/compact_lang_det/win/cld_unicodetext.cc \
-
-echo ""
-echo "Compile..."
-%CC% /c %CFLAGS% %SOURCES%
-
-echo ""
-echo "Make libcld"
-%AR% *.obj -OUT:libcld.lib
-
-echo ""
-echo "Compile example.cc"
-%CC% %CFLAGS% %LFLAGS% example.cc libcld.lib
-
-echo
-echo "Done!"
View
@@ -0,0 +1,20 @@
+# -*- encoding: utf-8 -*-
+require File.expand_path('../lib/cld/version', __FILE__)
+
+Gem::Specification.new do |gem|
+ gem.authors = ["Jason Toy"]
+ gem.email = ["jtoy@jtoy.net"]
+ gem.description = %q{Compact Language Detection for Ruby}
+ gem.summary = %q{Compact Language Detection for Ruby}
+ gem.homepage = "http://github.com/jtoy/cld"
+
+ gem.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
+ gem.files = `git ls-files`.split("\n")
+ gem.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
+ gem.extensions = ["ext/cld/extconf.rb"]
+ gem.name = "cld"
+ gem.require_paths = ["lib"]
+ gem.version = CLD::VERSION
+
+ gem.add_dependency "ffi"
+end
Oops, something went wrong.

0 comments on commit d3c6ad8

Please sign in to comment.