113 changes: 65 additions & 48 deletions guesslanguage/core.py → libindic/guesslanguage/core.py
@@ -1,34 +1,36 @@
''' Guess the language of text.
#!/usr/bin/env python
# -*- coding: utf-8 -*-

# Guess the language of text.
#
# Based on guesslanguage.cpp by Jacob R Rideout for KDE
# http://websvn.kde.org/branches/work/sonnet-refactoring/common/nlp/guesslanguage.cpp?view=markup
# which itself is based on Language::Guess by Maciej Ceglowski
# http://languid.cantbedone.org/
#
# Copyright (c) 2008, Kent S Johnson
#
# C++ version is Copyright (c) 2006 Jacob R Rideout <kde@jacobrideout.net>
# Perl version is (c) 2004-6 Maciej Ceglowski
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#
# Note: Language::Guess is GPL-licensed. KDE developers received permission
# from the author to distribute their port under LGPL:
# http://lists.kde.org/?l=kde-sonnet&m=116910092228811&w=2

Based on guesslanguage.cpp by Jacob R Rideout for KDE
http://websvn.kde.org/branches/work/sonnet-refactoring/common/nlp/guesslanguage.cpp?view=markup
which itself is based on Language::Guess by Maciej Ceglowski
http://languid.cantbedone.org/
Copyright (c) 2008, Kent S Johnson
C++ version is Copyright (c) 2006 Jacob R Rideout <kde@jacobrideout.net>
Perl version is (c) 2004-6 Maciej Ceglowski
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Note: Language::Guess is GPL-licensed. KDE developers received permission
from the author to distribute their port under LGPL:
http://lists.kde.org/?l=kde-sonnet&m=116910092228811&w=2
'''

import codecs
import os
Expand All @@ -40,7 +42,7 @@

try:
from collections import defaultdict
except:
except BaseException:
class defaultdict(dict):
def __init__(self, default_factory=None, *a, **kw):
if (default_factory is not None and
Expand Down Expand Up @@ -85,8 +87,11 @@ def __repr__(self):

MIN_LENGTH = 20

BASIC_LATIN = "en_US ceb ha so tlh id haw la sw eu nr nso zu_ZA xh ss st tn ts".split()
EXTENDED_LATIN = "cs af_ZA pl_PL hr_HR ro sk sl tr hu_HU az et sq ca es fr de nl it_IT da is nb sv fi lv pt ve lt tl cy".split()
BASIC_LATIN = "en_US ceb ha so tlh id haw la sw eu nr ".split() + \
"nso zu_ZA xh ss st tn ts".split()
EXTENDED_LATIN = "cs af_ZA pl_PL hr_HR ro sk sl tr hu_HU az et sq ".split() + \
"ca es fr de nl it_IT da is nb sv fi lv pt ve lt ".split() + \
"tl cy".split()
ALL_LATIN = BASIC_LATIN + EXTENDED_LATIN
CYRILLIC = "ru uk kk uz mn sr mk bg ky".split()
ARABIC = "ar fa ps ur".split()
Expand Down Expand Up @@ -211,7 +216,6 @@ def __repr__(self):
"ts": "Tsonga",
"tw": "Twi",
"uk": "Ukrainian",
"uk": "Ukranian",
"ur": "Urdu",
"uz": "Uzbek",
"ve": "Venda",
Expand Down Expand Up @@ -306,7 +310,6 @@ def __repr__(self):
"tr": 26500,
"tw": 1499,
"uk": 26510,
"uk": 26520,
"ur": 26530,
"uz": 26540,
"vi": 26550,
Expand Down Expand Up @@ -339,7 +342,7 @@ def guessLanguage(text):
return UNKNOWN

if isinstance(text, str):
text = unicode(text, 'utf-8')
text = unicode(text, 'utf-8') # noqa: F821

text = normalize(text)

Expand Down Expand Up @@ -407,7 +410,7 @@ def find_runs(text):
# and extended additional latin if over 10% (for Vietnamese)
relevant_runs = []
for key, value in run_types.items():
pct = (value*100) / totalCount
pct = (value * 100) / totalCount
if pct >= 40:
relevant_runs.append(key)
elif key == "Basic Latin" and (pct >= 15):
Expand All @@ -429,21 +432,23 @@ def _identify(sample, scripts):
if "Greek and Coptic" in scripts:
return "el"

if "Katakana" in scripts or "Hiragana" in scripts or "Katakana Phonetic Extensions" in scripts:
if "Katakana" in scripts or "Hiragana" in scripts or \
"Katakana Phonetic Extensions" in scripts:
return "ja"

if "CJK Unified Ideographs" in scripts or "Bopomofo" in scripts \
or "Bopomofo Extended" in scripts or "KangXi Radicals" in scripts:

# This is in both Ceglowski and Rideout
# I can't imagine why...
# or "Arabic Presentation Forms-A" in scripts
# This is in both Ceglowski and Rideout
# I can't imagine why...
# or "Arabic Presentation Forms-A" in scripts
return "zh"

if "Cyrillic" in scripts:
return check(sample, CYRILLIC)

if "Arabic" in scripts or "Arabic Presentation Forms-A" in scripts or "Arabic Presentation Forms-B" in scripts:
if "Arabic" in scripts or "Arabic Presentation Forms-A" in scripts or \
"Arabic Presentation Forms-B" in scripts:
return check(sample, ARABIC)

if "Devanagari" in scripts:
Expand Down Expand Up @@ -496,8 +501,12 @@ def createOrderedModel(content):
trigrams = defaultdict(int) # QHash<QString,int>
content = content.lower()

for i in xrange(0, len(content)-2):
trigrams[content[i:i+3]] += 1
try:
ranges = xrange(0, len(content) - 2) # noqa: F821
except:
ranges = range(0, len(content) - 2)
for i in ranges:
trigrams[content[i:i + 3]] += 1

return sorted(trigrams.keys(), key=lambda k: (-trigrams[k], k))

Expand All @@ -522,8 +531,12 @@ def distance(model, knownModel):
def _makeNonAlphaRe():
nonAlpha = [u'[^']
for i in range(sys.maxunicode):
c = unichr(i)
if c.isalpha(): nonAlpha.append(c)
try:
c = unichr(i) # noqa: F821
except:
c = chr(i)
if c.isalpha():
nonAlpha.append(c)
nonAlpha.append(u']')
nonAlpha = u"".join(nonAlpha)
return re.compile(nonAlpha)
Expand Down Expand Up @@ -563,13 +576,17 @@ def guessLanguageId(self, text):
return lang

def getScriptName(self, text):
return dumps(detect_lang(text))
return detect_lang(text)

def get_module_name(self):
return "Guess Language"

def get_info(self):
return "Guess the language of given text. This module can detect more than 50 languages. Based on Language::Guess by Maciej Ceglowski(http://languid.cantbedone.org/)"
message = "Guess the language of given text. This module can " + \
"detect more than 50 languages. Based on " + \
"Language::Guess by Maciej Ceglowski" + \
"(http://languid.cantbedone.org/)"
return message


def getInstance():
Expand Down
@@ -1,6 +1,6 @@
#! /usr/bin/env python
# -*- coding: utf-8 -*-
#Language Detection based on unicode range
# Language Detection based on unicode range
# Copyright 2008 Santhosh Thottingal <santhosh.thottingal@gmail.com>
# http://www.smc.org.in
#
Expand Down Expand Up @@ -36,7 +36,7 @@ def _detect_lang(text):
word = words[word_iter]
if(word):
orig_word = word
#remove the punctuations
# remove the punctuations
for punct in string.punctuation:
word = word.replace(punct, " ")
length = len(word)
Expand Down
File renamed without changes.
Empty file.
Expand Up @@ -12,20 +12,38 @@
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Foundation, Inc., 51 Franklin Street, 5th Floor, Boston, MA 02110-1301 USA
'''

import unittest
from testtools import TestCase

from blocks import unicodeBlock
from ..blocks import unicodeBlock
from itertools import chain


class BlocksTest(TestCase):

def setUp(self):
super(BlocksTest, self).setUp()

def assertBlock(self, name, c):
try:
c = unichr(c) # noqa: F821
except:
c = chr(c)
block = unicodeBlock(c)
self.assertEquals(name, unicodeBlock(c),
'%s != %s for %r' % (name, block, c))

class blocks_test(unittest.TestCase):
def test_unicodeBlock(self):
for c in range(128):
self.assertBlock('Basic Latin', c)

for c in range(0x80, 0x180) + range(0x250, 0x2B0):
self.assertBlock('Extended Latin', c)
try:
for c in range(0x80, 0x180) + range(0x250, 0x2B0):
self.assertBlock('Extended Latin', c)
except:
for c in chain(range(0x80, 0x180), range(0x250, 0x2B0)):
self.assertBlock('Extended Latin', c)

self.assertBlock('Thai', 0xE00)
self.assertBlock('Thai', 0xE7F)
Expand All @@ -34,16 +52,3 @@ def test_unicodeBlock(self):
self.assertBlock('Tibetan', 0xF00)
self.assertBlock('Tibetan', 0xFFF)
self.assertBlock('Cyrillic', 0x421)

def assertBlock(self, name, c):
c = unichr(c)
block = unicodeBlock(c)
self.assertEquals(name, unicodeBlock(c), '%s != %s for %r' % (name, block, c))


def setUp(self):
pass


if __name__ == '__main__':
unittest.main()
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
1 change: 1 addition & 0 deletions requirements.txt
@@ -0,0 +1 @@
silpa_common
28 changes: 28 additions & 0 deletions setup.cfg
@@ -0,0 +1,28 @@
[metadata]
name = libindic-guesslanguage
summary = Detect primary language of text
description-file =
README.md
author = Santhosh Thottingal
author-email = santhosh.thottingal@gmail.com
home-page = http://silpa.org.in
classifier =
#Environment :: Web Environment
#Framework :: Flask
Intended Audience :: Developers
Intended Audience :: Information Technology
License :: OSI Approved
License :: OSI Approved :: GNU Affero General Public License v3 or later (AGPLv3+)
Programming Language :: Python
Programming Language :: Python :: 3

[files]
namespace_packages = libindic
packages = libindic

[build-sphinx]
all_files = 1
source-dir = docs

[wheel]
universal = 1
23 changes: 4 additions & 19 deletions setup.py
@@ -1,23 +1,8 @@

#!/usr/bin/env python

from setuptools import setup, find_packages

name = "guesslanguage"
from setuptools import setup

setup(
name = name,
version = "0.2.1",
url = "http://silpa.org.in/Guess_Language",
license = "LGPL-3.0",
description = "Guess primary language of given text",
author = "Santhosh Thottingal",
author_email = "santhosh.thottingal@gmail.com",
long_description = """Guess the language of given text.Even
works for text containing multiple languages""",
packages = find_packages(),
include_package_data = True,
setup_requires = ['setuptools-git'],
install_requires = ['setuptools','silpa_common'],
zip_safe = False,
)
setup_requires=['pbr'],
pbr=True,
)
6 changes: 6 additions & 0 deletions test-requirements.txt
@@ -0,0 +1,6 @@
testrepository
python-subunit
flake8
mccabe
coverage
coveralls
49 changes: 0 additions & 49 deletions tests/blocks_test.py

This file was deleted.

20 changes: 20 additions & 0 deletions tox.ini
@@ -0,0 +1,20 @@
# Tox (http://tox.testrun.org/) is a tool for running tests
# in multiple virtualenvs. This configuration file will run the
# test suite on all supported python versions. To use it, "pip install tox"
# and then run "tox" from this directory.

[tox]
envlist = py35, py27, pep8

[testenv]
commands = {envpython} setup.py test
deps =
-rrequirements.txt
-rtest-requirements.txt

[testenv:pep8]
deps=
-rrequirements.txt
-rtest-requirements.txt
commands=
flake8 --ignore F401 libindic