diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..cae10c9 --- /dev/null +++ b/.gitignore @@ -0,0 +1,20 @@ +*.pyc +*#* +*.DS_STORE +*.log +*Data.fs* +*flymake* +*egg* +build/ +__pycache__/ +/.Python +/bin/ +/include/ +/lib/ +/pip-selfcheck.json +.tox/ +comments/ +dist/ +*silly* +extras/ +.cache/ \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..bc6ac55 --- /dev/null +++ b/README.md @@ -0,0 +1,84 @@ +## tok + +[![PyPI](https://img.shields.io/pypi/v/tok.svg?style=flat-square)](https://pypi.python.org/pypi/tok/) +[![PyPI](https://img.shields.io/pypi/pyversions/tok.svg?style=flat-square)](https://pypi.python.org/pypi/tok/) + +Fastest and most complete/customizable tokenizer in Python. + +Roughly 25x faster than spacy's and nltk's regex based tokenizers. + +### Installation + + pip install tok + +It depends on [textsearch](https://github.com/kootenpv/textsearch). + +### Usage + +By default it handles contractions, http, (float) numbers and currencies. + +```python +from tok import word_tokenize +word_tokenize("I wouldn't do that.... would you?") +['I', 'would', 'not', 'do', 'that', '...', 'would', 'you', '?'] +``` + +Or configure it yourself: + +```python +from tok import Tokenizer +tokenizer = Tokenizer(protected_words=["some.thing"]) # still using the defaults +tokenizer.word_tokenize("I want to protect some.thing") +['I', 'want', 'to', 'protect', 'some.thing'] +``` + +Split by sentences: + +```python +from tok import sent_tokenize +sent_tokenize("I wouldn't do that.... would you?") +[['I', 'would', 'not', 'do', 'that', '...'], ['would', 'you', '?']] +``` + +for more options check the documentation of the `Tokenizer`. + +### Further customization + +Given: + +```python +from tok import Tokenizer +t = Tokenizer(protected_words=["some.thing"]) # still using the defaults +``` + +You can add your own ideas to the tokenizer by using: + +- `t.keep(x, reason)`: Whenever it finds x, it will not add whitespace. Prevents direct tokenization. +- `t.split(x, reason)`: Whenever it finds x, it will surround it by whitespace, thus creating a token. +- `t.drop(x, reason)`: Whenever it finds x, it will remove it but add a split. +- `t.strip(x, reason)`: Whenever it finds x, it will remove it without splitting. + +```python +tokenizer.drop("bla", "bla is not needed") +t.word_tokenize("Please remove bla, thank you") +['Please', 'remove', ',', 'thank', 'you'] +``` + +### Explainable + +Explain what happened: + +```python +t.explain("bla") +[{'from': 'bla', 'to': ' ', 'explanation': 'bla is not needed'}] +``` + +See everything in there (will help you understand how it works): + +```python +t.explain_dict +``` + +### Contributing + +It would be greatly appreciated if you want to contribute to contribute to this library, or for example to add more languages to [contractions](https://github.com/kootenpv/contractions). diff --git a/deploy.py b/deploy.py new file mode 100644 index 0000000..e9f6ab1 --- /dev/null +++ b/deploy.py @@ -0,0 +1,28 @@ +""" File unrelated to the package, except for convenience in deploying """ +import re +import sh +import os + +commit_count = sh.git('rev-list', ['--all']).count('\n') + +with open('setup.py') as f: + setup = f.read() + +setup = re.sub("MICRO_VERSION = '[0-9]+'", "MICRO_VERSION = '{}'".format(commit_count), setup) + +major = re.search("MAJOR_VERSION = '([0-9]+)'", setup).groups()[0] +minor = re.search("MINOR_VERSION = '([0-9]+)'", setup).groups()[0] +micro = re.search("MICRO_VERSION = '([0-9]+)'", setup).groups()[0] +version = '{}.{}.{}'.format(major, minor, micro) + +with open('setup.py', 'w') as f: + f.write(setup) + +with open('tok/__init__.py') as f: + init = f.read() + +with open('tok/__init__.py', 'w') as f: + f.write(re.sub('__version__ = "[0-9.]+"', '__version__ = "{}"'.format(version), init)) + +py_version = "python3.7" if sh.which("python3.7") is not None else "python" +os.system('{} setup.py sdist bdist_wheel upload'.format(py_version)) diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..f528236 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,8 @@ +[metadata] +description-file = README.md + +[bdist_rpm] +doc_files = README.md + +[wheel] +universal = 1 \ No newline at end of file diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..7e03863 --- /dev/null +++ b/setup.py @@ -0,0 +1,53 @@ +from setuptools import find_packages +from setuptools import setup + +MAJOR_VERSION = '0' +MINOR_VERSION = '0' +MICRO_VERSION = '1' +VERSION = "{}.{}.{}".format(MAJOR_VERSION, MINOR_VERSION, MICRO_VERSION) + +with open("README.md") as f: + LONG_DESCRIPTION = f.read() + +setup( + name='tok', + version=VERSION, + description="Fast and customizable tokenizer", + long_description=LONG_DESCRIPTION, + long_description_content_type="text/markdown", + url='https://github.com/kootenpv/tok', + author='Pascal van Kooten', + author_email='kootenpv@gmail.com', + license='MIT', + packages=find_packages(), + include_package_data=True, + install_requires=["textsearch", "tldextract"], + classifiers=[ + 'Environment :: Console', + 'Intended Audience :: Developers', + 'Intended Audience :: Customer Service', + 'Intended Audience :: System Administrators', + 'License :: OSI Approved :: GNU General Public License v3 (GPLv3)', + 'Operating System :: Microsoft', + 'Operating System :: MacOS :: MacOS X', + 'Operating System :: Unix', + 'Operating System :: POSIX', + 'Programming Language :: Python', + 'Programming Language :: Python :: 2.7', + 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 3.4', + 'Programming Language :: Python :: 3.5', + 'Programming Language :: Python :: 3.6', + 'Programming Language :: Python :: 3.7', + 'Topic :: Software Development', + 'Topic :: Software Development :: Build Tools', + 'Topic :: Software Development :: Debuggers', + 'Topic :: Software Development :: Libraries', + 'Topic :: Software Development :: Libraries :: Python Modules', + 'Topic :: System :: Software Distribution', + 'Topic :: System :: Systems Administration', + 'Topic :: Utilities', + ], + zip_safe=False, + platforms='any', +) diff --git a/tok/__init__.py b/tok/__init__.py new file mode 100644 index 0000000..8fafca1 --- /dev/null +++ b/tok/__init__.py @@ -0,0 +1,208 @@ +__project__ = "tokenize" +__version__ = "0.0.1" +__repo__ = "https://github.com/kootenpv/tok" + +import string +from textsearch import TextSearch +from contractions import contractions_dict + + +class Tokenizer: + def __init__( + self, + handle_http=False, + handle_domains=False, + numbers=True, + combine_punctuation=True, + eol="\n", + currencies=("$",), + protected_words=None, + contractions=True, + language="en", + ): + # set() set() should fallback to just using __iter__ of automaton for a speedboost + if language != "en" and contractions: + raise ValueError("No contractions known for languages other than English.") + self.contractions = contractions + self.tokenizer = None + self.handle_http = handle_http + self.handle_domains = handle_domains + self.combine_punctuation = combine_punctuation + self.numbers = numbers + self.eol = eol + self.currencies = currencies or [] + self.protected_words = protected_words or [] + self.explain_dict = {} + self.setup() + + def setup(self): + self.tokenizer = TextSearch("sensitive", "norm", set(), set()) + self.add_base_cases() + self.add_currencies() + self.add_words(self.protected_words) + if self.handle_http: + self.tokenizer.add_http_handler(keep_result=True) + for word in ["http://", "https://", "www."]: + self.explain_dict[ + word + ] = "regex: when it finds '{}' it will stop after it finds a space.".format(word) + if self.handle_domains: + self.add_domain_handler() + if self.contractions: + if self.contractions == True: + self.contractions = contractions_dict + self.add_words(self.contractions) + + def add_words(self, words): + # [("cannot", "can not"), ("can't", "can n't"), ("mr.", "mr.")] + words = words.items() if isinstance(words, dict) else words + if words and isinstance(words[0], str): + words = [(x, x) for x in words] + for x, y in words: + REASON_AS_IS = "protected word: adds word as is, prevents splitting it." + REASON_UPPER = "protected word: adds word uppercased, prevents splitting it." + REASON_TITLE = "protected word: adds word titlecased, prevents splitting it." + self.add(x, y, REASON_AS_IS) + self.add(x.upper(), y.upper(), REASON_UPPER) + self.add(x[0].upper() + x[1:], y[0].upper() + y[1:], REASON_TITLE) + + def add_domain_handler(self): + import re + from tldextract.tldextract import TLD_EXTRACTOR + + valid_re = re.compile("^[a-zA-Z.]+$") + tlds = ["." + x for x in TLD_EXTRACTOR.tlds if valid_re.match(x)] + + for x in tlds: + self.add(x, x, "Added by domain handler, keeps the token existing.") + + def add_base_cases(self): + if self.numbers: + for x in "0123456789": + self.keep(x + ",") + self.keep(x + ".") + + # self.tokenizer.add(" !", " ! ") + + if self.combine_punctuation: + # combine multiples + R_COMBINE = "combine punctuation: merges '{}' into '{}' and starts a new sentence." + for s in "!.?-": + for i in range(2, 10): + # one of these is a splitting char + if i == 1 and s == "-": + continue + c = s * i + e = s * 3 if i > 1 else s + # end = "$$" if i == 1 or s != "-" else " " + end = " \n" if i == 1 or s != "-" else " " + self.add(c, " {}{}".format(e, end), R_COMBINE.format(c, e + end)) + + for i in range(2, 10): + # self.tokenizer.add("\n" * i, "$$") + self.add("\n" * i, " \n ", "merges newlines") + + for s in "!.?-\n": + self.split(s, "Splits on '{}' and creating a new sentence.".format(s)) + + self.split("- ") + + self.split("...") + + # does not work + # self.tokenizer.add_regex_handler(["!?"], "[!]+[?]+[!?]+", True, return_value=" !? ") + + self.split("!?") + self.split("!?!") + self.split("!!?") + self.split("!??") + self.split("?!!") + self.split("?!?") + self.split("??!") + + for x in string.ascii_letters: + self.keep(" " + x + ".") + + # for x in string.ascii_letters: + # self.tokenizer.add("\n" + x, "\n" + x) + + self.split(",") + + # quotes (make sure we add all the exeptions) + self.split("'") + self.split('"') + + def keep(self, x, reason=None): + """ Whenever it finds x, it will not add whitespace. Prevents direct tokenization. """ + self.tokenizer.add(x, x) + self.explain_dict[x] = reason or "keep:" + self.keep.__doc__.replace("x", repr(x)).rstrip() + + def split(self, x, reason=None): + """ Whenever it finds x, it will surround it by whitespace, thus creating a token. """ + self.tokenizer.add(x, " {} ".format(x)) + self.explain_dict[x] = ( + reason or "split:" + self.split.__doc__.replace("x", repr(x)).rstrip() + ) + + def drop(self, x, reason=None): + """ Whenever it finds x, it will remove it but add a split.""" + self.tokenizer.add(x, " ") + self.explain_dict[x] = reason or "drop:" + self.drop.__doc__.replace("x", repr(x)).rstrip() + + def strip(self, x, reason=None): + """ Whenever it finds x, it will remove it without splitting. """ + self.tokenizer.add(x, "") + self.explain_dict[x] = ( + reason or "strip:" + self.strip.__doc__.replace("x", repr(x)).rstrip() + ) + + def add(self, x, y, reason): + self.tokenizer.add(x, y) + self.explain_dict[x] = reason + + def explain(self, char_or_chars): + keys = [x for x in self.tokenizer._root_dict if char_or_chars in x] + if not keys: + return { + "explanation": "No explanation, meaning there is nothing specified for the input" + } + return [ + {"from": x, "to": self.tokenizer._root_dict[x], "explanation": self.explain_dict[x]} + for x in keys + ] + + def remove(self, x, reason=None): + self.tokenizer.remove(x) + self.explain_dict[x] = reason or "removing '{}'".format(x) + + def add_currencies(self): + for currency in self.currencies: + self.split(currency) + + for num in "0123456789": + # to prevent the . and , from being treated as punct + for punc in ",.": + s = "{currency}{num}{punc}".format(currency=currency, num=num, punc=punc) + r = " {currency} {num}{punc}".format(currency=currency, num=num, punc=punc) + self.add(s, r, "protecting currency from being seen as a number.") + + def word_tokenize(self, z, return_entities=False, to_lower=False): + if return_entities: + a, b = self.tokenizer.replace(" " + z, return_entities=True) + return a.split(), b + res = self.tokenizer.replace(" " + z).split() + if to_lower: + res = [x.lower() for x in res] + return res + + def word_newlined_tokenize(self, z): + sentences = self.sent_tokenize(z) + return sum([x + ["\n"] for x in sentences[:-1]], []) + sentences[-1] + + def sent_tokenize(self, z): + return [x.split() for x in self.tokenizer.replace(z).split("\n") if x.strip()] + + +t = Tokenizer(handle_http=True, handle_domains=False) +word_tokenize = t.word_tokenize +sent_tokenize = t.sent_tokenize