Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 7aabb58
Showing
6 changed files
with
401 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
*.pyc | ||
*#* | ||
*.DS_STORE | ||
*.log | ||
*Data.fs* | ||
*flymake* | ||
*egg* | ||
build/ | ||
__pycache__/ | ||
/.Python | ||
/bin/ | ||
/include/ | ||
/lib/ | ||
/pip-selfcheck.json | ||
.tox/ | ||
comments/ | ||
dist/ | ||
*silly* | ||
extras/ | ||
.cache/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
## tok | ||
|
||
[![PyPI](https://img.shields.io/pypi/v/tok.svg?style=flat-square)](https://pypi.python.org/pypi/tok/) | ||
[![PyPI](https://img.shields.io/pypi/pyversions/tok.svg?style=flat-square)](https://pypi.python.org/pypi/tok/) | ||
|
||
Fastest and most complete/customizable tokenizer in Python. | ||
|
||
Roughly 25x faster than spacy's and nltk's regex based tokenizers. | ||
|
||
### Installation | ||
|
||
pip install tok | ||
|
||
It depends on [textsearch](https://github.com/kootenpv/textsearch). | ||
|
||
### Usage | ||
|
||
By default it handles contractions, http, (float) numbers and currencies. | ||
|
||
```python | ||
from tok import word_tokenize | ||
word_tokenize("I wouldn't do that.... would you?") | ||
['I', 'would', 'not', 'do', 'that', '...', 'would', 'you', '?'] | ||
``` | ||
|
||
Or configure it yourself: | ||
|
||
```python | ||
from tok import Tokenizer | ||
tokenizer = Tokenizer(protected_words=["some.thing"]) # still using the defaults | ||
tokenizer.word_tokenize("I want to protect some.thing") | ||
['I', 'want', 'to', 'protect', 'some.thing'] | ||
``` | ||
|
||
Split by sentences: | ||
|
||
```python | ||
from tok import sent_tokenize | ||
sent_tokenize("I wouldn't do that.... would you?") | ||
[['I', 'would', 'not', 'do', 'that', '...'], ['would', 'you', '?']] | ||
``` | ||
|
||
for more options check the documentation of the `Tokenizer`. | ||
|
||
### Further customization | ||
|
||
Given: | ||
|
||
```python | ||
from tok import Tokenizer | ||
t = Tokenizer(protected_words=["some.thing"]) # still using the defaults | ||
``` | ||
|
||
You can add your own ideas to the tokenizer by using: | ||
|
||
- `t.keep(x, reason)`: Whenever it finds x, it will not add whitespace. Prevents direct tokenization. | ||
- `t.split(x, reason)`: Whenever it finds x, it will surround it by whitespace, thus creating a token. | ||
- `t.drop(x, reason)`: Whenever it finds x, it will remove it but add a split. | ||
- `t.strip(x, reason)`: Whenever it finds x, it will remove it without splitting. | ||
|
||
```python | ||
tokenizer.drop("bla", "bla is not needed") | ||
t.word_tokenize("Please remove bla, thank you") | ||
['Please', 'remove', ',', 'thank', 'you'] | ||
``` | ||
|
||
### Explainable | ||
|
||
Explain what happened: | ||
|
||
```python | ||
t.explain("bla") | ||
[{'from': 'bla', 'to': ' ', 'explanation': 'bla is not needed'}] | ||
``` | ||
|
||
See everything in there (will help you understand how it works): | ||
|
||
```python | ||
t.explain_dict | ||
``` | ||
|
||
### Contributing | ||
|
||
It would be greatly appreciated if you want to contribute to contribute to this library, or for example to add more languages to [contractions](https://github.com/kootenpv/contractions). |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
""" File unrelated to the package, except for convenience in deploying """ | ||
import re | ||
import sh | ||
import os | ||
|
||
commit_count = sh.git('rev-list', ['--all']).count('\n') | ||
|
||
with open('setup.py') as f: | ||
setup = f.read() | ||
|
||
setup = re.sub("MICRO_VERSION = '[0-9]+'", "MICRO_VERSION = '{}'".format(commit_count), setup) | ||
|
||
major = re.search("MAJOR_VERSION = '([0-9]+)'", setup).groups()[0] | ||
minor = re.search("MINOR_VERSION = '([0-9]+)'", setup).groups()[0] | ||
micro = re.search("MICRO_VERSION = '([0-9]+)'", setup).groups()[0] | ||
version = '{}.{}.{}'.format(major, minor, micro) | ||
|
||
with open('setup.py', 'w') as f: | ||
f.write(setup) | ||
|
||
with open('tok/__init__.py') as f: | ||
init = f.read() | ||
|
||
with open('tok/__init__.py', 'w') as f: | ||
f.write(re.sub('__version__ = "[0-9.]+"', '__version__ = "{}"'.format(version), init)) | ||
|
||
py_version = "python3.7" if sh.which("python3.7") is not None else "python" | ||
os.system('{} setup.py sdist bdist_wheel upload'.format(py_version)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
[metadata] | ||
description-file = README.md | ||
|
||
[bdist_rpm] | ||
doc_files = README.md | ||
|
||
[wheel] | ||
universal = 1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
from setuptools import find_packages | ||
from setuptools import setup | ||
|
||
MAJOR_VERSION = '0' | ||
MINOR_VERSION = '0' | ||
MICRO_VERSION = '1' | ||
VERSION = "{}.{}.{}".format(MAJOR_VERSION, MINOR_VERSION, MICRO_VERSION) | ||
|
||
with open("README.md") as f: | ||
LONG_DESCRIPTION = f.read() | ||
|
||
setup( | ||
name='tok', | ||
version=VERSION, | ||
description="Fast and customizable tokenizer", | ||
long_description=LONG_DESCRIPTION, | ||
long_description_content_type="text/markdown", | ||
url='https://github.com/kootenpv/tok', | ||
author='Pascal van Kooten', | ||
author_email='kootenpv@gmail.com', | ||
license='MIT', | ||
packages=find_packages(), | ||
include_package_data=True, | ||
install_requires=["textsearch", "tldextract"], | ||
classifiers=[ | ||
'Environment :: Console', | ||
'Intended Audience :: Developers', | ||
'Intended Audience :: Customer Service', | ||
'Intended Audience :: System Administrators', | ||
'License :: OSI Approved :: GNU General Public License v3 (GPLv3)', | ||
'Operating System :: Microsoft', | ||
'Operating System :: MacOS :: MacOS X', | ||
'Operating System :: Unix', | ||
'Operating System :: POSIX', | ||
'Programming Language :: Python', | ||
'Programming Language :: Python :: 2.7', | ||
'Programming Language :: Python :: 3', | ||
'Programming Language :: Python :: 3.4', | ||
'Programming Language :: Python :: 3.5', | ||
'Programming Language :: Python :: 3.6', | ||
'Programming Language :: Python :: 3.7', | ||
'Topic :: Software Development', | ||
'Topic :: Software Development :: Build Tools', | ||
'Topic :: Software Development :: Debuggers', | ||
'Topic :: Software Development :: Libraries', | ||
'Topic :: Software Development :: Libraries :: Python Modules', | ||
'Topic :: System :: Software Distribution', | ||
'Topic :: System :: Systems Administration', | ||
'Topic :: Utilities', | ||
], | ||
zip_safe=False, | ||
platforms='any', | ||
) |
Oops, something went wrong.