initial commit

kootenpv · Jul 4, 2019 · 7aabb58 · 7aabb58
commit 7aabb58
Show file tree

Hide file tree

Showing 6 changed files with 401 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,20 @@
+*.pyc
+*#*
+*.DS_STORE
+*.log
+*Data.fs*
+*flymake*
+*egg*
+build/
+__pycache__/
+/.Python
+/bin/
+/include/
+/lib/
+/pip-selfcheck.json
+.tox/
+comments/
+dist/
+*silly*
+extras/
+.cache/
diff --git a/README.md b/README.md
@@ -0,0 +1,84 @@
+## tok
+
+[![PyPI](https://img.shields.io/pypi/v/tok.svg?style=flat-square)](https://pypi.python.org/pypi/tok/)
+[![PyPI](https://img.shields.io/pypi/pyversions/tok.svg?style=flat-square)](https://pypi.python.org/pypi/tok/)
+
+Fastest and most complete/customizable tokenizer in Python.
+
+Roughly 25x faster than spacy's and nltk's regex based tokenizers.
+
+### Installation
+
+    pip install tok
+
+It depends on [textsearch](https://github.com/kootenpv/textsearch).
+
+### Usage
+
+By default it handles contractions, http, (float) numbers and currencies.
+
+```python
+from tok import word_tokenize
+word_tokenize("I wouldn't do that.... would you?")
+['I', 'would', 'not', 'do', 'that', '...', 'would', 'you', '?']
+```
+
+Or configure it yourself:
+
+```python
+from tok import Tokenizer
+tokenizer = Tokenizer(protected_words=["some.thing"]) # still using the defaults
+tokenizer.word_tokenize("I want to protect some.thing")
+['I', 'want', 'to', 'protect', 'some.thing']
+```
+
+Split by sentences:
+
+```python
+from tok import sent_tokenize
+sent_tokenize("I wouldn't do that.... would you?")
+[['I', 'would', 'not', 'do', 'that', '...'], ['would', 'you', '?']]
+```
+
+for more options check the documentation of the `Tokenizer`.
+
+### Further customization
+
+Given:
+
+```python
+from tok import Tokenizer
+t = Tokenizer(protected_words=["some.thing"]) # still using the defaults
+```
+
+You can add your own ideas to the tokenizer by using:
+
+- `t.keep(x, reason)`: Whenever it finds x, it will not add whitespace. Prevents direct tokenization.
+- `t.split(x, reason)`: Whenever it finds x, it will surround it by whitespace, thus creating a token.
+- `t.drop(x, reason)`: Whenever it finds x, it will remove it but add a split.
+- `t.strip(x, reason)`: Whenever it finds x, it will remove it without splitting.
+
+```python
+tokenizer.drop("bla", "bla is not needed")
+t.word_tokenize("Please remove bla, thank you")
+['Please', 'remove', ',', 'thank', 'you']
+```
+
+### Explainable
+
+Explain what happened:
+
+```python
+t.explain("bla")
+[{'from': 'bla', 'to': ' ', 'explanation': 'bla is not needed'}]
+```
+
+See everything in there (will help you understand how it works):
+
+```python
+t.explain_dict
+```
+
+### Contributing
+
+It would be greatly appreciated if you want to contribute to contribute to this library, or for example to add more languages to [contractions](https://github.com/kootenpv/contractions).
diff --git a/deploy.py b/deploy.py
@@ -0,0 +1,28 @@
+""" File unrelated to the package, except for convenience in deploying """
+import re
+import sh
+import os
+
+commit_count = sh.git('rev-list', ['--all']).count('\n')
+
+with open('setup.py') as f:
+    setup = f.read()
+
+setup = re.sub("MICRO_VERSION = '[0-9]+'", "MICRO_VERSION = '{}'".format(commit_count), setup)
+
+major = re.search("MAJOR_VERSION = '([0-9]+)'", setup).groups()[0]
+minor = re.search("MINOR_VERSION = '([0-9]+)'", setup).groups()[0]
+micro = re.search("MICRO_VERSION = '([0-9]+)'", setup).groups()[0]
+version = '{}.{}.{}'.format(major, minor, micro)
+
+with open('setup.py', 'w') as f:
+    f.write(setup)
+
+with open('tok/__init__.py') as f:
+    init = f.read()
+
+with open('tok/__init__.py', 'w') as f:
+    f.write(re.sub('__version__ = "[0-9.]+"', '__version__ = "{}"'.format(version), init))
+
+py_version = "python3.7" if sh.which("python3.7") is not None else "python"
+os.system('{} setup.py sdist bdist_wheel upload'.format(py_version))
diff --git a/setup.cfg b/setup.cfg
@@ -0,0 +1,8 @@
+[metadata]
+description-file = README.md
+
+[bdist_rpm]
+doc_files = README.md
+
+[wheel]
+universal = 1
diff --git a/setup.py b/setup.py
@@ -0,0 +1,53 @@
+from setuptools import find_packages
+from setuptools import setup
+
+MAJOR_VERSION = '0'
+MINOR_VERSION = '0'
+MICRO_VERSION = '1'
+VERSION = "{}.{}.{}".format(MAJOR_VERSION, MINOR_VERSION, MICRO_VERSION)
+
+with open("README.md") as f:
+    LONG_DESCRIPTION = f.read()
+
+setup(
+    name='tok',
+    version=VERSION,
+    description="Fast and customizable tokenizer",
+    long_description=LONG_DESCRIPTION,
+    long_description_content_type="text/markdown",
+    url='https://github.com/kootenpv/tok',
+    author='Pascal van Kooten',
+    author_email='kootenpv@gmail.com',
+    license='MIT',
+    packages=find_packages(),
+    include_package_data=True,
+    install_requires=["textsearch", "tldextract"],
+    classifiers=[
+        'Environment :: Console',
+        'Intended Audience :: Developers',
+        'Intended Audience :: Customer Service',
+        'Intended Audience :: System Administrators',
+        'License :: OSI Approved :: GNU General Public License v3 (GPLv3)',
+        'Operating System :: Microsoft',
+        'Operating System :: MacOS :: MacOS X',
+        'Operating System :: Unix',
+        'Operating System :: POSIX',
+        'Programming Language :: Python',
+        'Programming Language :: Python :: 2.7',
+        'Programming Language :: Python :: 3',
+        'Programming Language :: Python :: 3.4',
+        'Programming Language :: Python :: 3.5',
+        'Programming Language :: Python :: 3.6',
+        'Programming Language :: Python :: 3.7',
+        'Topic :: Software Development',
+        'Topic :: Software Development :: Build Tools',
+        'Topic :: Software Development :: Debuggers',
+        'Topic :: Software Development :: Libraries',
+        'Topic :: Software Development :: Libraries :: Python Modules',
+        'Topic :: System :: Software Distribution',
+        'Topic :: System :: Systems Administration',
+        'Topic :: Utilities',
+    ],
+    zip_safe=False,
+    platforms='any',
+)