diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..cae10c9
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,20 @@
+*.pyc
+*#*
+*.DS_STORE
+*.log
+*Data.fs*
+*flymake*
+*egg*
+build/
+__pycache__/
+/.Python
+/bin/
+/include/
+/lib/
+/pip-selfcheck.json
+.tox/
+comments/
+dist/
+*silly*
+extras/
+.cache/
\ No newline at end of file
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..bc6ac55
--- /dev/null
+++ b/README.md
@@ -0,0 +1,84 @@
+## tok
+
+[![PyPI](https://img.shields.io/pypi/v/tok.svg?style=flat-square)](https://pypi.python.org/pypi/tok/)
+[![PyPI](https://img.shields.io/pypi/pyversions/tok.svg?style=flat-square)](https://pypi.python.org/pypi/tok/)
+
+Fastest and most complete/customizable tokenizer in Python.
+
+Roughly 25x faster than spacy's and nltk's regex based tokenizers.
+
+### Installation
+
+    pip install tok
+
+It depends on [textsearch](https://github.com/kootenpv/textsearch).
+
+### Usage
+
+By default it handles contractions, http, (float) numbers and currencies.
+
+```python
+from tok import word_tokenize
+word_tokenize("I wouldn't do that.... would you?")
+['I', 'would', 'not', 'do', 'that', '...', 'would', 'you', '?']
+```
+
+Or configure it yourself:
+
+```python
+from tok import Tokenizer
+tokenizer = Tokenizer(protected_words=["some.thing"]) # still using the defaults
+tokenizer.word_tokenize("I want to protect some.thing")
+['I', 'want', 'to', 'protect', 'some.thing']
+```
+
+Split by sentences:
+
+```python
+from tok import sent_tokenize
+sent_tokenize("I wouldn't do that.... would you?")
+[['I', 'would', 'not', 'do', 'that', '...'], ['would', 'you', '?']]
+```
+
+for more options check the documentation of the `Tokenizer`.
+
+### Further customization
+
+Given:
+
+```python
+from tok import Tokenizer
+t = Tokenizer(protected_words=["some.thing"]) # still using the defaults
+```
+
+You can add your own ideas to the tokenizer by using:
+
+- `t.keep(x, reason)`: Whenever it finds x, it will not add whitespace. Prevents direct tokenization.
+- `t.split(x, reason)`: Whenever it finds x, it will surround it by whitespace, thus creating a token.
+- `t.drop(x, reason)`: Whenever it finds x, it will remove it but add a split.
+- `t.strip(x, reason)`: Whenever it finds x, it will remove it without splitting.
+
+```python
+tokenizer.drop("bla", "bla is not needed")
+t.word_tokenize("Please remove bla, thank you")
+['Please', 'remove', ',', 'thank', 'you']
+```
+
+### Explainable
+
+Explain what happened:
+
+```python
+t.explain("bla")
+[{'from': 'bla', 'to': ' ', 'explanation': 'bla is not needed'}]
+```
+
+See everything in there (will help you understand how it works):
+
+```python
+t.explain_dict
+```
+
+### Contributing
+
+It would be greatly appreciated if you want to contribute to contribute to this library, or for example to add more languages to [contractions](https://github.com/kootenpv/contractions).
diff --git a/deploy.py b/deploy.py
new file mode 100644
index 0000000..e9f6ab1
--- /dev/null
+++ b/deploy.py
@@ -0,0 +1,28 @@
+""" File unrelated to the package, except for convenience in deploying """
+import re
+import sh
+import os
+
+commit_count = sh.git('rev-list', ['--all']).count('\n')
+
+with open('setup.py') as f:
+    setup = f.read()
+
+setup = re.sub("MICRO_VERSION = '[0-9]+'", "MICRO_VERSION = '{}'".format(commit_count), setup)
+
+major = re.search("MAJOR_VERSION = '([0-9]+)'", setup).groups()[0]
+minor = re.search("MINOR_VERSION = '([0-9]+)'", setup).groups()[0]
+micro = re.search("MICRO_VERSION = '([0-9]+)'", setup).groups()[0]
+version = '{}.{}.{}'.format(major, minor, micro)
+
+with open('setup.py', 'w') as f:
+    f.write(setup)
+
+with open('tok/__init__.py') as f:
+    init = f.read()
+
+with open('tok/__init__.py', 'w') as f:
+    f.write(re.sub('__version__ = "[0-9.]+"', '__version__ = "{}"'.format(version), init))
+
+py_version = "python3.7" if sh.which("python3.7") is not None else "python"
+os.system('{} setup.py sdist bdist_wheel upload'.format(py_version))
diff --git a/setup.cfg b/setup.cfg
new file mode 100644
index 0000000..f528236
--- /dev/null
+++ b/setup.cfg
@@ -0,0 +1,8 @@
+[metadata]
+description-file = README.md
+
+[bdist_rpm]
+doc_files = README.md
+
+[wheel]
+universal = 1
\ No newline at end of file
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..7e03863
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,53 @@
+from setuptools import find_packages
+from setuptools import setup
+
+MAJOR_VERSION = '0'
+MINOR_VERSION = '0'
+MICRO_VERSION = '1'
+VERSION = "{}.{}.{}".format(MAJOR_VERSION, MINOR_VERSION, MICRO_VERSION)
+
+with open("README.md") as f:
+    LONG_DESCRIPTION = f.read()
+
+setup(
+    name='tok',
+    version=VERSION,
+    description="Fast and customizable tokenizer",
+    long_description=LONG_DESCRIPTION,
+    long_description_content_type="text/markdown",
+    url='https://github.com/kootenpv/tok',
+    author='Pascal van Kooten',
+    author_email='kootenpv@gmail.com',
+    license='MIT',
+    packages=find_packages(),
+    include_package_data=True,
+    install_requires=["textsearch", "tldextract"],
+    classifiers=[
+        'Environment :: Console',
+        'Intended Audience :: Developers',
+        'Intended Audience :: Customer Service',
+        'Intended Audience :: System Administrators',
+        'License :: OSI Approved :: GNU General Public License v3 (GPLv3)',
+        'Operating System :: Microsoft',
+        'Operating System :: MacOS :: MacOS X',
+        'Operating System :: Unix',
+        'Operating System :: POSIX',
+        'Programming Language :: Python',
+        'Programming Language :: Python :: 2.7',
+        'Programming Language :: Python :: 3',
+        'Programming Language :: Python :: 3.4',
+        'Programming Language :: Python :: 3.5',
+        'Programming Language :: Python :: 3.6',
+        'Programming Language :: Python :: 3.7',
+        'Topic :: Software Development',
+        'Topic :: Software Development :: Build Tools',
+        'Topic :: Software Development :: Debuggers',
+        'Topic :: Software Development :: Libraries',
+        'Topic :: Software Development :: Libraries :: Python Modules',
+        'Topic :: System :: Software Distribution',
+        'Topic :: System :: Systems Administration',
+        'Topic :: Utilities',
+    ],
+    zip_safe=False,
+    platforms='any',
+)
diff --git a/tok/__init__.py b/tok/__init__.py
new file mode 100644
index 0000000..8fafca1
--- /dev/null
+++ b/tok/__init__.py
@@ -0,0 +1,208 @@
+__project__ = "tokenize"
+__version__ = "0.0.1"
+__repo__ = "https://github.com/kootenpv/tok"
+
+import string
+from textsearch import TextSearch
+from contractions import contractions_dict
+
+
+class Tokenizer:
+    def __init__(
+        self,
+        handle_http=False,
+        handle_domains=False,
+        numbers=True,
+        combine_punctuation=True,
+        eol="\n",
+        currencies=("$",),
+        protected_words=None,
+        contractions=True,
+        language="en",
+    ):
+        # set() set() should fallback to just using __iter__ of automaton for a speedboost
+        if language != "en" and contractions:
+            raise ValueError("No contractions known for languages other than English.")
+        self.contractions = contractions
+        self.tokenizer = None
+        self.handle_http = handle_http
+        self.handle_domains = handle_domains
+        self.combine_punctuation = combine_punctuation
+        self.numbers = numbers
+        self.eol = eol
+        self.currencies = currencies or []
+        self.protected_words = protected_words or []
+        self.explain_dict = {}
+        self.setup()
+
+    def setup(self):
+        self.tokenizer = TextSearch("sensitive", "norm", set(), set())
+        self.add_base_cases()
+        self.add_currencies()
+        self.add_words(self.protected_words)
+        if self.handle_http:
+            self.tokenizer.add_http_handler(keep_result=True)
+            for word in ["http://", "https://", "www."]:
+                self.explain_dict[
+                    word
+                ] = "regex: when it finds '{}' it will stop after it finds a space.".format(word)
+        if self.handle_domains:
+            self.add_domain_handler()
+        if self.contractions:
+            if self.contractions == True:
+                self.contractions = contractions_dict
+            self.add_words(self.contractions)
+
+    def add_words(self, words):
+        # [("cannot", "can not"), ("can't", "can n't"), ("mr.", "mr.")]
+        words = words.items() if isinstance(words, dict) else words
+        if words and isinstance(words[0], str):
+            words = [(x, x) for x in words]
+        for x, y in words:
+            REASON_AS_IS = "protected word: adds word as is, prevents splitting it."
+            REASON_UPPER = "protected word: adds word uppercased, prevents splitting it."
+            REASON_TITLE = "protected word: adds word titlecased, prevents splitting it."
+            self.add(x, y, REASON_AS_IS)
+            self.add(x.upper(), y.upper(), REASON_UPPER)
+            self.add(x[0].upper() + x[1:], y[0].upper() + y[1:], REASON_TITLE)
+
+    def add_domain_handler(self):
+        import re
+        from tldextract.tldextract import TLD_EXTRACTOR
+
+        valid_re = re.compile("^[a-zA-Z.]+$")
+        tlds = ["." + x for x in TLD_EXTRACTOR.tlds if valid_re.match(x)]
+
+        for x in tlds:
+            self.add(x, x, "Added by domain handler, keeps the token existing.")
+
+    def add_base_cases(self):
+        if self.numbers:
+            for x in "0123456789":
+                self.keep(x + ",")
+                self.keep(x + ".")
+
+        # self.tokenizer.add(" !", " ! ")
+
+        if self.combine_punctuation:
+            # combine multiples
+            R_COMBINE = "combine punctuation: merges '{}' into '{}' and starts a new sentence."
+            for s in "!.?-":
+                for i in range(2, 10):
+                    # one of these is a splitting char
+                    if i == 1 and s == "-":
+                        continue
+                    c = s * i
+                    e = s * 3 if i > 1 else s
+                    # end = "$<EOS>$" if i == 1 or s != "-" else " "
+                    end = " \n" if i == 1 or s != "-" else " "
+                    self.add(c, " {}{}".format(e, end), R_COMBINE.format(c, e + end))
+
+            for i in range(2, 10):
+                # self.tokenizer.add("\n" * i, "$<EOS>$")
+                self.add("\n" * i, " \n ", "merges newlines")
+
+        for s in "!.?-\n":
+            self.split(s, "Splits on '{}' and creating a new sentence.".format(s))
+
+        self.split("- ")
+
+        self.split("...")
+
+        # does not work
+        # self.tokenizer.add_regex_handler(["!?"], "[!]+[?]+[!?]+", True, return_value=" !? ")
+
+        self.split("!?")
+        self.split("!?!")
+        self.split("!!?")
+        self.split("!??")
+        self.split("?!!")
+        self.split("?!?")
+        self.split("??!")
+
+        for x in string.ascii_letters:
+            self.keep(" " + x + ".")
+
+        # for x in string.ascii_letters:
+        #     self.tokenizer.add("\n" + x, "\n" + x)
+
+        self.split(",")
+
+        # quotes (make sure we add all the exeptions)
+        self.split("'")
+        self.split('"')
+
+    def keep(self, x, reason=None):
+        """ Whenever it finds x, it will not add whitespace. Prevents direct tokenization. """
+        self.tokenizer.add(x, x)
+        self.explain_dict[x] = reason or "keep:" + self.keep.__doc__.replace("x", repr(x)).rstrip()
+
+    def split(self, x, reason=None):
+        """ Whenever it finds x, it will surround it by whitespace, thus creating a token. """
+        self.tokenizer.add(x, " {} ".format(x))
+        self.explain_dict[x] = (
+            reason or "split:" + self.split.__doc__.replace("x", repr(x)).rstrip()
+        )
+
+    def drop(self, x, reason=None):
+        """ Whenever it finds x, it will remove it but add a split."""
+        self.tokenizer.add(x, " ")
+        self.explain_dict[x] = reason or "drop:" + self.drop.__doc__.replace("x", repr(x)).rstrip()
+
+    def strip(self, x, reason=None):
+        """ Whenever it finds x, it will remove it without splitting. """
+        self.tokenizer.add(x, "")
+        self.explain_dict[x] = (
+            reason or "strip:" + self.strip.__doc__.replace("x", repr(x)).rstrip()
+        )
+
+    def add(self, x, y, reason):
+        self.tokenizer.add(x, y)
+        self.explain_dict[x] = reason
+
+    def explain(self, char_or_chars):
+        keys = [x for x in self.tokenizer._root_dict if char_or_chars in x]
+        if not keys:
+            return {
+                "explanation": "No explanation, meaning there is nothing specified for the input"
+            }
+        return [
+            {"from": x, "to": self.tokenizer._root_dict[x], "explanation": self.explain_dict[x]}
+            for x in keys
+        ]
+
+    def remove(self, x, reason=None):
+        self.tokenizer.remove(x)
+        self.explain_dict[x] = reason or "removing '{}'".format(x)
+
+    def add_currencies(self):
+        for currency in self.currencies:
+            self.split(currency)
+
+        for num in "0123456789":
+            # to prevent the . and , from being treated as punct
+            for punc in ",.":
+                s = "{currency}{num}{punc}".format(currency=currency, num=num, punc=punc)
+                r = " {currency} {num}{punc}".format(currency=currency, num=num, punc=punc)
+                self.add(s, r, "protecting currency from being seen as a number.")
+
+    def word_tokenize(self, z, return_entities=False, to_lower=False):
+        if return_entities:
+            a, b = self.tokenizer.replace(" " + z, return_entities=True)
+            return a.split(), b
+        res = self.tokenizer.replace(" " + z).split()
+        if to_lower:
+            res = [x.lower() for x in res]
+        return res
+
+    def word_newlined_tokenize(self, z):
+        sentences = self.sent_tokenize(z)
+        return sum([x + ["\n"] for x in sentences[:-1]], []) + sentences[-1]
+
+    def sent_tokenize(self, z):
+        return [x.split() for x in self.tokenizer.replace(z).split("\n") if x.strip()]
+
+
+t = Tokenizer(handle_http=True, handle_domains=False)
+word_tokenize = t.word_tokenize
+sent_tokenize = t.sent_tokenize