keras-team · fchollet · Jun 20, 2017 · Jun 7, 2017 · Jun 7, 2017 · Jun 7, 2017
diff --git a/docs/templates/preprocessing/text.md b/docs/templates/preprocessing/text.md
@@ -2,8 +2,10 @@
 ## text_to_word_sequence
 
 ```python
-keras.preprocessing.text.text_to_word_sequence(text, 
-    filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=" ")
+keras.preprocessing.text.text_to_word_sequence(text,
+                                               filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
+                                               lower=True,
+                                               split=" ")
 ```
 
 Split a sentence into a list of words.
@@ -12,29 +14,74 @@ Split a sentence into a list of words.
 
 - __Arguments__:
     - __text__: str.
-    - __filters__: list (or concatenation) of characters to filter out, such as punctuation. Default: '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n' , includes basic punctuation, tabs, and newlines.
+    - __filters__: list (or concatenation) of characters to filter out, such as
+         punctuation. Default: '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n' , includes
+         basic punctuation, tabs, and newlines.
     - __lower__: boolean. Whether to set the text to lowercase.
     - __split__: str. Separator for word splitting.
 
 ## one_hot
 
 ```python
-keras.preprocessing.text.one_hot(text, n,
-    filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=" ")
+keras.preprocessing.text.one_hot(text,
+                                 n,
+                                 filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
+                                 lower=True,
+                                 split=" ")
 ```
 
-One-hot encode a text into a list of word indexes in a vocabulary of size n.
+One-hot encodes a text into a list of word indexes in a vocabulary of size n.
+
+This is a wrapper to the `hashing_trick` function using `hash` as the hashing function.
 
 - __Return__: List of integers in [1, n]. Each integer encodes a word (unicity non-guaranteed).
 
-- __Arguments__: Same as `text_to_word_sequence` above.
+- __Arguments__:
+    - __text__: str.
     - __n__: int. Size of vocabulary.
+    - __filters__: list (or concatenation) of characters to filter out, such as
+         punctuation. Default: '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n' , includes
+         basic punctuation, tabs, and newlines.
+    - __lower__: boolean. Whether to set the text to lowercase.
+    - __split__: str. Separator for word splitting.
+
+## hashing_trick
+
+```python
+keras.preprocessing.text.hashing_trick(text, 
+                                       n,
+                                       hash_function=None,
+                                       filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
+                                       lower=True,
+                                       split=' ')
+```
+
+Converts a text to a sequence of indices in a fixed-size hashing space
+
+- __Return__:
+        A list of integer word indices (unicity non-guaranteed).
+- __Arguments__:
+    - __text__: str.
+    - __n__: Dimension of the hashing space.
+    - __hash_function__: defaults to python `hash` function, can be 'md5' or
+            any function that takes in input a string and returns a int.
+            Note that 'hash' is not a stable hashing function, so
+            it is not consistent across different runs, while 'md5'
+            is a stable hashing function.
+    - __filters__: list (or concatenation) of characters to filter out, such as
+         punctuation. Default: '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n' , includes
+         basic punctuation, tabs, and newlines.
+    - __lower__: boolean. Whether to set the text to lowercase.
+    - __split__: str. Separator for word splitting.
 
 ## Tokenizer
 
 ```python
-keras.preprocessing.text.Tokenizer(num_words=None, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', 
-    lower=True, split=" ", char_level=False)
+keras.preprocessing.text.Tokenizer(num_words=None,
+                                   filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
+                                   lower=True,
+                                   split=" ",
+                                   char_level=False)
 ```
 
 Class for vectorizing texts, or/and turning texts into sequences (=list of word indexes, where the word of rank i in the dataset (starting at 1) has index i).

diff --git a/keras/preprocessing/text.py b/keras/preprocessing/text.py
@@ -8,11 +8,13 @@
 
 import string
 import sys
+import warnings
+from collections import OrderedDict
+from hashlib import md5
+
 import numpy as np
 from six.moves import range
 from six.moves import zip
-from collections import OrderedDict
-import warnings
 
 if sys.version_info < (3,):
     maketrans = string.maketrans
@@ -45,11 +47,58 @@ def one_hot(text, n,
             filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
             lower=True,
             split=' '):
+    """One-hot encodes a text into a list of word indexes of size n.
+
+    This is a wrapper to the `hashing_trick` function using `hash` as the
+    hashing function, unicity of word to index mapping non-guaranteed.
+    """
+    return hashing_trick(text, n,
+                         hash_function=hash,
+                         filters=filters,
+                         lower=lower,
+                         split=split)
+
+
+def hashing_trick(text, n,
+                  hash_function=None,
+                  filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
+                  lower=True,
+                  split=' '):
+    """Converts a text to a sequence of indexes in a fixed-size hashing space.
+
+    # Arguments
+        text: Input text (string).
+        n: Dimension of the hashing space.
+        hash_function: if `None` uses python `hash` function, can be 'md5' or
+            any function that takes in input a string and returns a int.
+            Note that `hash` is not a stable hashing function, so
+            it is not consistent across different runs, while 'md5'
+            is a stable hashing function.
+        filters: Sequence of characters to filter out.
+        lower: Whether to convert the input to lowercase.
+        split: Sentence split marker (string).
+
+    # Returns
+        A list of integer word indices (unicity non-guaranteed).
+
+    `0` is a reserved index that won't be assigned to any word.
+
+    Two or more words may be assigned to the same index, due to possible
+    collisions by the hashing function.
+    The [probability](https://en.wikipedia.org/wiki/Birthday_problem#Probability_table)
+    of a collision is in relation to the dimension of the hashing space and
+    the number of distinct objects.
+    """
+    if hash_function is None:
+        hash_function = hash
+    elif hash_function == 'md5':
+        hash_function = lambda w: int(md5(w.encode()).hexdigest(), 16)
+
     seq = text_to_word_sequence(text,
                                 filters=filters,
                                 lower=lower,
                                 split=split)
-    return [(abs(hash(w)) % (n - 1) + 1) for w in seq]
+    return [(hash_function(w) % (n - 1) + 1) for w in seq]
 
 
 class Tokenizer(object):

diff --git a/tests/keras/preprocessing/text_test.py b/tests/keras/preprocessing/text_test.py
@@ -1,6 +1,7 @@
-from keras.preprocessing.text import Tokenizer, one_hot
-import pytest
 import numpy as np
+import pytest
+
+from keras.preprocessing.text import Tokenizer, one_hot, hashing_trick
 
 
 def test_one_hot():
@@ -11,6 +12,22 @@ def test_one_hot():
     assert np.min(encoded) >= 0
 
 
+def test_hashing_trick_hash():
+    text = 'The cat sat on the mat.'
+    encoded = hashing_trick(text, 5)
+    assert len(encoded) == 6
+    assert np.max(encoded) <= 4
+    assert np.min(encoded) >= 1
+
+
+def test_hashing_trick_md5():
+    text = 'The cat sat on the mat.'
+    encoded = hashing_trick(text, 5, hash_function='md5')
+    assert len(encoded) == 6
+    assert np.max(encoded) <= 4
+    assert np.min(encoded) >= 1
+
+
 def test_tokenizer():
     texts = ['The cat sat on the mat.',
              'The dog sat on the log.',