# How to Prepare Text Data With Keras

#### Split Words with text to word sequence

In [2]:
from keras.preprocessing.text import text_to_word_sequence
# define the document
text = ' The quick brown fox jumped over the lazy dog. '
# tokenize the document
result = text_to_word_sequence(text)
print(result)

['the', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'lazy', 'dog']


#### Encoding with one hot

In [7]:
from keras.preprocessing.text import one_hot
from keras.preprocessing.text import text_to_word_sequence

# define the document
text = ' The quick brown fox jumped over the lazy dog. '

# estimate the size of the vocabulary
words = set(text_to_word_sequence(text))
print(words)
vocab_size = len(words)
print('\n\n',vocab_size)

# integer encode the document
result = one_hot(text, round(vocab_size*1.3))
print('\n\n',result)

{'the', 'lazy', 'brown', 'jumped', 'over', 'quick', 'fox', 'dog'}


 8


 [9, 4, 8, 6, 9, 4, 9, 8, 7]


#### Hash Encoding with hashing trick

In [9]:
from keras.preprocessing.text import hashing_trick
from keras.preprocessing.text import text_to_word_sequence
# define the document
text = 'The quick brown fox jumped over the lazy dog.'

# estimate the size of the vocabulary
words = set(text_to_word_sequence(text))
print(words)
vocab_size = len(words)
print('\n\n', vocab_size)
# integer encode the document
result = hashing_trick(text, round(vocab_size*1.3), hash_function='md5' )
print('\n\n',result)

{'the', 'lazy', 'brown', 'jumped', 'over', 'quick', 'fox', 'dog'}


 8


 [6, 4, 1, 2, 7, 5, 6, 2, 6]


#### Tokenizer API

In [11]:
from keras.preprocessing.text import Tokenizer
# define 5 documents
docs = [ ' Well done! ' ,
' Good work ' ,
' Great effort ' ,
' nice work ' ,
' Excellent! ' ]
# create the tokenizer
t = Tokenizer()
# fit the tokenizer on the documents
t.fit_on_texts(docs)
# summarize what was learned
print(t.word_counts)
print('\n\n', t.document_count)
print('\n\n', t.word_index)
print('\n\n', t.word_docs)
# integer encode documents
encoded_docs = t.texts_to_matrix(docs, mode= 'count' )
print('\n\n', encoded_docs)

OrderedDict([('well', 1), ('done', 1), ('good', 1), ('work', 2), ('great', 1), ('effort', 1), ('nice', 1), ('excellent', 1)])


 5


 {'work': 1, 'well': 2, 'done': 3, 'good': 4, 'great': 5, 'effort': 6, 'nice': 7, 'excellent': 8}


 defaultdict(<class 'int'>, {'done': 1, 'well': 1, 'work': 2, 'good': 1, 'great': 1, 'effort': 1, 'nice': 1, 'excellent': 1})


 [[0. 0. 1. 1. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 1. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1.]]
