# Develop Deep Learning Models for Natural Language in Python

## 7 - How to Prepare Data with Keras

### 7.2 - Split words with *text_to_word_sequence*

In [3]:
from keras.preprocessing.text import text_to_word_sequence

text = 'The quick brown fox jumed over the lazy dog.'

words = text_to_word_sequence(text)
print(words)

['the', 'quick', 'brown', 'fox', 'jumed', 'over', 'the', 'lazy', 'dog']


### 7.3 - Encoding with *one_hot*

In [5]:
from keras.preprocessing.text import text_to_word_sequence, one_hot

text = 'The quick brown fox jumed over the lazy dog.'

# Get vocabulary
vocab = set(text_to_word_sequence(text))
vocab_size = len(vocab)
print('Vocabulary Size:', vocab_size)

# Integer encode the vocabulary (Increased vocab_size to reduce collision)
encode = one_hot(text, round(vocab_size * 1.3))
print('one_hot Encode:', encode)

Vocabulary Size: 8
one_hot Encode: [8, 6, 9, 8, 4, 7, 8, 4, 8]


### 7.4 - Encoding with *hashing_trick*

In [2]:
from keras.preprocessing.text import text_to_word_sequence, hashing_trick

text = 'The quick brown fox jumed over the lazy dog.'

# Get vocabulary
vocab = set(text_to_word_sequence(text))
vocab_size = len(vocab)
print('Vocabulary Size:', vocab_size)

# Integer encode the vocabulary (Increased vocab_size to reduce collision)
encode = hashing_trick(text, round(vocab_size * 1.3), hash_function = 'md5')
print('hashing_trick Encode:', encode)

Vocabulary Size: 8
hashing_trick Encode: [6, 4, 1, 2, 9, 5, 6, 2, 6]


### 7.5 Tokenizer API

In [13]:
from keras.preprocessing.text import Tokenizer

docs = ['Well done!', 'Good work.', 'Great effort.', 'nice work', 'Excellent!']

# Create tokenizer
t = Tokenizer()

t.fit_on_texts(docs)

# Summerize what was learned
print(t.word_counts)
print(t.document_count)
print(t.word_index)
print(t.word_docs)

# Integer Encode documents
encoded_docs = t.texts_to_matrix(docs, mode = 'count')
print(encoded_docs)

OrderedDict([('well', 1), ('done', 1), ('good', 1), ('work', 2), ('great', 1), ('effort', 1), ('nice', 1), ('excellent', 1)])
5
{'work': 1, 'well': 2, 'done': 3, 'good': 4, 'great': 5, 'effort': 6, 'nice': 7, 'excellent': 8}
defaultdict(<class 'int'>, {'done': 1, 'well': 1, 'good': 1, 'work': 2, 'effort': 1, 'great': 1, 'nice': 1, 'excellent': 1})
[[0. 0. 1. 1. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 1. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1.]]
