In [2]:
from keras.preprocessing.text import Tokenizer

Using TensorFlow backend.


In [3]:
samples = ["The cat sat on the mat.", "The dog ate my homework."]

In [4]:
tokenizer = Tokenizer(num_words=20000) # Creates a tokenizer configured to take num_words number of tokens

In [5]:
tokenizer.fit_on_texts(samples)

In [6]:
sequences = tokenizer.texts_to_sequences(samples)

In [8]:
tokenizer.texts_to_matrix(samples)

array([[0., 1., 1., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.]])

In [11]:
tokenizer.word_index

{'the': 1,
 'cat': 2,
 'sat': 3,
 'on': 4,
 'mat': 5,
 'dog': 6,
 'ate': 7,
 'my': 8,
 'homework': 9}

In [103]:
from keras.layers import Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D

In [13]:
Embedding(input_dim=20000, output_dim=20)

<keras.layers.embeddings.Embedding at 0x7f302ef76198>

In [22]:
from keras.datasets import imdb
from keras import preprocessing

In [23]:
max_features = 50000

In [24]:
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)

Downloading data from https://s3.amazonaws.com/text-datasets/imdb.npz


In [74]:
x_train = preprocessing.sequence.pad_sequences(x_train, maxlen=1000)
x_test = preprocessing.sequence.pad_sequences(x_test, maxlen=1000)


(100,)

In [61]:
from keras.models import Sequential

In [62]:
from keras.layers import Flatten, Dense, Embedding

In [63]:
x_train.argmax()

1186610

In [75]:
model = Sequential()
model.add(Embedding(input_dim=50000,
                   output_dim=20, input_length=1000))

In [76]:
model.add(Flatten())

In [77]:
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
model.summary()

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, 1000, 20)          1000000   
_________________________________________________________________
flatten_8 (Flatten)          (None, 20000)             0         
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 20001     
Total params: 1,020,001
Trainable params: 1,020,001
Non-trainable params: 0
_________________________________________________________________


In [67]:
model.weights

[<tf.Variable 'embedding_8/embeddings:0' shape=(50000, 20) dtype=float32, numpy=
 array([[-0.00103805,  0.01243638,  0.00695888, ...,  0.0140628 ,
         -0.03706864,  0.03919477],
        [-0.01429489,  0.01941842,  0.00401769, ...,  0.01499332,
         -0.02780882, -0.03840978],
        [ 0.04311088,  0.01399045, -0.03531728, ..., -0.04103826,
          0.04177375,  0.02169676],
        ...,
        [ 0.01629743, -0.00032008, -0.00167928, ..., -0.0121116 ,
         -0.00659166, -0.04698461],
        [-0.03777728, -0.03675102, -0.02585849, ..., -0.01359762,
          0.03570898, -0.02002659],
        [-0.04374249, -0.03850909, -0.02263874, ...,  0.00511963,
          0.00982546,  0.01311047]], dtype=float32)>,
 <tf.Variable 'dense_6/kernel:0' shape=(2000, 1) dtype=float32, numpy=
 array([[ 0.03432419],
        [ 0.01642423],
        [-0.04538822],
        ...,
        [ 0.02656068],
        [ 0.00789237],
        [ 0.03650248]], dtype=float32)>,
 <tf.Variable 'dense_6/bias:0' shape

In [78]:
history = model.fit(x_train, y_train, epochs=3, batch_size=32, validation_split=0.2)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 20000 samples, validate on 5000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


model.weights

From raw text to word embeddings

In [81]:
 from keras.preprocessing.text import Tokenizer

In [82]:
from keras.preprocessing.sequence import pad_sequences

In [83]:
import numpy as np

In [86]:
maxlen = 500 # cuts off comments after 500 words
max_words = 50000 # considers only the top 50000 words in the training datset

In [88]:
corpus = ["The cat sat on the mat.", "The dog ate my homework."]

In [90]:
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(corpus)
sequences = tokenizer.texts_to_sequences(corpus)

In [91]:
sequences

[[1, 2, 3, 4, 1, 5], [1, 6, 7, 8, 9]]

In [92]:
word_index = tokenizer.word_index

In [96]:
print('Found %s unique tokens' % len(word_index))

Found 9 unique tokens


In [117]:
x_train = pad_sequences(sequences=sequences, maxlen=maxlen) # Turns the lists of integers into a 2D
                                                        # integer of shape (len(corpus) or len(sequences), maxlen)

In [109]:
embedding_dim = 8

In [110]:
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense

In [113]:
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=maxlen))
# The Embedding layer takes a input a 2D tensor of integers, of shape (len(sequences), maxlen) and returns 
# a 3D floating point tensor of shape (len(sequenes), maxlen, embedding_dim). Such a 3D tensor can then processed by 
# a 1D convolutuon layer.
          
# When we instantiate an Embedding laer, its weights ar initially random, just as with any other layer
# During training, these word vectors are graudually adjusted via backpropogation 
          
# LEt's now use 1D convnets. They consist of a stack of Conv1D and Maxpooling1D layers, ending in either a 
# global pooiling or Flatten layer, that turn the 3D outputs into 2D outputs, allowing you to add one or 
# more Dense layers to the model for classification or regression 
model.add(Conv1D(filters=32, kernel_size=7, activation='relu'))
model.add(MaxPooling1D(pool_size=5))
model.add(Conv1D(filters=32, kernel_size=7, activation='relu'))
model.add(GlobalMaxPooling1D()) # Flattens the 3D tensor into a 2D tensor of shape (len(corpus), 32)
model.add(Dense(units=1, activation='sigmoid'))
model.summary()

Model: "sequential_14"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_11 (Embedding)     (None, 500, 8)            400000    
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 494, 32)           1824      
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 98, 32)            0         
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 92, 32)            7200      
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 32)                0         
_________________________________________________________________
dense_9 (Dense)              (None, 1)                 33        
Total params: 409,057
Trainable params: 409,057
Non-trainable params: 0
_______________________________________________

In [115]:
from keras.optimizers import RMSprop

In [116]:
model.compile(optimizer=RMSprop(lr=1e-4),
             loss='binary_crossentropy',
             metrics=['acc'])

In [None]:
history = model.fit(x_train, y_train, epochs=5, batch_size=128,
                   validation_split=0.2)