In [25]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense

import numpy as np

In [36]:
vocabulary_size = 100
max_length = 20

In [43]:
# dataset
reviews = ['I love programming', 'I don\'t love running']
labels = [1, 0]

In [37]:
# preprocess dataset
# create tokenizer
tokenizer = Tokenizer(num_words=vocabulary_size)
# train tokenizer with vocabulary
tokenizer.fit_on_texts(reviews)
# create sequences of integer out reviews
sequences = tokenizer.texts_to_sequences(reviews)
# print out
print(sequences)

[[1, 2, 3, 4], [5, 6, 7, 8, 9, 1, 2]]


In [38]:
# preprocess dataset
# pad our sequences to ensure they’re of the same length
x_train = pad_sequences(sequences, maxlen = max_length, padding='pre')
# print out
print(type(x_train))
print(x_train.shape)
print(x_train)

<class 'numpy.ndarray'>
(2, 20)
[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 2 3 4]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 5 6 7 8 9 1 2]]


In [39]:
y_train = np.array(labels)
print(type(y_train))
print(y_train.shape)
print(y_train)

<class 'numpy.ndarray'>
(2,)
[1 0]


In [47]:
# define the model
# use an Embedding Layer with an input dimension of 100 (our vocabulary size)
# an output dimension of 64 (the size of the embedding vectors)
# an input length of the size of our padded sequences
model = Sequential()
embedding_layer = Embedding(vocabulary_size, 64, input_length=max_length)
model.add(embedding_layer)
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

model.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_5 (Embedding)     (None, 20, 64)            6400      
                                                                 
 flatten_5 (Flatten)         (None, 1280)              0         
                                                                 
 dense_5 (Dense)             (None, 1)                 1281      
                                                                 
Total params: 7681 (30.00 KB)
Trainable params: 7681 (30.00 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [48]:
# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [49]:
# train the model
model.fit(x_train, y_train, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10


2023-10-12 11:48:51.384160: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x2b2f18b20>

In [50]:
print(embedding_layer.get_weights()[0].shape )
print(embedding_layer.get_weights()[0][0] )

(100, 64)
[-1.3688568e-02  3.1681504e-02  5.0974125e-03 -5.2420989e-02
 -3.7845936e-02 -5.9228759e-02 -1.1415168e-02  2.0011872e-02
  5.2482765e-02  3.3049151e-02  9.3408925e-03 -2.2770107e-02
 -3.6578223e-02 -9.2308773e-03 -2.5252922e-02 -3.1616647e-02
  1.4326822e-02  4.7166947e-02 -2.1721521e-02 -8.5460655e-03
 -9.4988216e-03 -3.6859244e-02  6.0028523e-02 -1.3310545e-02
  3.4475483e-02 -5.4265440e-02 -3.2195617e-03 -3.3574153e-02
  3.7458472e-02 -6.3645467e-03 -5.0528217e-02  1.5997929e-02
  2.8862590e-02 -9.3663298e-03 -3.5441056e-02 -3.5396726e-03
  5.9001733e-02  1.7202215e-02  5.7534810e-02  1.3650018e-02
  5.9324065e-03  3.7962202e-02 -3.5332229e-02 -3.3120099e-02
  6.5675034e-05 -1.5795229e-02  4.1016761e-02 -4.5903515e-02
 -5.5608559e-02 -6.7924894e-03  5.7816962e-03  3.1421568e-02
  5.5812687e-02 -4.4761121e-02  3.6175933e-02  6.5704118e-03
 -5.8289029e-02  8.1923651e-03 -3.3283625e-02 -6.7556491e-03
  3.6661927e-02 -2.1452025e-02 -1.0955386e-02  5.5169135e-02]
