In [2]:
!curl -O https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -xf aclImdb_v1.tar.gz
!rm -r aclImdb/train/unsup

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 80.2M  100 80.2M    0     0  6189k      0  0:00:13  0:00:13 --:--:-- 8317k


In [7]:
import os, pathlib, shutil, random
from tensorflow import keras
batch_size = 32
base_dir = pathlib.Path("aclImdb")
val_dir = base_dir / "val"
train_dir = base_dir / "train"
for category in ("neg", "pos"):
    os.makedirs(val_dir / category)
    files = os.listdir(train_dir / category)
    random.Random(1337).shuffle(files)
    num_val_samples = int(0.2 * len(files))
    val_files = files[-num_val_samples:]
    for fname in val_files:
        shutil.move(train_dir / category / fname,
                    val_dir / category / fname)

train_ds = keras.utils.text_dataset_from_directory(
    "aclImdb/train", batch_size=batch_size
)
val_ds = keras.utils.text_dataset_from_directory(
    "aclImdb/val", batch_size=batch_size
)
test_ds = keras.utils.text_dataset_from_directory(
    "aclImdb/test", batch_size=batch_size
)


FileExistsError: [Errno 17] File exists: 'aclImdb/val/neg'

### loaded onlt text

In [44]:
for text_batch, label_batch in train_ds.take(1):
  for i in range(1):
    print("Review", text_batch.numpy()[i])
    print("Label", label_batch.numpy()[i])

Review b"God I love this movie. If you grew up in the 80's and love Heavy Metal, this is the Movie for you. They really don't get much better than this. The Fastway soundtrack is one of the best soundtracks ever. I put on the record when it first came out and spent the next month learning every song on guitar note for note. The plot outline is your standard Heavy Metal horror movie. Kid's favorite singer dies. Kid plays record backwards. Hero comes back in demonic form and rocks the town. What more could you ask for?<br /><br />If you haven't seen it yet, rush out and buy it. You will not be disappointed. Metal Rules..."
Label 1


In [40]:
text_only_train_ds = train_ds.map(lambda x, y: x)

In [41]:
text_only_train_ds

<_MapDataset element_spec=TensorSpec(shape=(None,), dtype=tf.string, name=None)>

### Strip the sentence after 600 words 

In [14]:
from tensorflow.keras import layers

max_length = 600
max_tokens = 20000
text_vectorization = layers.TextVectorization(
    max_tokens=max_tokens,
    output_mode="int",
    output_sequence_length=max_length,
)
text_vectorization.adapt(text_only_train_ds)

int_train_ds = train_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)
int_val_ds = val_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)
int_test_ds = test_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)

In [39]:
int_train_ds

<_ParallelMapDataset element_spec=(TensorSpec(shape=(None, 600), dtype=tf.int64, name=None), TensorSpec(shape=(None,), dtype=tf.int32, name=None))>

In [16]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

max_tokens = 20000  # Define max_tokens or pass the correct value

class EmbeddedLayer(keras.Layer):
    def call(self, x):
        return tf.one_hot(x, depth=max_tokens)
        

inputs = keras.Input(shape=(None,), dtype="int64")
embedded = EmbeddedLayer()(inputs)

x = layers.Bidirectional(layers.LSTM(32))(embedded)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(1, activation="sigmoid")(x)

model = keras.Model(inputs, outputs)
model.compile(optimizer="rmsprop",
              loss="binary_crossentropy",
              metrics=["accuracy"])

model.summary()

In [18]:
callbacks = [
    keras.callbacks.ModelCheckpoint("one_hot_bidir_lstm.keras",
                                    save_best_only=True)
]
model.fit(int_train_ds, validation_data=int_val_ds, epochs=10, callbacks=callbacks)
model = keras.models.load_model("one_hot_bidir_lstm.keras")
print(f"Test acc: {model.evaluate(int_test_ds)[1]:.3f}")

Epoch 1/10
[1m218/625[0m [32m━━━━━━[0m[37m━━━━━━━━━━━━━━[0m [1m10:30[0m 2s/step - accuracy: 0.5310 - loss: 0.6885

KeyboardInterrupt: 

#### Understanding word embeddings

### embedding Layer trained from scratch
Embedding layer maps integer indices to dense vectors. It looks integers as input , looks up these inetegers into internal dictionary and returns associated vectors. 

In [21]:
embedding_layer = layers.Embedding(input_dim=max_tokens, output_dim=256)

In [22]:
inputs = keras.Input(shape=(None,), dtype="int64")
embedded = layers.Embedding(input_dim=max_tokens, output_dim=256)(inputs)
x = layers.Bidirectional(layers.LSTM(32))(embedded)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(1, activation="sigmoid")(x)
model = keras.Model(inputs, outputs)
model.compile(optimizer="rmsprop",
              loss="binary_crossentropy",
              metrics=["accuracy"])
model.summary()

callbacks = [
    keras.callbacks.ModelCheckpoint("embeddings_bidir_gru.keras",
                                    save_best_only=True)
]
model.fit(int_train_ds, validation_data=int_val_ds, epochs=10, callbacks=callbacks)
model = keras.models.load_model("embeddings_bidir_gru.keras")
print(f"Test acc: {model.evaluate(int_test_ds)[1]:.3f}")

Epoch 1/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m106s[0m 170ms/step - accuracy: 0.6365 - loss: 0.6169 - val_accuracy: 0.8564 - val_loss: 0.3634
Epoch 2/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m110s[0m 175ms/step - accuracy: 0.8335 - loss: 0.4115 - val_accuracy: 0.7558 - val_loss: 0.6231
Epoch 3/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m223s[0m 357ms/step - accuracy: 0.8682 - loss: 0.3473 - val_accuracy: 0.8684 - val_loss: 0.3455
Epoch 4/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m110s[0m 175ms/step - accuracy: 0.8923 - loss: 0.2876 - val_accuracy: 0.8714 - val_loss: 0.3097
Epoch 5/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2760s[0m 4s/step - accuracy: 0.9107 - loss: 0.2449 - val_accuracy: 0.8806 - val_loss: 0.3208
Epoch 6/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1521s[0m 2s/step - accuracy: 0.9295 - loss: 0.2021 - val_accuracy: 0.8820 - val_loss: 0.3574
Epoch 7/

**Using an `Embedding` layer with masking enabled**

In [29]:
inputs = keras.Input(shape=(None,), dtype="int64")
embedded = layers.Embedding(
    input_dim=max_tokens, output_dim=256, mask_zero=True)(inputs)
x = layers.Bidirectional(layers.LSTM(32))(embedded)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(1, activation="sigmoid")(x)
model = keras.Model(inputs, outputs)
model.compile(optimizer="rmsprop",
              loss="binary_crossentropy",
              metrics=["accuracy"])
model.summary()

callbacks = [
    keras.callbacks.ModelCheckpoint("embeddings_bidir_gru_with_masking.keras",
                                    save_best_only=True)
]
model.fit(int_train_ds, validation_data=int_val_ds, epochs=1, callbacks=callbacks)
model = keras.models.load_model("embeddings_bidir_gru_with_masking.keras")
print(f"Test acc: {model.evaluate(int_test_ds)[1]:.3f}")

[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m124s[0m 197ms/step - accuracy: 0.6830 - loss: 0.5677 - val_accuracy: 0.8466 - val_loss: 0.3556
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 31ms/step - accuracy: 0.8329 - loss: 0.3875
Test acc: 0.836


### download pretarined word embeddings

In [25]:
#!curl -O http://nlp.stanford.edu/data/glove.6B.zip
#!wget http://nlp.stanford.edu/data/glove.6B.zip
#!unzip -q glove.6B.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   308  100   308    0     0   1264      0 --:--:-- --:--:-- --:--:--  1267
[glove.6B.zip]
  End-of-central-directory signature not found.  Either this file is not
  a zipfile, or it constitutes one disk of a multi-part archive.  In the
  latter case the central directory and zipfile comment will be found on
  the last disk(s) of this archive.
unzip:  cannot find zipfile directory in one of glove.6B.zip or
        glove.6B.zip.zip, and cannot find glove.6B.zip.ZIP, period.


### load the GloveVector

In [26]:
import numpy as np
path_to_glove_file = "data/glove.6B.100d.txt"

embeddings_index = {}
with open(path_to_glove_file) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print(f"Found {len(embeddings_index)} word vectors.")

Found 400000 word vectors.


### Printing Dictionary element

In [28]:
 for key, value in embeddings_index.items():
        print(f"{key}: {value}")
        break;

the: [-0.038194 -0.24487   0.72812  -0.39961   0.083172  0.043953 -0.39141
  0.3344   -0.57545   0.087459  0.28787  -0.06731   0.30906  -0.26384
 -0.13231  -0.20757   0.33395  -0.33848  -0.31743  -0.48336   0.1464
 -0.37304   0.34577   0.052041  0.44946  -0.46971   0.02628  -0.54155
 -0.15518  -0.14107  -0.039722  0.28277   0.14393   0.23464  -0.31021
  0.086173  0.20397   0.52624   0.17164  -0.082378 -0.71787  -0.41531
  0.20335  -0.12763   0.41367   0.55187   0.57908  -0.33477  -0.36559
 -0.54857  -0.062892  0.26584   0.30205   0.99775  -0.80481  -3.0243
  0.01254  -0.36942   2.2167    0.72201  -0.24978   0.92136   0.034514
  0.46745   1.1079   -0.19358  -0.074575  0.23353  -0.052062 -0.22044
  0.057162 -0.15806  -0.30798  -0.41625   0.37972   0.15006  -0.53212
 -0.2055   -1.2526    0.071624  0.70565   0.49744  -0.42063   0.26148
 -1.538    -0.30223  -0.073438 -0.28312   0.37104  -0.25217   0.016215
 -0.017099 -0.38984   0.87424  -0.72569  -0.51058  -0.52028  -0.1459
  0.8278    0.27

In [34]:
embedding_dim = 100

vocabulary = text_vectorization.get_vocabulary()
word_index = dict(zip(vocabulary, range(len(vocabulary))))
for key, value in word_index.items():
        print(f"{key}: {value}")
        break;


: 0


### maxTokens is the size of the vocabulary

### Lets create the embedding_matrix from pretrianed matrix for the vocabaulry of ours. GloVe vectors

In [35]:

embedding_matrix = np.zeros((max_tokens, embedding_dim))
for word, i in word_index.items():
    if i < max_tokens:
        embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

### loading vector into embedding layer

In [36]:
embedding_layer = layers.Embedding(
    max_tokens,
    embedding_dim,
    embeddings_initializer=keras.initializers.Constant(embedding_matrix),
    trainable=False,
    mask_zero=True,
)

In [37]:
inputs = keras.Input(shape=(None,), dtype="int64")
embedded = embedding_layer(inputs)
x = layers.Bidirectional(layers.LSTM(32))(embedded)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(1, activation="sigmoid")(x)
model = keras.Model(inputs, outputs)
model.compile(optimizer="rmsprop",
              loss="binary_crossentropy",
              metrics=["accuracy"])
model.summary()

callbacks = [
    keras.callbacks.ModelCheckpoint("glove_embeddings_sequence_model.keras",
                                    save_best_only=True)
]
model.fit(int_train_ds, validation_data=int_val_ds, epochs=10, callbacks=callbacks)
model = keras.models.load_model("glove_embeddings_sequence_model.keras")
print(f"Test acc: {model.evaluate(int_test_ds)[1]:.3f}")

Epoch 1/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 111ms/step - accuracy: 0.6331 - loss: 0.6308 - val_accuracy: 0.7928 - val_loss: 0.4455
Epoch 2/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m72s[0m 116ms/step - accuracy: 0.7867 - loss: 0.4673 - val_accuracy: 0.8342 - val_loss: 0.3838
Epoch 3/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m164s[0m 263ms/step - accuracy: 0.8186 - loss: 0.4100 - val_accuracy: 0.8412 - val_loss: 0.3643
Epoch 4/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m73s[0m 117ms/step - accuracy: 0.8361 - loss: 0.3775 - val_accuracy: 0.8542 - val_loss: 0.3324
Epoch 5/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m72s[0m 115ms/step - accuracy: 0.8499 - loss: 0.3536 - val_accuracy: 0.8434 - val_loss: 0.3541
Epoch 6/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m73s[0m 117ms/step - accuracy: 0.8615 - loss: 0.3320 - val_accuracy: 0.8728 - val_loss: 0.3024
Epoch 7/1