In [1]:
import string

In [2]:
class Vectorizer:
    def standardize(self, text):
        text = text.lower()
        return "".join(char for char in text if char not in string.punctuation)
    
    def tokenize(self, text):
        text = self.standardize(text)
        return text.split()
    
    def make_vocab(self, dataset):
        self.vocab = {"": 0, "[UNK]": 1}
        for text in dataset:
            tokens = self.tokenize(text)
            for token in tokens:
                if token not in self.vocab:
                    self.vocab[token] = len(self.vocab)
        self.inverse_vocab = dict(
            (v, k) for k, v in self.vocab.items()
        )
        
    def encode(self, text):
        tokens = self.tokenize(text)
        return [self.vocab.get(token, 1) for token in tokens]
    
    def decode(self, int_sequence):
        return " ".join(
            self.inverse_vocab.get(i, "[UNK]") for i in int_sequence
        )

In [7]:
vectorizer = Vectorizer()
dataset = [
    "Out of the mid-wood's twilight",
    "Into the meadow's dawn",
    "Ivory limbed and brown-eyed",
    "Flashes my Faun!",
]
vectorizer.make_vocab(dataset)

In [8]:
test_sentence = "Out of the mid-wood's dawn there is ivory"
encoded_sentence = vectorizer.encode(test_sentence)
print(encoded_sentence)

[2, 3, 4, 5, 9, 1, 1, 10]


In [9]:
decoded_sentence = vectorizer.decode(encoded_sentence)
print(decoded_sentence)

out of the midwoods dawn [UNK] [UNK] ivory


In [12]:
from tensorflow.keras.layers import TextVectorization
text_vectorizer = TextVectorization(
    output_mode="int"
)

In [13]:
text_vectorizer.adapt(dataset)

In [14]:
text_vectorizer.get_vocabulary()

['',
 '[UNK]',
 'the',
 'twilight',
 'out',
 'of',
 'my',
 'midwoods',
 'meadows',
 'limbed',
 'ivory',
 'into',
 'flashes',
 'faun',
 'dawn',
 'browneyed',
 'and']

In [19]:
vocabulary = text_vectorizer.get_vocabulary()
encoded_sentence = text_vectorizer(test_sentence)
print(encoded_sentence)

tf.Tensor([ 4  5  2  7 14  1  1 10], shape=(8,), dtype=int64)


In [20]:
inverse_vocab = dict(enumerate(vocabulary))
decoded_sentence = " ".join(inverse_vocab[int(i)] for i in encoded_sentence)
print(decoded_sentence)

out of the midwoods dawn [UNK] [UNK] ivory


In [22]:
!cat aclImdb/train/pos/4077_10.txt

I first saw this back in the early 90s on UK TV, i did like it then but i missed the chance to tape it, many years passed but the film always stuck with me and i lost hope of seeing it TV again, the main thing that stuck with me was the end, the hole castle part really touched me, its easy to watch, has a great story, great music, the list goes on and on, its OK me saying how good it is but everyone will take there own best bits away with them once they have seen it, yes the animation is top notch and beautiful to watch, it does show its age in a very few parts but that has now become part of it beauty, i am so glad it has came out on DVD as it is one of my top 10 films of all time. Buy it or rent it just see it, best viewing is at night alone with drink and food in reach so you don't have to stop the film.<br /><br />Enjoy

In [23]:
import os, pathlib, shutil, random

In [25]:
base_dir = pathlib.Path("./aclImdb")
train_dir = base_dir / "train"
val_dir = base_dir / "val"
test_dir = base_dir / "test"
for category in ("neg", "pos"):
    os.makedirs(val_dir / category)
    files= os.listdir(train_dir / category)
    random.Random(42).shuffle(files)
    num_val_samples = int(0.2 * len(files))
    val_files = files[-num_val_samples:]
    for fname in val_files:
        shutil.move(train_dir / category / fname, val_dir / category / fname)

FileExistsError: [Errno 17] File exists: 'aclImdb/val/neg'

In [40]:
from tensorflow import keras

batch_size = 32

train_ds = keras.utils.text_dataset_from_directory(
    train_dir, batch_size=batch_size
)
val_ds = keras.utils.text_dataset_from_directory(
    val_dir, batch_size=batch_size
)
test_ds = keras.utils.text_dataset_from_directory(
    test_dir, batch_size=batch_size
)

Found 20000 files belonging to 2 classes.
Found 5000 files belonging to 2 classes.
Found 25000 files belonging to 2 classes.


In [41]:
inputs, targets = next(iter(train_ds))

In [42]:
print(inputs.shape)
print(inputs.dtype)
print(targets.shape)
print(targets.dtype)
print(inputs[0])
print(targets[0])

(32,)
<dtype: 'string'>
(32,)
<dtype: 'int32'>
tf.Tensor(b'Well, you might not actually SEE any women in love in this movie, but you\'ll certainly hear women TALKING about love, and men talking about love, and women talking about men, and men talking about women, and men talking about men, and everyone talking about death, and talking, and talking, until you yourself will want to scream and do something that requires no talking at all, like paint your bedroom or water your plants.<br /><br />Welcome to the world of D.H. Lawrence, where psycho-babble reigns supreme, and where no one can get down to living a productive life because everyone is too busy talking about how unproductive their lives are. Spending time with the characters in a D.H. Lawrence novel is like being locked in a closet with a group of your most self-absorbed acquaintances who you would run away from if you met them at a party. When I read "Women in Love," I so desperately wanted to strangle every single character in 

In [43]:
text_vectorization = TextVectorization(
    max_tokens=20000,
    output_mode="multi_hot"
)

In [44]:
text_only_train_ds = train_ds.map(lambda x, y: x)
text_vectorization.adapt(text_only_train_ds)

binary_1gram_train_ds = train_ds.map(lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)
binary_1gram_val_ds = val_ds.map(lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)
binary_1gram_test_ds = test_ds.map(lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)

In [45]:
inputs, targets = next(iter(binary_1gram_train_ds))
print(inputs.shape)
print(inputs.dtype)
print(targets.shape)
print(targets.dtype)
print(inputs[0])
print(targets[0])

(32, 20000)
<dtype: 'float32'>
(32,)
<dtype: 'int32'>
tf.Tensor([1. 1. 1. ... 0. 0. 0.], shape=(20000,), dtype=float32)
tf.Tensor(0, shape=(), dtype=int32)


In [46]:
from tensorflow.keras import layers

def get_model(max_tokens=20000, hidden_dim=16):
    inputs = layers.Input(shape=(max_tokens,))
    x = layers.Dense(hidden_dim, activation='relu')(inputs)
    x = layers.Dropout(0.5)(x)
    outputs = layers.Dense(1, activation='sigmoid')(x)
    model = keras.Model(inputs=inputs, outputs=outputs)
    model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
    return model

In [47]:
model = get_model()
model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, 20000)]           0         
                                                                 
 dense_4 (Dense)             (None, 16)                320016    
                                                                 
 dropout_2 (Dropout)         (None, 16)                0         
                                                                 
 dense_5 (Dense)             (None, 1)                 17        
                                                                 
Total params: 320033 (1.22 MB)
Trainable params: 320033 (1.22 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [48]:
callbacks = [keras.callbacks.ModelCheckpoint("binary_1gram.keras", save_best_only=True)]
history = model.fit(binary_1gram_train_ds.cache(), epochs=10, validation_data=binary_1gram_val_ds.cache(), callbacks=callbacks)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [52]:
model = keras.models.load_model("binary_1gram.keras")
model.evaluate(binary_1gram_test_ds.cache())



[0.2953505218029022, 0.8813999891281128]

In [53]:
text_vectorization = TextVectorization(
    ngrams=2,
    max_tokens=20000,
    output_mode='multi_hot'
)

In [54]:
text_vectorization.adapt(text_only_train_ds)
binary_2gram_train_ds = train_ds.map(lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)
binary_2gram_val_ds = val_ds.map(lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)
binary_2gram_test_ds = test_ds.map(lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)

In [55]:
model = get_model()
callbacks = [keras.callbacks.ModelCheckpoint("binary_2gram.keras", save_best_only=True)]
history = model.fit(binary_2gram_train_ds.cache(), epochs=10, validation_data=binary_2gram_val_ds.cache(), callbacks=callbacks)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [56]:
model = keras.models.load_model("binary_2gram.keras")
model.evaluate(binary_2gram_test_ds.cache())



[0.26423379778862, 0.8988400101661682]

In [58]:
text_vectorization = TextVectorization(
    ngrams=2,
    max_tokens=20000,
    output_mode='tf_idf'
)

In [59]:
text_vectorization.adapt(text_only_train_ds)

tfidf_2gram_train_ds = train_ds.map(lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)
tfidf_2gram_val_ds = val_ds.map(lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)
tfidf_2gram_test_ds = test_ds.map(lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)

2023-12-28 13:06:07.927430: W tensorflow/compiler/mlir/tools/kernel_gen/transforms/gpu_kernel_to_blob_pass.cc:191] Failed to compile generated PTX with ptxas. Falling back to compilation by driver.


In [60]:
model = get_model()
callbacks = [keras.callbacks.ModelCheckpoint("tfidf_2gram.keras", save_best_only=True)]
model.fit(tfidf_2gram_train_ds.cache(), epochs=10, validation_data=tfidf_2gram_val_ds.cache(), callbacks=callbacks)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7fe37c3b7790>

In [61]:
model = keras.models.load_model("tfidf_2gram.keras")
model.evaluate(tfidf_2gram_test_ds.cache())



[0.29364439845085144, 0.8820000290870667]

In [62]:
inputs = keras.Input(shape=(1,), dtype='string')
processed_inputs = text_vectorization(inputs)
outputs = model(processed_inputs)
inference_model = keras.Model(inputs=inputs, outputs=outputs)

In [72]:
import tensorflow as tf

raw_text_data = tf.convert_to_tensor(["The movie was amazing, I loved every part of it, would watch again with my friends, "
                                      "I love everything about it! Great!"])
predictions = inference_model(raw_text_data)
print(f"{float(predictions[0]* 100):.2f} % positive")

87.04 % positive


In [73]:
max_length = 600
max_tokens = 20000

text_vectorization = layers.TextVectorization(
    max_tokens=max_tokens,
    output_sequence_length=max_length,
    output_mode="int"
)

text_vectorization.adapt(text_only_train_ds)

int_train_ds = train_ds.map(lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)
int_val_ds = val_ds.map(lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)
int_test_ds = test_ds.map(lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)

In [74]:
inputs = keras.Input(shape=(None,), dtype='int64')
embedded = tf.one_hot(inputs, depth=max_tokens)
x = layers.Bidirectional(layers.LSTM(32))(embedded)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(1, activation='sigmoid')(x)
model = keras.Model(inputs=inputs, outputs=outputs)
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

2023-12-28 13:20:17.471851: W tensorflow/compiler/mlir/tools/kernel_gen/transforms/gpu_kernel_to_blob_pass.cc:191] Failed to compile generated PTX with ptxas. Falling back to compilation by driver.


Model: "model_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_7 (InputLayer)        [(None, None)]            0         
                                                                 
 tf.one_hot (TFOpLambda)     (None, None, 20000)       0         
                                                                 
 bidirectional (Bidirection  (None, 64)                5128448   
 al)                                                             
                                                                 
 dropout_5 (Dropout)         (None, 64)                0         
                                                                 
 dense_10 (Dense)            (None, 1)                 65        
                                                                 
Total params: 5128513 (19.56 MB)
Trainable params: 5128513 (19.56 MB)
Non-trainable params: 0 (0.00 Byte)
___________________

In [75]:
callbacks = [keras.callbacks.ModelCheckpoint("one_hot_bidir_lstm.keras", save_best_only=True)]
model.fit(int_train_ds, validation_data=int_val_ds, epochs=10, callbacks=callbacks)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7fe37c3d5d50>

In [76]:
model = keras.models.load_model("one_hot_bidir_lstm.keras")
model.evaluate(int_test_ds)



[0.29682227969169617, 0.8810799717903137]

In [77]:
inputs = keras.Input(shape=(None,), dtype='int64')
embedded = layers.Embedding(input_dim=max_tokens, output_dim=256)(inputs)
x = layers.Bidirectional(layers.LSTM(32))(embedded)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(1, activation='sigmoid')(x)
model = keras.Model(inputs=inputs, outputs=outputs)
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=["accuracy"])

In [78]:
callbacks = [keras.callbacks.ModelCheckpoint("embedding_lstm.keras", save_best_only=True)]
model.fit(int_train_ds, validation_data=int_val_ds, epochs=10, callbacks=callbacks)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7fe3182945d0>

In [79]:
model = keras.models.load_model("embedding_lstm.keras")
model.evaluate(int_test_ds)



[0.3296020030975342, 0.8694800138473511]

In [81]:
inputs = keras.Input(shape=(None, ), dtype='int64')
embedded = layers.Embedding(input_dim=max_tokens, output_dim=256, mask_zero=True)(inputs)
x = layers.Bidirectional(layers.LSTM(32))(embedded)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(1, activation="sigmoid")(x)
model = keras.Model(inputs, outputs)
model.compile(optimizer="rmsprop", loss="binary_crossentropy", metrics=['accuracy'])

In [82]:
callbacks = [keras.callbacks.ModelCheckpoint("embedding_lstm_masked.keras", save_best_only=True)]
model.fit(int_train_ds, validation_data=int_val_ds, epochs=10, callbacks=callbacks)

Epoch 1/10


2023-12-28 13:49:22.883109: W tensorflow/core/common_runtime/type_inference.cc:339] Type inference failed. This indicates an invalid graph that escaped type checking. Error message: INVALID_ARGUMENT: expected compatible input types, but input 1:
type_id: TFT_OPTIONAL
args {
  type_id: TFT_PRODUCT
  args {
    type_id: TFT_TENSOR
    args {
      type_id: TFT_INT32
    }
  }
}
 is neither a subtype nor a supertype of the combined inputs preceding it:
type_id: TFT_OPTIONAL
args {
  type_id: TFT_PRODUCT
  args {
    type_id: TFT_TENSOR
    args {
      type_id: TFT_FLOAT
    }
  }
}

	for Tuple type infernce function 0
	while inferring type of node 'cond_36/output/_23'
2023-12-28 13:49:23.006447: W tensorflow/compiler/mlir/tools/kernel_gen/transforms/gpu_kernel_to_blob_pass.cc:191] Failed to compile generated PTX with ptxas. Falling back to compilation by driver.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7fe318687ad0>

In [83]:
model = keras.models.load_model("embedding_lstm_masked.keras")
model.evaluate(int_test_ds)



[0.2979428172111511, 0.8759999871253967]

In [84]:
import numpy as np
path_to_glove_file = "glove.6B.100d.txt"

embeddings_index = {}

with open(path_to_glove_file) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs
print(f"Found {len(embeddings_index)} word vectors")

Found 400000 word vectors


In [85]:
embedding_dim = 100

vocabulary = text_vectorization.get_vocabulary()
word_index = dict(zip(vocabulary, range(len(vocabulary))))

embeddings_matrix = np.zeros((max_tokens, embedding_dim))
for word, i in word_index.items():
    if i < max_tokens:
        embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embeddings_matrix[i] = embedding_vector

In [86]:
embedding_layer = layers.Embedding(
    max_tokens,
    embedding_dim,
    embeddings_initializer=keras.initializers.Constant(embeddings_matrix),
    trainable=False,
    mask_zero=True
)
inputs = layers.Input(shape=(None,), dtype='int64')
embedded = embedding_layer(inputs)
x = layers.Bidirectional(layers.LSTM(32))(embedded)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(1, activation="sigmoid")(x)
model = keras.Model(inputs, outputs)
model.compile(optimizer="rmsprop", loss="binary_crossentropy", metrics=["accuracy"])

In [87]:
callbacks = [keras.callbacks.ModelCheckpoint("glove_embeddings_sequence.keras", save_best_only=True)]
model.fit(int_train_ds, validation_data=int_val_ds, epochs=10, callbacks=callbacks)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7fe013c1cd10>

In [88]:
model = keras.models.load_model("glove_embeddings_sequence.keras")
model.evaluate(int_test_ds)



[0.29214969277381897, 0.877240002155304]