In [1]:
import os
import sys
import shutil
import random
import pathlib

import tensorflow as tf

### Downloading the data

In [2]:
DATASET_DIR = pathlib.Path("aclImdb")

if not DATASET_DIR.exists():
    !curl -O https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
    !tar -xf aclImdb_v1.tar.gz # this untars the archive to a folder called aclImdb
    !rm -r aclImdb/train/unsup

MODELS_DIR = pathlib.Path("models")
MODELS_DIR.mkdir(exist_ok=True)

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 80.2M  100 80.2M    0     0  20.2M      0  0:00:03  0:00:03 --:--:-- 20.2M


In [3]:
# code to split the data into train/val folders
TRAIN_DIR = DATASET_DIR / "train"
VAL_DIR = DATASET_DIR / "val"
TEST_DIR = DATASET_DIR / "test"
for category in ("neg", "pos"):
    if not os.path.isdir(VAL_DIR / category):    # do this only once
        os.makedirs(VAL_DIR / category)          # make 'neg'/'pos' dir in validation
        files = os.listdir(TRAIN_DIR / category) # list files in 'train'
        random.Random(1337).shuffle(files)       # shuffle using a seed
        num_val_samples = int(0.2 * len(files))  # 2% of our samples for validation
        val_files = files[-num_val_samples:]
        for fname in val_files:                  # move our files
            shutil.move(TRAIN_DIR / category / fname,
                        VAL_DIR / category / fname)

In [4]:
batch_size = 32
train_ds = tf.keras.utils.text_dataset_from_directory(
    TRAIN_DIR, batch_size=batch_size
)
train_ds = tf.keras.utils.text_dataset_from_directory(
    TRAIN_DIR, batch_size=batch_size
)
val_ds = tf.keras.utils.text_dataset_from_directory(
    VAL_DIR, batch_size=batch_size
)
test_ds = tf.keras.utils.text_dataset_from_directory(
    TEST_DIR, batch_size=batch_size
)

Found 20000 files belonging to 2 classes.
Found 20000 files belonging to 2 classes.
Found 5000 files belonging to 2 classes.
Found 25000 files belonging to 2 classes.


In [5]:
text_only_train_ds = train_ds.map(lambda x, y: x)

---

### Bigrams with TF-IDF encoding

####  TF-IDF: Term Frequency / Inverse Document Frequency

- the most frequent terms in documents tell us something about the **topic**;  
  ("flower" appearing often in a doc about flowers)
- however, **some terms** appear **often and everywhere**;  
  ("the", "a", are frequent in the whole dataset);  
- to counteract that, divide the frequency in one doc by the frequency in all docs:

$$
\bbox[5px,border:2px solid red]
{
\text{TF-IDF} = \frac{\text{frequency in one document}}{\text{frequency in all documents (dataset)}}
}
$$

<small>[See Wikipedia the exact maths](https://en.wikipedia.org/wiki/Tf%E2%80%93idf)</small>

In [6]:
text_vectorization = tf.keras.layers.TextVectorization(
    ngrams=2,
    max_tokens=20000,
    output_mode="count" # configuring the `TextVectorization` layer to return token counts
)

text_vectorization.adapt(text_only_train_ds)

In [7]:
vocabulary = text_vectorization.get_vocabulary()
inverse_vocab = dict(enumerate(vocabulary))
tokens = text_vectorization("the quick brown fox jumps over the quick brown dog")
print(tokens) # instead of just 1 at the word index we get a count!
print()
words = tf.where(tokens > 0)
for word in words:
    word = word.numpy().item()
    print(f"token {word:>4} | frequency: {tokens[word]} | decoded: {inverse_vocab[word]}")

tf.Tensor([8 2 0 ... 0 0 0], shape=(20000,), dtype=int64)

token    0 | frequency: 8 | decoded: [UNK]
token    1 | frequency: 2 | decoded: the
token  153 | frequency: 1 | decoded: over
token  568 | frequency: 1 | decoded: over the
token 1535 | frequency: 1 | decoded: dog
token 3247 | frequency: 1 | decoded: fox
token 3439 | frequency: 2 | decoded: quick
token 4986 | frequency: 2 | decoded: brown
token 6162 | frequency: 1 | decoded: jumps


In [8]:
text_vectorization = tf.keras.layers.TextVectorization(
    ngrams=2,
    max_tokens=20000,
    output_mode="tf_idf", # configuring `TextVectorization` to return TF-IDF-weighted outputs
)
text_vectorization.adapt(text_only_train_ds)

In [9]:
vocabulary = text_vectorization.get_vocabulary()
inverse_vocab = dict(enumerate(vocabulary))
tokens = text_vectorization("the quick brown fox jumps over the quick brown dog")
print(tokens) # instead of just 1 or a count at the word index we get the tf-idf quantity!
print()
words = tf.where(tokens > 0)
for word in words:
    word = word.numpy().item()
    print(f"token {word:>4} | tf-idf: {tokens[word]:10.7f} | decoded: {inverse_vocab[word]}")

tf.Tensor([42.621906   1.3945451  0.        ...  0.         0.         0.       ], shape=(20000,), dtype=float32)

token    0 | tf-idf: 42.6219063 | decoded: [UNK]
token    1 | tf-idf:  1.3945451 | decoded: the
token  153 | tf-idf:  1.9289606 | decoded: over
token  568 | tf-idf:  2.9110560 | decoded: over the
token 1535 | tf-idf:  3.9795589 | decoded: dog
token 3247 | tf-idf:  4.7938089 | decoded: fox
token 3439 | tf-idf:  9.0870848 | decoded: quick
token 4986 | tf-idf: 10.1312609 | decoded: brown
token 6162 | tf-idf:  5.0974345 | decoded: jumps


---

### Training a simple model with TF-IDF

In [10]:
# our datasets
tfidf_2gram_train_ds = train_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)
tfidf_2gram_val_ds = val_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)
tfidf_2gram_test_ds = test_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)

In [11]:
def get_model(max_tokens=20000, hidden_dim=16, clear=True):
    """
    Our model-building utility
    """
    if clear:
        tf.keras.backend.clear_session()
    inputs = tf.keras.Input(shape=(max_tokens,))
    x = tf.keras.layers.Dense(hidden_dim, activation="relu")(inputs)
    x = tf.keras.layers.Dropout(0.5)(x)
    outputs = tf.keras.layers.Dense(1, activation="sigmoid")(x)
    model = tf.keras.Model(inputs, outputs)
    model.compile(
        optimizer="rmsprop",
        loss="binary_crossentropy", # our labels are only 0 or 1 (negative/positive)
        metrics=["accuracy"]
    )
    return model

In [13]:
model = get_model()
model.summary()

callbacks = [
    tf.keras.callbacks.ModelCheckpoint(
        str(MODELS_DIR / "tfidf_2gram.keras"),
        save_best_only=True
    )
]

model.fit(
    tfidf_2gram_train_ds.cache(),
    validation_data=tfidf_2gram_val_ds.cache(),
    epochs=10,
    callbacks=callbacks
)

Epoch 1/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1009s[0m 2s/step - accuracy: 0.6466 - loss: 0.6488 - val_accuracy: 0.8810 - val_loss: 0.3096
Epoch 2/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.8164 - loss: 0.3942 - val_accuracy: 0.8734 - val_loss: 0.3099
Epoch 3/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.8296 - loss: 0.3440 - val_accuracy: 0.8766 - val_loss: 0.3162
Epoch 4/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.8509 - loss: 0.3137 - val_accuracy: 0.8780 - val_loss: 0.3296
Epoch 5/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.8753 - loss: 0.2847 - val_accuracy: 0.8852 - val_loss: 0.3262
Epoch 6/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.8925 - loss: 0.2548 - val_accuracy: 0.8864 - val_loss: 0.3484
Epoch 7/10
[1m625/625[0m

<keras.src.callbacks.history.History at 0x7ba9fdff8820>

In [16]:
# and testing
model = tf.keras.models.load_model(MODELS_DIR / "tfidf_2gram.keras")
_, acc = model.evaluate(tfidf_2gram_test_ds)
print(f"Test acc: {acc:.3f}") # this did not beat the bigram model

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m982s[0m 1s/step - accuracy: 0.8751 - loss: 0.3087
Test acc: 0.875


### Save models to Google Drive


In [19]:
EXPORT=False

if EXPORT:
    # zip models
    !zip tfidf.models.zip {MODELS_DIR}/*
    # connect to drive
    from google.colab import drive
    drive.mount('/content/drive')
    # copy zip to drive (adjust folder as needed)
    !cp tfidf.models.zip drive/MyDrive/IS53024B-Artificial-Intelligence/models