In [1]:
import tensorflow as tf
import os, pathlib, shutil, random

### Downloading the data

Commands to download and uncompress the imdb dataset:

```bash
!curl -O https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -xf aclImdb_v1.tar.gz
!rm -r aclImdb/train/unsup
```

#### Reminder

In Colab, use a bang (!) to run a bash command. You can also run them in a terminal (without the initial !).

#### Other reminder

You can use symlinks in Linux to avoid downloading things twice!

```bash
# will create a folder called 'linked_dataset' in the current dir, linked to the other one
!ln -s /path/to/already/downloaded/dataset linked_dataset
```

In Google drive, the same can be achieved by selecting the directory you want to link to, and pressing Shift+Z, then selecting the target directory.

In [2]:
# code to split the data into train/val folders
base_dir = pathlib.Path("aclImdb")
train_dir = base_dir / "train"
val_dir = base_dir / "val"
test_dir = base_dir / "test"
for category in ("neg", "pos"):
    if not os.path.isdir(val_dir / category):    # do this only once
        os.makedirs(val_dir / category)          # make 'neg'/'pos' dir in validation
        files = os.listdir(train_dir / category) # list files in 'train'
        random.Random(1337).shuffle(files)       # shuffle using a seed
        num_val_samples = int(0.2 * len(files))  # 2% of our samples for validation
        val_files = files[-num_val_samples:]
        for fname in val_files:                  # move our files
            shutil.move(train_dir / category / fname,
                        val_dir / category / fname)

In [9]:
batch_size = 32
train_ds = tf.keras.utils.text_dataset_from_directory(
    train_dir, batch_size=batch_size
)
train_ds = tf.keras.utils.text_dataset_from_directory(
    train_dir, batch_size=batch_size
)
val_ds = tf.keras.utils.text_dataset_from_directory(
    val_dir, batch_size=batch_size
)
test_ds = tf.keras.utils.text_dataset_from_directory(
    test_dir, batch_size=batch_size
)

Found 20000 files belonging to 2 classes.
Found 20000 files belonging to 2 classes.
Found 5000 files belonging to 2 classes.
Found 25000 files belonging to 2 classes.


In [10]:
text_only_train_ds = train_ds.map(lambda x, y: x)

---

### Bigrams with TF-IDF encoding

####  TF-IDF: Term Frequency / Inverse Document Frequency

- the most frequent terms in documents tell us something about the **topic**;  
  ("flower" appearing often in a doc about flowers)
- however, **some terms** appear **often and everywhere**;  
  ("the", "a", are frequent in the whole dataset);  
- to counteract that, divide the frequency in one doc by the frequency in all docs:

$$
\bbox[5px,border:2px solid red]
{
\text{TF-IDF} = \frac{\text{frequency in one document}}{\text{frequency in all documents (dataset)}}
}
$$

<small>[See Wikipedia the exact maths](https://en.wikipedia.org/wiki/Tf%E2%80%93idf)</small>

In [11]:
text_vectorization = tf.keras.layers.TextVectorization(
    ngrams=2,
    max_tokens=20000,
    output_mode="count" # configuring the `TextVectorization` layer to return token counts
)

text_vectorization.adapt(text_only_train_ds)

In [12]:
vocabulary = text_vectorization.get_vocabulary()
inverse_vocab = dict(enumerate(vocabulary))
tokens = text_vectorization("the quick brown fox jumps over the quick brown dog")
print(tokens) # instead of just 1 at the word index we get a count!
print()
words = tf.where(tokens > 0)
for word in words:
    word = word.numpy().item()
    print(f"token {word:>4} | frequency: {tokens[word]} | decoded: {inverse_vocab[word]}")

tf.Tensor([8. 2. 0. ... 0. 0. 0.], shape=(20000,), dtype=float32)

token    0 | frequency: 8.0 | decoded: [UNK]
token    1 | frequency: 2.0 | decoded: the
token  152 | frequency: 1.0 | decoded: over
token  588 | frequency: 1.0 | decoded: over the
token 1567 | frequency: 1.0 | decoded: dog
token 3088 | frequency: 1.0 | decoded: fox
token 3124 | frequency: 2.0 | decoded: quick
token 4447 | frequency: 2.0 | decoded: brown
token 6116 | frequency: 1.0 | decoded: jumps


In [13]:
text_vectorization = tf.keras.layers.TextVectorization(
    ngrams=2,
    max_tokens=20000,
    output_mode="tf_idf", # configuring `TextVectorization` to return TF-IDF-weighted outputs
)
text_vectorization.adapt(text_only_train_ds)

In [14]:
vocabulary = text_vectorization.get_vocabulary()
inverse_vocab = dict(enumerate(vocabulary))
tokens = text_vectorization("the quick brown fox jumps over the quick brown dog")
print(tokens) # instead of just 1 or a count at the word index we get the tf-idf quantity!
print()
words = tf.where(tokens > 0)
for word in words:
    word = word.numpy().item()
    print(f"token {word:>4} | tf-idf: {tokens[word]:10.7f} | decoded: {inverse_vocab[word]}")

tf.Tensor([42.640827   1.3944945  0.        ...  0.         0.         0.       ], shape=(20000,), dtype=float32)

token    0 | tf-idf: 42.6408272 | decoded: [UNK]
token    1 | tf-idf:  1.3944945 | decoded: the
token  152 | tf-idf:  1.9256997 | decoded: over
token  588 | tf-idf:  2.9419792 | decoded: over the
token 1567 | tf-idf:  4.0243702 | decoded: dog
token 3088 | tf-idf:  4.7474136 | decoded: fox
token 3124 | tf-idf:  8.9111671 | decoded: quick
token 4447 | tf-idf:  9.9094706 | decoded: brown
token 6116 | tf-idf:  5.0974345 | decoded: jumps


---

### Training a simple model with TF-IDF

In [15]:
# our datasets
tfidf_2gram_train_ds = train_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)
tfidf_2gram_val_ds = val_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)
tfidf_2gram_test_ds = test_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)

In [17]:
def get_model(max_tokens=20000, hidden_dim=16, clear=True):
    """
    Our model-building utility
    """
    if clear:
        tf.keras.backend.clear_session()
    inputs = tf.keras.Input(shape=(max_tokens,))
    x = tf.keras.layers.Dense(hidden_dim, activation="relu")(inputs)
    x = tf.keras.layers.Dropout(0.5)(x)
    outputs = tf.keras.layers.Dense(1, activation="sigmoid")(x)
    model = tf.keras.Model(inputs, outputs)
    model.compile(
        optimizer="rmsprop",
        loss="binary_crossentropy", # our labels are only 0 or 1 (negative/positive)
        metrics=["accuracy"]
    )
    return model

In [18]:
model = get_model()
model.summary()

callbacks = [
    tf.keras.callbacks.ModelCheckpoint(
        str(base_dir / "tfidf_2gram.h5"),   # NOTE: in DLWP, he uses the current `.keras` format, but ModelCheckpoint
        save_best_only=True                 # in TF 2.13 now breaks when using it with `save_best_only`...
    )                                       # see: https://github.com/keras-team/tf-keras/issues/151
]

model.fit(
    tfidf_2gram_train_ds.cache(),
    validation_data=tfidf_2gram_val_ds.cache(),
    epochs=10,
    callbacks=callbacks
)

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 20000)]           0         
                                                                 
 dense (Dense)               (None, 16)                320016    
                                                                 
 dropout (Dropout)           (None, 16)                0         
                                                                 
 dense_1 (Dense)             (None, 1)                 17        
                                                                 
Total params: 320,033
Trainable params: 320,033
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f288a3ad0d0>

In [19]:
# and testing
model = tf.keras.models.load_model(base_dir / "tfidf_2gram.h5")
print(f"Test acc: {model.evaluate(tfidf_2gram_test_ds, verbose=0)[1]:.3f}") # this did not beat the bigram model

Test acc: 0.881
