<a href="https://colab.research.google.com/github/jonathas-1993/Topicos_UFAM/blob/main/4_TextVectorization_tf_idf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
# %%
# =========================================
# Sentiment Analysis IMDB - TF-IDF
# Neste notebook, vamos baixar o dataset IMDB, preparar datasets de treino, validação e teste,
# aplicar TextVectorization com TF-IDF e treinar um modelo simples de classificação binária.
# =========================================

# %%
# =========================================
# Importações
# =========================================
import os
import pathlib
import random
import shutil
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [12]:
# %%
# =========================================
# Download e descompactação do dataset IMDB
# =========================================
!curl -O https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -xf aclImdb_v1.tar.gz
!rm -r aclImdb/train/unsup  # Remove reviews não rotulados

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 80.2M  100 80.2M    0     0  4697k      0  0:00:17  0:00:17 --:--:-- 7436k


In [13]:
# %%
# =========================================
# Criar diretórios de validação (20% do treino)
# =========================================
base_dir = pathlib.Path("aclImdb")
val_dir = base_dir / "val"
train_dir = base_dir / "train"

for category in ("neg", "pos"):
    os.makedirs(val_dir / category, exist_ok=True)
    files = os.listdir(train_dir / category)
    random.Random(1337).shuffle(files)  # Seed fixa para reprodutibilidade
    num_val_samples = int(0.2 * len(files))
    val_files = files[-num_val_samples:]
    for fname in val_files:
        shutil.move(train_dir / category / fname, val_dir / category / fname)

In [14]:
# %%
# =========================================
# Criar datasets do TensorFlow
# =========================================
batch_size = 32

train_ds = keras.utils.text_dataset_from_directory(
    train_dir, batch_size=batch_size
)
val_ds = keras.utils.text_dataset_from_directory(
    val_dir, batch_size=batch_size
)
test_ds = keras.utils.text_dataset_from_directory(
    base_dir / "test", batch_size=batch_size
)

Found 20000 files belonging to 2 classes.
Found 5000 files belonging to 2 classes.
Found 25000 files belonging to 2 classes.


In [15]:
# %%
# =========================================
# Preparar TF-IDF com TextVectorization
# =========================================
max_tokens = 20000

text_vectorization = layers.TextVectorization(
    max_tokens=max_tokens,
    output_mode="tf_idf"
)

# Adaptar vocabulário apenas com textos de treino
text_only_train_ds = train_ds.map(lambda x, y: x)
text_vectorization.adapt(text_only_train_ds)

# Mapear os datasets para TF-IDF
tfidf_train_ds = train_ds.map(lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)
tfidf_val_ds = val_ds.map(lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)
tfidf_test_ds = test_ds.map(lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)

In [16]:
# %%
# =========================================
# Criar o modelo
# =========================================
def get_model(max_tokens=20000, hidden_dim=16):
    inputs = keras.Input(shape=(max_tokens,))
    x = layers.Dense(hidden_dim, activation="relu")(inputs)
    x = layers.Dropout(0.5)(x)
    outputs = layers.Dense(1, activation="sigmoid")(x)
    model = keras.Model(inputs, outputs)
    model.compile(
        optimizer="rmsprop",
        loss="binary_crossentropy",
        metrics=["accuracy"]
    )
    return model

model = get_model(max_tokens=max_tokens)
model.summary()

In [17]:
# %%
# =========================================
# Checkpoint para salvar melhores pesos
# =========================================
checkpoint_filepath = '/tmp/checkpoint.weights.h5'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True
)

In [18]:
# %%
# =========================================
# Treinamento
# =========================================
model.fit(
    tfidf_train_ds.cache(),
    validation_data=tfidf_val_ds.cache(),
    epochs=5,
    callbacks=[model_checkpoint_callback]
)


Epoch 1/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 19ms/step - accuracy: 0.7727 - loss: 0.4846 - val_accuracy: 0.9018 - val_loss: 0.2749
Epoch 2/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 6ms/step - accuracy: 0.9038 - loss: 0.2594 - val_accuracy: 0.8994 - val_loss: 0.2963
Epoch 3/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 8ms/step - accuracy: 0.9224 - loss: 0.2174 - val_accuracy: 0.8962 - val_loss: 0.3110
Epoch 4/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 6ms/step - accuracy: 0.9325 - loss: 0.1965 - val_accuracy: 0.8984 - val_loss: 0.3277
Epoch 5/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 6ms/step - accuracy: 0.9419 - loss: 0.1769 - val_accuracy: 0.8948 - val_loss: 0.3543


<keras.src.callbacks.history.History at 0x7d58bf398bf0>

In [19]:
# %%
# =========================================
# Avaliação com os melhores pesos
# =========================================
model.load_weights(checkpoint_filepath)
test_loss, test_acc = model.evaluate(tfidf_test_ds)
print(f"Test accuracy: {test_acc:.3f}")

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 8ms/step - accuracy: 0.8892 - loss: 0.2948
Test accuracy: 0.888
