In [2]:
import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np
from sklearn.model_selection import train_test_split

# --- 2. Cargar y preparar el dataset IMDB de TFDS ---

# Cargamos el dataset IMDB Reviews (ya viene etiquetado: 0 para negativo, 1 para positivo)
(ds_train, ds_test), ds_info = tfds.load(
    'imdb_reviews',
    split=['train', 'test'],
    as_supervised=True,  # Cada ejemplo es un par (texto, etiqueta)
    with_info=True
)

# Función para convertir tf.data.Dataset a listas de textos y etiquetas
def dataset_to_lists(dataset):
    texts = []
    labels = []
    for text, label in tfds.as_numpy(dataset):
        texts.append(text.decode('utf-8'))
        labels.append(int(label))
    return texts, labels

train_texts, train_labels = dataset_to_lists(ds_train)
test_texts, test_labels = dataset_to_lists(ds_test)

# Dividimos parte del training para validación (por ejemplo, 10% para validación)
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_texts, train_labels, test_size=0.1, random_state=42
)



Downloading and preparing dataset 80.23 MiB (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Generating splits...:   0%|          | 0/3 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/25000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/incomplete.IBB7L3_1.0.0/imdb_reviews-train.tfrecor…

Generating test examples...:   0%|          | 0/25000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/incomplete.IBB7L3_1.0.0/imdb_reviews-test.tfrecord…

Generating unsupervised examples...:   0%|          | 0/50000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/incomplete.IBB7L3_1.0.0/imdb_reviews-unsupervised.…

Dataset imdb_reviews downloaded and prepared to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.


In [5]:
import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np
from sklearn.model_selection import train_test_split

# Parámetros del modelo y preprocesamiento
max_seq_length = 512   # Longitud máxima de la secuencia
d_model = 256         # Dimensión de embeddings y modelo
max_tokens = 100000     # Tamaño del vocabulario

# --- 1. Definir funciones auxiliares ---

# Función para generar el encoding posicional (necesario para el Transformer)
def get_angles(pos, i, d_model):
    angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model))
    return pos * angle_rates

def positional_encoding(position, d_model):
    angle_rads = get_angles(np.arange(position)[:, np.newaxis],
                            np.arange(d_model)[np.newaxis, :],
                            d_model)
    # Aplicamos sin a índices pares y cos a impares
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
    pos_encoding = angle_rads[np.newaxis, ...]
    return tf.cast(pos_encoding, dtype=tf.float32)


def encoder_branch(input_tensor, d_model):
    # Multihead Attention
    x = tf.keras.layers.MultiHeadAttention(num_heads=5, key_dim=d_model//2)(input_tensor, input_tensor, input_tensor)
    x = x + input_tensor
    x_before_ffn = tf.keras.layers.LayerNormalization()(x)
    # Feed Forward Network
    x = tf.keras.layers.Dense(128, activation='relu')(x_before_ffn)
    x = tf.keras.layers.Dropout(0.2)(x)
    x = tf.keras.layers.Dense(64, activation='relu')(x)
    x = tf.keras.layers.Dropout(0.2)(x)
    x = tf.keras.layers.Dense(d_model)(x)
    # Conexión residual y normalización
    x = x + x_before_ffn
    x = tf.keras.layers.LayerNormalization()(x)
    return x

# Función para construir el clasificador de sentimiento basado en Transformer
def build_transformer_sentiment_classifier(texts, max_seq_length, d_model, max_tokens):
    # Creamos la capa de vectorización de texto (tokenización)
    vectorizer = tf.keras.layers.TextVectorization(max_tokens=max_tokens,
                                                   output_mode="int",
                                                   output_sequence_length=max_seq_length)
    vectorizer.adapt(texts)

    # Definimos la entrada del modelo
    input_layer = tf.keras.Input(shape=(max_seq_length,), dtype=tf.int32)

    # Embedding y suma del encoding posicional
    embedding_layer = tf.keras.layers.Embedding(input_dim=max_tokens, output_dim=d_model)(input_layer)
    pos_encoding = positional_encoding(max_seq_length, d_model)
    embeddings_with_pos = embedding_layer + pos_encoding

    # Pasamos por el encoder del Transformer
    x = encoder_branch(embeddings_with_pos, d_model)

    # Aplicamos un pooling global para condensar la secuencia en un vector
    x = tf.keras.layers.GlobalAveragePooling1D()(x)

    # Capa final de clasificación (una neurona con sigmoide para clasificación binaria)
    output = tf.keras.layers.Dense(1, activation='sigmoid')(x)

    model = tf.keras.Model(inputs=input_layer, outputs=output)
    return model, vectorizer

# --- 3. Construir y compilar el modelo ---

model, vectorizer = build_transformer_sentiment_classifier(train_texts, max_seq_length, d_model, max_tokens)

# Convertimos los textos a secuencias enteras usando el vectorizer
train_sequences = vectorizer(tf.constant(train_texts))
val_sequences   = vectorizer(tf.constant(val_texts))
test_sequences  = vectorizer(tf.constant(test_texts))

# Compilamos el modelo
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.summary()

# --- 4. Entrenar el modelo ---

history = model.fit(
    train_sequences, np.array(train_labels),
    validation_data=(val_sequences, np.array(val_labels)),
    epochs=4,
    batch_size=32
)

Epoch 1/4
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 59ms/step - accuracy: 0.5228 - loss: 0.7033 - val_accuracy: 0.5172 - val_loss: 0.6912
Epoch 2/4
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 30ms/step - accuracy: 0.7355 - loss: 0.5096 - val_accuracy: 0.8352 - val_loss: 0.3600
Epoch 3/4
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 30ms/step - accuracy: 0.8791 - loss: 0.2904 - val_accuracy: 0.8464 - val_loss: 0.3554
Epoch 4/4
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 30ms/step - accuracy: 0.9170 - loss: 0.2110 - val_accuracy: 0.8732 - val_loss: 0.3097


In [None]:
def get_model_prediction(texts):
    #usar vectorizer guardado
    sequences = vectorizer(tf.constant(texts))
    y_pred = model.predict(sequences)
    y_pred_classes = (y_pred > 0.5).astype(int)
    return y_pred_classes

In [None]:
y_pred = get_model_prediction(test_texts)

#calculate accuracy
from sklearn.metrics import accuracy_score
accuracy_score(test_labels, y_pred)

#print classification report
from sklearn.metrics import classification_report
print(classification_report(test_labels, y_pred))

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 9ms/step
              precision    recall  f1-score   support

           0       0.84      0.89      0.87     12500
           1       0.88      0.83      0.86     12500

    accuracy                           0.86     25000
   macro avg       0.86      0.86      0.86     25000
weighted avg       0.86      0.86      0.86     25000

