In [6]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from sentence_transformers import SentenceTransformer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split

# 1. Cargar y preprocesar los datos
df = pd.read_csv("../data/raw/goodreads_data_sample.csv")

# Normalizar columnas necesarias
df['Book'] = df['Book'].fillna('')
df['Description'] = df['Description'].fillna('')
df['Genres'] = df['Genres'].fillna("[]")

# Crear la columna 'text' combinando título y descripción
df['text'] = df['Book'] + ". " + df['Description']

# Convertir la columna 'Genres' de cadenas a listas reales
df['tags'] = df['Genres'].apply(eval)

# 2. Codificar las etiquetas (tags)
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['tags'])

# 3. Convertir texto a embeddings
model_embed = SentenceTransformer('all-MiniLM-L6-v2')
X_embeddings = model_embed.encode(df['text'].tolist(), show_progress_bar=True)

# 4. Dividir los datos en entrenamiento y validación
X_train, X_val, y_train, y_val = train_test_split(X_embeddings, y, test_size=0.2, random_state=42)

# 5. Definir el modelo Keras
input_dim = X_train.shape[1]
num_classes = y_train.shape[1]

model = Sequential([
    Dense(128, activation='relu', input_shape=(input_dim,)),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dense(num_classes, activation='sigmoid')  # Para clasificación multilabel
])

# Compilar el modelo
model.compile(optimizer=Adam(learning_rate=0.001),
              loss='binary_crossentropy',
              metrics=['accuracy'])

# 6. Entrenar el modelo
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_val, y_val))

# 7. Predecir etiquetas para nuevos datos
# Libros de prueba
test_books = [
    {
        "text": "Messi: Edición revisada y actualizada. Una biografía del astro argentino.",
        "expected_tags": ["Biography", "Sports", "Football"]
    },
    {
        "text": "The Great Gatsby. A novel about the American dream and the roaring twenties.",
        "expected_tags": ["Classics", "Fiction", "Literature"]
    },
    {
        "text": "A Brief History of Time. Stephen Hawking explains the universe and black holes.",
        "expected_tags": ["Science", "Nonfiction", "Physics"]
    },
    {
        "text": "The Catcher in the Rye. A story about teenage rebellion and identity.",
        "expected_tags": ["Classics", "Fiction", "Young Adult"]
    },
    {
        "text": "The Art of War. Ancient Chinese military strategy by Sun Tzu.",
        "expected_tags": ["Philosophy", "History", "Nonfiction"]
    },
    {
        "text": "Harry Potter and the Chamber of Secrets. The second book in the Harry Potter series.",
        "expected_tags": ["Fantasy", "Fiction", "Young Adult"]
    }
]

# Predecir etiquetas para los libros de prueba
for book in test_books:
    sample_embedding = model_embed.encode([book["text"]])
    predictions = model.predict(sample_embedding)
    predicted_tags = mlb.inverse_transform((predictions > 0.5).astype(int))
    print(f"Book: {book['text']}")
    print(f"Expected Tags: {book['expected_tags']}")
    print(f"Predicted Tags: {predicted_tags}")
    print("-" * 50)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Epoch 1/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 719ms/step - accuracy: 0.0000e+00 - loss: 0.6995 - val_accuracy: 0.0000e+00 - val_loss: 0.6949
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 72ms/step - accuracy: 0.0000e+00 - loss: 0.6930 - val_accuracy: 0.0000e+00 - val_loss: 0.6941
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step - accuracy: 0.0000e+00 - loss: 0.6921 - val_accuracy: 0.0000e+00 - val_loss: 0.6932
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step - accuracy: 0.0000e+00 - loss: 0.6866 - val_accuracy: 0.0000e+00 - val_loss: 0.6921
Epoch 5/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 65ms/step - accuracy: 0.0000e+00 - loss: 0.6849 - val_accuracy: 0.0000e+00 - val_loss: 0.6909
Epoch 6/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step - accuracy: 0.0000e+00 - loss: 0.6842 - val_accuracy: 0.0000e+00 - val_loss: 0.6897
Epoch 7/