In [9]:
import numpy as np
import pandas as pd
import joblib

from sentence_transformers import SentenceTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from nltk.corpus import stopwords

# 1) Cargar datos
books_df = pd.read_csv("../data/processed/books.csv")

# Combinar stopwords de nltk con otras posibles palabras irrelevantes
my_stop_words = {'to-read', 'closed', 'abandoned-books'
'rating-top', 'literature', 'not-interested', 'libricos', 'recommended',
'000-next', 
'001-ladder-top', '002-ladder-short-term', '003-ladder-medium-term', '004-ladder-long-term', 
'005-ladder-maybe-someday', 
'_cristina', '_giorgia', '_natalia', '_nieves', '_pilar', '_sindy', '_víctor', 'chełmińska'
}
stop_words = set(my_stop_words)
stop_words = stop_words.union(set(stopwords.words('english')))
stop_words = stop_words.union(set(stopwords.words('spanish')))
stop_words = stop_words.union(set(stopwords.words('french')))
stop_words = stop_words.union(set(stopwords.words('italian')))

# 3. Procesar los tags: convertirlos en listas y filtrar stop-words
def filter_stopwords(tags):
    """
    Filtra las stop-words de una lista de etiquetas.
    """
    return [tag for tag in tags if tag.lower() not in stop_words]

books_df['text'] = books_df['book_title'] + " " + books_df['blurb']
# Filtyer stopwords from text
books_df['tags'] = books_df['tags'].apply(lambda x: filter_stopwords([t.strip() for t in str(x).split(',')]))
books_df['text'] = books_df['text'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in stop_words]))

# 7) Cargar Sentence-BERT y vectorizar
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') 
X = model.encode(books_df['text'].tolist(), show_progress_bar=True)

# 8) Binarizar las etiquetas
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(books_df['tags'])

# 9) Separar train y test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 10) Construir el Random Forest en modo multi-output
rf_base = RandomForestClassifier(random_state=42)
multi_rf = MultiOutputClassifier(rf_base)

# 11) Definir la grilla de hiperparámetros
param_grid = {
    'estimator__n_estimators': [10, 20],
    'estimator__max_depth': [5, 10],
    'estimator__min_samples_leaf': [1, 2],
}

# 12) GridSearchCV
grid_search = GridSearchCV(
    multi_rf,
    param_grid=param_grid,
    cv=2, 
    scoring='f1_micro',
    n_jobs=-1,
    verbose=2
)

grid_search.fit(X_train, y_train)

print("Mejores hiperparámetros:", grid_search.best_params_)
print("Mejor puntuación (CV):", grid_search.best_score_)

best_rf = grid_search.best_estimator_

# 13) Evaluar en test
test_score = best_rf.score(X_test, y_test)
print("Puntuación final en test:", test_score)

# 14) Guardar modelo y binarizador
joblib.dump(best_rf, "../model/book_tagging_rf.joblib")
joblib.dump(mlb, "../model/book_tagging_rf_mlb.joblib")


Batches:   0%|          | 0/37 [00:00<?, ?it/s]

Fitting 2 folds for each of 8 candidates, totalling 16 fits
Mejores hiperparámetros: {'estimator__max_depth': 10, 'estimator__min_samples_leaf': 1, 'estimator__n_estimators': 10}
Mejor puntuación (CV): 0.07163693459466232
Puntuación final en test: 0.05172413793103448


['../model/book_tagging_rf_mlb.joblib']