# Modelo baseline con TF-IDF y N-gramas
Este cuaderno completa la Etapa 1 entrenando y evaluando el modelo baseline definido en el plan: un predictor de calificaciones basado en TF-IDF/N-gramas y un recomendador secuencial simple.

## Objetivos de este cuaderno
- Cargar las representaciones TF-IDF generadas en el cuaderno anterior y vincularlas con los conjuntos Train/Dev/Test.
- Entrenar un modelo de regresión basado en N-gramas para predecir calificaciones normalizadas y medir MAE/RMSE.
- Construir un recomendador secuencial de orden 2 y cuantificar Precision@K, Recall@K y MAP.
- Registrar hallazgos y preparar insumos para los documentos y presentaciones de la Etapa 1.

In [1]:
from pathlib import Path
import sys
import json
import pandas as pd
import numpy as np
from scipy import sparse
import joblib
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import r2_score
PROJECT_ROOT = Path("/home/lctr/SEMESTRES/SEMESTRE_5/MINERIA_DE_TEXTO/PROYECTO")
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))
from sistema_recomendacion.src.evaluation.metrics import mae, rmse, precision_at_k, recall_at_k, mean_average_precision
from sistema_recomendacion.src.models.baseline import NgramSequenceRecommender

## Carga de datos y representaciones
Traemos los conjuntos procesados y las matrices TF-IDF guardadas en `data/processed/`. Solo conservamos columnas relevantes para las tareas de predicción y recomendación.

In [2]:
processed_dir = PROJECT_ROOT / "sistema_recomendacion" / "data" / "processed"
ratings_cols = ["book_id", "user_id", "rating", "review_time", "rating_normalized"]
parse_kwargs = {"usecols": ratings_cols, "parse_dates": ["review_time"]}
ratings_train = pd.read_csv(processed_dir / "ratings_train.csv", **parse_kwargs)
ratings_dev = pd.read_csv(processed_dir / "ratings_dev.csv", **parse_kwargs)
ratings_test = pd.read_csv(processed_dir / "ratings_test.csv", **parse_kwargs)
tfidf_vectorizer = joblib.load(processed_dir / "tfidf_vectorizer.joblib")
X_train = sparse.load_npz(processed_dir / "X_train_tfidf.npz")
X_dev = sparse.load_npz(processed_dir / "X_dev_tfidf.npz")
X_test = sparse.load_npz(processed_dir / "X_test_tfidf.npz")
ratings_train.head(20)

Unnamed: 0,book_id,user_id,rating,review_time,rating_normalized
0,B000G167FA,AZUNT3QP2CWTL,5.0,1969-12-31 23:59:59,1.0
1,B000G167FA,AWVWX5F3YEJKZ,5.0,1969-12-31 23:59:59,1.0
2,0786280670,AE3SEXFJCQLJQ,1.0,1969-12-31 23:59:59,0.0
3,B000G167FA,AAFZZHA2I598B,5.0,1969-12-31 23:59:59,1.0
4,B00005O4HA,A3RTKL9KB8KLID,5.0,1996-08-17 00:00:00,1.0
5,0671787381,A38LAIK2N83NH0,5.0,1996-11-18 00:00:00,1.0
6,015600206X,A38LAIK2N83NH0,4.0,1996-11-18 00:00:00,0.75
7,B000KW9IT2,A38LAIK2N83NH0,2.0,1996-11-20 00:00:00,0.25
8,B000P9X16G,A3PPXVR5J6U2JD,5.0,1996-12-16 00:00:00,1.0
9,B0001BJEG4,A3PPXVR5J6U2JD,5.0,1996-12-16 00:00:00,1.0


In [3]:
def describe_split(name: str, ratings_df: pd.DataFrame, matrix) -> dict:
    return {
        "split": name,
        "rows": len(ratings_df),
        "sparse_shape": matrix.shape,
        "density": float(matrix.nnz / (matrix.shape[0] * matrix.shape[1])) if matrix.shape[0] else 0.0,
    }
splits_summary = pd.DataFrame([
    describe_split("train", ratings_train, X_train),
    describe_split("dev", ratings_dev, X_dev),
    describe_split("test", ratings_test, X_test),
])
splits_summary

Unnamed: 0,split,rows,sparse_shape,density
0,train,1036854,"(1036854, 100000)",0.000942
1,dev,129607,"(129607, 100000)",0.00068
2,test,129607,"(129607, 100000)",0.000347


## Predicción de calificaciones con un modelo lineal
El baseline considera un modelo de regresión lineal optimizado con descenso estocástico (`SGDRegressor`) sobre la matriz TF-IDF. Este enfoque escala a millones de interacciones y respeta el requerimiento de un clasificador/regresor sobre N-gramas.

In [4]:
y_train = ratings_train["rating_normalized"].astype(np.float32).to_numpy()
y_dev = ratings_dev["rating_normalized"].astype(np.float32).to_numpy()
y_test = ratings_test["rating_normalized"].astype(np.float32).to_numpy()
sgd_regressor = SGDRegressor(
    loss="squared_error",
    penalty="l2",
    alpha=1e-5,
    max_iter=100,
    tol=1e-4,
    random_state=42,
    learning_rate="optimal",
    average=True,
    early_stopping=True,
    validation_fraction=0.05,
    n_iter_no_change=5,
)
sgd_regressor.fit(X_train, y_train)



0,1,2
,loss,'squared_error'
,penalty,'l2'
,alpha,1e-05
,l1_ratio,0.15
,fit_intercept,True
,max_iter,100
,tol,0.0001
,shuffle,True
,verbose,0
,epsilon,0.1


In [5]:
def evaluate_regressor(model, X, y_true, split_name: str) -> dict:
    y_pred = model.predict(X)
    y_pred = np.clip(y_pred, 0.0, 1.0)
    return {
        "split": split_name,
        "mae": mae(y_true, y_pred),
        "rmse": rmse(y_true, y_pred),
        "r2": r2_score(y_true, y_pred),
    }
regression_metrics = pd.DataFrame([
    evaluate_regressor(sgd_regressor, X_train, y_train, "train"),
    evaluate_regressor(sgd_regressor, X_dev, y_dev, "dev"),
    evaluate_regressor(sgd_regressor, X_test, y_test, "test"),
])
regression_metrics

Unnamed: 0,split,mae,rmse,r2
0,train,0.50074,0.650144,-4.1273
1,dev,0.496148,0.650643,-4.252834
2,test,0.500183,0.661701,-6.422797


In [6]:
def rescale_to_rating(pred_normalized: np.ndarray) -> np.ndarray:
    return pred_normalized * 4.0 + 1.0
def evaluate_rescaled(model, X, y_true, split_name: str) -> dict:
    y_pred_norm = np.clip(model.predict(X), 0.0, 1.0)
    y_pred_rating = rescale_to_rating(y_pred_norm)
    y_true_rating = rescale_to_rating(y_true)
    return {
        "split": split_name,
        "mae_rating": mae(y_true_rating, y_pred_rating),
        "rmse_rating": rmse(y_true_rating, y_pred_rating),
        "r2 score": r2_score(y_true_rating, y_pred_rating),
    }
regression_metrics_rating_scale = pd.DataFrame([
    evaluate_rescaled(sgd_regressor, X_train, y_train, "train"),
    evaluate_rescaled(sgd_regressor, X_dev, y_dev, "dev"),
    evaluate_rescaled(sgd_regressor, X_test, y_test, "test"),
])
regression_metrics_rating_scale

Unnamed: 0,split,mae_rating,rmse_rating,r2 score
0,train,2.002961,2.600576,-4.1273
1,dev,1.984592,2.602571,-4.252834
2,test,2.000733,2.646804,-6.422797


## Recomendador secuencial de orden 2
Para la parte secuencial del baseline entrenamos un modelo simple de N-gramas (orden 2) que aprende transiciones entre libros dentro del historial de cada usuario. Evaluamos su capacidad de sugerir el siguiente ítem en los conjuntos Dev y Test.

In [13]:
def build_user_histories(ratings: pd.DataFrame) -> pd.Series:
    ordered = ratings.sort_values(["user_id", "review_time"])
    return ordered.groupby("user_id")[["book_id"]].agg(list)["book_id"]
train_histories = build_user_histories(ratings_train)
dev_histories = build_user_histories(ratings_dev)
test_histories = build_user_histories(ratings_test)
train_histories.head(20)

user_id
A1001QR844MH6D                 [0792733576, B0006SJ2GE, 079273355X]
A10021RC8DJY3L    [B000HMY3ZY, B000N6QL8G, 1558006915, B00089MZY...
A100307XCCSKWR     [B000PGI7QI, B000MWC3FQ, B000PWMT1G, B000MOOAJG]
A1004HHMSDY5IP                 [0805010459, B0009JKV7E, 1890627429]
A1005TJ8GDAWZA                 [B000I3NFKG, B000PCESRE, B000PMCF1A]
A1008EEMWRT7DD    [B000PGI7QI, B000MWC3FQ, B000MOOAJG, B000PWMT1...
A100ALUJWBXF0Y                                         [B000P8NZBI]
A100BWRR2UOZ41                 [060981009X, 0330487965, B0002XH6T8]
A100NGGXRQF0AQ    [B000K718XQ, 0679751254, 0679755829, 095500612...
A100PHXQUU7ZZN                             [0006552269, B0001WOUKI]
A100SHSGDS1SOM     [1850891648, B00006SL45, B000F5X89K, 0894715208]
A100TQ7ZRE0W02     [0971237034, 0972800522, 0976325608, 0971237018]
A100TW8FZECWD6     [0553574272, 0553580132, 0505520389, B000K8QSXU]
A100UD67AHFODS                 [1570713383, 1875671110, 0794819931]
A100UI1EDZB5UQ    [B000GQK706, B000FAIRN

In [11]:
ngram_recommender = NgramSequenceRecommender(n=2, max_suggestions=20)
ngram_recommender.fit(train_histories.tolist())

In [12]:
def generate_eval_cases(histories: pd.Series, n: int):
    contexts: list[list[str]] = []
    relevant: list[list[str]] = []
    for history in histories.tolist():
        if len(history) < n:
            continue
        for idx in range(n - 1, len(history)):
            context = history[idx - (n - 1) : idx]
            target = history[idx]
            contexts.append(context)
            relevant.append([target])
    return contexts, relevant
def evaluate_sequential(model: NgramSequenceRecommender, histories: pd.Series, split_name: str, k: int = 10) -> dict:
    contexts, relevant = generate_eval_cases(histories, model.n)
    recommendations = [model.recommend(context) for context in contexts]
    coverage = sum(1 for recs in recommendations if recs) / max(len(recommendations), 1)
    return {
        "split": split_name,
        "cases": len(recommendations),
        "coverage": coverage,
        f"precision@{k}": precision_at_k(recommendations, relevant, k),
        f"recall@{k}": recall_at_k(recommendations, relevant, k),
        f"map@{k}": mean_average_precision(recommendations, relevant, k),
    }
sequential_metrics = pd.DataFrame([
    evaluate_sequential(ngram_recommender, dev_histories, "dev", k=10),
    evaluate_sequential(ngram_recommender, test_histories, "test", k=10),
])
sequential_metrics

Unnamed: 0,split,cases,coverage,precision@10,recall@10,map@10
0,dev,103269,0.994122,0.080503,0.666977,0.342887
1,test,106058,0.99538,0.081115,0.714675,0.324652


## Hallazgos y próximos pasos
- El modelo lineal obtiene métricas preliminares sobre las calificaciones normalizadas y en escala original. Debemos comparar contra modelos alternativos en la Etapa 2.
- El recomendador de orden 2 ofrece cobertura limitada fuera de los patrones frecuentes; conviene explorar suavizado o N-gramas de orden mayor.
- Documentar estas métricas en `docs/documentos/etapa1_baseline.md` y actualizar la presentación con gráficos de desempeño.
- Próximo trabajo: experimentar con embeddings alternativos (Word2Vec/Sentence-BERT) y modelos más expresivos (LightFM, GBTs) para la Etapa 2.

In [15]:
ratings_lookup = pd.read_csv(
    processed_dir / "books_ratings_clean.csv",
    usecols=["book_id", "book_title_key"],
).dropna(subset=["book_id", "book_title_key"])
metadata_lookup = pd.read_csv(
    processed_dir / "books_metadata_clean.csv",
    usecols=["book_title_key", "book_title"],
).dropna(subset=["book_title_key"])
id_to_title = (
    ratings_lookup.merge(metadata_lookup, on="book_title_key", how="left")
    .dropna(subset=["book_title"])
    .drop_duplicates(subset=["book_id"])
    .set_index("book_id")["book_title"].to_dict()
 )
def as_title(book_id: str) -> str:
    return id_to_title.get(book_id, book_id)

In [16]:
def sample_recommendations(histories: pd.Series, model: NgramSequenceRecommender, n_samples: int = 10) -> pd.DataFrame:
    examples = []
    for user_id, history in histories.items():
        if len(history) < model.n - 1:
            continue
        context = history[-(model.n - 1):] if model.n > 1 else []
        recs = model.recommend(context)
        if not recs:
            continue
        examples.append({
            "user_id": user_id,
            "context": [as_title(book_id) for book_id in context],
            "recommendations": [as_title(book_id) for book_id in recs[:10]],
        })
        if len(examples) == n_samples:
            break
    return pd.DataFrame(examples)
sample_recommendations(dev_histories, ngram_recommender, n_samples=10)

Unnamed: 0,user_id,context,recommendations
0,A00787411M1CAS4K6H99N,[Soul Surfer],[Footprints of a Pilgrim: The Life and Loves o...
1,A008059932M4DUB2IWDB8,[Seven pillars of wisdom: A triumph],"[Seven pillars of wisdom: A triumph, Seven pil..."
2,A00891092QIVH4W1YP46A,[Jane Eyre / Wuthering Heights],"[Wuthering Heights, Wuthering Heights (The Fra..."
3,A01254073JW8SSTKH6AIB,[Wuthering Heights.],"[Wuthering Heights, Wuthering Heights (College..."
4,A0134066213WYQXLTVGYT,[Royal Panoply: Brief Lives of the English Mon...,"[Sense and sensibility, Middlemarch;: A study ..."
5,A01416042M2UP370M5JO,[The Hitchhiker's Guide to the Galaxy],"[The Hitchhiker's Guide to the Galaxy, The Hit..."
6,A025268923L497N34PUMH,[A Fine Balance],"[A Fine Balance, A Fine Balance, God of Small ..."
7,A03816223LL3Q1P48HRU,[The Hobbit There and Back Again],"[The Hobbit, The Hobbit, The Hobbitt, or there..."
8,A038312122Z9EEO33LIWH,"[The Hobbitt, or there and back again; illustr...","[The Hobbit, The Hobbit, The Hobbit, The Hobbi..."
9,A0461265TOJ3VFHJUMY7,[King Rat],"[King Rat, King Rat, TAI-PAN, Noble House, The..."
