# Procesamiento de datos

In [None]:
!pip install lightfm

# Montar drive
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np

text_embeddings = np.load("/content/drive/MyDrive/datasets/RecSys/combined_text_embeddings.npy", allow_pickle=True).item()

image_embeddings_0 = np.load("/content/drive/MyDrive/datasets/RecSys/image_embeddings_parte_0.npy", allow_pickle=True).item()
image_embeddings_2 = np.load("/content/drive/MyDrive/datasets/RecSys/image_embeddings_parte_2.npy", allow_pickle=True).item()
image_embeddings_3 = np.load("/content/drive/MyDrive/datasets/RecSys/image_embeddings_parte_3.npy", allow_pickle=True).item()

image_embeddings = {**image_embeddings_0, **image_embeddings_2, **image_embeddings_3}

interacciones = pd.read_csv('/content/drive/MyDrive/datasets/RecSys/interaction.csv')

item_info = pd.read_csv('/content/drive/MyDrive/datasets/RecSys/item_info.csv')


if isinstance(interacciones, np.ndarray):  # Si es una matriz NumPy
    interacciones = pd.DataFrame(interacciones, columns=['user_id', 'item_id', 'interaction_type'])

from scipy.sparse import coo_matrix

# Si interacciones es una matriz dispersa, conviértela a DataFrame
if isinstance(interacciones, coo_matrix):
    interacciones_df = pd.DataFrame({
        'user_id': interacciones.row,
        'item_id': interacciones.col,
        'timestamp': interacciones.data
    })
    interacciones = interacciones_df

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd

def create_dataset_batched(text_embeddings, image_embeddings, interacciones, output_file="dataset.csv"):
    # Encuentra la intersección de ítems
    def normalize_ids(ids, remove_extension=False):
        normalized = set()
        for id_ in ids:
            normalized_id = id_.strip().lower()
            if remove_extension and '.' in normalized_id:
                normalized_id = normalized_id.split('.')[0]
            normalized.add(normalized_id)
        return normalized

    # Normalizar los IDs
    text_items = normalize_ids(text_embeddings.keys())
    image_items = normalize_ids(image_embeddings.keys(), remove_extension=True)
    interaction_items = normalize_ids(interacciones['item_id'].unique())
    common_items = text_items.intersection(image_items).intersection(interaction_items)

    # Filtrar las interacciones para los ítems comunes
    interacciones = interacciones[interacciones['item_id'].isin(common_items)]

    # Procesar por lotes y guardar
    chunk_size = 10000
    with open(output_file, 'w') as f:
        for i in range(0, len(interacciones), chunk_size):
            batch = interacciones.iloc[i:i + chunk_size]
            data = []
            for _, row in batch.iterrows():
                user_id = row['user_id']
                item_id = row['item_id']

                # Obtener los embeddings de texto
                text_emb_tag = text_embeddings[item_id].get('tag', [float('nan')])
                text_emb_title = text_embeddings[item_id].get('title', [float('nan')])
                text_emb_description = text_embeddings[item_id].get('description', [float('nan')])

                # Obtener el embedding de imagen (eliminar la extensión .jpg)
                image_emb = image_embeddings.get(f"{item_id}.jpg", [float('nan')])

                # Agregar los datos al lote
                data.append([user_id, item_id] + list(text_emb_tag) + list(text_emb_title) + list(text_emb_description) + list(image_emb) + [row["timestamp"]])

            # Crear DataFrame por cada lote
            df = pd.DataFrame(data, columns=['user_id', 'item_id', 'text_emb_tag', 'text_emb_title', 'text_emb_description', 'image_emb', 'timestamp'])
            df.to_csv(f, mode='a', header=f.tell() == 0, index=False)

    return f"Dataset guardado en {output_file}"


In [None]:
create_dataset_batched(text_embeddings, image_embeddings, interacciones, output_file="/content/drive/MyDrive/datasets/RecSys/dataset.csv")

# Modelo LightFM


In [None]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd

# Cargar dataset
df = pd.read_csv('/content/drive/MyDrive/datasets/RecSys/dataset.csv')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA

def reduce_and_apply_pca(dataframe, n_components=2, n_interactions=100):

    # Se obtiene de manera aleatoria n interacciones
    reduced_df = dataframe.sample(n=n_interactions, random_state=42)

    # Seleccionar las columnas de embeddings
    embedding_columns = ['text_emb_tag', 'text_emb_title', 'text_emb_description', 'image_emb']

    # Convertir las cadenas de texto de los embeddings en arrays numéricos
    for col in embedding_columns:
        reduced_df[col] = reduced_df[col].apply(lambda x: np.fromstring(x.strip('[]'), sep=' ') if pd.notnull(x) else np.zeros(300))

    # Asegurarse de que todos los embeddings tengan la misma longitud
    max_length = max(reduced_df[col].apply(len).max() for col in embedding_columns)
    for col in embedding_columns:
        reduced_df[col] = reduced_df[col].apply(lambda x: np.pad(x, (0, max_length - len(x)), 'constant'))

    # Concatenar todos los embeddings en una sola matriz
    embeddings_matrix = np.hstack([np.vstack(reduced_df[col].values) for col in embedding_columns])

    # Aplicar PCA
    pca = PCA(n_components=n_components)
    principal_components = pca.fit_transform(embeddings_matrix)

    # Crear un DataFrame con las componentes principales
    principal_df = pd.DataFrame(data=principal_components, columns=[f'PC{i+1}' for i in range(n_components)])

    # Añadir las columnas 'timestamp', 'user_id' e 'item_id' de vuelta al DataFrame
    final_df = pd.concat([reduced_df[['timestamp', 'user_id', 'item_id']].reset_index(drop=True), principal_df], axis=1)

    return final_df

pca_df = reduce_and_apply_pca(df, n_components=200, n_interactions = 50000)
pca_df.head()

Unnamed: 0,timestamp,user_id,item_id,PC1,PC2,PC3,PC4,PC5,PC6,PC7,...,PC191,PC192,PC193,PC194,PC195,PC196,PC197,PC198,PC199,PC200
0,1607004670,u1926709,i21633,-0.63069,0.141525,-0.109315,-0.044869,0.248739,-0.162289,0.129708,...,0.044192,0.00683,-0.001708,0.03667,0.008651,0.025679,0.000744,-0.048505,-0.014301,0.034325
1,1560168977,u2600231,i176803,0.168497,-0.348206,0.163946,0.068984,0.013752,-0.275159,0.058521,...,-0.000327,0.001342,0.043496,-0.002934,-0.03049,0.020761,-0.036156,0.00707,-0.01163,0.021489
2,1648519202,u9746763,i176675,0.220104,-0.073572,-0.101976,-0.316948,0.16633,0.01109,0.001095,...,-0.00703,0.023593,-0.10214,-0.030644,-0.001675,-0.033163,-0.058924,-0.029537,0.008213,-0.006305
3,1633166809,u6205552,i162587,0.186036,0.196992,-0.228251,-0.041422,-0.173273,-0.047366,0.222782,...,0.018019,-0.063919,0.055177,-0.003887,0.043706,0.093922,-0.008833,-0.005835,0.025865,0.003686
4,1562029845,u2733424,i132119,-0.622688,0.146248,-0.156496,-0.085645,0.150999,-0.097624,-0.157683,...,0.022461,-0.036589,-0.015778,-0.013724,-0.027335,0.010161,-0.030331,0.011521,0.048029,0.004221


In [None]:
!pip install lightfm

Collecting lightfm
  Downloading lightfm-1.17.tar.gz (316 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/316.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━[0m [32m225.3/316.4 kB[0m [31m6.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.4/316.4 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: lightfm
  Building wheel for lightfm (setup.py) ... [?25l[?25hdone
  Created wheel for lightfm: filename=lightfm-1.17-cp310-cp310-linux_x86_64.whl size=804887 sha256=b8cde550072bf8ffc68690367484d2d85795ca043db5114298a7bcb549f0ebdd
  Stored in directory: /root/.cache/pip/wheels/4f/9b/7e/0b256f2168511d8fa4dae4fae0200fdbd729eb424a912ad636
Successfully built lightfm
Installing collected packages: lightfm
Successfully installed lightfm-1.17


In [None]:
import pandas as pd
import numpy as np
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.evaluation import precision_at_k, recall_at_k
from sklearn.decomposition import PCA
from scipy.spatial.distance import jaccard
from lightfm.cross_validation import random_train_test_split


def prepare_lightfm_data(pca_df):

    # Crear el dataset de LightFM
    dataset = Dataset()

    # Extraer los IDs únicos de usuarios y artículos
    user_ids = pca_df['user_id'].unique()
    item_ids = pca_df['item_id'].unique()

    # Extraer las características de artículos
    item_features = pca_df[['item_id'] + [f'PC{i+1}' for i in range(pca_df.shape[1] - 3)]].drop_duplicates('item_id')

    # Construir el dataset
    dataset.fit(users=user_ids, items=item_ids,  item_features=[f'PC{i+1}' for i in range(pca_df.shape[1] - 3)])

    # Construir la matriz de interacciones con pesos
    interactions, _ = dataset.build_interactions([
        (row['user_id'], row['item_id'], row['timestamp'])
        for _, row in pca_df.iterrows()
    ])

    # Construir las matrices de características
    item_features_matrix = dataset.build_item_features([(row['item_id'], [f'PC{i+1}' for i in range(pca_df.shape[1] - 3)]) for _, row in item_features.iterrows()])

    # Dividir los datos en entrenamiento y prueba
    train_interactions, test_interactions = random_train_test_split(interactions, test_percentage=0.2)


    # Obtener los mapeos entre IDs originales y índices internos
    mapping = dataset.mapping()

    ## Une los 2 mappings para tener un item_id_map, el 2 y 3 corresponden a los items
    item_id_map = {**mapping[2], **mapping[3]}

    # Crear mapeos inversos
    index_to_item_id = {v: k for k, v in item_id_map.items()}

    return train_interactions, test_interactions, item_features_matrix, index_to_item_id


train_interactions, test_interactions, item_features_matrix , index_to_item_id = prepare_lightfm_data(pca_df)


In [None]:
# Entrenar el modelo solo con el conjunto de entrenamiento
model = LightFM(loss='bpr')
model.fit(train_interactions, item_features=item_features_matrix, epochs=30, num_threads=2)

In [None]:
import pandas as pd
item_info = pd.read_csv('/content/drive/MyDrive/datasets/RecSys/item_info.csv')


In [None]:
item_categories = item_info.set_index('item_id')['tag'].to_dict()

In [None]:
# Usamos predict_rank para obtener el ranking de los ítems por usuario
ranked_items = model.predict_rank(test_interactions, train_interactions=train_interactions, item_features=item_features_matrix)

In [103]:
import numpy as np

def precision_at_k(relevant_items, recommended_items, k):
    recommended_at_k = recommended_items[:k]
    relevant_at_k = set(relevant_items).intersection(recommended_at_k)
    return len(relevant_at_k) / k

def dcg_at_k(relevant_items, recommended_items, k):
    dcg = 0.0
    for i in range(min(len(recommended_items), k)):
        if recommended_items[i] in relevant_items:
            dcg += 1 / np.log2(i + 2)  # log2(i + 2) porque índice inicia en 0
    return dcg

def idcg_at_k(k):
    idcg = 0.0
    for i in range(k):
        idcg += 1 / np.log2(i + 2)
    return idcg

def ndcg_at_k(relevant_items, recommended_items, k):
    dcg = dcg_at_k(relevant_items, recommended_items, k)
    idcg = idcg_at_k(k)
    return dcg / idcg if idcg > 0 else 0

def novelty(recommended_items, item_popularity):
    # Filtrar ítems válidos
    recommended_items_filtered = [
        item for item in recommended_items
        if item in item_popularity and item_popularity[item] > 0
    ]

    if len(recommended_items_filtered) == 0:
        return 0.0

    # Calcular la novedad
    novelty_score = sum(np.log(1 / item_popularity[item]) for item in recommended_items_filtered)
    return novelty_score


def diversity(recommended_items, item_categories):
    unique_categories = set()
    for item_index in recommended_items:
        # Convertir índice interno al ID real del ítem
        item_id = index_to_item_id.get(item_index)
        if item_id is not None:
            # Obtener categorías del ítem real
            categories = item_categories.get(item_id, [])
            if isinstance(categories, list):
                unique_categories.update(categories)
            else:
                unique_categories.add(categories)

    # Calcular diversidad como proporción de categorías únicas
    return len(unique_categories) / len(recommended_items) if len(recommended_items) > 0 else 0


def evaluate_model(model, ranked_items, test_interactions, item_popularity, item_categories, k=10):
    n_users, n_items = test_interactions.shape
    precision_scores = []
    ndcg_scores = []
    novelty_scores = []
    diversity_scores = []

    for user_id in range(n_users):
        # Obtener las interacciones del usuario en la matriz dispersa
        relevant_items = test_interactions.getrow(user_id).indices.tolist()

        # Si no hay interacciones en el conjunto de prueba, saltamos a este usuario
        if len(relevant_items) == 0:
            continue

        # Obtenemos el ranking de ítems para el usuario y lo convertimos a array denso
        recommended_items = ranked_items[user_id].toarray().flatten()  # Convierte a array denso

        # Obtener los índices de los k ítems más recomendados
        recommended_items = recommended_items.argsort()[::-1][:k]  # Los k primeros ítems

        # Calcular métricas
        precision_scores.append(precision_at_k(relevant_items, recommended_items, k))
        ndcg_scores.append(ndcg_at_k(relevant_items, recommended_items, k))
        novelty_scores.append(novelty(recommended_items, item_popularity))
        diversity_scores.append(diversity(recommended_items, item_categories))

    # Si no se han evaluado métricas, evitar errores al calcular la media
    if len(precision_scores) == 0:
        return {
            "precision@k": 0,
            "ndcg@k": 0,
            "novelty": 0,
            "diversity": 0
        }

    return {
        "precision@k": np.mean(precision_scores),
        "ndcg@k": np.mean(ndcg_scores),
        "novelty": np.mean(novelty_scores),
        "diversity": np.mean(diversity_scores)
    }

# Calcular la popularidad de los ítems (frecuencia relativa)
item_popularity = np.array(train_interactions.sum(axis=0)).flatten()
item_popularity = {i: pop / train_interactions.sum() for i, pop in enumerate(item_popularity)}

# Evaluar el modelo
metrics = evaluate_model(model, ranked_items, test_interactions, item_popularity, item_categories, k=10)

# Imprimir las métricas
print(f'Precision@k=10: {metrics["precision@k"]}')
print(f'NDCG@k=10: {metrics["ndcg@k"]}')
print(f'Novelty: {metrics["novelty"]}')
print(f'Diversity: {metrics["diversity"]}')


Precision@k=10: 0.023274664196387216
NDCG@k=10: 0.04825902893403268
Novelty: 101.58596399438169
Diversity: 0.8844025011579438


In [104]:
import numpy as np

def precision_at_k(relevant_items, recommended_items, k):
    relevant_items_set = set(relevant_items)
    recommended_items_set = set(recommended_items[:k])
    intersection = relevant_items_set.intersection(recommended_items_set)
    return len(intersection) / k if k > 0 else 0

def ndcg_at_k(relevant_items, recommended_items, k):
    dcg = 0
    idcg = 0
    for i in range(k):
        if recommended_items[i] in relevant_items:
            dcg += 1 / np.log2(i + 2)
        idcg += 1 / np.log2(i + 2) if i < len(relevant_items) else 0
    return dcg / idcg if idcg > 0 else 0

def novelty(recommended_items, item_popularity):
    # Filtrar los ítems recomendados que están en item_popularity
    recommended_items_filtered = [item for item in recommended_items if item in item_popularity]

    if len(recommended_items_filtered) == 0:
        return 0.0  # Si no hay ítems válidos, retornar novedad 0

    # Calcular la novedad
    return np.mean([np.log2(1 + item_popularity[item]) for item in recommended_items_filtered])


def diversity(recommended_items, item_categories):
    unique_categories = set()
    for item_index in recommended_items:
        # Convertir índice interno al ID real del ítem
        item_id = index_to_item_id.get(item_index)
        if item_id is not None:
            # Obtener categorías del ítem real
            categories = item_categories.get(item_id, [])
            if isinstance(categories, list):
                unique_categories.update(categories)
            else:
                unique_categories.add(categories)

    # Calcular diversidad como proporción de categorías únicas
    return len(unique_categories) / len(recommended_items) if len(recommended_items) > 0 else 0

def evaluate_most_popular_model(train_interactions, test_interactions, item_popularity, item_categories, k=10):
    # Obtener los ítems más populares, ordenados por la cantidad de interacciones
    most_popular_items = np.argsort(-np.array(list(item_popularity.values())))

    # Crear un array de recomendaciones con los ítems más populares para cada usuario
    recommended_items_for_all_users = np.tile(most_popular_items[:k], (test_interactions.shape[0], 1))

    return evaluate_metrics(recommended_items_for_all_users, test_interactions, item_popularity, item_categories, k)

def evaluate_random_model(n_items, test_interactions, item_popularity, item_categories, k=10):
    # Generar recomendaciones aleatorias para cada usuario
    random_recommendations = np.random.randint(0, n_items, size=(test_interactions.shape[0], k))

    return evaluate_metrics(random_recommendations, test_interactions, item_popularity, item_categories, k)

def evaluate_metrics(recommended_items, test_interactions, item_popularity, item_categories, k=10):
    precision_scores = []
    ndcg_scores = []
    novelty_scores = []
    diversity_scores = []

    n_users = test_interactions.shape[0]

    for user_id in range(n_users):
        # Obtener las interacciones del usuario en la matriz dispersa
        relevant_items = test_interactions.getrow(user_id).indices

        # Si no hay interacciones en el conjunto de prueba, saltamos a este usuario
        if len(relevant_items) == 0:
            continue

        # Obtener las recomendaciones para este usuario
        recommended_items_for_user = recommended_items[user_id]

        # Calcular métricas
        precision_scores.append(precision_at_k(relevant_items, recommended_items_for_user, k))
        ndcg_scores.append(ndcg_at_k(relevant_items, recommended_items_for_user, k))
        novelty_scores.append(novelty(recommended_items_for_user, item_popularity))
        diversity_scores.append(diversity(recommended_items_for_user, item_categories))

    # Si no se han evaluado métricas, evitar errores al calcular la media
    if len(precision_scores) == 0:
        return {
            "precision@k": 0,
            "ndcg@k": 0,
            "novelty": 0,
            "diversity": 0
        }

    return {
        "precision@k": np.mean(precision_scores),
        "ndcg@k": np.mean(ndcg_scores),
        "novelty": np.mean(novelty_scores),
        "diversity": np.mean(diversity_scores)
    }

n_items = train_interactions.shape[1]

metrics_most_popular = evaluate_most_popular_model(train_interactions, test_interactions, item_popularity, item_categories, k=10)
metrics_random = evaluate_random_model(n_items, test_interactions, item_popularity, item_categories, k=10)

print("Most Popular Model Metrics:")
print(metrics_most_popular)

print("\nRandom Model Metrics:")
print(metrics_random)


Most Popular Model Metrics:
{'precision@k': 0.00010421491431218156, 'ndcg@k': 0.0004839403413930735, 'novelty': 0.000346205068568967, 'diversity': 0.6999999999999998}

Random Model Metrics:
{'precision@k': 4.631773969430292e-05, 'ndcg@k': 0.0002450781464161313, 'novelty': 4.965140473232798e-05, 'diversity': 0.8749652616952294}
