In [24]:
# !pip cache purge
# !pip uninstall -y numpy scipy rectools
# !pip install "numpy<1.29.0" "scipy<1.11.0" rectools
# !pip install lightning_fabric
# !pip install rectools
# !pip install rectools[torch]
# !pip install gensim
# !pip install -U sentence-transformers

In [2]:
# Get access to Google disk
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [3]:
# Import libraries
import os
import re
import pickle
import warnings
import itertools
import typing as tp
from pathlib import Path
from collections import Counter
from itertools import chain
from tqdm import tqdm
from typing import List, Dict, Optional

import pandas as pd
import numpy as np
import torch
import pyspark
import threadpoolctl
import matplotlib.pyplot as plt
import plotly.express as px
from scipy.spatial.distance import euclidean
from sklearn.preprocessing import normalize
from sklearn.cluster import AgglomerativeClustering

from pyspark.sql import SparkSession, functions as F
from pyspark.sql.functions import col, to_timestamp
from sentence_transformers import SentenceTransformer

In [4]:
# Import libraries
from lightning_fabric import seed_everything
from pytorch_lightning import Trainer, LightningModule
from pytorch_lightning.loggers import CSVLogger
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint, Callback

from rectools import Columns, ExternalIds
from rectools.dataset import Dataset
from rectools.metrics import NDCG, Recall, Serendipity, calc_metrics
from rectools.models import BERT4RecModel, SASRecModel, load_model
from rectools.models.nn.item_net import IdEmbeddingsItemNet
from rectools.models.nn.transformers.base import TransformerModelBase

from rectools import Columns
from rectools.dataset import Dataset
from rectools.metrics import (
    MAP,
    CoveredUsers,
    AvgRecPopularity,
    Intersection,
    HitRate,
    Serendipity,
)
from rectools.models import PopularModel, EASEModel, SASRecModel, BERT4RecModel
from rectools.model_selection import TimeRangeSplitter, cross_validate
from rectools.models.nn.item_net import CatFeaturesItemNet, IdEmbeddingsItemNet
from rectools.visuals import MetricsApp

warnings.simplefilter("ignore")

# Enable deterministic behaviour with CUDA >= 10.2
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"

# Random seed
RANDOM_STATE=60
torch.use_deterministic_algorithms(True)
seed_everything(RANDOM_STATE, workers=True)

INFO:lightning_fabric.utilities.seed:Seed set to 60


60

In [5]:
%cd "/content/drive/MyDrive/Colab Notebooks/diploma/scripts/"
import process_data

/content/drive/MyDrive/Colab Notebooks/diploma/scripts


In [25]:
# Создаём SparkSession
spark = SparkSession.builder \
    .appName("PetCo") \
    .getOrCreate()

In [7]:
HEAD_DIRECTORY = '/content/drive/MyDrive/Colab Notebooks/diploma/'

# Функции

In [8]:
def generate_item_descriptions(catalog: pd.DataFrame, features: List[str]) -> Dict[int, List[str]]:
    """
    Генерирует текстовое описание каждого товара, заменяя '-' на пробелы в текстовых фичах.

    :param catalog: DataFrame с фичами товаров (id, Category, Delivery, Pet, Brand)
    :return: Словарь {item_id: [список слов]}
    """

    descriptions = {}
    for _, row in tqdm(catalog.iterrows()):
        words = []
        for col in features:
          if col != 'Brand':
            if isinstance(row[col], list):  # Проверяем, что значение - это список
                for item in row[col]:
                    words.append(item.replace("-", " ").lower())  # Заменяем "-" на пробел
        if 'Brand' in features and isinstance(row['Brand'], str):  # Добавляем бренд
            words.append(row['Brand'])
        descriptions[row['item_id']] = " ".join(words)

    return descriptions

In [9]:
def generate_item_embeddings(item_descriptions: Dict[int, str], model: SentenceTransformer) -> Dict[int, np.ndarray]:
    """
    Генерирует эмбеддинги для товаров, усредняя векторы всех слов в описаниях товаров.

    :param item_descriptions: Словарь {item_id: описание товара (строка)}
    :param model: Обученная модель SentenceTransformer
    :return: Словарь {item_id: эмбеддинг товара (numpy массив)}
    """
    item_embeddings = {}

    for item_id, description in tqdm(item_descriptions.items()):
        item_embeddings[item_id] = model.encode(description)

    return item_embeddings

In [10]:
def create_user_histories(interactions_df: pd.DataFrame) -> Dict[int, List[int]]:
    """
    Создает словарь историй пользователей на основе взаимодействий с товарами.

    :param interactions_df: Датасет с взаимодействиями пользователей с товарами.
                            Должен содержать столбцы: 'user_id', 'item_id'.
    :return: Словарь {user_id: список товаров, с которыми взаимодействовал пользователь}
    """
    user_histories = {}

    # Группируем данные по user_id и собираем список item_id для каждого пользователя
    for user_id, group in tqdm(interactions_df.groupby('user_id')):
        user_histories[user_id] = group['item_id'].tolist()

    return user_histories

In [11]:
def generate_user_embeddings(user_histories: Dict[int, List[int]], item_embeddings: Dict[int, np.ndarray]) -> Dict[int, np.ndarray]:
    """
    Генерирует эмбеддинги для пользователей, усредняя эмбеддинги товаров в их истории.

    :param user_histories: Словарь {user_id: список товаров, с которыми взаимодействовал пользователь}
    :param item_embeddings: Словарь {item_id: эмбеддинг товара}
    :return: Словарь {user_id: эмбеддинг пользователя (numpy массив)}
    """
    user_embeddings = {}
    missed_items = 0

    for user_id, item_ids in tqdm(user_histories.items()):
        item_vectors = []

        # Получаем эмбеддинги для всех товаров пользователя
        for item_id in item_ids:
          if item_id in item_embeddings.keys():
            item_vectors.append(item_embeddings[item_id])
          else:
            missed_items += 1

        # Усредняем эмбеддинги товаров пользователя
        if item_vectors:
            user_embeddings[user_id] = np.mean(item_vectors, axis=0)
        else:
            # Если для пользователя нет товаров, возвращаем пустой вектор
            user_embeddings[user_id] = np.zeros(next(iter(item_embeddings.values())).shape)

    return user_embeddings, missed_items

In [12]:
def normalize_vector(vector: np.ndarray) -> np.ndarray:
    """
    Нормализует вектор до единичной длины.

    :param vector: Вектор, который нужно нормализовать.
    :return: Нормализованный вектор.
    """
    norm = np.linalg.norm(vector)
    if norm == 0:
        return vector  # Если норма вектора равна 0, возвращаем вектор без изменений
    return vector / norm

def find_nearest_users(user_id: int, user_embeddings: Dict[int, np.ndarray], top_n: int = 5) -> List[int]:
    """
    Находит ближайших пользователей на основе евклидовой метрики их нормализованных эмбеддингов.

    :param user_id: ID пользователя, для которого ищем ближайших пользователей.
    :param user_embeddings: Словарь {user_id: эмбеддинг пользователя}.
    :param top_n: Количество ближайших пользователей, которые нужно вернуть.
    :return: Список из top_n ближайших пользователей по евклидовой метрике.
    """
    if user_id not in user_embeddings:
        raise ValueError(f"User ID {user_id} not found in the user embeddings.")

    # Нормализуем эмбеддинг заданного пользователя
    target_embedding = normalize_vector(user_embeddings[user_id])

    # Список расстояний (евклидова метрика)
    distances = []

    for other_user_id, other_embedding in user_embeddings.items():
        if other_user_id != user_id:
            # Нормализуем эмбеддинг другого пользователя
            other_embedding_normalized = normalize_vector(other_embedding)
            # Вычисляем евклидово расстояние
            dist = euclidean(target_embedding, other_embedding_normalized)
            distances.append((other_user_id, dist))

    # Сортируем пользователей по возрастанию расстояния (меньшее значение - более похожий)
    distances.sort(key=lambda x: x[1])

    # Возвращаем top_n ближайших пользователей
    nearest_users = [user for user, _ in distances[:top_n]]

    return nearest_users

In [13]:
def show_user_description(user_id):
  user_actions = actions_with_item_features[actions_with_item_features['user_id']==user_id]

# categories
  categories = []
  for row in user_actions['Category']:
    if row:
      categories += row
    # for a in row:
    #   categories += a.split('-')
  unique_elements, counts = np.unique(categories, return_counts=True)
  count_dict = dict(zip(unique_elements, counts))
  sorted_count_dict = dict(sorted(count_dict.items(), key=lambda item: item[1], reverse=True))
  print('item categories:')
  print(sorted_count_dict, end='\n\n')

# pets
  pets = []
  for row in user_actions['Pet']:
    if row:
      pets += row
  unique_elements, counts = np.unique(pets, return_counts=True)
  count_dict = dict(zip(unique_elements, counts))
  sorted_count_dict = dict(sorted(count_dict.items(), key=lambda item: item[1], reverse=True))
  print('item pets:')
  print(sorted_count_dict, end='\n\n')

# brands
  brands = np.unique(user_actions['Brand'], return_counts=True)
  brands_dict = {}
  for i in range(len(brands[0])):
    brands_dict[brands[0][i][0]] = brands[1][i]
  sorted_count_dict = dict(sorted(brands_dict.items(), key=lambda item: item[1], reverse=True))
  print('item brands:')
  print(sorted_count_dict, end='\n\n')

# delivery
  deliveries = np.unique(user_actions['Delivery'], return_counts=True)
  del_dict = {}
  for i in range(len(deliveries[0])):
    del_dict[deliveries[0][i][0]] = deliveries[1][i]
  sorted_count_dict = dict(sorted(del_dict.items(), key=lambda item: item[1], reverse=True))
  print('item deliveries:')
  print(sorted_count_dict, end='\n\n')

In [14]:
def agglomerative_clustering(user_embeddings: Dict[int, np.ndarray], n_clusters: int = 5) -> Dict[int, int]:
    """
    Выполняет агломеративную кластеризацию пользователей на основе их эмбеддингов.

    :param user_embeddings: Словарь {user_id: embedding}.
    :param n_clusters: Количество кластеров.
    :return: Словарь {user_id: cluster_label}.
    """
    # Преобразуем словарь в список user_ids и матрицу эмбеддингов
    user_ids = list(user_embeddings.keys())
    embeddings_matrix = np.array(list(user_embeddings.values()))

    # Нормализуем эмбеддинги
    embeddings_matrix = normalize(embeddings_matrix)

    # Выполняем агломеративную кластеризацию
    clustering = AgglomerativeClustering(n_clusters=n_clusters, linkage='ward')
    cluster_labels = clustering.fit_predict(embeddings_matrix)

    # Создаем словарь {user_id: cluster_label}
    user_clusters = dict(zip(user_ids, cluster_labels))

    return user_clusters

In [15]:
def cluster_users_histories(actions: pd.DataFrame, catalog: pd.DataFrame, model: SentenceTransformer, embeddings_path: str, features: List[str] = ['Category'], n_clusters: int = 10):

  items_text_descriptions = generate_item_descriptions(catalog, features=['Category'])

  # item_embeddings = generate_item_embeddings(items_text_descriptions, model)
  # with open(embeddings_path, "wb") as f:
  #         pickle.dump(item_embeddings, f)
  with open(embeddings_path, "rb") as f:
            item_embeddings = pickle.load(f)

  users_histories = create_user_histories(actions)

  user_embeddings, missed_items = generate_user_embeddings(users_histories, item_embeddings)
  print(f'Item embeddings were not found {missed_items} times')

  clusters = agglomerative_clustering(user_embeddings, n_clusters=n_clusters)

  return clusters

In [16]:
def count_clusters(clusters: Dict[int, int]) -> Dict[int, int]:
    """
    Подсчитывает количество пользователей в каждом кластере и сортирует по убыванию.

    :param clusters: Словарь {user_id: cluster_label}.
    :return: Словарь {cluster_label: count}, отсортированный по убыванию.
    """
    # Подсчитываем количество пользователей в каждом кластере
    cluster_counts = {}
    for cluster_label in clusters.values():
        cluster_counts[cluster_label] = cluster_counts.get(cluster_label, 0) + 1

    # Сортируем по убыванию количества элементов в кластере
    sorted_cluster_counts = dict(sorted(cluster_counts.items(), key=lambda item: item[1], reverse=True))

    return sorted_cluster_counts

In [17]:
def get_user_histories_by_cluster(cluster: int, clusters: Dict[int, int], interactions_with_features: pd.DataFrame, n_examples: int = 3) -> None:
    """
    :param clusters: Словарь {user_id: cluster_label}.
    :param user_histories: Словарь {user_id: [список товаров]}.
    :return: Словарь {cluster_label: [история одного пользователя]}.
    """

    n_printed_examples = 0
    for user_id, user_cluster in clusters.items():
      if user_cluster == cluster:
        show_user_description(user_id)
        print('-'*200)
        n_printed_examples += 1
        if n_printed_examples == n_examples:
          break

In [18]:
def get_cluster_examples(clusters: Dict[int, int]):
  cluster_examples = {}
  for cluster in tqdm(np.unique(list(clusters.values()))):
    for user_id, user_cluster in clusters.items():
      if user_cluster == cluster:
        cluster_examples[cluster] = user_id
        break

  return cluster_examples

# Данные


In [28]:
# sasrec_data_folder_path = HEAD_DIRECTORY+'data/sasrec_format/'
# output_name = 'actions'
# data_actions_processed = spark.read.parquet(sasrec_data_folder_path+output_name).orderBy(['user_id', 'datetime'])

In [30]:
# sasrec_data_folder_path = HEAD_DIRECTORY+'data/sasrec_format/'
# output_name = 'items'
# data_items_processed = spark.read.parquet(sasrec_data_folder_path+output_name).orderBy('id', 'feature', 'value')
# data_items_cleaned = spark.read.parquet(HEAD_DIRECTORY+'data/cleaned_data/data_items')

In [32]:
# actions, items_features = process_data.processed_datasets_to_pandas(data_actions_processed, data_items_processed)
# actions.to_pickle(HEAD_DIRECTORY+'data/sasrec_format/actions.pkl')
# items_features.to_pickle(HEAD_DIRECTORY+'data/sasrec_format/items.pkl')
actions = pd.read_pickle(HEAD_DIRECTORY+'data/sasrec_format/actions.pkl')
items_features = pd.read_pickle(HEAD_DIRECTORY+'data/sasrec_format/items.pkl')

In [33]:
# catalog = data_items_cleaned.select('customer_id', 'name', 'group_ids_intersect', 'How_to_get_it', 'Primary_Brand', 'Primary_Pet_Type').toPandas()
# catalog.rename(columns={'customer_id': 'item_id'}, inplace=True)
# catalog['item_id'] = catalog['item_id'].apply(lambda x: int(x))
# catalog = catalog.rename(columns={'group_ids_intersect': 'Category', 'How_to_get_it': 'Delivery', 'Primary_Brand': 'Brand', 'Primary_Pet_Type': 'Pet'})
# catalog.to_pickle(HEAD_DIRECTORY+'data/sasrec_format/catalog.pkl')
catalog = pd.read_pickle(HEAD_DIRECTORY+'data/sasrec_format/catalog.pkl')
actions_with_item_features = actions.merge(catalog, on='item_id', how='left')

In [34]:
# В валлидационный датасет откладываем последнее действие для каждого юзера
val_df = actions.loc[actions.groupby("user_id")["datetime"].idxmax()]

# Обучающий датасет – все, кроме последней итерации у каждого пользователя
train_df = actions.drop(val_df.index)

In [35]:
# model = SentenceTransformer('nfhakim/topic-clustering-v1')

In [36]:
# clusters = cluster_users_histories(actions=train_df, catalog=catalog, model=model, embeddings_path=HEAD_DIRECTORY+'embeddings/item_embeddings_val_1.pkl', n_clusters=10)
# with open(HEAD_DIRECTORY+'clustering/users_clusters_10.pkl', "wb") as f:
#         pickle.dump(clusters, f)
with open(HEAD_DIRECTORY+'clustering/users_clusters_10.pkl', "rb") as f:
        clusters = pickle.load(f)

print(count_clusters(clusters))

{0: 3960, 1: 3645, 6: 3160, 7: 2148, 3: 1671, 9: 1617, 4: 1208, 5: 1190, 2: 444, 8: 278}


In [37]:
clusters_examples = get_cluster_examples(clusters)
# with open(HEAD_DIRECTORY+'others/clusters_examples.pkl', "wb") as f:
#         pickle.dump(clusters_examples, f)
# with open(HEAD_DIRECTORY+'others/clusters_examples.pkl', "rb") as f:
#         clusters_examples = pickle.load(f)

100%|██████████| 10/10 [00:00<00:00, 59747.92it/s]


# Модель

In [41]:
# Construct dataset
features_final_names = ['Category', 'Delivery', 'Brand', 'Pet']
dataset = Dataset.construct(
    interactions_df=train_df,
    item_features_df=items_features,
    cat_item_features=features_final_names,
)

In [42]:
model = SASRecModel(
    deterministic=True,
    loss="softmax",
    item_net_block_types=(IdEmbeddingsItemNet, CatFeaturesItemNet),  # Use item ids and cat features
)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


In [43]:
# Checkpoint last epoch
last_epoch_ckpt = ModelCheckpoint(filename="last_epoch")

trainer = Trainer(
    # accelerator="gpu",
    # devices=2,
    min_epochs=2,
    max_epochs=20,
    deterministic=True,
    limit_train_batches=10,  # use only 2 batches for each epoch for a test run
    logger = CSVLogger(save_dir=HEAD_DIRECTORY+'models'),
    callbacks=[last_epoch_ckpt],  # pass our callbacks for checkpoints
    enable_progress_bar=True,
    enable_model_summary=True,
)

# Replace trainer with our custom one
model._trainer = trainer

# Fit model. Everything will happen under the hood
model.fit(dataset)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.callbacks.model_summary:
  | Name        | Type                     | Params | Mode 
-----------------------------------------------------------------
0 | torch_model | TransformerTorchBackbone | 6.1 M  | train
-----------------------------------------------------------------
6.1 M     Trainable params
0         Non-trainable params
6.1 M     Total params
24.317    Total estimated model params size (MB)
37        Modules in train mode
0         Modules in eval mode


Training: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=20` reached.


<rectools.models.nn.transformers.sasrec.SASRecModel at 0x7beb01a0bf10>

In [44]:
# ckpt_path = os.path.join(model.fit_trainer.log_dir, "checkpoints", "last_epoch.ckpt")
ckpt_path = HEAD_DIRECTORY+'models/lightning_logs/version_7/checkpoints/last_epoch.ckpt'
model = SASRecModel.load_from_checkpoint(ckpt_path)
# loaded.recommend(users=VAL_USERS[:1], dataset=dataset, filter_viewed=True, k=5)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


# Избранные примеры рекомендаций

In [57]:
%%time
test_user = clusters_examples[0]
recos = model.recommend(users=[test_user], dataset=dataset, k=10, filter_viewed=True, on_unsupported_targets="warn")

CPU times: user 66.9 ms, sys: 1.04 ms, total: 68 ms
Wall time: 67.5 ms


In [59]:
history_df = actions_with_item_features[actions_with_item_features["user_id"] == test_user]
history_df

Unnamed: 0,user_id,item_id,datetime,weight,name,Category,Delivery,Brand,Pet
167,162384,5082437,2024-09-28 22:38:18,1,Nature's Miracle Premium Clumping Corn Cob Cat...,"[cat-repeat-delivery-products, repeat-delivery...","[One Time Delivery, Repeat Delivery]",[Nature's Miracle],[Cat]
168,162384,5003927,2024-09-28 22:41:58,1,Arm & Hammer Clump & Seal Multi-Cat Odor Seali...,"[same-day-delivery-cat-products, buy-online-pi...","[Same Day Delivery, Free Pickup Today, One Tim...",[Arm & Hammer],[Cat]
169,162384,5054729,2024-09-28 22:42:46,1,Purina Pro Plan Complete Essentials Seafood St...,"[cat-repeat-delivery-products, summer-hydratin...","[One Time Delivery, Repeat Delivery]",[Purina Pro Plan],[Cat]
170,162384,5206204,2024-09-28 22:47:24,1,Fancy Feast Grain Free Poultry and Beef Feast ...,"[cat-repeat-delivery-products, new-brands-low-...","[Same Day Delivery, Free Pickup Today, One Tim...",[Fancy Feast],[Cat]
171,162384,5206216,2024-09-28 22:47:55,1,Fancy Feast Gravy Lovers Poultry and Beef Gril...,"[cat-food-variety-packs, same-day-delivery-cat...","[Same Day Delivery, Free Pickup Today, One Tim...",[Fancy Feast],[Cat]
172,162384,5206056,2024-09-28 22:48:32,1,Fancy Feast Grilled Seafood Collection Wet Cat...,"[new-brands-low-prices, cat-food-variety-packs...","[Same Day Delivery, Free Pickup Today, One Tim...",[Fancy Feast],[Cat]
173,162384,5190467,2024-09-28 22:49:10,1,Fancy Feast Grilled Seafood Collection in Wet ...,"[new-brands-low-prices, cat-repeat-delivery-pr...","[Free Pickup Today, One Time Delivery, Repeat ...",[Fancy Feast],[Cat]
174,162384,5190460,2024-09-28 22:54:42,1,Fancy Feast Grilled Chicken and Beef in Wet Ca...,"[cat-repeat-delivery-products, new-brands-low-...","[One Time Delivery, Repeat Delivery]",[Fancy Feast],[Cat]
175,162384,5107345,2024-09-28 23:01:12,1,Rachael Ray Nutrish Savory Bites Tasty Salmon ...,"[repeat-delivery-eligible-products, responsibl...","[One Time Delivery, Repeat Delivery]",[Rachael Ray Nutrish],[Cat]
176,162384,5206115,2024-09-28 23:06:07,1,Friskies Chicken Lovers Prime Filets and Shred...,"[same-day-delivery-cat-products, cat-food-vari...","[Same Day Delivery, Free Pickup Today, One Tim...",[Friskies],[Cat]


In [60]:
recommended_df = recos.merge(catalog, on="item_id")
recommended_df

Unnamed: 0,user_id,item_id,score,rank,name,Category,Delivery,Brand,Pet
0,162384,5081747,7.541914,1,"Sheba Perfect Portions Gravy Roasted Chicken, ...","[cat-food-with-grain, cat-food-variety-packs, ...","[Same Day Delivery, Free Pickup Today, One Tim...",[Sheba],[Cat]
1,162384,5022978,6.843294,2,"Sheba Perfect Portions Cuts in Gravy, with Sus...","[cat-food-with-grain, same-day-delivery-cat-pr...","[Same Day Delivery, Free Pickup Today, One Tim...",[Sheba],[Cat]
2,162384,5053243,6.448388,3,Fancy Feast Grain Free Gourmet Naturals Pate W...,"[new-brands-low-prices, cat-repeat-delivery-pr...","[Same Day Delivery, Free Pickup Today, One Tim...",[Fancy Feast],[Cat]
3,162384,5206215,6.293166,4,Fancy Feast Grain Free Seafood Classic Collect...,"[new-brands-low-prices, cat-food-variety-packs...","[Same Day Delivery, Free Pickup Today, One Tim...",[Fancy Feast],[Cat]
4,162384,5022976,6.268663,5,Sheba Perfect Portions Multipack Cuts in Gravy...,"[cat-food-variety-packs, cat-food-trays-cups-a...","[Same Day Delivery, Free Pickup Today, One Tim...",[Sheba],[Cat]
5,162384,5216061,6.232083,6,Sheba PERFECT PORTIONS Adult Cat Food Cuts in ...,"[cat-food-variety-packs, cat-repeat-delivery-p...","[Same Day Delivery, Free Pickup Today, One Tim...",[Sheba],[Cat]
6,162384,5022974,6.115338,7,Sheba Perfect Portions Multipack Delicate Salm...,"[cat-food-variety-packs, cat-food-trays-cups-a...","[Same Day Delivery, Free Pickup Today, One Tim...",[Sheba],[Cat]
7,162384,5206128,6.079864,8,Purina Friskies Seafood and Chicken Pate Favor...,"[cat-repeat-delivery-products, new-brands-low-...","[Same Day Delivery, Free Pickup Today, One Tim...",[Friskies],[Cat]
8,162384,5022977,5.879561,9,Sheba Perfect Portions Variety Pack Savory Chi...,"[same-day-delivery-cat-products, cat-food-with...","[Same Day Delivery, Free Pickup Today, One Tim...",[Sheba],[Cat]
9,162384,5123891,5.480339,10,"Fancy Feast Petites Gourmet, Gravy Collection,...","[cat-repeat-delivery-products, cat-food-with-g...","[One Time Delivery, Repeat Delivery]",[Fancy Feast],[Cat]


In [61]:
conversed_df = val_df[val_df['user_id']==clusters_examples[0]].merge(catalog, on='item_id')
conversed_df

Unnamed: 0,user_id,item_id,datetime,weight,name,Category,Delivery,Brand,Pet
0,162384,5199423,2024-12-02 12:17:14,1,Temptations Classic Seafood Medley Flavor Crun...,"[cat-crunchy-treats, cat-repeat-delivery-produ...","[One Time Delivery, Repeat Delivery]",[Temptations],[Cat]
