In [1]:
import logging
from logging import getLogger

import pandas as pd
import pyarrow.dataset as ds
import torch
from recbole.config import Config
from recbole.data import create_dataset, data_preparation
from recbole.data.interaction import Interaction
from recbole.model.sequential_recommender import GRU4Rec
from recbole.trainer import Trainer
from recbole.utils import init_seed, init_logger


2025-10-22 11:11:52.852801: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-10-22 11:11:52.852835: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-10-22 11:11:52.853763: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-10-22 11:11:52.859387: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
MAX_ITEM = 30

parameter_dict = {
    'model': 'GRU4Rec',
    'dataset': 'eventos_vix',
    'data_path': 'dataset/',
    'show_progress': True,

    'USER_ID_FIELD': 'session_id',
    'ITEM_ID_FIELD': 'item_id',
    'TIME_FIELD': 'timestamp',
    'load_col': {
        'inter': ['session_id', 'item_id', 'timestamp']
    },
    'train_neg_sample_args': None,                # no negative sampling (use full item ranking)
    'epochs': 40,                                 # number of training epochs
    'stopping_step': 10,                           # stop early if no improvement after 3 valid steps

    'eval_batch_size': 1024,                      # batch size during evaluation
    'train_batch_size': 1024,                     # (optional) batch size for training
    'enable_amp': True,                           # (optional) enable mixed-precision training
    'MAX_ITEM_LIST_LENGTH': MAX_ITEM,             # max number of past items used in sequence
    'eval_args': {
        'split': {'RS': [8, 1, 1]},               # random split: 90% train, 10% valid, 0% test
        'group_by': 'user',                       # group data per user/session
        'order': 'TO',                            # respect temporal order
        'mode': 'full'                            # use full item list for evaluation
    }
}

config = Config(config_dict=parameter_dict)
init_seed(config['seed'], config['reproducibility'])

init_logger(config)
logger = getLogger()

c_handler = logging.StreamHandler()
c_handler.setLevel(logging.INFO)
logger.addHandler(c_handler)

logger.info(config)

22 Oct 11:11    INFO  
General Hyper Parameters:
gpu_id = 0
use_gpu = True
seed = 2020
state = INFO
reproducibility = True
data_path = dataset/eventos_vix
checkpoint_dir = saved
show_progress = True
save_dataset = False
dataset_save_path = None
save_dataloaders = False
dataloaders_save_path = None
log_wandb = False

Training Hyper Parameters:
epochs = 40
train_batch_size = 1024
learner = adam
learning_rate = 0.001
train_neg_sample_args = {'distribution': 'none', 'sample_num': 'none', 'alpha': 'none', 'dynamic': False, 'candidate_num': 0}
eval_step = 1
stopping_step = 10
clip_grad_norm = None
weight_decay = 0.0
loss_decimal_place = 4

Evaluation Hyper Parameters:
eval_args = {'split': {'RS': [8, 1, 1]}, 'order': 'TO', 'group_by': 'user', 'mode': {'valid': 'full', 'test': 'full'}}
repeatable = True
metrics = ['Recall', 'MRR', 'NDCG', 'Hit', 'Precision']
topk = [10]
valid_metric = MRR@10
valid_metric_bigger = True
eval_batch_size = 1024
metric_decimal_place = 4

Dataset Hyper Parameters:


In [3]:
dataset = create_dataset(config)
logger.info(dataset)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  feat[field].fillna(value=0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  feat[field].fillna(value=feat[field].mean(), inplace=True)
22 Oct 11:12    INFO  eventos_vix
The number of users: 89980
Average actions of users: 43.5800353415797
The number of items: 7403
A

### Training

In [4]:
train_data, valid_data, test_data = data_preparation(config, dataset)

22 Oct 11:13    INFO  [Training]: train_batch_size = [1024] train_neg_sample_args: [{'distribution': 'none', 'sample_num': 'none', 'alpha': 'none', 'dynamic': False, 'candidate_num': 0}]
[Training]: train_batch_size = [1024] train_neg_sample_args: [{'distribution': 'none', 'sample_num': 'none', 'alpha': 'none', 'dynamic': False, 'candidate_num': 0}]
22 Oct 11:13    INFO  [Evaluation]: eval_batch_size = [1024] eval_args: [{'split': {'RS': [8, 1, 1]}, 'order': 'TO', 'group_by': 'user', 'mode': {'valid': 'full', 'test': 'full'}}]
[Evaluation]: eval_batch_size = [1024] eval_args: [{'split': {'RS': [8, 1, 1]}, 'order': 'TO', 'group_by': 'user', 'mode': {'valid': 'full', 'test': 'full'}}]


In [5]:
model = GRU4Rec(config, train_data.dataset).to(config['device'])
logger.info(model)

trainer = Trainer(config, model)

best_valid_score, best_valid_result = trainer.fit(train_data, valid_data)

22 Oct 11:13    INFO  GRU4Rec(
  (item_embedding): Embedding(7403, 64, padding_idx=0)
  (emb_dropout): Dropout(p=0.3, inplace=False)
  (gru_layers): GRU(64, 128, bias=False, batch_first=True)
  (dense): Linear(in_features=128, out_features=64, bias=True)
  (loss_fct): CrossEntropyLoss()
)
Trainable parameters: 555776
GRU4Rec(
  (item_embedding): Embedding(7403, 64, padding_idx=0)
  (emb_dropout): Dropout(p=0.3, inplace=False)
  (gru_layers): GRU(64, 128, bias=False, batch_first=True)
  (dense): Linear(in_features=128, out_features=64, bias=True)
  (loss_fct): CrossEntropyLoss()
)
Trainable parameters: 555776
  scaler = amp.GradScaler(enabled=self.enable_scaler)
22 Oct 11:13    INFO  epoch 0 training [time: 36.75s, train loss: 15648.0575]
epoch 0 training [time: 36.75s, train loss: 15648.0575]
22 Oct 11:14    INFO  epoch 0 evaluating [time: 2.25s, valid_score: 0.417800]
epoch 0 evaluating [time: 2.25s, valid_score: 0.417800]
22 Oct 11:14    INFO  valid result: 
recall@10 : 0.5556    mrr

In [13]:
def recommend_for_session(external_session_id, model, dataset, top_k=10):
    session_field = dataset.uid_field
    item_field = dataset.iid_field
    time_field = dataset.time_field

    internal_session_id = dataset.token2id(session_field, external_session_id)

    inter_feat = dataset.inter_feat

    inter_df = pd.DataFrame({
        session_field: inter_feat[session_field].tolist(),
        item_field: inter_feat[item_field].tolist(),
        time_field: inter_feat[time_field].tolist(),
    })

    session_history = (
        inter_df
        .loc[inter_df[session_field] == internal_session_id]
        .sort_values(by=time_field)
    )

    internal_item_ids = session_history[item_field].tolist()

    item_list_field = item_field + '_list'

    interaction = Interaction({
        session_field: torch.tensor([internal_session_id]),
        item_list_field: torch.tensor([internal_item_ids]),
        'item_length': torch.tensor([len(internal_item_ids)]),
    })

    interaction = interaction.to(model.device)

    model.eval()
    with torch.no_grad():
        scores = model.full_sort_predict(interaction)

    scores = scores.view(-1)
    top_k_scores, top_k_indices = torch.topk(scores, k=top_k)

    external_item_ids = dataset.id2token(item_field, top_k_indices.cpu().numpy())

    print(f"Top {top_k} itens recomendados para a sessão '{external_session_id}':\n")
    for i, item_id in enumerate(external_item_ids):
        score = top_k_scores[i].item()
        print(f"  {i+1}. Item ID: {item_id} (Score: {score:.4f})")

    print(",".join([str(item_id) for item_id in external_item_ids]))

    return external_item_ids, top_k_scores.cpu().numpy()

In [7]:
def recommend_for_sessions(external_session_id, model, dataset, top_k=10):
    session_field = dataset.uid_field
    item_field = dataset.iid_field
    time_field = dataset.time_field

    internal_session_id = dataset.token2id(session_field, external_session_id)
    inter_feat = dataset.inter_feat

    inter_df = pd.DataFrame({
        session_field: inter_feat[session_field].tolist(),
        item_field: inter_feat[item_field].tolist(),
        time_field: inter_feat[time_field].tolist(),
    })

    session_history = (
        inter_df[inter_df[session_field] == internal_session_id]
        .sort_values(by=time_field)
    )
    internal_item_ids = session_history[item_field].tolist()

    item_list_field = dataset.iid_field + '_list'
    user_field = dataset.uid_field

    interaction = Interaction({
        user_field: torch.tensor([internal_session_id]),
        item_list_field: torch.tensor([internal_item_ids]),
        'item_length': torch.tensor([len(internal_item_ids)]),
    })

    model.eval()
    with torch.no_grad():
      scores = model.full_sort_predict(interaction.to(model.device))

    top_k_indices = torch.topk(scores[0], k=top_k).indices.tolist()
    external_item_ids = dataset.id2token(dataset.iid_field, top_k_indices)

    print(f"Top {top_k} itens recomendados para a sessão {external_session_id}:")
    for internal_id, external_id in zip(top_k_indices, external_item_ids):
        print(f"  Internal ID: {internal_id}  →  External ID: {external_id}")

    return external_item_ids

In [14]:
recommend_for_session('999', model, dataset, top_k=10)

Top 10 itens recomendados para a sessão '999':

  1. Item ID: 54358 (Score: 28.9674)
  2. Item ID: 48208 (Score: 27.3849)
  3. Item ID: 6318 (Score: 26.3049)
  4. Item ID: 27709 (Score: 26.1470)
  5. Item ID: 54058 (Score: 26.1249)
  6. Item ID: 22540 (Score: 25.7301)
  7. Item ID: 10039 (Score: 25.3491)
  8. Item ID: 26566 (Score: 25.3441)
  9. Item ID: 29556 (Score: 25.3304)
  10. Item ID: 34513 (Score: 25.2866)
54358,48208,6318,27709,54058,22540,10039,26566,29556,34513


(array(['54358', '48208', '6318', '27709', '54058', '22540', '10039',
        '26566', '29556', '34513'], dtype='<U5'),
 array([28.967402, 27.384941, 26.304882, 26.146965, 26.124907, 25.73013 ,
        25.349087, 25.34414 , 25.330423, 25.28663 ], dtype=float32))

In [None]:
base_path = "/home/hygo2025/Documents/data/processed_data/listings"

dataset = ds.dataset(base_path, format="parquet", partitioning="hive")
table = dataset.to_table()

df = table.to_pandas()

print(df.shape)
print(df.columns.tolist())
df.head()




In [9]:
session_field = dataset.uid_field
try:
    first_internal_id = dataset.inter_feat[session_field][0].item()
    external_id_list = dataset.id2token(session_field, [first_internal_id])
    if external_id_list:
        external_id_to_print = external_id_list[0]
        print(f"First external session ID: {external_id_to_print}")
    else:
        print("No external session ID found for the first internal ID.")
except KeyError:
    print(f"Session field '{session_field}' not found in dataset.inter_feat.")
except IndexError:
    print("No data found in dataset.inter_feat.")

First external session ID: 999


In [None]:
saved_model_path = "saved/Gru"
checkpoint = torch.load(saved_model_path, weights_only=False)

config = checkpoint["config"]
init_seed(config['seed'], config['reproducibility'])
init_logger(config)
dataset = create_dataset(config)

model = GRU4Rec(config, dataset).to(config['device'])

model.load_state_dict(checkpoint["state_dict"])

model.eval()