In [1]:
import logging
from logging import getLogger

import pandas as pd
import pyarrow.dataset as ds
import torch
from recbole.config import Config
from recbole.data import create_dataset, data_preparation
from recbole.data.interaction import Interaction
from recbole.model.sequential_recommender import GRU4Rec
from recbole.trainer import Trainer
from recbole.utils import init_seed, init_logger


2025-10-21 09:44:54.184750: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-10-21 09:44:54.184781: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-10-21 09:44:54.185637: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-10-21 09:44:54.191133: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
MAX_ITEM = 30

parameter_dict = {
    'model': 'GRU4Rec',
    'dataset': 'eventos_vix',
    'data_path': 'dataset/',
    'show_progress': True,

    'USER_ID_FIELD': 'session_id',
    'ITEM_ID_FIELD': 'item_id',
    'TIME_FIELD': 'timestamp',
    'load_col': {
        'inter': ['session_id', 'item_id', 'timestamp']
    },
    'train_neg_sample_args': None,                # no negative sampling (use full item ranking)
    'epochs': 10,                                 # number of training epochs
    'stopping_step': 3,                           # stop early if no improvement after 3 valid steps

    'eval_batch_size': 1024,                      # batch size during evaluation
    'train_batch_size': 1024,                     # (optional) batch size for training
    'enable_amp': True,                           # (optional) enable mixed-precision training
    'MAX_ITEM_LIST_LENGTH': MAX_ITEM,             # max number of past items used in sequence
    'eval_args': {
        'split': {'RS': [9, 1, 0]},               # random split: 90% train, 10% valid, 0% test
        'group_by': 'user',                       # group data per user/session
        'order': 'TO',                            # respect temporal order
        'mode': 'full'                            # use full item list for evaluation
    }
}

config = Config(config_dict=parameter_dict)
init_seed(config['seed'], config['reproducibility'])

init_logger(config)
logger = getLogger()

c_handler = logging.StreamHandler()
c_handler.setLevel(logging.INFO)
logger.addHandler(c_handler)

logger.info(config)

21 Oct 09:44    INFO  
General Hyper Parameters:
gpu_id = 0
use_gpu = True
seed = 2020
state = INFO
reproducibility = True
data_path = dataset/eventos_vix
checkpoint_dir = saved
show_progress = True
save_dataset = False
dataset_save_path = None
save_dataloaders = False
dataloaders_save_path = None
log_wandb = False

Training Hyper Parameters:
epochs = 10
train_batch_size = 1024
learner = adam
learning_rate = 0.001
train_neg_sample_args = {'distribution': 'none', 'sample_num': 'none', 'alpha': 'none', 'dynamic': False, 'candidate_num': 0}
eval_step = 1
stopping_step = 3
clip_grad_norm = None
weight_decay = 0.0
loss_decimal_place = 4

Evaluation Hyper Parameters:
eval_args = {'split': {'RS': [9, 1, 0]}, 'order': 'TO', 'group_by': 'user', 'mode': {'valid': 'full', 'test': 'full'}}
repeatable = True
metrics = ['Recall', 'MRR', 'NDCG', 'Hit', 'Precision']
topk = [10]
valid_metric = MRR@10
valid_metric_bigger = True
eval_batch_size = 1024
metric_decimal_place = 4

Dataset Hyper Parameters:
f

In [3]:
dataset = create_dataset(config)
logger.info(dataset)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  feat[field].fillna(value=0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  feat[field].fillna(value=feat[field].mean(), inplace=True)
21 Oct 09:45    INFO  eventos_vix
The number of users: 206232
Average actions of users: 23.408803720100277
The number of items: 209

### Training

In [4]:
train_data, valid_data, test_data = data_preparation(config, dataset)

21 Oct 09:46    INFO  [Training]: train_batch_size = [1024] train_neg_sample_args: [{'distribution': 'none', 'sample_num': 'none', 'alpha': 'none', 'dynamic': False, 'candidate_num': 0}]
[Training]: train_batch_size = [1024] train_neg_sample_args: [{'distribution': 'none', 'sample_num': 'none', 'alpha': 'none', 'dynamic': False, 'candidate_num': 0}]
21 Oct 09:46    INFO  [Evaluation]: eval_batch_size = [1024] eval_args: [{'split': {'RS': [9, 1, 0]}, 'order': 'TO', 'group_by': 'user', 'mode': {'valid': 'full', 'test': 'full'}}]
[Evaluation]: eval_batch_size = [1024] eval_args: [{'split': {'RS': [9, 1, 0]}, 'order': 'TO', 'group_by': 'user', 'mode': {'valid': 'full', 'test': 'full'}}]


In [5]:
model = GRU4Rec(config, train_data.dataset).to(config['device'])
logger.info(model)

trainer = Trainer(config, model)

best_valid_score, best_valid_result = trainer.fit(train_data, valid_data)

21 Oct 09:46    INFO  GRU4Rec(
  (item_embedding): Embedding(20938, 64, padding_idx=0)
  (emb_dropout): Dropout(p=0.3, inplace=False)
  (gru_layers): GRU(64, 128, bias=False, batch_first=True)
  (dense): Linear(in_features=128, out_features=64, bias=True)
  (loss_fct): CrossEntropyLoss()
)
Trainable parameters: 1422016
GRU4Rec(
  (item_embedding): Embedding(20938, 64, padding_idx=0)
  (emb_dropout): Dropout(p=0.3, inplace=False)
  (gru_layers): GRU(64, 128, bias=False, batch_first=True)
  (dense): Linear(in_features=128, out_features=64, bias=True)
  (loss_fct): CrossEntropyLoss()
)
Trainable parameters: 1422016
  scaler = amp.GradScaler(enabled=self.enable_scaler)
21 Oct 09:47    INFO  epoch 0 training [time: 46.98s, train loss: 20898.1087]
epoch 0 training [time: 46.98s, train loss: 20898.1087]
21 Oct 09:47    INFO  epoch 0 evaluating [time: 3.08s, valid_score: 0.572100]
epoch 0 evaluating [time: 3.08s, valid_score: 0.572100]
21 Oct 09:47    INFO  valid result: 
recall@10 : 0.6788   

In [6]:
def recommend_for_session(external_session_id, model, dataset, top_k=10):
    session_field = dataset.uid_field
    item_field = dataset.iid_field
    time_field = dataset.time_field

    internal_session_id = dataset.token2id(session_field, external_session_id)

    inter_feat = dataset.inter_feat

    inter_df = pd.DataFrame({
        session_field: inter_feat[session_field].tolist(),
        item_field: inter_feat[item_field].tolist(),
        time_field: inter_feat[time_field].tolist(),
    })

    session_history = (
        inter_df
        .loc[inter_df[session_field] == internal_session_id]
        .sort_values(by=time_field)
    )

    internal_item_ids = session_history[item_field].tolist()

    item_list_field = item_field + '_list'

    interaction = Interaction({
        session_field: torch.tensor([internal_session_id]),
        item_list_field: torch.tensor([internal_item_ids]),
        'item_length': torch.tensor([len(internal_item_ids)]),
    })

    interaction = interaction.to(model.device)

    model.eval()
    with torch.no_grad():
        scores = model.full_sort_predict(interaction)

    scores = scores.view(-1)
    top_k_scores, top_k_indices = torch.topk(scores, k=top_k)

    external_item_ids = dataset.id2token(item_field, top_k_indices.cpu().numpy())

    print(f"Top {top_k} itens recomendados para a sessão '{external_session_id}':\n")
    for i, item_id in enumerate(external_item_ids):
        score = top_k_scores[i].item()
        print(f"  {i+1}. Item ID: {item_id} (Score: {score:.4f})")

    return external_item_ids, top_k_scores.cpu().numpy()

In [7]:
def recommend_for_sessions(external_session_id, model, dataset, top_k=10):
    session_field = dataset.uid_field
    item_field = dataset.iid_field
    time_field = dataset.time_field

    internal_session_id = dataset.token2id(session_field, external_session_id)
    inter_feat = dataset.inter_feat

    inter_df = pd.DataFrame({
        session_field: inter_feat[session_field].tolist(),
        item_field: inter_feat[item_field].tolist(),
        time_field: inter_feat[time_field].tolist(),
    })

    session_history = (
        inter_df[inter_df[session_field] == internal_session_id]
        .sort_values(by=time_field)
    )
    internal_item_ids = session_history[item_field].tolist()

    item_list_field = dataset.iid_field + '_list'
    user_field = dataset.uid_field

    interaction = Interaction({
        user_field: torch.tensor([internal_session_id]),
        item_list_field: torch.tensor([internal_item_ids]),
        'item_length': torch.tensor([len(internal_item_ids)]),
    })

    model.eval()
    with torch.no_grad():
      scores = model.full_sort_predict(interaction.to(model.device))

    top_k_indices = torch.topk(scores[0], k=top_k).indices.tolist()
    external_item_ids = dataset.id2token(dataset.iid_field, top_k_indices)

    print(f"Top {top_k} itens recomendados para a sessão {external_session_id}:")
    for internal_id, external_id in zip(top_k_indices, external_item_ids):
        print(f"  Internal ID: {internal_id}  →  External ID: {external_id}")

    return external_item_ids

In [8]:
recommend_for_session('851_116', model, dataset, top_k=10)

Top 10 itens recomendados para a sessão '851_116':

  1. Item ID: 1367510 (Score: 19.0394)
  2. Item ID: 878682 (Score: 13.4436)
  3. Item ID: 2429091 (Score: 12.6762)
  4. Item ID: 1778866 (Score: 12.6693)
  5. Item ID: 1959357 (Score: 12.6676)
  6. Item ID: 1609317 (Score: 12.5105)
  7. Item ID: 1392661 (Score: 12.4417)
  8. Item ID: 1772585 (Score: 12.2927)
  9. Item ID: 1329326 (Score: 12.2450)
  10. Item ID: 1099301 (Score: 12.0960)


(array(['1367510', '878682', '2429091', '1778866', '1959357', '1609317',
        '1392661', '1772585', '1329326', '1099301'], dtype='<U7'),
 array([19.039433, 13.443646, 12.67622 , 12.669264, 12.667649, 12.510486,
        12.441708, 12.292713, 12.244952, 12.096034], dtype=float32))

In [9]:
base_path = "/home/hygo2025/Documents/data/processed_data/listings"

dataset = ds.dataset(base_path, format="parquet", partitioning="hive")
table = dataset.to_table()

df = table.to_pandas()

print(df.shape)
print(df.columns.tolist())
df.head()




(2550799, 17)
['state', 'city', 'neighborhood', 'anonymized_listing_id', 'dt', 'created_at', 'updated_at', 'price', 'usable_areas', 'total_areas', 'bathrooms', 'bedrooms', 'suites', 'parking_spaces', 'amenities', 'listing_id_numeric', 'geopoint']


Unnamed: 0,state,city,neighborhood,anonymized_listing_id,dt,created_at,updated_at,price,usable_areas,total_areas,bathrooms,bedrooms,suites,parking_spaces,amenities,listing_id_numeric,geopoint
0,bahia,salvador,matatu,00130EA2C2FA2CC9E7E84057ECD7C885A93A6142886300...,2024-07-01,2023-12-10 20:34:37,2024-04-29 21:00:14,95000.0,150.0,150.0,2.0,3.0,1.0,1.0,"['PETS_ALLOWED', 'FIREPLACE', 'GOURMET_BALCONY']",1,"-38.4938709,-12.971598"
1,bahia,salvador,rio vermelho,00131334683CACCB666EB6E8E4CCCA267756368007DE7D...,2024-07-01,2022-11-01 11:34:24,2022-12-02 17:52:13,395000.0,120.0,,3.0,3.0,1.0,1.0,"['PLAYGROUND', 'PARTY_HALL']",2,"-38.4899605,-13.0100607"
2,bahia,salvador,stiep,0014636701765E6CD57131532032AFE604AA769819866F...,2024-07-01,2022-07-07 17:10:17,2022-07-12 18:43:40,575000.0,100.0,100.0,3.0,3.0,1.0,1.0,"['GYM', 'ELEVATOR', 'POOL', 'PARTY_HALL', 'GAT...",3,"-38.4465942,-12.9804811"
3,espírito santo,serra,são francisco,00206B5CC4949696833E7244C6E87CA19668671B9C1F81...,2024-07-01,2022-10-21 13:48:20,2022-12-15 12:20:05,53000.0,86.0,186.0,2.0,2.0,1.0,1.0,[],4,"-40.1807942,-20.118991"
4,bahia,salvador,jardim apipema,00273500B92CD16B11858FF3D0D7A088C0C556E1D5122F...,2024-07-01,2022-07-25 17:47:25,2024-05-23 13:29:36,1350000.0,220.0,220.0,6.0,4.0,4.0,3.0,"['GREEN_SPACE', 'BARBECUE_GRILL', 'POOL', 'CON...",5,"-38.5178178,-13.0048463"
