In [13]:
import random
import warnings
import zipfile
from pathlib import Path

import pandas as pd
import tensorflow as tf
import tqdm
warnings.filterwarnings("ignore")

import pyarrow.dataset as ds

In [14]:
import pyarrow.dataset as ds
import pyarrow.compute as pc
import pandas as pd

def load_data_efficiently():
    # Caminho para o seu dataset
    dataset_path = "/home/hygo2025/Documents/data/processed_data/enriched_events"

    # Parâmetros para amostragem
    sample_fraction = 0.2
    random_seed = 42

    # 1. Crie o objeto do dataset como antes
    dataset = ds.dataset(
        dataset_path,
        format='parquet',
        partitioning=['partition_date']
    )

    # 2. Crie a expressão de filtro
    # Isso define a condição `name_raw != 'GalleryClicked'`
    filter_expression = ds.field('city') == 'Vila Velha'

    # 3. Crie um Scanner com o filtro aplicado
    # O Scanner "planeja" a leitura dos dados, mas ainda não os carrega
    scanner = dataset.scanner(filter=filter_expression)

    # 4. Itere sobre os "batches" (pedaços) de dados e faça a amostragem de cada um
    sampled_batches = []
    # O scanner.to_reader() lê os dados filtrados de forma eficiente
    for batch in scanner.to_reader():
        # Converte o pedaço para pandas
        batch_df = batch.to_pandas()
        # Faz a amostragem do pedaço
        sampled_part = batch_df.sample(frac=sample_fraction, random_state=random_seed)
        sampled_batches.append(sampled_part)

    # 5. Concatene todos os pedaços amostrados em um único DataFrame
    if not sampled_batches:
        # Retorna um DataFrame vazio se o filtro não encontrou nada
        return pd.DataFrame()
    else:
        final_data = pd.concat(sampled_batches, ignore_index=True)
        return final_data


In [15]:
data = load_data_efficiently()
print("data shape:", data.shape)

data shape: (1986261, 31)


In [16]:
data.head()

Unnamed: 0,listing_id,user_id,anonymous_id,name_raw,event_type,collector_timestamp,event_ts,dt,weight,days_until_today,...,bathrooms,bedrooms,suites,parking_spaces,floors,ceiling_height,amenities,user_anonymous_id,user_user_id,partition_date
0,33E7B9776EA5039C25732680CA83967FEB661FEF512DFF...,3F3601CFCDEB2813C81360B11BEA5F136D542F62F3F55F...,3F3601CFCDEB2813C81360B11BEA5F136D542F62F3F55F...,GalleryClicked,OTHER,1706759917368,2024-02-01 03:58:37.368,2024-02-01 03:58:37,0.075,603,...,1.0,2.0,0.0,1.0,,,[],3F3601CFCDEB2813C81360B11BEA5F136D542F62F3F55F...,,partition_date=2024-02-01
1,FADCDF9C44DE48E94CA9F861572640B477D9CF0E406515...,4452C0329CEF4B578C059032554E997A598192C241C9CD...,4452C0329CEF4B578C059032554E997A598192C241C9CD...,ListingRendered,VISIT,1706819413205,2024-02-01 20:30:13.205,2024-02-01 20:30:13,0.1,603,...,0.0,0.0,0.0,3.0,,,[],4452C0329CEF4B578C059032554E997A598192C241C9CD...,,partition_date=2024-02-01
2,D09FAAB549B6B26F56DB5E6BF4BCFCEE39D2A0AE9D60E5...,08FBE3F2210ADA497D58995A3D784C53E883372F46B094...,08FBE3F2210ADA497D58995A3D784C53E883372F46B094...,GalleryClicked,OTHER,1706809683219,2024-02-01 17:48:03.219,2024-02-01 17:48:03,0.075,603,...,3.0,3.0,1.0,2.0,,,"['AIR_CONDITIONING', 'BICYCLES_PLACE', 'KITCHE...",08FBE3F2210ADA497D58995A3D784C53E883372F46B094...,,partition_date=2024-02-01
3,25FF3710C88B633D5232E4EF487C553E50062E1A51D706...,F13D3A79A127675E527C63E9576D4CEA9DD36C42471B58...,F13D3A79A127675E527C63E9576D4CEA9DD36C42471B58...,GalleryClicked,OTHER,1706749700792,2024-02-01 01:08:20.792,2024-02-01 01:08:20,0.075,604,...,2.0,3.0,1.0,2.0,,,"['BACKYARD', 'BALCONY', 'SAFETY_CIRCUIT', 'SER...",F13D3A79A127675E527C63E9576D4CEA9DD36C42471B58...,,partition_date=2024-02-01
4,0D38223741B240E58E0616EDC8DF228203075857FECB1F...,2DCE8C039A9DCFF1DCDFF12FCE5B84D6466A71B3E21AEF...,2DCE8C039A9DCFF1DCDFF12FCE5B84D6466A71B3E21AEF...,GalleryClicked,OTHER,1706806358890,2024-02-01 16:52:38.890,2024-02-01 16:52:38,0.075,603,...,1.0,2.0,0.0,1.0,,,"['PETS_ALLOWED', 'GOURMET_BALCONY']",2DCE8C039A9DCFF1DCDFF12FCE5B84D6466A71B3E21AEF...,,partition_date=2024-02-01


In [17]:
data.columns

Index(['listing_id', 'user_id', 'anonymous_id', 'name_raw', 'event_type',
       'collector_timestamp', 'event_ts', 'dt', 'weight', 'days_until_today',
       'boost', 'rating', 'created_at', 'updated_at', 'status', 'price',
       'state', 'city', 'neighborhood', 'usable_areas', 'total_areas',
       'bathrooms', 'bedrooms', 'suites', 'parking_spaces', 'floors',
       'ceiling_height', 'amenities', 'user_anonymous_id', 'user_user_id',
       'partition_date'],
      dtype='object')

In [18]:
def standardize_column_names(df):
    cols_to_delete = [
        'user_anonymous_id','user_user_id','partition_date', 'collector_timestamp','event_ts', 'weight', 'days_until_today',
       'boost',  'updated_at', 'partition_date'
    ]

    rename_mapping = {
        'user_id': 'user',
        'listing_id': 'item',
        'rating': 'label',
    }


    return df.drop(columns=cols_to_delete).rename(columns=rename_mapping)

In [19]:
data = standardize_column_names(data)
data

Unnamed: 0,item,user,anonymous_id,name_raw,event_type,dt,label,created_at,status,price,...,neighborhood,usable_areas,total_areas,bathrooms,bedrooms,suites,parking_spaces,floors,ceiling_height,amenities
0,33E7B9776EA5039C25732680CA83967FEB661FEF512DFF...,3F3601CFCDEB2813C81360B11BEA5F136D542F62F3F55F...,3F3601CFCDEB2813C81360B11BEA5F136D542F62F3F55F...,GalleryClicked,OTHER,2024-02-01 03:58:37,0.008249,2023-12-06,INACTIVE,100000.0,...,Soteco,200.0,400.0,1.0,2.0,0.0,1.0,,,[]
1,FADCDF9C44DE48E94CA9F861572640B477D9CF0E406515...,4452C0329CEF4B578C059032554E997A598192C241C9CD...,4452C0329CEF4B578C059032554E997A598192C241C9CD...,ListingRendered,VISIT,2024-02-01 20:30:13,0.010999,2024-01-15,DELETED,65000.0,...,Morada da Barra,110.0,110.0,0.0,0.0,0.0,3.0,,,[]
2,D09FAAB549B6B26F56DB5E6BF4BCFCEE39D2A0AE9D60E5...,08FBE3F2210ADA497D58995A3D784C53E883372F46B094...,08FBE3F2210ADA497D58995A3D784C53E883372F46B094...,GalleryClicked,OTHER,2024-02-01 17:48:03,0.008249,2023-09-27,DELETED,1500000.0,...,Praia de Itaparica,130.0,130.0,3.0,3.0,1.0,2.0,,,"['AIR_CONDITIONING', 'BICYCLES_PLACE', 'KITCHE..."
3,25FF3710C88B633D5232E4EF487C553E50062E1A51D706...,F13D3A79A127675E527C63E9576D4CEA9DD36C42471B58...,F13D3A79A127675E527C63E9576D4CEA9DD36C42471B58...,GalleryClicked,OTHER,2024-02-01 01:08:20,0.008218,2023-11-04,INACTIVE,200000.0,...,Barra do Jucu,150.0,250.0,2.0,3.0,1.0,2.0,,,"['BACKYARD', 'BALCONY', 'SAFETY_CIRCUIT', 'SER..."
4,0D38223741B240E58E0616EDC8DF228203075857FECB1F...,2DCE8C039A9DCFF1DCDFF12FCE5B84D6466A71B3E21AEF...,2DCE8C039A9DCFF1DCDFF12FCE5B84D6466A71B3E21AEF...,GalleryClicked,OTHER,2024-02-01 16:52:38,0.008249,2024-01-08,INACTIVE,140000.0,...,Aribiri,85.0,95.0,1.0,2.0,0.0,1.0,,,"['PETS_ALLOWED', 'GOURMET_BALCONY']"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1986256,ECD3BC924873AD6C73EF39232BFA921E189FDC9FF60D8A...,9840CEF9090D927789C238AB03A462AF46D0FFC91375F7...,9840CEF9090D927789C238AB03A462AF46D0FFC91375F7...,GalleryClicked,OTHER,2024-08-01 10:56:12,0.016186,2023-08-12,ACTIVE,250000.0,...,Ponta da Fruta,360.0,360.0,0.0,0.0,0.0,0.0,0.0,,['KITCHEN']
1986257,54A8F1F6442C56889EB012DB8CDEBF35A8ECA800936FE9...,77188678561D19A2823085534F10800C42177AA1BA4A47...,77188678561D19A2823085534F10800C42177AA1BA4A47...,GalleryClicked,OTHER,2024-08-01 18:34:46,0.016186,2024-06-20,ACTIVE,550000.0,...,Praia de Itaparica,76.0,76.0,2.0,2.0,1.0,1.0,10.0,,"['RECREATION_AREA', 'GYM', 'BARBECUE_GRILL', '..."
1986258,B337889D1F7E754C6C89AF62B07508FBE1BDF828CD1BE5...,BCE6FCCF980DD84C38A9C7689C8C248EADD33331DDC08D...,BCE6FCCF980DD84C38A9C7689C8C248EADD33331DDC08D...,GalleryClicked,OTHER,2024-08-01 17:22:14,0.016186,2024-05-30,ACTIVE,750000.0,...,Itapuã,81.0,81.0,2.0,2.0,1.0,1.0,,,"['SEA_VIEW', 'KITCHEN', 'SECURITY_24_HOURS', '..."
1986259,CDE9ECDC59B6505F6833EF4A7A97E1F4B9E1DE94591DA0...,77188678561D19A2823085534F10800C42177AA1BA4A47...,77188678561D19A2823085534F10800C42177AA1BA4A47...,GalleryClicked,OTHER,2024-08-01 20:42:27,0.016186,2023-10-04,ACTIVE,499000.0,...,Praia de Itaparica,60.0,70.0,1.0,2.0,1.0,1.0,0.0,,"['BARBECUE_GRILL', 'KITCHEN', 'ELEVATOR', 'POO..."


In [20]:
data['dt_datetime'] = pd.to_datetime(data['dt'])

# Passo 2: Converter de datetime para Timestamp Unix (em segundos)
# .astype('int64') converte para nanossegundos, então dividimos por 1 bilhão (10**9)
data['time'] = data['dt_datetime'].astype('int64') // 10**9

In [21]:
order_columns = [
    'user', 'item','label', 'time', 'anonymous_id', 'name_raw', 'event_type', 'dt',
        'created_at', 'status', 'price', 'state', 'city',
       'neighborhood', 'usable_areas', 'total_areas', 'bathrooms', 'bedrooms',
       'suites', 'parking_spaces', 'floors', 'ceiling_height', 'amenities'
]

# Aplique a nova ordem
data = data[order_columns]

In [22]:
from libreco.data import DatasetFeat

from libreco.algorithms import DeepFM
from libreco.data import DatasetFeat, split_by_ratio_chrono

train_data, eval_data = split_by_ratio_chrono(data, test_size=0.2)

sparse_col = ['state', 'city', 'neighborhood', 'amenities']
dense_col = [
    'price', 'usable_areas', 'bathrooms', 'bedrooms',
    'suites', 'parking_spaces',
]
user_col = []
item_col = sparse_col + dense_col

for col in sparse_col:
    train_data[col] = train_data[col].fillna('desconhecido')
    eval_data[col] = eval_data[col].fillna('desconhecido')
for col in dense_col:
    median_val = train_data[col].median()
    train_data[col] = train_data[col].fillna(median_val)
    eval_data[col] = eval_data[col].fillna(median_val)

train_data, data_info = DatasetFeat.build_trainset(
    train_data=train_data,
    user_col=user_col,
    item_col=item_col,
    sparse_col=sparse_col,
    dense_col=dense_col,
    multi_sparse_col=[],
    pad_val=["missing"],  # specify padding value
)

eval_data = DatasetFeat.build_evalset(eval_data)

print("Dataset construído com sucesso, valores nulos foram tratados!")

Dataset construído com sucesso, valores nulos foram tratados!


In [23]:
print(data_info)

n_users: 85352, n_items: 40823, data density: 0.0459 %


In [24]:

deepfm = DeepFM(
    "ranking",
    data_info,
    embed_size=16,
    n_epochs=2,
    lr=1e-4,
    lr_decay=False,
    reg=None,
    batch_size=2048,
    num_neg=1,
    use_bn=False,
    dropout_rate=None,
    hidden_units=(128, 64, 32),
    tf_sess_config=None,
    multi_sparse_combiner="sqrtn",  # specify multi_sparse combiner
)

deepfm.fit(
    train_data,
    neg_sampling=True,
    verbose=2,
    shuffle=True,
    eval_data=eval_data,
    metrics=[
        "loss",
        "balanced_accuracy",
        "roc_auc",
        "pr_auc",
        "precision",
        "recall",
        "map",
        "ndcg",
    ],
)


2025-09-29 16:21:04.840602: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2025-09-29 16:21:04.842423: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2025-09-29 16:21:04.842579: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-

Training start time: [35m2025-09-29 16:21:04[0m
total params: [33m2,524,600[0m | embedding params: [33m2,489,723[0m | network params: [33m34,877[0m


2025-09-29 16:21:05.675147: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:388] MLIR V1 optimization pass is not enabled
train:   0%|          | 0/1562 [00:00<?, ?it/s]2025-09-29 16:21:06.101409: W external/local_xla/xla/stream_executor/gpu/asm_compiler.cc:225] Falling back to the CUDA driver for PTX compilation; ptxas does not support CC 8.9
2025-09-29 16:21:06.101425: W external/local_xla/xla/stream_executor/gpu/asm_compiler.cc:228] Used ptxas at ptxas
2025-09-29 16:21:06.101486: W tensorflow/compiler/mlir/tools/kernel_gen/transforms/gpu_kernel_to_blob_pass.cc:191] Failed to compile generated PTX with ptxas. Falling back to compilation by driver.
2025-09-29 16:21:06.142438: W tensorflow/compiler/mlir/tools/kernel_gen/transforms/gpu_kernel_to_blob_pass.cc:191] Failed to compile generated PTX with ptxas. Falling back to compilation by driver.
2025-09-29 16:21:06.262875: W tensorflow/compiler/mlir/tools/kernel_gen/transforms/gpu_kernel_to_blob_pass.cc:191] Failed to compile 

Epoch 1 elapsed: 6.222s
	 [32mtrain_loss: 5506979.5[0m


eval_pointwise: 100%|██████████| 94/94 [00:00<00:00, 350.33it/s]
eval_listwise: 100%|██████████| 55752/55752 [02:22<00:00, 391.72it/s]


	 eval log_loss: 14.5637
	 eval balanced_accuracy: 0.5958
	 eval roc_auc: 0.5960
	 eval pr_auc: 0.6788
	 eval precision@10: 0.0000
	 eval recall@10: 0.0001
	 eval map@10: 0.0001
	 eval ndcg@10: 0.0001


train: 100%|██████████| 1562/1562 [00:05<00:00, 281.06it/s]


Epoch 2 elapsed: 5.560s
	 [32mtrain_loss: 90915.1875[0m


eval_pointwise: 100%|██████████| 94/94 [00:00<00:00, 498.36it/s]
eval_listwise: 100%|██████████| 55752/55752 [02:20<00:00, 396.07it/s]


	 eval log_loss: 15.7815
	 eval balanced_accuracy: 0.5621
	 eval roc_auc: 0.5621
	 eval pr_auc: 0.7475
	 eval precision@10: 0.0001
	 eval recall@10: 0.0002
	 eval map@10: 0.0002
	 eval ndcg@10: 0.0003


In [None]:
print("prediction: ", deepfm.predict(user=1, item=2333))
print("recommendation: ", deepfm.recommend_user(user=1, n_rec=7))