In [10]:
import logging
from logging import getLogger

import pandas as pd
import pyarrow.dataset as ds
import torch
from recbole.config import Config
from recbole.data import create_dataset, data_preparation
from recbole.data.interaction import Interaction
from recbole.model.sequential_recommender import GRU4Rec
from recbole.trainer import Trainer
from recbole.utils import init_seed, init_logger


In [11]:
saved_model_path = "saved/GRU4Rec-Oct-29-2025_19-40-34.pth"
checkpoint = torch.load(saved_model_path, weights_only=False)

config = checkpoint["config"]
init_seed(config['seed'], config['reproducibility'])
init_logger(config)
dataset = create_dataset(config)

model = GRU4Rec(config, dataset).to(config['device'])

model.load_state_dict(checkpoint["state_dict"])

model.eval()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  feat[field].fillna(value=0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  feat[field].fillna(value=feat[field].mean(), inplace=True)


GRU4Rec(
  (item_embedding): Embedding(67542, 64, padding_idx=0)
  (emb_dropout): Dropout(p=0.3, inplace=False)
  (gru_layers): GRU(64, 128, bias=False, batch_first=True)
  (dense): Linear(in_features=128, out_features=64, bias=True)
  (loss_fct): CrossEntropyLoss()
)

In [12]:
base_path_events = "/home/hygo2025/Documents/data/processed_data/enriched_events_vix"
df_events = ds.dataset(base_path_events, format="parquet", partitioning="hive").to_table().to_pandas()

print(df_events.shape)
print(df_events.columns.tolist())
df_events.head()

(14519740, 22)
['listing_id', 'user_id', 'session_id', 'event_type', 'price', 'event_ts', 'event_id', 'platform', 'business_type', 'neighborhood', 'zip_code', 'usable_areas', 'total_areas', 'bathrooms', 'bedrooms', 'suites', 'parking_spaces', 'amenities', 'unit_type', 'usage_type', 'geopoint', 'dt']


Unnamed: 0,listing_id,user_id,session_id,event_type,price,event_ts,event_id,platform,business_type,neighborhood,...,total_areas,bathrooms,bedrooms,suites,parking_spaces,amenities,unit_type,usage_type,geopoint,dt
0,395548,21322,1697,RankingRendered,300000.0,2024-04-17 12:25:08.740,ranking-rendered-9999e6,DESKTOP,SALE,balneário ponta da fruta,...,70.0,2.0,2.0,0.0,1.0,"['KITCHEN', 'KITCHEN_CABINETS', 'SERVICE_AREA'...",HOME,RESIDENTIAL,"-40.3747719,-20.5101708",2024-04-18
1,358437,21322,1697,RankingRendered,540750.0,2024-04-17 12:25:08.740,ranking-rendered-9999e6,DESKTOP,SALE,praia de itaparica,...,60.0,2.0,2.0,1.0,2.0,"['KITCHEN', 'BALCONY', 'KITCHEN_CABINETS', 'BA...",APARTMENT,RESIDENTIAL,"-40.3022484,-20.3709523",2024-04-18
2,268831,21322,1697,RankingRendered,1250000.0,2024-04-17 12:25:08.740,ranking-rendered-9999e6,DESKTOP,SALE,barra do jucu,...,300.0,4.0,3.0,3.0,4.0,[],HOME,RESIDENTIAL,"-40.3251852,-20.4285117",2024-04-18
3,444009,21322,1697,RankingRendered,670000.0,2024-04-17 12:25:08.740,ranking-rendered-9999e6,DESKTOP,SALE,itapuã,...,80.0,2.0,2.0,1.0,0.0,[],APARTMENT,RESIDENTIAL,"-40.2896807,-20.3522924",2024-04-18
4,485278,21322,1697,RankingRendered,3500.0,2024-04-17 12:23:41.797,ranking-rendered-7qkl5o,DESKTOP,RENTAL,itapuã,...,64.0,1.0,2.0,0.0,1.0,"['POOL', 'SAUNA']",APARTMENT,RESIDENTIAL,"-40.2896807,-20.3522924",2024-04-18


In [13]:
df_events[df_events["session_id"] == 999]["listing_id"].unique()

array([16055], dtype=int32)

In [14]:
base_path = "/home/hygo2025/Documents/data/processed_data/listings"

dataset = ds.dataset(base_path, format="parquet", partitioning="hive")
table = dataset.to_table()

df = table.to_pandas()

print(df.shape)
print(df.columns.tolist())
df.head(100)




(494083, 20)
['state', 'city', 'neighborhood', 'anonymized_listing_id', 'dt', 'created_at', 'updated_at', 'price', 'zip_code', 'usable_areas', 'total_areas', 'bathrooms', 'bedrooms', 'suites', 'parking_spaces', 'amenities', 'unit_type', 'usage_type', 'listing_id_numeric', 'geopoint']


Unnamed: 0,state,city,neighborhood,anonymized_listing_id,dt,created_at,updated_at,price,zip_code,usable_areas,total_areas,bathrooms,bedrooms,suites,parking_spaces,amenities,unit_type,usage_type,listing_id_numeric,geopoint
0,espírito santo,vila velha,interlagos,00008113B5C71B3B284E139564237E6FCBF586BE438A11...,2024-07-01,2023-11-16 22:52:58,2023-11-16 22:54:47,1500.0,29129667,60.0,60.0,1.0,2.0,2.0,1.0,[],APARTMENT,RESIDENTIAL,1,"-40.3507383,-20.4804082"
1,espírito santo,vila velha,itapuã,002E26CA767AB3D375CFFC3D66F6ADCE6911CA15EB7C13...,2024-07-01,2019-09-06 14:06:52,2021-01-12 15:11:45,300000.0,29101692,90.0,,2.0,2.0,1.0,9.0,"['PLAYGROUND', 'PARTY_HALL', 'ELEVATOR']",APARTMENT,RESIDENTIAL,2,"-40.2896807,-20.3522924"
2,espírito santo,vila velha,centro de vila velha,002E41AB22F07BA51ADD50E37C63DCABEB0652778CA233...,2024-07-01,2023-03-20 17:05:14,2023-03-20 17:05:16,750000.0,29100440,360.0,360.0,1.0,3.0,0.0,2.0,"['BALCONY', 'SERVICE_AREA']",HOME,RESIDENTIAL,3,"-40.2935892,-20.3363548"
3,espírito santo,vila velha,itapuã,002FE0C3AB7F578A4E321AB0B39E5FA037D880303D5C6D...,2024-07-01,2022-12-02 12:40:17,2022-12-02 12:40:18,601900.0,29101670,85.0,85.0,1.0,3.0,0.0,2.0,"['GYM', 'BALCONY', 'SERVICE_AREA', 'BARBECUE_G...",APARTMENT,RESIDENTIAL,4,"-40.2896807,-20.3522924"
4,espírito santo,vila velha,praia de itaparica,0030DAB363A22B6893144330FDA68A3078B3A5DF092098...,2024-07-01,2020-12-06 08:35:23,2021-01-11 14:32:00,1699000.0,29102050,214.0,214.0,3.0,4.0,2.0,3.0,"['AIR_CONDITIONING', 'KITCHEN', 'DEPOSIT', 'GO...",PENTHOUSE,RESIDENTIAL,5,"-40.3022484,-20.3709523"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,espírito santo,vila velha,itapuã,0AC7A2027AEB9671C1326B185019D76ABC5CEAF4A3F0EA...,2024-07-01,2019-10-29 01:48:26,2021-01-12 14:26:43,460000.0,,75.0,75.0,2.0,2.0,1.0,2.0,"['BARBECUE_GRILL', 'POOL', 'PLAYGROUND', 'INTE...",APARTMENT,RESIDENTIAL,96,"-40.2896807,-20.3522924"
96,espírito santo,vila velha,praia da costa,0ACC3BE55BC64C97DBB873125478E47DE82B51DB2E7CCF...,2024-07-01,2021-02-21 05:55:12,2021-03-15 03:15:10,922608.0,29101315,120.0,120.0,2.0,3.0,1.0,2.0,"['ELEVATOR', 'SERVICE_AREA']",APARTMENT,RESIDENTIAL,97,"-40.2824025,-20.3350315"
97,espírito santo,vila velha,praia de itaparica,0AE298E1087EBCD2728BA061D429AF29A5FF7DB44696D1...,2024-07-01,2021-11-04 01:40:53,2021-11-04 01:40:55,380000.0,29102290,65.0,65.0,2.0,2.0,1.0,1.0,[],APARTMENT,RESIDENTIAL,98,"-40.3022484,-20.3709523"
98,espírito santo,vila velha,balneário ponta da fruta,0B24BF4CB215237989D8EFC32C707D302B6A8AA0155434...,2024-07-01,2023-03-28 14:34:57,2023-06-05 16:00:18,295000.0,29128410,35.0,70.0,1.0,2.0,1.0,3.0,[],HOME,RESIDENTIAL,99,"-40.3747719,-20.5101708"


In [15]:
from src.utils.similarity_calc import PropertySimilarityCalculator


In [16]:

numerical_cols = ['price', 'usable_areas', 'total_areas', 'bathrooms', 'bedrooms', 'suites', 'parking_spaces']
categorical_cols = ['state', 'city', 'neighborhood']

weights = {
    'price': 3.0,
    'usable_areas': 2.0,
    'total_areas': 2.0,
    # 'neighborhood': 5.0,
    # 'city': 4.0,
    # 'state': 2.0,
    'bathrooms': 1.0,
    'bedrooms': 2.0,
    'suites': 1.0,
    'amenities': 1.5
}


calculator = PropertySimilarityCalculator(
    dataframe=df,
    id_column='listing_id_numeric',
    numerical_features=numerical_cols,
    #categorical_features=categorical_cols,
    categorical_features=[],
    amenities_column='amenities'
)


viewed = df_events[df_events["session_id"] == 999]["listing_id"].unique()
recs = [54358, 48208, 6318, 27709, 54058, 22540, 10039, 26566, 29556, 34513]

for listing_viewed in viewed:
    df_analise = calculator.analyze_recommendations(listing_viewed, recs, weights)
    print(f"--- Análise de Recomendações para o Imóvel Base ID {listing_viewed} ---")
    print(df_analise)
    print("--------------------------------------------------")
# imovel_base = 51056
# recomendacoes = [54358,48208,6318,27709,54058,22540,10039,26566,29556,34513]
#
# df_analise = calculator.analyze_recommendations(imovel_base, recomendacoes, weights)
#
# print(f"--- Análise de Recomendações para o Imóvel Base ID {imovel_base} ---")
# print(df_analise)

Calculador de Similaridade pronto!
--- Análise de Recomendações para o Imóvel Base ID 16055 ---
   recommended_id  similarity_score
0           54358          0.998518
1           48208          0.995065
2           27709          0.990064
3           26566          0.989098
4           29556          0.879046
5            6318          0.877565
6           22540          0.876546
7           54058          0.876016
8           34513          0.872564
9           10039          0.870064
--------------------------------------------------


In [17]:
import pandas as pd

numerical_cols = ['price', 'usable_areas', 'total_areas', 'bathrooms', 'bedrooms', 'suites', 'parking_spaces']
categorical_cols = ['state', 'city', 'neighborhood']

weights = {
    'price': 3.0,
    'usable_areas': 2.0,
    'total_areas': 2.0,
    'bathrooms': 1.0,
    'bedrooms': 2.0,
    'suites': 1.0,
    'amenities': 1.5
}

calculator = PropertySimilarityCalculator(
    dataframe=df,
    id_column='listing_id_numeric',
    numerical_features=numerical_cols,
    categorical_features=categorical_cols,
    amenities_column='amenities'
)

viewed = df_events[df_events["session_id"] == 999]["listing_id"].unique()
recs = [54358, 48208, 6318, 27709, 54058, 22540, 10039, 26566, 29556, 34513]

columns_to_view = numerical_cols + categorical_cols

for listing_viewed in viewed:
    df_analysis = calculator.analyze_recommendations(listing_viewed, recs, weights)
    if df_analysis.empty:
        print(f"No valid recommendations found to compare with {listing_viewed}.")
        continue

    print(df_analysis)

    top_3_similar_ids = df_analysis.head(5)['recommended_id'].tolist()

    if not top_3_similar_ids:
        print(f"\nNo similar items found among the recommendations for {listing_viewed}.")
        continue

    ids_to_compare = [listing_viewed] + top_3_similar_ids

    try:
        print(f"\n--- Feature Comparison: Base ({listing_viewed}) vs. Top 3 Similar ---")

        existing_columns = [col for col in columns_to_view if col in calculator.df.columns]

        df_comparison = calculator.df.loc[ids_to_compare, existing_columns].copy()

        df_comparison['Type'] = 'Recommended (Similar)'
        df_comparison.loc[listing_viewed, 'Type'] = '>>> VIEWED (BASE) <<<'

        final_columns = ['Type'] + existing_columns

        print(df_comparison[final_columns].to_markdown(numalign="left", stralign="left"))

    except KeyError as e:
        print(f"Error fetching data: ID {e} was not found in the DataFrame.")
    except Exception as e:
        print(f"An error occurred during feature comparison: {e}")

print("\n--- End of Analysis ---")

#estudo comparativo
#melhoria de um algoritmo

#olhar uma revista para tal a3 a4

Calculador de Similaridade pronto!
   recommended_id  similarity_score
0           54358          0.998518
1           48208          0.995065
2           27709          0.990064
3           26566          0.989098
4           29556          0.879046
5            6318          0.877565
6           22540          0.876546
7           54058          0.876016
8           34513          0.872564
9           10039          0.870064

--- Feature Comparison: Base (16055) vs. Top 3 Similar ---
| listing_id_numeric   | Type                  | price   | usable_areas   | total_areas   | bathrooms   | bedrooms   | suites   | parking_spaces   | state          | city       | neighborhood        |
|:---------------------|:----------------------|:--------|:---------------|:--------------|:------------|:-----------|:---------|:-----------------|:---------------|:-----------|:--------------------|
| 16055                | >>> VIEWED (BASE) <<< | 2800    | 110            | 110           | 2           | 3