In [6]:
import pyarrow.dataset as ds
import pyarrow.parquet as pq
import pandas as pd

base_path = "/home/hygo2025/Documents/data/processed_data/enriched_events"



In [7]:
dataset = ds.dataset(base_path, format="parquet", partitioning="hive")
cols_to_drop = ["unified_user_id", "listing_id", "user_id", "anonymous_id"]

all_cols = dataset.schema.names
cols_to_keep = [c for c in all_cols if c not in cols_to_drop]

table = dataset.to_table(columns=cols_to_keep)

rename_map = {
    "listing_id_numeric": "listing_id",
    "user_numeric_id": "user_id"
}
for old_name, new_name in rename_map.items():
    if old_name in table.column_names:
        table = table.rename_columns([
            new_name if c == old_name else c
            for c in table.column_names
        ])

first_cols = ["user_id", "listing_id", "name_raw", "event_type"]
rest_cols = [c for c in table.column_names if c not in first_cols]
new_order = first_cols + rest_cols
table = table.select(new_order)

df = table.to_pandas()

print(df.shape)
print(df.columns.tolist())
df.head()


(19126387, 22)
['user_id', 'listing_id', 'name_raw', 'event_type', 'browser_family', 'os_family', 'collector_timestamp', 'business_type', 'event_ts', 'state', 'city', 'neighborhood', 'price', 'usable_areas', 'total_areas', 'bathrooms', 'bedrooms', 'suites', 'parking_spaces', 'amenities', 'geopoint', 'dt']


Unnamed: 0,user_id,listing_id,name_raw,event_type,browser_family,os_family,collector_timestamp,business_type,event_ts,state,...,price,usable_areas,total_areas,bathrooms,bedrooms,suites,parking_spaces,amenities,geopoint,dt
0,103,1450584,ListingRendered,VISIT,Chrome Mobile,Android,1706800846835,SALE,2024-02-01 15:20:46.835,bahia,...,387000.0,57.0,57.0,2.0,2.0,0.0,1.0,"['KITCHEN', 'ELEVATOR', 'GATED_COMMUNITY']","-38.4896543,-12.9881887",2024-02-01
1,103,1450584,ListingRendered,VISIT,Chrome Mobile,Android,1706797670251,SALE,2024-02-01 14:27:50.251,bahia,...,387000.0,57.0,57.0,2.0,2.0,0.0,1.0,"['KITCHEN', 'ELEVATOR', 'GATED_COMMUNITY']","-38.4896543,-12.9881887",2024-02-01
2,304,1987731,RankingClicked,LEAD_INTENTION,Chrome Mobile,Android,1706794623317,SALE,2024-02-01 13:37:03.317,bahia,...,290000.0,2000.0,6200.0,,,,,[],"-38.0069504,-12.4844471",2024-02-01
3,304,1529573,RankingClicked,LEAD_INTENTION,Chrome Mobile,Android,1706794579394,SALE,2024-02-01 13:36:19.394,bahia,...,98000.0,868.0,868.0,,0.0,,,['SEA_VIEW'],"-38.0069504,-12.4844471",2024-02-01
4,304,1529573,ListingRendered,VISIT,Chrome Mobile,Android,1706794583642,SALE,2024-02-01 13:36:23.642,bahia,...,98000.0,868.0,868.0,,0.0,,,['SEA_VIEW'],"-38.0069504,-12.4844471",2024-02-01


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19126387 entries, 0 to 19126386
Data columns (total 22 columns):
 #   Column               Dtype         
---  ------               -----         
 0   user_id              int32         
 1   listing_id           int32         
 2   name_raw             object        
 3   event_type           object        
 4   browser_family       object        
 5   os_family            object        
 6   collector_timestamp  object        
 7   business_type        object        
 8   event_ts             datetime64[ns]
 9   state                object        
 10  city                 object        
 11  neighborhood         object        
 12  price                float64       
 13  usable_areas         float64       
 14  total_areas          float64       
 15  bathrooms            float64       
 16  bedrooms             float64       
 17  suites               float64       
 18  parking_spaces       float64       
 19  amenities          

In [9]:
df = df[df["state"] == "espírito santo"]
df = df[df["city"] == "vitória"]
df.head()

Unnamed: 0,user_id,listing_id,name_raw,event_type,browser_family,os_family,collector_timestamp,business_type,event_ts,state,...,price,usable_areas,total_areas,bathrooms,bedrooms,suites,parking_spaces,amenities,geopoint,dt
143,1955,10211,RankingClicked,LEAD_INTENTION,,,1706745772248,SALE,2024-02-01 00:02:52.248,espírito santo,...,1450000.0,200.0,225.0,5.0,4.0,2.0,4.0,"['BALCONY', 'PAVED_STREET', 'BARBECUE_GRILL', ...","-40.267894,-20.2599804",2024-02-01
144,1955,10211,ListingRendered,VISIT,Chrome Mobile,Android,1706745775165,SALE,2024-02-01 00:02:55.165,espírito santo,...,1450000.0,200.0,225.0,5.0,4.0,2.0,4.0,"['BALCONY', 'PAVED_STREET', 'BARBECUE_GRILL', ...","-40.267894,-20.2599804",2024-02-01
145,1955,1292801,RankingClicked,LEAD_INTENTION,Chrome Mobile,Android,1706746103186,SALE,2024-02-01 00:08:23.186,espírito santo,...,550000.0,650000.0,,1.0,2.0,0.0,4.0,"['BARBECUE_GRILL', 'GARDEN', 'ELECTRONIC_GATE'...","-40.3339751,-20.2833869",2024-02-01
146,1955,1292801,ListingRendered,VISIT,,,1706746105978,SALE,2024-02-01 00:08:25.978,espírito santo,...,550000.0,650000.0,,1.0,2.0,0.0,4.0,"['BARBECUE_GRILL', 'GARDEN', 'ELECTRONIC_GATE'...","-40.3339751,-20.2833869",2024-02-01
147,1955,500391,ListingRendered,VISIT,,,1706746009609,SALE,2024-02-01 00:06:49.609,espírito santo,...,2700000.0,850.0,1000.0,6.0,5.0,2.0,8.0,"['DINNER_ROOM', 'PAVED_STREET', 'COPA', 'KITCH...","-40.3269799,-20.3071309",2024-02-01


In [10]:
df = df.sort_values(by=['user_id', 'event_ts']).reset_index(drop=True)

In [11]:
df[df["user_id"] == 851].head(50)

Unnamed: 0,user_id,listing_id,name_raw,event_type,browser_family,os_family,collector_timestamp,business_type,event_ts,state,...,price,usable_areas,total_areas,bathrooms,bedrooms,suites,parking_spaces,amenities,geopoint,dt
3075,851,578671,RankingClicked,LEAD_INTENTION,,,1708264050586,SALE,2024-02-18 13:47:30.586,espírito santo,...,275000.0,49.0,49.0,1.0,1.0,0.0,0.0,[],"-40.3006106,-20.3028247",2024-02-18
3076,851,578671,RankingClicked,LEAD_INTENTION,,,1708264050586,RENTAL,2024-02-18 13:47:30.586,espírito santo,...,275000.0,49.0,49.0,1.0,1.0,0.0,0.0,[],"-40.3006106,-20.3028247",2024-02-18
3077,851,578671,ListingRendered,VISIT,Mobile Safari UI/WKWebView,iOS,1708264050588,SALE,2024-02-18 13:47:30.588,espírito santo,...,275000.0,49.0,49.0,1.0,1.0,0.0,0.0,[],"-40.3006106,-20.3028247",2024-02-18
3078,851,578671,ListingRendered,VISIT,Mobile Safari UI/WKWebView,iOS,1708264050588,RENTAL,2024-02-18 13:47:30.588,espírito santo,...,275000.0,49.0,49.0,1.0,1.0,0.0,0.0,[],"-40.3006106,-20.3028247",2024-02-18
3079,851,1367510,RankingClicked,LEAD_INTENTION,,,1709235188207,SALE,2024-02-29 19:33:08.207,espírito santo,...,480000.0,46.0,46.0,1.0,1.0,0.0,1.0,"['ELEVATOR', 'FURNISHED', 'GARAGE']","-40.267894,-20.2599804",2024-02-29
3080,851,1367510,ListingRendered,VISIT,,,1709235188207,SALE,2024-02-29 19:33:08.207,espírito santo,...,480000.0,46.0,46.0,1.0,1.0,0.0,1.0,"['ELEVATOR', 'FURNISHED', 'GARAGE']","-40.267894,-20.2599804",2024-02-29
3081,851,1367510,LeadClicked,LEAD,,,1709235337956,SALE,2024-02-29 19:35:37.956,espírito santo,...,480000.0,46.0,46.0,1.0,1.0,0.0,1.0,"['ELEVATOR', 'FURNISHED', 'GARAGE']","-40.267894,-20.2599804",2024-02-29
3082,851,776478,RankingClicked,LEAD_INTENTION,Mobile Safari UI/WKWebView,iOS,1720395951815,SALE,2024-07-07 23:45:51.815,espírito santo,...,170000.0,17.0,17.0,1.0,1.0,1.0,1.0,"['AIR_CONDITIONING', 'KITCHEN_CABINETS', 'GARA...","-40.3027037,-20.2913934",2024-07-07
3083,851,776478,ListingRendered,VISIT,Mobile Safari UI/WKWebView,iOS,1720395951817,SALE,2024-07-07 23:45:51.817,espírito santo,...,170000.0,17.0,17.0,1.0,1.0,1.0,1.0,"['AIR_CONDITIONING', 'KITCHEN_CABINETS', 'GARA...","-40.3027037,-20.2913934",2024-07-07
3084,851,2552791,RankingClicked,LEAD_INTENTION,,,1720396191820,SALE,2024-07-07 23:49:51.820,espírito santo,...,205000.0,12.0,15.0,1.0,1.0,1.0,1.0,"['SECURITY_CAMERA', 'SAFETY_CIRCUIT', 'COFFEE_...","-40.3006106,-20.3028247",2024-07-07


In [12]:
session_time_in_hours = 24
inactivity_threshold = pd.Timedelta(hours=session_time_in_hours)

time_diff = df.groupby('user_id')['event_ts'].diff()
is_new_session = (time_diff > inactivity_threshold) | (time_diff.isnull())
session_marker = is_new_session.cumsum()
df['session_id'] = df['user_id'].astype(str) + '_' + session_marker.astype(str)

df_session = df[['event_ts', 'user_id', 'listing_id', 'name_raw', 'event_type', 'session_id']]

df_session[df["user_id"] == 851].head(50)

Unnamed: 0,event_ts,user_id,listing_id,name_raw,event_type,session_id
3075,2024-02-18 13:47:30.586,851,578671,RankingClicked,LEAD_INTENTION,851_115
3076,2024-02-18 13:47:30.586,851,578671,RankingClicked,LEAD_INTENTION,851_115
3077,2024-02-18 13:47:30.588,851,578671,ListingRendered,VISIT,851_115
3078,2024-02-18 13:47:30.588,851,578671,ListingRendered,VISIT,851_115
3079,2024-02-29 19:33:08.207,851,1367510,RankingClicked,LEAD_INTENTION,851_116
3080,2024-02-29 19:33:08.207,851,1367510,ListingRendered,VISIT,851_116
3081,2024-02-29 19:35:37.956,851,1367510,LeadClicked,LEAD,851_116
3082,2024-07-07 23:45:51.815,851,776478,RankingClicked,LEAD_INTENTION,851_117
3083,2024-07-07 23:45:51.817,851,776478,ListingRendered,VISIT,851_117
3084,2024-07-07 23:49:51.820,851,2552791,RankingClicked,LEAD_INTENTION,851_117


In [13]:
df.head(100)

Unnamed: 0,user_id,listing_id,name_raw,event_type,browser_family,os_family,collector_timestamp,business_type,event_ts,state,...,usable_areas,total_areas,bathrooms,bedrooms,suites,parking_spaces,amenities,geopoint,dt,session_id
0,5,233878,RankingClicked,LEAD_INTENTION,Chrome,Windows,1713635883412,SALE,2024-04-20 17:58:03.412,espírito santo,...,78.0,78.0,2.0,2.0,0.0,1.0,"['PETS_ALLOWED', 'INTEGRATED_ENVIRONMENTS', 'A...","-40.2946647,-20.298286",2024-04-20,5_1
1,5,233878,ListingRendered,VISIT,Chrome,Windows,1713635884767,SALE,2024-04-20 17:58:04.767,espírito santo,...,78.0,78.0,2.0,2.0,0.0,1.0,"['PETS_ALLOWED', 'INTEGRATED_ENVIRONMENTS', 'A...","-40.2946647,-20.298286",2024-04-20,5_1
2,5,643235,RankingClicked,LEAD_INTENTION,Chrome,Windows,1713635923016,SALE,2024-04-20 17:58:43.016,espírito santo,...,90.0,120.0,2.0,2.0,2.0,1.0,"['KITCHEN', 'ELEVATOR', 'INTERCOM', 'LAUNDRY',...","-40.2946647,-20.298286",2024-04-20,5_1
3,5,643235,ListingRendered,VISIT,Chrome,Windows,1713635924189,SALE,2024-04-20 17:58:44.189,espírito santo,...,90.0,120.0,2.0,2.0,2.0,1.0,"['KITCHEN', 'ELEVATOR', 'INTERCOM', 'LAUNDRY',...","-40.2946647,-20.298286",2024-04-20,5_1
4,5,643235,LeadPanelClicked,LEAD_INTENTION,Chrome,Windows,1713636316478,SALE,2024-04-20 18:05:16.478,espírito santo,...,90.0,120.0,2.0,2.0,2.0,1.0,"['KITCHEN', 'ELEVATOR', 'INTERCOM', 'LAUNDRY',...","-40.2946647,-20.298286",2024-04-20,5_1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,130,236039,LeadPanelClicked,LEAD_INTENTION,,,1711735778758,SALE,2024-03-29 18:09:38.758,espírito santo,...,100.0,,3.0,3.0,1.0,2.0,[],"-40.267894,-20.2599804",2024-03-29,130_11
96,130,236039,LeadPanelClicked,LEAD_INTENTION,Edge,Windows,1711735779455,SALE,2024-03-29 18:09:39.455,espírito santo,...,100.0,,3.0,3.0,1.0,2.0,[],"-40.267894,-20.2599804",2024-03-29,130_11
97,130,881611,LeadPanelClicked,LEAD_INTENTION,Edge,Windows,1711735779865,SALE,2024-03-29 18:09:39.865,espírito santo,...,264.0,360.0,5.0,4.0,2.0,2.0,"['BARBECUE_GRILL', 'GARDEN', 'ELEVATOR']","-40.2966929,-20.2712388",2024-03-29,130_11
98,130,881611,LeadPanelClicked,LEAD_INTENTION,Edge,Windows,1711735780472,SALE,2024-03-29 18:09:40.472,espírito santo,...,264.0,360.0,5.0,4.0,2.0,2.0,"['BARBECUE_GRILL', 'GARDEN', 'ELEVATOR']","-40.2966929,-20.2712388",2024-03-29,130_11


In [32]:
df_recbole = pd.DataFrame()

In [33]:
df_recbole['session_id:token'] = df['session_id']
df_recbole['item_id:token'] = df['listing_id']
df_recbole['timestamp:float'] = df['event_ts'].astype('int64') // 10**9


In [34]:
df_recbole.head(10)

Unnamed: 0,session_id:token,item_id:token,timestamp:float
0,5_1,233878,1713635883
1,5_1,233878,1713635884
2,5_1,643235,1713635923
3,5_1,643235,1713635924
4,5_1,643235,1713636316
5,5_1,643235,1713636319
6,6_2,65574,1709043626
7,6_2,65574,1709043635
8,13_3,1275761,1713798664
9,13_3,943929,1713798722


In [35]:
import os

dataset_name = 'eventos_vix'
data_path = os.path.join('../gru/dataset', dataset_name)
os.makedirs(data_path, exist_ok=True)
file_path = os.path.join(data_path, f'{dataset_name}.inter')
df_recbole.to_csv(file_path, index=False, sep='\t')