### Recency Weights (Interaction-level)
Goal: attach a per-review `recency_weight = exp(-Δt / λ)` so newer reviews count more in downstream features.
- `time` parsed to datetime (ms epoch assumed).
- Reference time `t_ref = max(time)` (reproducible; use “latest review”).
- Δt in days, λ ∈ {90, 180, 365} (default 365).
- Output: `recency/reviews_with_recency.parquet` with columns `[user_id, gmap_id, time, recency_weight]` (+ passthroughs).


In [2]:
from pathlib import Path
import pandas as pd
import numpy as np
import json, re
from typing import Dict, List

# Paths
ROOT = Path("../..").resolve()
DATA_PROCESSED = ROOT / "data" / "processed"
OUT_DIR = DATA_PROCESSED / "features" / "recency"
OUT_DIR.mkdir(parents=True, exist_ok=True)

# Load NY reviews from splits (train/val/test) 
splits = ["train", "val", "test"]
reviews_df = pd.concat(
    [pd.read_parquet(DATA_PROCESSED / f"{s}.parquet") for s in splits],
    ignore_index=True
)

# keep only what we need and normalise ids
need = ["user_id", "gmap_id", "time"]
missing = [c for c in need if c not in reviews_df.columns]
if missing:
    raise ValueError(f"Missing columns in reviews_df: {missing}")

reviews_df = reviews_df.dropna(subset=["user_id", "gmap_id", "time"]).copy()
reviews_df["user_id"] = reviews_df["user_id"].astype(str)
reviews_df["gmap_id"] = reviews_df["gmap_id"].astype(str)

print(reviews_df.shape)
print(reviews_df[need].head(3))

(9959341, 12)
                 user_id                                gmap_id  \
0  1.000000127957533e+20  0x89c25cff28f955e3:0x2a9e7e560d0c3dbd   
1  1.000000127957533e+20  0x89c258e28c304997:0xfcafe4e7ce35ee8c   
2  1.000000127957533e+20  0x89c25902934779a5:0x9a7927b826097a9a   

                     time  
0 2015-06-29 23:39:04.836  
1 2018-02-16 08:15:33.898  
2 2019-09-28 20:11:18.064  


In [3]:
## recency weight computation
# params
LAMBDA_DAYS = 365.0   # try 90/180/365

# Parse time (unix ms) to UTC datetime
t = pd.to_datetime(reviews_df["time"], unit="ms", utc=True, errors="coerce")
reviews_df = reviews_df.loc[t.notna()].copy()
reviews_df["time_dt"] = t[t.notna()] 
 
# Reference time = latest review in the dataset 
t_ref = reviews_df["time_dt"].max()

# Δt in days
delta_days = (t_ref - reviews_df["time_dt"]).dt.total_seconds() / 86400.0

# Exponential decay
w = np.exp(-delta_days / LAMBDA_DAYS)
w = np.clip(w, 1e-8, 1.0)  # avoid exact zeros for downstream stability
reviews_df["recency_weight"] = w

reviews_df[["user_id","gmap_id","time_dt","recency_weight"]].head()


Unnamed: 0,user_id,gmap_id,time_dt,recency_weight
0,1.000000127957533e+20,0x89c25cff28f955e3:0x2a9e7e560d0c3dbd,2015-06-29 23:39:04.836000+00:00,0.002038
1,1.000000127957533e+20,0x89c258e28c304997:0xfcafe4e7ce35ee8c,2018-02-16 08:15:33.898000+00:00,0.028461
2,1.000000127957533e+20,0x89c25902934779a5:0x9a7927b826097a9a,2019-09-28 20:11:18.064000+00:00,0.143107
3,1.000000127957533e+20,0x89c25999b9c6eff9:0x4b98becb736928d8,2019-09-28 20:12:21.136000+00:00,0.143108
4,1.000000127957533e+20,0x89c25999b9c6eff9:0x4b98becb736928d8,2019-09-28 20:12:21.136000+00:00,0.143108


In [4]:
import pyarrow as pa, pyarrow.parquet as pq

cols_out = ["user_id","gmap_id","time","time_dt","recency_weight"]
out_reviews = reviews_df[cols_out].copy()

# enforce types
out_reviews["time"] = pd.to_numeric(out_reviews["time"], errors="coerce").astype("int64")
out_reviews["recency_weight"] = out_reviews["recency_weight"].astype("float32")

schema = pa.schema([
    pa.field("user_id", pa.string()),
    pa.field("gmap_id", pa.string()),
    pa.field("time", pa.int64()),
    pa.field("time_dt", pa.timestamp("ms", tz="UTC")),
    pa.field("recency_weight", pa.float32()),
])

table = pa.Table.from_pandas(out_reviews, schema=schema, preserve_index=False)
REVIEWS_OUT = OUT_DIR / "ny_reviews_with_recency.parquet"
pq.write_table(table, REVIEWS_OUT)
print("Saved review-level recency ->", REVIEWS_OUT)

Saved review-level recency -> /Users/kienanana/Documents/SCHOOL/Y3S1/BT4222/PROJECT/data/processed/features/recency/ny_reviews_with_recency.parquet
