### Recency Weights (Interaction-level)
Goal: attach a per-review `recency_weight = exp(-Δt / λ)` so newer reviews count more in downstream features.
- `time` parsed to datetime (ms epoch assumed).
- Reference time `t_ref = max(time)` (reproducible; use “latest review”).
- Δt in days, λ ∈ {90, 180, 365} (default 365).
- Output: `recency/reviews_with_recency.parquet` with columns `[user_id, gmap_id, time, recency_weight]` (+ passthroughs).


In [1]:
from pathlib import Path
import json, re
import pandas as pd
import numpy as np
from typing import Dict, List

# Project paths (adjust if your notebook runs elsewhere)
ROOT = Path("../..").resolve()
DATA_PROCESSED = ROOT / "data" / "processed"
REVIEWS_PATH = DATA_PROCESSED / "sf-reviews.json"                 # full data
REVIEWS_SAMPLED_PATH = DATA_PROCESSED / "sf-sampled-reviews.json" # optional subset

OUT_DIR = DATA_PROCESSED / "features" / "recency"
OUT_DIR.mkdir(parents=True, exist_ok=True)

In [2]:
def load_reviews_json(path: Path, max_rows: int | None = None) -> pd.DataFrame:
    """
    Expects objects with {'user_id','gmap_id','text','time',...}.
    Handles JSONL (one object per line) or a single large JSON array.
    """
    if not path.exists():
        raise FileNotFoundError(path)
    rows = []
    with path.open("r", encoding="utf-8") as f:
        first = f.read(1)
        f.seek(0)
        if first == "[":
            data = json.load(f)
            rows = data if max_rows is None else data[:max_rows]
        else:
            for i, line in enumerate(f):
                if max_rows is not None and i >= max_rows:
                    break
                if line.strip():
                    rows.append(json.loads(line))
    df = pd.DataFrame(rows)
    need = ["user_id", "gmap_id", "time"]
    miss = [c for c in need if c not in df.columns]
    if miss:
        raise ValueError(f"Missing columns {miss} in {path}")
    df = df[df["user_id"].notna() & df["gmap_id"].notna() & df["time"].notna()].copy()
    df["user_id"] = df["user_id"].astype(str)
    df["gmap_id"] = df["gmap_id"].astype(str)
    return df

try:
    df = load_reviews_json(REVIEWS_SAMPLED_PATH)
except Exception:
    df = load_reviews_json(REVIEWS_PATH)

len(df)
df.head(3)

Unnamed: 0,user_id,name,time,rating,text,gmap_id
0,1.111581421e+20,Tiffany Duong,1412890409813,5,Lazing your day away in cafes is one of the be...,0x1532f63595a49fdd:0xac59d1857e129a53
1,1.030322479e+20,JaeKwon Son,1552143019447,4,,0x1532f63595a49fdd:0xac59d1857e129a53
2,1.039197287e+20,Samantha Andonian,1575754575457,4,,0x1532f63595a49fdd:0xac59d1857e129a53


In [5]:
## recency weight computation
# params
LAMBDA_DAYS = 365 # can also try 90/180

# parse timestamps (unix time in ms)
t = pd.to_datetime(df["time"], unit="ms", utc=True, errors='coerce')
df = df.loc[t.notna()].copy()
df['time_dt'] = t[t.notna()].dt.tz_convert('UTC')

# reference time: most revent review time in data
t_ref = df['time_dt'].max()

# Δt in days and exponential decay weight
delta_days = (t_ref - df['time_dt']).dt.total_seconds() / 86400.0
w = np.exp(-delta_days / float(LAMBDA_DAYS))
w = np.clip(w, 1e-8, 1.0) # 1e-8 instead of 0 to prevent divide by 0 error when aggregating

df['recency_weight'] = w
df[['time_dt', 'recency_weight']].head(5)

Unnamed: 0,time_dt,recency_weight
0,2014-10-09 21:33:29.813000+00:00,0.001012
1,2019-03-09 14:50:19.447000+00:00,0.083759
2,2019-12-07 21:36:15.457000+00:00,0.177091
3,2016-12-10 04:01:13.506000+00:00,0.008872
4,2017-08-19 14:23:23.542000+00:00,0.017716


In [None]:
import pyarrow as pa, pyarrow.parquet as pq

cols_out = ['user_id', 'gmap_id', 'time', 'time_dt', 'recency_weight']
out_df = df[cols_out].copy()
out_df["time"] = pd.to_numeric(out_df["time"], errors="coerce").astype("Int64")

schema = pa.schema([
    pa.field("user_id", pa.string()),
    pa.field("gmap_id", pa.string()),
    pa.field("time", pa.int64()),
    pa.field("time_dt", pa.timestamp("us", tz="UTC")),
    pa.field("recency_weight", pa.float32()),
])

table = pa.Table.from_pandas(out_df.astype({"recency_weight":"float32"}), schema=schema, preserve_index=False)
OUT_FILE = OUT_DIR / "reviews_with_recency.parquet"
pq.write_table(table, OUT_FILE)
print("Saved:", OUT_FILE)