In [None]:
from google.colab import drive
import json
import pandas as pd
import duckdb
import pathlib as pl
import numpy as np

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# ============================================================
# ABSA -> 8 Aspect Buckets -> User & Item Vectors (Colab)
# ============================================================

# ---- 0) Setup / Imports ----
!pip -q install pandas pyarrow fastparquet sentence-transformers

import re
import json
import math
import numpy as np
import pandas as pd
from collections import defaultdict
from pathlib import Path
from typing import Optional, Dict, List
import time
!pip -q install tqdm
from tqdm.auto import tqdm
tqdm.pandas()  # enables df.progress_* helpers

from sentence_transformers import SentenceTransformer
from numpy.linalg import norm

# ---- 1) Paths ----
BASE = Path("/content/drive/MyDrive/processed/slice_4k")
USER_ABSA_PATH  = BASE / "aspects_users_train_full.parquet"     # assumed to contain ABSA outputs (train)
ITEM_ABSA_PATH  = BASE / "aspects_items_train_full.parquet"     # assumed to contain ABSA outputs (train)

OUT_USER_VEC = BASE / "user_aspect_vectors.parquet"
OUT_ITEM_VEC = BASE / "item_aspect_vectors.parquet"
OUT_COVERAGE = BASE / "aspect_mapping_coverage.csv"

# ---- 2) Canonical 8 Aspect Buckets (editable) ----
ASPECT_BUCKETS: Dict[str, List[str]] = {
    "food":       ["food","taste","flavor","dish","noodle","sushi","pizza","burger","taco",
                   "spicy","fresh","broth","steak","dessert","menu","ramen","bbq"],
    "service":    ["service","staff","waiter","waitress","server","attentive","rude","friendly",
                   "manager","host","hosts","hostess"],
    "price":      ["price","cost","value","cheap","expensive","affordable","overpriced","worth",
                   "deal","pricing"],
    "ambience":   ["ambience","atmosphere","vibe","noise","music","decor","seating","lighting",
                   "crowded","cozy","environment"],
    "cleanliness":["clean","dirty","bathroom","restroom","sanitary","hygiene","messy","filthy"],
    "portion":    ["portion","size","serving","amount","quantity","share","big","small"],
    "wait_time":  ["wait","queue","delay","slow","fast","quick","prompt","line","service time"],
    "location":   ["location","parking","nearby","distance","convenient","access","walk","drive"]
}

# Optional regex shortcuts for speed/precision
SPECIAL_REGEX = [
    (re.compile(r"\b(wait|queue|delay|line|slow|fast|quick|prompt)\b"), "wait_time"),
    (re.compile(r"\b(clean|dirty|bathroom|restroom|hygiene|sanitary|filthy|messy)\b"), "cleanliness"),
]

# ---- 3) Lightweight text normalization (fast & dependency-free) ----
STOP = set(["the","a","an","and","or","in","at","of","to","for","on","is","are","was","were","very","really","so"])
PUNCT_RX = re.compile(r"[^\w\s\-]")

def normalize_term(term: str) -> str:
    t = str(term).lower().strip()
    t = PUNCT_RX.sub(" ", t)
    t = re.sub(r"\s+", " ", t)
    return t

def tokenize_keep_heads(s: str) -> List[str]:
    toks = [w.strip("-") for w in s.split() if w and w not in STOP]
    # keep 1-2 important tokens if multiword (e.g., "rude staff" -> ["rude","staff"])
    return toks[:2] if len(toks) > 1 else toks

# ---- 4) Load a small embedding model for backoff mapping ----
# (MiniLM is fast and usually available in Colab)
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

def l2norm_vec(x: np.ndarray) -> np.ndarray:
    n = norm(x)
    return x / n if n > 0 else x

# Build bucket centroids once from seeds
BUCKET_CENTROID: Dict[str, np.ndarray] = {}
for bkt, seeds in ASPECT_BUCKETS.items():
    embs = model.encode(seeds, normalize_embeddings=True)
    BUCKET_CENTROID[bkt] = l2norm_vec(np.mean(embs, axis=0))

# Fast lexicon lookup (exact token matches after normalization)
LEXICON = {w: b for b, words in ASPECT_BUCKETS.items() for w in words}

# Similarity threshold for embedding backoff
SIM_THRESH = 0.50  # tweak between 0.45 and 0.60 based on coverage report



# ---------- Speedy mapping with cache + batched embedding backoff + progress ----------

TERM_CACHE: Dict[str, Optional[str]] = {}   # normalized term -> bucket or None
CENTROIDS = np.stack([BUCKET_CENTROID[b] for b in ASPECT_BUCKETS.keys()], axis=0)  # [8, D]
BUCKET_NAMES = list(ASPECT_BUCKETS.keys())

def fast_lexicon_regex_bucket(norm_term: str) -> Optional[str]:
    """Fast path: regex + lexicon only (no embedding)."""
    if not norm_term:
        return None
    # regex
    for rx, b in SPECIAL_REGEX:
        if rx.search(norm_term):
            return b
    # lexicon (token-wise)
    toks = tokenize_keep_heads(norm_term)
    for tok in toks:
        if tok in LEXICON:
            return LEXICON[tok]
    return None

def map_terms_to_buckets_with_progress(df: pd.DataFrame, term_col: str = "aspect_term",
                                       batch_size: int = 4096, sim_thresh: float = SIM_THRESH) -> tuple[pd.DataFrame, float]:
    """
    Vectorized + cached mapping with progress bars.
    1) Normalize terms
    2) Fast path (regex/lexicon)
    3) Batch-embed unmatched terms, cosine-sim to centroids
    Returns: (df with 'bucket' column, coverage_float)
    """
    t0 = time.time()

    # 1) Normalize all terms (progress)
    terms = df[term_col].astype(str)
    norm_terms = terms.progress_apply(normalize_term)

    # 2) Try cache/fast path first
    buckets = []
    need_embed_idx = []
    need_embed_terms = []
    for i, nt in enumerate(tqdm(norm_terms, desc="Fast path (regex+lexicon+cache)")):
        if nt in TERM_CACHE:
            buckets.append(TERM_CACHE[nt])
            continue
        b = fast_lexicon_regex_bucket(nt)
        if b is None:
            buckets.append(None)
            need_embed_idx.append(i)
            need_embed_terms.append(nt)
        else:
            buckets.append(b)
            TERM_CACHE[nt] = b

    # 3) Batched embedding backoff for remaining terms
    if need_embed_terms:
        vecs = []
        for start in tqdm(range(0, len(need_embed_terms), batch_size), desc="Embedding backoff", total=math.ceil(len(need_embed_terms)/batch_size)):
            chunk = need_embed_terms[start:start+batch_size]
            v = model.encode(chunk, normalize_embeddings=True)  # [B, D]
            vecs.append(v)
        V = np.vstack(vecs)  # [N_remain, D]
        # cosine sim since both normalized: dot
        sims = V @ CENTROIDS.T                     # [N_remain, 8]
        best_j = sims.argmax(axis=1)               # [N_remain]
        best_s = sims.max(axis=1)                  # [N_remain]

        # assign back
        for idx, j, s in zip(need_embed_idx, best_j, best_s):
            b = BUCKET_NAMES[int(j)] if float(s) >= sim_thresh else None
            buckets[idx] = b
            TERM_CACHE[norm_terms.iloc[idx]] = b

    # 4) Build output + coverage
    out = df.copy()
    out["bucket"] = buckets
    coverage = float(np.mean(pd.Series(buckets, dtype=object).notna()))
    dt = time.time() - t0
    print(f"[map] coverage={coverage:.2%} | N={len(df):,} | unmatched={len(need_embed_terms):,} | time={dt:.1f}s")
    return out, coverage


# Map PyABSA sentiment strings to numeric
def to_numeric_sentiment(x):
    if pd.isna(x):
        return np.nan
    if isinstance(x, (int, float, np.number)):
        # already numeric (rare), clip to [-1, 1]
        try:
            return float(x)
        except:
            return np.nan
    s = str(x).strip().lower()
    table = {
        "positive": 1.0, "pos": 1.0, "+1": 1.0, "1": 1.0,
        "negative": -1.0, "neg": -1.0, "-1": -1.0,
        "neutral": 0.0, "neu": 0.0, "0": 0.0,
    }
    return table.get(s, np.nan)



def load_absa_parquet(p: Path, mode: str) -> pd.DataFrame:
    """
    mode: "user" or "item"
    - user: requires user_id + gmap_id + aspect + sentiment
    - item: requires gmap_id + aspect + sentiment (user_id absent/ignored)
    Standardizes to: user_id, gmap_id, aspect_term, polarity (numeric), confidence
    """
    df = pd.read_parquet(p)

    uid_col  = next((c for c in df.columns if c.lower() in ["user_id","user","uid"]), None)
    item_col = next((c for c in df.columns if c.lower() in ["gmap_id","place_id","item_id","business_id"]), None)
    asp_col  = next((c for c in df.columns if c.lower() in ["aspect","aspect_term","term","category"]), None)
    sent_col = next((c for c in df.columns if c.lower() in ["sentiment","polarity","label","score"]), None)
    conf_col = next((c for c in df.columns if c.lower() in ["confidence","prob","probability"]), None)

    if mode == "user":
        req_names = [("user_id", uid_col), ("gmap_id", item_col), ("aspect", asp_col), ("sentiment", sent_col)]
    else:
        req_names = [("gmap_id", item_col), ("aspect", asp_col), ("sentiment", sent_col)]

    for name, col in req_names:
        if col is None:
            raise ValueError(f"Required field '{name}' not found in {p.name}. "
                             f"Found columns: {list(df.columns)[:20]}")

    out = pd.DataFrame({
        "gmap_id":     df[item_col].astype(str),
        "aspect_term": df[asp_col].astype(str),
        "polarity":    df[sent_col].map(to_numeric_sentiment)
    })
    if mode == "user":
        out["user_id"] = df[uid_col].astype(str)
    else:
        out["user_id"] = ""  # placeholder; items file has no user_id

    out["confidence"] = (
        pd.to_numeric(df[conf_col], errors="coerce").clip(0,1)
        if conf_col is not None else 1.0
    )
    out = out[out["polarity"].notna()].reset_index(drop=True)
    return out


user_absa  = load_absa_parquet(USER_ABSA_PATH, mode="user")
item_absa  = load_absa_parquet(ITEM_ABSA_PATH, mode="item")





absa_user_mapped, cov_u  = map_terms_to_buckets_with_progress(user_absa,  term_col="aspect_term")
absa_items_mapped, cov_i = map_terms_to_buckets_with_progress(item_absa, term_col="aspect_term")


# Keep only mapped rows
absa_user_mapped  = absa_user_mapped[absa_user_mapped["bucket"].notna()].copy()
absa_items_mapped = absa_items_mapped[absa_items_mapped["bucket"].notna()].copy()


print(f"[INFO] Coverage user: {cov_u:.2%} | item: {cov_i:.2%}")
# Clip polarity & fix confidence
for _df in (absa_user_mapped, absa_items_mapped):
    _df["polarity"]   = _df["polarity"].clip(-1, 1)
    _df["confidence"] = _df["confidence"].fillna(1.0).clip(0, 1)




def aggregate_aspect_vectors(df: pd.DataFrame, by: str) -> pd.DataFrame:
    buckets = list(ASPECT_BUCKETS.keys())
    stats = defaultdict(lambda: defaultdict(lambda: {"w":0.0, "sum":0.0, "n":0}))
    for key, bucket, pol, conf in zip(df[by], df["bucket"], df["polarity"], df["confidence"]):
        d = stats[key][bucket]
        d["w"]   += float(conf)
        d["sum"] += float(conf) * float(pol)
        d["n"]   += 1
    rows = []
    for key, bd in stats.items():
        row = {"key": key}
        for b in buckets:
            w = bd[b]["w"]
            row[f"{b}_sent"] = (bd[b]["sum"]/w) if w>0 else 0.0
            row[f"{b}_cnt"]  = bd[b]["n"]
        rows.append(row)
    out = pd.DataFrame(rows).rename(columns={"key": by})
    out["total_mentions"] = out[[f"{b}_cnt" for b in buckets]].sum(axis=1)
    return out

# USERS: only rows with real user_id
absa_user_mapped = absa_user_mapped[absa_user_mapped["user_id"].str.len() > 0]
user_vecs = aggregate_aspect_vectors(absa_user_mapped, by="user_id")

# ITEMS: pool both sources by gmap_id for maximum coverage
absa_item_pool = pd.concat(
    [
        absa_user_mapped[["gmap_id","bucket","polarity","confidence"]],
        absa_items_mapped[["gmap_id","bucket","polarity","confidence"]],
    ],
    ignore_index=True
)

print("[stage] Cleaning & clipping …")
# Keep only mapped rows
absa_user_mapped  = absa_user_mapped[absa_user_mapped["bucket"].notna()].copy()
absa_items_mapped = absa_items_mapped[absa_items_mapped["bucket"].notna()].copy()

# Clip polarity & fix confidence
for _df in (absa_user_mapped, absa_items_mapped):
    _df["polarity"]   = _df["polarity"].clip(-1, 1)
    _df["confidence"] = _df["confidence"].fillna(1.0).clip(0, 1)

print("[stage] Aggregating users …")
absa_user_mapped = absa_user_mapped[absa_user_mapped["user_id"].str.len() > 0]
user_vecs = aggregate_aspect_vectors(absa_user_mapped, by="user_id")

print("[stage] Aggregating items (pooled) …")
absa_item_pool = pd.concat(
    [
        absa_user_mapped[["gmap_id","bucket","polarity","confidence"]],
        absa_items_mapped[["gmap_id","bucket","polarity","confidence"]],
    ],
    ignore_index=True
)
item_vecs = aggregate_aspect_vectors(absa_item_pool, by="gmap_id")

item_vecs = aggregate_aspect_vectors(absa_item_pool, by="gmap_id")


user_vecs.to_parquet(OUT_USER_VEC, index=False)
item_vecs.to_parquet(OUT_ITEM_VEC, index=False)

bucket_counts = (pd.concat([absa_user_mapped[["bucket"]], absa_items_mapped[["bucket"]]])
                 ["bucket"].value_counts().rename_axis("bucket").reset_index(name="n_terms"))
bucket_counts["share"] = bucket_counts["n_terms"] / bucket_counts["n_terms"].sum()
bucket_counts.to_csv(OUT_COVERAGE, index=False)

print(f"[OK] Saved user vectors → {OUT_USER_VEC}")
print(f"[OK] Saved item vectors → {OUT_ITEM_VEC}")
print(f"[OK] Coverage breakdown → {OUT_COVERAGE}")

display(user_vecs.head(3))
display(item_vecs.head(3))
display(bucket_counts)









  0%|          | 0/58901 [00:00<?, ?it/s]

Fast path (regex+lexicon+cache):   0%|          | 0/58901 [00:00<?, ?it/s]

Embedding backoff:   0%|          | 0/8 [00:00<?, ?it/s]

[map] coverage=80.30% | N=58,901 | unmatched=32,632 | time=7.0s


  0%|          | 0/58606 [00:00<?, ?it/s]

Fast path (regex+lexicon+cache):   0%|          | 0/58606 [00:00<?, ?it/s]

Embedding backoff:   0%|          | 0/1 [00:00<?, ?it/s]

[map] coverage=80.06% | N=58,606 | unmatched=2,784 | time=0.8s
[INFO] Coverage user: 80.30% | item: 80.06%
[stage] Cleaning & clipping …
[stage] Aggregating users …
[stage] Aggregating items (pooled) …
[OK] Saved user vectors → /content/drive/MyDrive/processed/slice_4k/user_aspect_vectors.parquet
[OK] Saved item vectors → /content/drive/MyDrive/processed/slice_4k/item_aspect_vectors.parquet
[OK] Coverage breakdown → /content/drive/MyDrive/processed/slice_4k/aspect_mapping_coverage.csv


Unnamed: 0,user_id,food_sent,food_cnt,service_sent,service_cnt,price_sent,price_cnt,ambience_sent,ambience_cnt,cleanliness_sent,cleanliness_cnt,portion_sent,portion_cnt,wait_time_sent,wait_time_cnt,location_sent,location_cnt,total_mentions
0,100000587891567535744,-0.646803,2,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,2
1,100004922652291933917,1.0,5,1.0,9,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,14
2,100030186921210979702,-0.491418,2,0.0,0,1.0,1,0.0,0,0.0,0,1.0,1,-1.0,1,0.0,0,5


Unnamed: 0,gmap_id,food_sent,food_cnt,service_sent,service_cnt,price_sent,price_cnt,ambience_sent,ambience_cnt,cleanliness_sent,cleanliness_cnt,portion_sent,portion_cnt,wait_time_sent,wait_time_cnt,location_sent,location_cnt,total_mentions
0,0x80dc8208f8ad8269:0x930d0b4346b897e1,-0.646803,2,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,2
1,0x80dd315f7ecfdb09:0x80c63d40f1f66536,0.0,0,1.0,2,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,2
2,0x80dd2fd893f5d4cb:0xed76652ba1cd0321,0.738732,11,1.0,6,0.0,0,0.0,0,0.0,0,1.0,1,-1.0,1,1.0,3,22


Unnamed: 0,bucket,n_terms,share
0,food,52297,0.55507
1,service,20012,0.212403
2,location,6788,0.072046
3,ambience,5411,0.057431
4,price,4545,0.04824
5,wait_time,3217,0.034145
6,portion,1504,0.015963
7,cleanliness,443,0.004702


In [None]:
from pathlib import Path
import pyarrow.parquet as pq
import pandas as pd

BASE = Path("/content/drive/MyDrive/processed/slice_4k")
USER_ASPECT_VEC = BASE / "user_aspect_vectors.parquet"
USER_TRAIN_REV  = BASE / "user_reviews_train.parquet"
USER_ABSA_PATH  = BASE / "aspects_users_train_full.parquet"

print("user_vecs unique users:", pd.read_parquet(USER_ASPECT_VEC, columns=["user_id"])["user_id"].astype(str).nunique())
print("train unique users:", pd.read_parquet(USER_TRAIN_REV, columns=["user_id"])["user_id"].astype(str).nunique())

print("\nSample user_vecs IDs:", pd.read_parquet(USER_ASPECT_VEC, columns=["user_id"])["user_id"].astype(str).head(5).tolist())
print("Sample train IDs:", pd.read_parquet(USER_TRAIN_REV, columns=["user_id"])["user_id"].astype(str).dropna().head(5).tolist())

print("\nABSA users file columns:", pq.read_schema(str(USER_ABSA_PATH)).names[:20])
print(pd.read_parquet(USER_ABSA_PATH).head(2))


user_vecs unique users: 3741
train unique users: 4000

Sample user_vecs IDs: ['100000587891567535744', '100004922652291933917', '100030186921210979702', '100036874830363594134', '100046238554630965326']
Sample train IDs: ['100003566959486344368', '100003566959486344368', '100003566959486344368', '100003566959486344368', '100003566959486344368']

ABSA users file columns: ['global_row', 'review_text', 'aspect', 'sentiment', 'confidence', 'evidence', 'position', 'user_id', 'gmap_id', 'name', 'rating', 'time_s']
   global_row                                        review_text aspect  \
0           0  ( Translated by Google ) The food has no taste...   food   
1           0  ( Translated by Google ) The food has no taste...  taste   

  sentiment  confidence                                           evidence  \
0   Neutral      0.5441  ( Translated by Google ) The food has no taste...   
1  Negative      0.9964  Google ) The food has no taste ( Original ) No...   

  position               

In [None]:
print("ABSA user rows:", len(user_absa))
print("Mapped user rows:", len(absa_user_mapped))
print("User_vecs rows:", len(user_vecs))
print("Medium+heavy users:", len(user_dense))

train_users = pd.read_parquet(USER_TRAIN_REV)["user_id"].nunique()
print("Users in train reviews file:", train_users)

print("Users in both after filtering:",
      len(set(user_dense.user_id) & set(pd.read_parquet(USER_TRAIN_REV)["user_id"].astype(str))))


ABSA user rows: 58901
Mapped user rows: 47295
User_vecs rows: 3393
Medium+heavy users: 1385
Users in train reviews file: 4000
Users in both after filtering: 5


In [None]:
# --- Minimal rebuild of user_aspect_vectors.parquet (correct IDs) ---
import re, numpy as np, pandas as pd
from pathlib import Path
from sentence_transformers import SentenceTransformer  # only for mapping fallback if needed

BASE = Path("/content/drive/MyDrive/processed/slice_4k")
USER_ABSA_PATH  = BASE / "aspects_users_train_full.parquet"
ITEM_ABSA_PATH  = BASE / "aspects_items_train_full.parquet"

# 8 buckets you used before
ASPECT_BUCKETS = {
    "food":       ["food","taste","flavor","dish","noodle","sushi","pizza","burger","taco","spicy","fresh","broth","steak","dessert","menu","ramen","bbq"],
    "service":    ["service","staff","waiter","waitress","server","attentive","rude","friendly","manager","host","hosts","hostess"],
    "price":      ["price","cost","value","cheap","expensive","affordable","overpriced","worth","deal","pricing"],
    "ambience":   ["ambience","atmosphere","vibe","noise","music","decor","seating","lighting","crowded","cozy","environment"],
    "cleanliness":["clean","dirty","bathroom","restroom","sanitary","hygiene","messy","filthy"],
    "portion":    ["portion","size","serving","amount","quantity","share","big","small"],
    "wait_time":  ["wait","queue","delay","slow","fast","quick","prompt","line","service time"],
    "location":   ["location","parking","nearby","distance","convenient","access","walk","drive"]
}
aspect_buckets = list(ASPECT_BUCKETS.keys())
LEXICON = {w: b for b, words in ASPECT_BUCKETS.items() for w in words}
PUNCT_RX = re.compile(r"[^\w\s\-]")
STOP = set(["the","a","an","and","or","in","at","of","to","for","on","is","are","was","were","very","really","so"])

def normalize_term(term: str) -> str:
    t = str(term).lower().strip()
    t = PUNCT_RX.sub(" ", t)
    t = re.sub(r"\s+", " ", t)
    return t

def tokenize_keep_heads(s: str):
    toks = [w.strip("-") for w in s.split() if w and w not in STOP]
    return toks[:2] if len(toks) > 1 else toks

def to_numeric_sentiment(x):
    s = str(x).strip().lower()
    table = {"positive":1.0,"pos":1.0,"1":1.0,"+1":1.0,
             "negative":-1.0,"neg":-1.0,"-1":-1.0,
             "neutral":0.0,"neu":0.0,"0":0.0}
    return table.get(s, np.nan)

def load_absa_user(p: Path) -> pd.DataFrame:
    df = pd.read_parquet(p)
    # Expect these columns present from your run_extract(id_cols=...)
    uid = next((c for c in df.columns if c.lower()=="user_id"), None)
    gid = next((c for c in df.columns if c.lower()=="gmap_id"), None)
    asp = next((c for c in df.columns if c.lower()=="aspect"), None)
    pol = next((c for c in df.columns if c.lower()=="sentiment"), None)
    conf= next((c for c in df.columns if c.lower()=="confidence"), None)

    if not all([uid,gid,asp,pol]):
        raise ValueError(f"Missing required columns in {p.name}. Found: {df.columns.tolist()[:20]}")

    out = pd.DataFrame({
        "user_id": df[uid].astype(str),
        "gmap_id": df[gid].astype(str),
        "aspect_term": df[asp].astype(str),
        "polarity": pd.to_numeric(df[pol].map(to_numeric_sentiment), errors="coerce")
    })
    out["confidence"] = (pd.to_numeric(df[conf], errors="coerce").clip(0,1)
                         if conf in df.columns else 1.0)
    out = out[out["polarity"].notna()].reset_index(drop=True)
    return out

# simple lexicon map (you already achieved ~80% coverage)
def map_term_to_bucket(term: str):
    t = normalize_term(term)
    for tok in tokenize_keep_heads(t):
        if tok in LEXICON:
            return LEXICON[tok]
    return None  # keep it simple; you had embedding backoff earlier if you want

def aggregate_aspect_vectors(df: pd.DataFrame, by: str) -> pd.DataFrame:
    # df must have columns: by, bucket, polarity, confidence
    stats = {}
    for key, g in df.groupby(by):
        row = {"key": key}
        for b in aspect_buckets:
            sub = g[g["bucket"]==b]
            w = float(sub["confidence"].sum())
            val = float((sub["confidence"] * sub["polarity"]).sum() / w) if w>0 else 0.0
            row[f"{b}_sent"] = val
            row[f"{b}_cnt"]  = int(len(sub))
        row["total_mentions"] = int(sum(row[f"{b}_cnt"] for b in aspect_buckets))
        stats[key] = row
    out = pd.DataFrame(stats.values()).rename(columns={"key": by})
    return out

# Load user ABSA with TRUE user_id
absa_u = load_absa_user(USER_ABSA_PATH)

# Map terms to buckets (reuse your fast mapper if you prefer)
absa_u["bucket"] = absa_u["aspect_term"].map(map_term_to_bucket)
absa_u = absa_u[absa_u["bucket"].notna()].copy()
absa_u["polarity"]   = absa_u["polarity"].clip(-1,1)
absa_u["confidence"] = absa_u["confidence"].fillna(1.0).clip(0,1)

# Re-aggregate USER vectors by user_id (correct IDs)
user_vecs_fixed = aggregate_aspect_vectors(absa_u, by="user_id")

# Rebuild tiers and save (items unchanged)
user_vecs_fixed['support_tier'] = pd.cut(
    user_vecs_fixed['total_mentions'], bins=[0,2,5,10,99999],
    labels=["cold","light","medium","heavy"]
).astype(str)

user_vecs_fixed.to_parquet(BASE/"user_aspect_vectors.parquet", index=False)
print("Rebuilt user_aspect_vectors.parquet with correct user_id. Shape:", user_vecs_fixed.shape)


Rebuilt user_aspect_vectors.parquet with correct user_id. Shape: (3393, 19)


In [None]:
MIN_CNT = 3
for df in (user_vecs, item_vecs):
    for b in ASPECT_BUCKETS.keys():
        m = df[f"{b}_cnt"] < MIN_CNT
        df.loc[m, f"{b}_sent"] = 0.0  # or np.nan if you prefer


In [None]:
def idf_weights(vecs: pd.DataFrame) -> dict:
    buckets = list(ASPECT_BUCKETS.keys())
    N = len(vecs)
    has_b = {b: (vecs[f"{b}_cnt"] > 0).sum() for b in buckets}
    return {b: math.log((N+1)/(has_b[b]+1)) for b in buckets}

w_users = idf_weights(user_vecs)
w_items = idf_weights(item_vecs)

for df, w in [(user_vecs, w_users), (item_vecs, w_items)]:
    for b, ww in w.items():
        df[f"{b}_sent"] = df[f"{b}_sent"] * ww


In [None]:
from IPython.display import display

print("User vectors shape:", user_vecs.shape)
print("Item vectors shape:", item_vecs.shape)

# Basic nulls
uv_null = user_vecs.isna().sum().sort_values(ascending=False).head(10)
iv_null = item_vecs.isna().sum().sort_values(ascending=False).head(10)
print("\n[Nulls] Users (top 10):"); display(uv_null)
print("\n[Nulls] Items (top 10):"); display(iv_null)

# Duplicates
print("\n[Dupes] dup users:", user_vecs["user_id"].duplicated().sum())
print("[Dupes] dup items:", item_vecs["gmap_id"].duplicated().sum())

# Basic head
display(user_vecs.head(3))
display(item_vecs.head(3))


User vectors shape: (3741, 18)
Item vectors shape: (25253, 18)

[Nulls] Users (top 10):


Unnamed: 0,0
user_id,0
food_sent,0
food_cnt,0
service_sent,0
service_cnt,0
price_sent,0
price_cnt,0
ambience_sent,0
ambience_cnt,0
cleanliness_sent,0



[Nulls] Items (top 10):


Unnamed: 0,0
gmap_id,0
food_sent,0
food_cnt,0
service_sent,0
service_cnt,0
price_sent,0
price_cnt,0
ambience_sent,0
ambience_cnt,0
cleanliness_sent,0



[Dupes] dup users: 0
[Dupes] dup items: 0


Unnamed: 0,user_id,food_sent,food_cnt,service_sent,service_cnt,price_sent,price_cnt,ambience_sent,ambience_cnt,cleanliness_sent,cleanliness_cnt,portion_sent,portion_cnt,wait_time_sent,wait_time_cnt,location_sent,location_cnt,total_mentions
0,100000587891567535744,0.0,2,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,2
1,100004922652291933917,0.078063,5,0.377662,9,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,14
2,100030186921210979702,0.0,2,0.0,0,0.0,1,0.0,0,0.0,0,0.0,1,0.0,1,0.0,0,5


Unnamed: 0,gmap_id,food_sent,food_cnt,service_sent,service_cnt,price_sent,price_cnt,ambience_sent,ambience_cnt,cleanliness_sent,cleanliness_cnt,portion_sent,portion_cnt,wait_time_sent,wait_time_cnt,location_sent,location_cnt,total_mentions
0,0x80dc8208f8ad8269:0x930d0b4346b897e1,0.0,2,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,2
1,0x80dd315f7ecfdb09:0x80c63d40f1f66536,0.0,0,0.0,2,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,2
2,0x80dd2fd893f5d4cb:0xed76652ba1cd0321,0.124265,11,0.753876,6,0.0,0,0.0,0,0.0,0,0.0,1,0.0,1,1.638118,3,22


In [None]:
# Sparsity & support
def sparsity_summary(df, key_col):
    buckets = [b for b in ASPECT_BUCKETS.keys()]
    cnt_cols = [f"{b}_cnt" for b in buckets]
    nonzero_buckets = (df[cnt_cols] > 0).sum(axis=1)
    total_mentions = df["total_mentions"]

    out = {
        "N_entities": len(df),
        "pct_total_mentions_ge_5": float((total_mentions >= 5).mean()),
        "pct_total_mentions_ge_10": float((total_mentions >= 10).mean()),
        "mean_total_mentions": float(total_mentions.mean()),
        "median_total_mentions": float(total_mentions.median()),
        "mean_nonzero_buckets": float(nonzero_buckets.mean()),
        "median_nonzero_buckets": float(nonzero_buckets.median()),
    }
    return out, nonzero_buckets, total_mentions

u_sum, u_nzb, u_tm = sparsity_summary(user_vecs, "user_id")
i_sum, i_nzb, i_tm = sparsity_summary(item_vecs, "gmap_id")

print("[Users] ", u_sum)
print("[Items] ", i_sum)

print("\nUsers – % with ≥2 buckets nonzero:", float((u_nzb >= 2).mean()))
print("Items – % with ≥2 buckets nonzero:", float((i_nzb >= 2).mean()))


[Users]  {'N_entities': 3741, 'pct_total_mentions_ge_5': 0.6260358192996525, 'pct_total_mentions_ge_10': 0.3694199411921946, 'mean_total_mentions': 12.64234161988773, 'median_total_mentions': 7.0, 'mean_nonzero_buckets': 3.0761828388131516, 'median_nonzero_buckets': 3.0}
[Items]  {'N_entities': 25253, 'pct_total_mentions_ge_5': 0.25363323169524415, 'pct_total_mentions_ge_10': 0.05872569595691601, 'mean_total_mentions': 3.7309230586464976, 'median_total_mentions': 3.0, 'mean_nonzero_buckets': 1.9449966340632796, 'median_nonzero_buckets': 2.0}

Users – % with ≥2 buckets nonzero: 0.8126169473402833
Items – % with ≥2 buckets nonzero: 0.5968399794083871


In [None]:
item_vecs['support_tier'] = pd.cut(
    item_vecs['total_mentions'],
    bins=[0, 2, 5, 10, 99999],
    labels=["cold", "light", "medium", "heavy"]
)

user_vecs['support_tier'] = pd.cut(
    user_vecs['total_mentions'],
    bins=[0, 2, 5, 10, 99999],
    labels=["cold", "light", "medium", "heavy"]
)


In [None]:
print("User tiers:\n", user_vecs['support_tier'].value_counts(normalize=True))
print("\nItem tiers:\n", item_vecs['support_tier'].value_counts(normalize=True))


User tiers:
 support_tier
heavy     0.342155
cold      0.223470
medium    0.220797
light     0.213579
Name: proportion, dtype: float64

Item tiers:
 support_tier
cold      0.473172
light     0.324753
medium    0.161248
heavy     0.040827
Name: proportion, dtype: float64


In [None]:
# !pip -q install pandas pyarrow fastparquet sentence-transformers torch torchvision tqdm

import os, math, json, random, gc, time
from pathlib import Path

import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from sentence_transformers import SentenceTransformer


In [None]:
BASE = Path("/content/drive/MyDrive/processed/slice_4k")

# Frozen ABSA vectors from your previous step
USER_ASPECT_VEC = BASE / "user_aspect_vectors.parquet"
ITEM_ASPECT_VEC = BASE / "item_aspect_vectors.parquet"

# Train interactions + text sources
USER_TRAIN_REV  = BASE / "user_reviews_train.parquet"   # has [user_id, gmap_id, text, time, ...]
ITEM_TRAIN_REV  = BASE / "item_reviews_train.parquet"   # has [gmap_id, text, time, ...] (warm items)
COLD_META       = BASE / "cold_item_metadata.parquet"   # (optional fallback; not used in phase-1 training)

# Output dirs
OUT_DIR = BASE / "cl_outputs"
OUT_DIR.mkdir(parents=True, exist_ok=True)

# SBERT model
SBERT_NAME = "sentence-transformers/all-MiniLM-L6-v2"
MAX_TEXT_PER_ENTITY = 30           # cap #reviews concatenated per entity (to control memory/time)
MAX_CHARS_PER_REVIEW = 500         # truncate per review (SBERT has its own tokenizer length too)
BATCH_ENC = 256                    # SBERT encode batch size
SEED = 42

random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
device = "cuda" if torch.cuda.is_available() else "cpu"
device


'cuda'

In [None]:
user_vecs = pd.read_parquet(USER_ASPECT_VEC)
item_vecs = pd.read_parquet(ITEM_ASPECT_VEC)

# Ensure expected columns exist
aspect_buckets = ["food","service","price","ambience","cleanliness","portion","wait_time","location"]
sent_cols = [f"{b}_sent" for b in aspect_buckets]
cnt_cols  = [f"{b}_cnt"  for b in aspect_buckets]

# Support tiers (as you computed)
user_vecs['support_tier'] = pd.cut(
    user_vecs['total_mentions'], bins=[0,2,5,10,99999],
    labels=["cold","light","medium","heavy"]
).astype(str)

item_vecs['support_tier'] = pd.cut(
    item_vecs['total_mentions'], bins=[0,2,5,10,99999],
    labels=["cold","light","medium","heavy"]
).astype(str)

# Keep only medium+heavy for Phase-1 CL
user_dense = user_vecs[user_vecs['support_tier'].isin(['medium','heavy'])].copy()
item_dense = item_vecs[item_vecs['support_tier'].isin(['medium','heavy'])].copy()

print(user_vecs.shape, user_dense.shape)
print(item_vecs.shape, item_dense.shape)


(3393, 19) (1385, 19)
(25253, 19) (5103, 19)


In [None]:
from pathlib import Path
import pandas as pd

BASE = Path("/content/drive/MyDrive/processed/slice_4k")
USER_ASPECT_VEC = BASE/"user_aspect_vectors.parquet"
ITEM_ASPECT_VEC = BASE/"item_aspect_vectors.parquet"
USER_TRAIN_REV  = BASE/"user_reviews_train.parquet"
ITEM_TRAIN_REV  = BASE/"item_reviews_train.parquet"

user_vecs = pd.read_parquet(USER_ASPECT_VEC)
item_vecs = pd.read_parquet(ITEM_ASPECT_VEC)

# current dense filters you used
user_vecs['support_tier'] = pd.cut(user_vecs['total_mentions'], [0,2,5,10,99999],
                                   labels=['cold','light','medium','heavy']).astype(str)
item_vecs['support_tier'] = pd.cut(item_vecs['total_mentions'], [0,2,5,10,99999],
                                   labels=['cold','light','medium','heavy']).astype(str)
user_dense = user_vecs[user_vecs['support_tier'].isin(['medium','heavy'])].copy()
item_dense = item_vecs[item_vecs['support_tier'].isin(['medium','heavy'])].copy()

# sets
A_users = set(user_dense['user_id'].astype(str))
B_users = set(pd.read_parquet(USER_TRAIN_REV, columns=['user_id'])['user_id'].astype(str))

A_items = set(item_dense['gmap_id'].astype(str))
B_items = set(pd.read_parquet(ITEM_TRAIN_REV, columns=['gmap_id'])['gmap_id'].astype(str))

print("users: |A|=", len(A_users), " |B|=", len(B_users), " |A∩B|=", len(A_users & B_users))
print("items: |A|=", len(A_items), " |B|=", len(B_items), " |A∩B|=", len(A_items & B_items))

# show a few mismatches so we see the pattern
only_in_A_users = list(A_users - B_users)[:10]
only_in_B_users = list(B_users - A_users)[:10]
print("\nSample user IDs only in dense vectors (A \\ B):", only_in_A_users)
print("Sample user IDs only in train reviews (B \\ A):", only_in_B_users)

# sanity on id shapes
uv_len = user_dense['user_id'].astype(str).str.len().describe()
tr_len = pd.read_parquet(USER_TRAIN_REV, columns=['user_id'])['user_id'].astype(str).str.len().describe()
print("\nuser_dense ID length stats:\n", uv_len)
print("\ntrain user_id length stats:\n", tr_len)


users: |A|= 1385  |B|= 4000  |A∩B|= 5
items: |A|= 5103  |B|= 19585  |A∩B|= 2742

Sample user IDs only in dense vectors (A \ B): ['114890819646094868814', '105713067283914339925', '118226089818248210476', '106098535228623242264', '110496730177959030126', '105722741905808572190', '103788980017945817351', '105016540058029034725', '115107110436126303631', '106865755669803094296']
Sample user IDs only in train reviews (B \ A): ['109905971731260418478', '101802480527462412244', '108972746353496572186', '109584233733405119213', '102698472720787296472', '103043558015232804770', '108174026644789202565', '100426635678076567459', '102065334771479290046', '102172587313882458109']

user_dense ID length stats:
 count    1385.0
mean       21.0
std         0.0
min        21.0
25%        21.0
50%        21.0
75%        21.0
max        21.0
Name: user_id, dtype: float64

train user_id length stats:
 count    29601.0
mean        21.0
std          0.0
min         21.0
25%         21.0
50%         21.0
75%

In [None]:
BASE = Path("/content/drive/MyDrive/processed/slice_4k")

USER_TRAIN_REV  = BASE/"user_reviews_train.parquet"
ABSA_USER_RAW   = BASE/"aspects_users_train_full.parquet"  # your pyabsa output
OUT_NEW         = BASE/"user_aspect_vectors_rebuilt.parquet"

# 1) get the real 4K user ids
true_users = set(pd.read_parquet(USER_TRAIN_REV, columns=["user_id"])["user_id"].astype(str))

# 2) load raw ABSA output
df = pd.read_parquet(ABSA_USER_RAW)
df["user_id"] = df["user_id"].astype(str)

# 3) keep only users in this 4K slice
df = df[df["user_id"].isin(true_users)].copy()

print("ABSA rows kept:", len(df))

# 4) numeric sentiment map
def to_num(s):
    s = str(s).lower()
    if s in ["positive","pos","1","+1"]: return 1.0
    if s in ["negative","neg","-1"]: return -1.0
    return 0.0

df["polarity"] = df["sentiment"].map(to_num)

# 5) run your bucket mapping + aggregation (reusing your functions)
mapped,_ = map_terms_to_buckets_with_progress(df, term_col="aspect")
mapped = mapped[mapped["bucket"].notna()].copy()

user_vecs_new = aggregate_aspect_vectors(mapped, by="user_id")

user_vecs_new.to_parquet(OUT_NEW, index=False)
print("✅ Saved:", OUT_NEW)
print(user_vecs_new.shape)
display(user_vecs_new.head())


ABSA rows kept: 195


  0%|          | 0/195 [00:00<?, ?it/s]

Fast path (regex+lexicon+cache):   0%|          | 0/195 [00:00<?, ?it/s]

[map] coverage=81.54% | N=195 | unmatched=0 | time=0.0s
✅ Saved: /content/drive/MyDrive/processed/slice_4k/user_aspect_vectors_rebuilt.parquet
(15, 18)


Unnamed: 0,user_id,food_sent,food_cnt,service_sent,service_cnt,price_sent,price_cnt,ambience_sent,ambience_cnt,cleanliness_sent,cleanliness_cnt,portion_sent,portion_cnt,wait_time_sent,wait_time_cnt,location_sent,location_cnt,total_mentions
0,104054999651113871985,0.573529,12,1.0,4,-0.235436,2,0.510007,2,0.0,0,-1.0,1,-1.0,1,1.0,2,24
1,104831766389823738186,0.175961,23,0.341501,3,0.016375,5,0.0,0,1.0,1,0.0,0,0.0,0,1.0,2,34
2,106169851816536441586,1.0,1,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,1
3,106251071765783816886,0.516451,8,0.0,1,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,1.0,2,11
4,106895878343305046660,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,0,1.0,1,1


In [None]:
# --- helpers: read only existing columns (no nrows) ---
import pyarrow.parquet as pq

def _existing_cols(path: Path, want):
    try:
        schema = pq.read_schema(str(path))
        names = set(schema.names)
        return [c for c in want if c in names]
    except Exception:
        # fallback: read whole file once
        df_all = pd.read_parquet(path)
        return [c for c in want if c in df_all.columns]

def load_user_texts(path: Path, users_keep: set) -> pd.DataFrame:
    want = ["user_id","text","time"]
    cols = _existing_cols(path, want)
    df = pd.read_parquet(path, columns=cols)
    # standardize
    if "user_id" not in df.columns or "text" not in df.columns:
        raise ValueError(f"{path.name} must have user_id and text. Found: {df.columns.tolist()}")
    df = df[df["user_id"].notna() & df["text"].notna()].copy()
    df["user_id"] = df["user_id"].astype(str)
    df = df[df["user_id"].isin(users_keep)]
    if "time" in df.columns:
        df["time"] = pd.to_datetime(df["time"], errors="coerce")
        df = df.sort_values(["user_id","time"])
    return df

def load_item_texts(path: Path, items_keep: set) -> pd.DataFrame:
    want = ["gmap_id","text","time"]
    cols = _existing_cols(path, want)
    df = pd.read_parquet(path, columns=cols)
    if "gmap_id" not in df.columns or "text" not in df.columns:
        raise ValueError(f"{path.name} must have gmap_id and text. Found: {df.columns.tolist()}")
    df = df[df["gmap_id"].notna() & df["text"].notna()].copy()
    df["gmap_id"] = df["gmap_id"].astype(str)
    df = df[df["gmap_id"].isin(items_keep)]
    if "time" in df.columns:
        df["time"] = pd.to_datetime(df["time"], errors="coerce")
        df = df.sort_values(["gmap_id","time"])
    return df


users_keep = set(user_dense["user_id"].astype(str))
items_keep = set(item_dense["gmap_id"].astype(str))

u_texts = load_user_texts(USER_TRAIN_REV, users_keep)
i_texts = load_item_texts(ITEM_TRAIN_REV, items_keep)

# Aggregate → 1 document per entity (truncate per-review; cap #reviews)
def aggregate_docs(df: pd.DataFrame, key_col: str) -> pd.DataFrame:
    agg = []
    for k, g in tqdm(df.groupby(key_col), desc=f"Aggregate {key_col} docs"):
        texts = [str(t)[:MAX_CHARS_PER_REVIEW] for t in g["text"].tolist()[:MAX_TEXT_PER_ENTITY]]
        doc = "\n".join(texts)
        agg.append((k, doc))
    return pd.DataFrame(agg, columns=[key_col, "doc"])

u_docs = aggregate_docs(u_texts, "user_id")
i_docs = aggregate_docs(i_texts, "gmap_id")

print(u_docs.shape, i_docs.shape)


Aggregate user_id docs:   0%|          | 0/5 [00:00<?, ?it/s]

Aggregate gmap_id docs:   0%|          | 0/2742 [00:00<?, ?it/s]

(5, 2) (2742, 2)
