In [22]:
# --- ABSA pipeline (San Francisco edition) ---

from pathlib import Path
import re, unicodedata, math, json, time
import numpy as np
import pandas as pd

%pip install pyabsa pyarrow fastparquet

# [CHANGE vs Sowmya] repo-aware relative paths → use your data/ tree
ROOT = Path(__file__).resolve().parents[1] if "__file__" in globals() else Path.cwd().parent
PROC = ROOT / "data" / "processed"

REVIEWS_JSON     = PROC / "sf-sampled-reviews.json"  # <- your sampled SF reviews
RESTAURANTS_JSON = PROC / "sf-restaurants.json"      # <- SF places meta

# Outputs (kept under processed so other notebooks can consume)
OUT_BASE  = PROC / "sf_absa_outputs"
OUT_BASE.mkdir(parents=True, exist_ok=True)

OUT_USERS_ABSA_PARTS = OUT_BASE / "aspects_users_parts"    # chunked parquet parts (users)
OUT_ITEMS_ABSA_PARTS = OUT_BASE / "aspects_items_parts"    # chunked parquet parts (items)
OUT_USERS_ABSA_FULL  = OUT_BASE / "aspects_users_full.parquet"
OUT_ITEMS_ABSA_FULL  = OUT_BASE / "aspects_items_full.parquet"

OUT_USER_VEC = OUT_BASE / "user_aspect_vectors.parquet"
OUT_ITEM_VEC = OUT_BASE / "item_aspect_vectors.parquet"
OUT_COVER    = OUT_BASE / "aspect_bucket_coverage.csv"

pd.set_option("display.width", 140)
pd.set_option("display.max_colwidth", 120)

# [CHANGE vs Sowmya] one canonical ID normaliser used everywhere
def norm_id(x) -> str:
    if pd.isna(x): return ""
    s = str(x)
    s = unicodedata.normalize("NFKC", s)
    s = s.replace("\u200b","").replace("\u200c","").replace("\u200d","")
    s = re.sub(r"\s+", " ", s).strip()
    return s.lower()

print("Configured for SF JSONs")
print("Reviews:", REVIEWS_JSON)
print("Restaurants:", RESTAURANTS_JSON)
print("Outputs →", OUT_BASE)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.
Configured for SF JSONs
Reviews: /Users/kienanana/Documents/SCHOOL/Y3S1/BT4222/PROJECT/data/processed/sf-sampled-reviews.json
Restaurants: /Users/kienanana/Documents/SCHOOL/Y3S1/BT4222/PROJECT/data/processed/sf-restaurants.json
Outputs → /Users/kienanana/Documents/SCHOOL/Y3S1/BT4222/PROJECT/data/processed/sf_absa_outputs


In [23]:
# [CHANGE vs Sowmya] no Colab drive / duckdb here; read local JSONL safely

def read_json_lines(path: Path) -> pd.DataFrame:
    rows = []
    with open(path, "r") as f:
        for line in f:
            if line.strip():
                rows.append(json.loads(line))
    return pd.DataFrame(rows)

reviews = read_json_lines(REVIEWS_JSON)
places  = read_json_lines(RESTAURANTS_JSON)

# enforce required columns (rename if your JSON uses slightly different keys)
rename_map = {
    "user_id": "user_id",
    "gmap_id": "gmap_id",
    "name": "name",
    "text": "text",
    "rating": "rating",
    "time": "time"
}
missing = [k for k in ["user_id","gmap_id","text","rating"] if k not in reviews.columns]
assert not missing, f"reviews JSON missing columns: {missing}"

# normalise IDs early (critical fix for Sowmya’s “6 warm users” issue)
reviews["user_id"] = reviews["user_id"].map(norm_id)
reviews["gmap_id"] = reviews["gmap_id"].map(norm_id)

if "gmap_id" in places.columns:
    places["gmap_id"] = places["gmap_id"].map(norm_id)

print(f"reviews: {len(reviews):,} rows | users={reviews.user_id.nunique():,} | items={reviews.gmap_id.nunique():,}")

reviews: 411,496 rows | users=131,972 | items=3,721


In [32]:
from pathlib import Path
import os, json
import numpy as np
import pandas as pd
import pyarrow as pa, pyarrow.parquet as pq
from pyabsa import ATEPCCheckpointManager
import torch
import pyabsa.tasks.AspectTermExtraction.models.__lcf__.fast_lcf_atepc as fast_lcf

os.environ.setdefault("PYABSA_HOME", str(OUT_BASE / "pyabsa_ckpt"))

# Smaller batches reduce odd shapes in PyABSA internals
BATCH_SIZE = 16
CKPT = "english"
CHUNK = 4000  # reviews per write (tweak for your RAM/CPU)

# --- PyABSA robustness patch v3: tolerate None / 0-D / ragged for labels & ids ---
def _to_2d_int64(x):
    """Coerce x into a padded 2-D int64 numpy array. Handles torch tensors, lists,
    ndarrays (incl. object dtype), 0-D scalars, and None."""
    if x is None:
        return np.zeros((1, 1), dtype=np.int64)
    try:
        arr = x.detach().cpu().numpy()  # torch path
    except Exception:
        arr = np.asarray(x, dtype=object)

    if arr is None:
        return np.zeros((1, 1), dtype=np.int64)
    if np.ndim(arr) == 0:
        try:
            v = arr.item()
        except Exception:
            v = None
        if v is None or (isinstance(v, float) and np.isnan(v)):
            v = 0
        else:
            try:
                v = int(v)
            except Exception:
                v = 0
        arr = np.array([[v]], dtype=object)
    elif np.ndim(arr) == 1:
        arr = np.array([arr], dtype=object)

    rows = []
    for row in arr:
        r = np.asarray(row, dtype=object)
        if r.ndim == 0:
            rows.append(np.array([0], dtype=np.int64))
            continue
        cleaned = []
        for el in np.ravel(r):
            if el is None or (isinstance(el, float) and np.isnan(el)):
                cleaned.append(0)
            else:
                try:
                    cleaned.append(int(el))
                except Exception:
                    cleaned.append(0)
        rows.append(np.array(cleaned, dtype=np.int64))

    maxlen = max((len(r) for r in rows), default=1)
    out = np.zeros((len(rows), maxlen), dtype=np.int64)
    for i, r in enumerate(rows):
        out[i, :len(r)] = r
    return out

def _safe_get_batch_token_labels_bert_base_indices(self, labels):
    padded = _to_2d_int64(labels)
    sep_token = self.num_labels - 1
    for i in range(padded.shape[0]):
        row = padded[i]
        hits = np.where(row == sep_token)[0]
        sep_index = int(hits[0]) if hits.size else (len(row) - 1)
        if sep_index + 1 < len(row):
            row[sep_index + 1:] = 0
        padded[i] = row
    return torch.tensor(padded, dtype=torch.long).to(self.bert4global.device)

def _safe_get_ids_for_local_context_extractor(self, text_indices):
    ids2d = _to_2d_int64(text_indices)
    return torch.tensor(ids2d, dtype=torch.long).to(self.bert4global.device)

# Apply monkey patches
fast_lcf.FAST_LCF_ATEPC.get_batch_token_labels_bert_base_indices = _safe_get_batch_token_labels_bert_base_indices
fast_lcf.FAST_LCF_ATEPC.get_ids_for_local_context_extractor     = _safe_get_ids_for_local_context_extractor

# Build extractor AFTER patching
extractor = ATEPCCheckpointManager.get_aspect_extractor(
    checkpoint=CKPT, auto_device=True, force_download=False, batch_size=BATCH_SIZE
)

# Per-item fallback so one bad sample doesn’t nuke a chunk
def safe_extract(texts: list[str]):
    try:
        return extractor.extract_aspect(texts, pred_sentiment=True, print_result=False, save_result=False)
    except Exception:
        out = []
        for t in texts:
            try:
                r = extractor.extract_aspect([t], pred_sentiment=True, print_result=False, save_result=False)
                out.append(r if isinstance(r, list) else [r])
            except Exception:
                out.append([{}])
        return out

def extract_to_parts(df: pd.DataFrame, id_cols: list[str], out_dir: Path):
    out_dir.mkdir(parents=True, exist_ok=True)
    wrote, total = 0, len(df)

    # Prefilter pathological texts (blank/ultra-short)
    df = df.assign(text=df["text"].astype(str).str.replace(r"\s+", " ", regex=True).str.strip())
    df = df[df["text"].str.len() >= 3].reset_index(drop=True)
    total = len(df)

    for start in range(0, total, CHUNK):
        stop = min(total, start + CHUNK)
        part = out_dir / f"part-{start:09d}.parquet"
        if part.exists():
            continue

        batch = df.iloc[start:stop].reset_index(drop=True).copy()
        texts = batch["text"].astype(str).tolist()

        # robust call
        res = safe_extract(texts)

        rows = []
        for i, r in enumerate(res):
            # robustly unwrap PyABSA result
            d = None
            if isinstance(r, dict) and "sentence" in r:
                d = r
            elif isinstance(r, dict) and "result" in r and r["result"]:
                d = r["result"][0]
            elif isinstance(r, list) and r:
                cand = r[0]
                if isinstance(cand, dict) and ("sentence" in cand or "result" in cand or "tokens" in cand):
                    d = cand

            if not d:
                continue

            tokens   = d.get("tokens", [])
            aspects  = d.get("aspect", [])
            pos      = d.get("position", [])
            sents    = d.get("sentiment", [])
            confs    = d.get("confidence", [None] * len(aspects))

            # Normalize container types
            if not isinstance(aspects, (list, tuple)):  aspects = [aspects]
            if not isinstance(pos, (list, tuple)):      pos = [pos]
            if not isinstance(sents, (list, tuple)):    sents = [sents]

            for k in range(min(len(aspects), len(pos), len(sents))):
                span_k = pos[k]
                span = span_k if isinstance(span_k, (list, tuple)) else [span_k]
                span = [int(x) for x in span if x is not None] or [0, 0]
                i0, i1 = (min(span), max(span)) if span else (0, 0)

                aspect_text = " ".join(tokens[i0:i1+1]) if tokens and 0 <= i0 <= i1 < len(tokens) else str(aspects[k])
                win0, win1 = max(0, i0-6), min(len(tokens), i1+7) if tokens else (0, 0)
                evidence = " ".join(tokens[win0:win1]) if tokens else aspect_text

                row = {
                    "aspect": aspect_text,
                    "sentiment": sents[k],
                    "confidence": confs[k] if confs else None,
                    "evidence": evidence,
                    "position": json.dumps(span),
                    "text": batch.loc[i, "text"],
                }
                for c in id_cols:
                    row[c] = batch.loc[i, c]
                rows.append(row)

        # write even if rows is empty (creates a small part file; harmless)
        pq.write_table(pa.Table.from_pylist(rows), part)
        wrote += len(rows)
    return wrote

# Users: keep user_id + gmap_id to allow (u,i) grouping later if needed
u_cols = ["user_id", "gmap_id", "text"]
i_cols = ["gmap_id", "text"]

print("Extracting user-aspects…")
wrote_u = extract_to_parts(reviews[u_cols], id_cols=["user_id","gmap_id"], out_dir=OUT_USERS_ABSA_PARTS)
print("Extracting item-aspects…")
wrote_i = extract_to_parts(reviews[i_cols], id_cols=["gmap_id"], out_dir=OUT_ITEMS_ABSA_PARTS)

# combine parts → *_full.parquet
def concat_parts(src_dir: Path, out_file: Path):
    parts = sorted(src_dir.glob("part-*.parquet"))
    writer = None; total = 0
    for p in parts:
        tbl = pq.read_table(p)
        if writer is None:
            writer = pq.ParquetWriter(out_file, tbl.schema)
        writer.write_table(tbl); total += tbl.num_rows
    if writer: writer.close()
    return total

tot_u = concat_parts(OUT_USERS_ABSA_PARTS, OUT_USERS_ABSA_FULL)
tot_i = concat_parts(OUT_ITEMS_ABSA_PARTS, OUT_ITEMS_ABSA_FULL)
print(f"users aspects rows: {tot_u:,}  | items aspects rows: {tot_i:,}")

[2025-11-05 20:09:18] (2.4.1) ********** Available ATEPC model checkpoints for Version:2.4.1 (this version) **********
[2025-11-05 20:09:18] (2.4.1) ********** Available ATEPC model checkpoints for Version:2.4.1 (this version) **********
[2025-11-05 20:09:18] (2.4.1) Downloading checkpoint:english 
[2025-11-05 20:09:18] (2.4.1) Notice: The pretrained model are used for testing, it is recommended to train the model on your own custom datasets
[2025-11-05 20:09:18] (2.4.1) Checkpoint already downloaded, skip
[2025-11-05 20:09:18] (2.4.1) Load aspect extractor from checkpoints/ATEPC_ENGLISH_CHECKPOINT/fast_lcf_atepc_English_cdw_apcacc_82.36_apcf1_81.89_atef1_75.43
[2025-11-05 20:09:18] (2.4.1) config: checkpoints/ATEPC_ENGLISH_CHECKPOINT/fast_lcf_atepc_English_cdw_apcacc_82.36_apcf1_81.89_atef1_75.43/fast_lcf_atepc.config
[2025-11-05 20:09:18] (2.4.1) state_dict: checkpoints/ATEPC_ENGLISH_CHECKPOINT/fast_lcf_atepc_English_cdw_apcacc_82.36_apcf1_81.89_atef1_75.43/fast_lcf_atepc.state_dict




Extracting user-aspects…


preparing ate inference dataloader: 100%|██████████| 4000/4000 [00:02<00:00, 1911.95it/s]
extracting aspect terms:   0%|          | 0/125 [00:00<?, ?it/s]


TypeError: int() argument must be a string, a bytes-like object or a real number, not 'NoneType'

In [None]:
# Canonical 8 aspect buckets (editable)
ASPECT_BUCKETS = {
    "food":       ["food","taste","flavor","dish","noodle","sushi","pizza","burger","taco","spicy","fresh","broth","steak","dessert","menu","ramen","bbq"],
    "service":    ["service","staff","waiter","waitress","server","attentive","rude","friendly","manager","host","hosts","hostess"],
    "price":      ["price","cost","value","cheap","expensive","affordable","overpriced","worth","deal","pricing"],
    "ambience":   ["ambience","atmosphere","vibe","noise","music","decor","seating","lighting","crowded","cozy","environment"],
    "cleanliness":["clean","dirty","bathroom","restroom","sanitary","hygiene","messy","filthy"],
    "portion":    ["portion","size","serving","amount","quantity","share","big","small"],
    "wait_time":  ["wait","queue","delay","slow","fast","quick","prompt","line","service time"],
    "location":   ["location","parking","nearby","distance","convenient","access","walk","drive"],
}

# quick regex/lexicon mapper (fast and dependency-free)
PUNCT_RX = re.compile(r"[^\w\s\-]")
STOP = set(["the","a","an","and","or","in","at","of","to","for","on","is","are","was","were","very","really","so"])
LEXICON = {w: b for b, words in ASPECT_BUCKETS.items() for w in words}

def normalize_term(term: str) -> str:
    t = str(term).lower().strip()
    t = PUNCT_RX.sub(" ", t)
    t = re.sub(r"\s+", " ", t)
    return t

def tokenize_keep_heads(s: str):
    toks = [w.strip("-") for w in s.split() if w and w not in STOP]
    return toks[:2] if len(toks) > 1 else toks

def map_term_to_bucket(term: str):
    t = normalize_term(term)
    for tok in tokenize_keep_heads(t):
        if tok in LEXICON:
            return LEXICON[tok]
    return None

In [None]:
# Load extracted aspects (from Cell 4 or precomputed)
assert OUT_USERS_ABSA_FULL.exists() and OUT_ITEMS_ABSA_FULL.exists(), \
    "Missing ABSA outputs. Run Cell 4 or drop existing *_full.parquet into OUT_BASE."

U_ABSA = pd.read_parquet(OUT_USERS_ABSA_FULL)
I_ABSA = pd.read_parquet(OUT_ITEMS_ABSA_FULL)

# [CHANGE vs Sowmya] enforce clean IDs here too
for col in (c for c in ["user_id","gmap_id"] if c in U_ABSA.columns):
    U_ABSA[col] = U_ABSA[col].map(norm_id)
for col in (c for c in ["gmap_id"] if c in I_ABSA.columns):
    I_ABSA[col] = I_ABSA[col].map(norm_id)

# Keep only rows with valid sentiment (Positive/Negative/Neutral)
U_ABSA["sentiment"] = U_ABSA["sentiment"].astype(str).str.title()
I_ABSA["sentiment"] = I_ABSA["sentiment"].astype(str).str.title()

print("U_ABSA rows:", len(U_ABSA), "| I_ABSA rows:", len(I_ABSA))

In [None]:
# sentiment → numeric
def to_numeric_sentiment(x):
    s = str(x).strip().lower()
    if s in ("positive","pos","+1","1"): return 1.0
    if s in ("negative","neg","-1"):     return -1.0
    return 0.0

def aggregate_aspect_vectors(df: pd.DataFrame, by: str) -> pd.DataFrame:
    buckets = list(ASPECT_BUCKETS.keys())
    stats = {}
    for key, g in df.groupby(by):
        row = {"key": key}
        for b in buckets:
            sub = g[g["bucket"] == b]
            w = float(sub["confidence"].fillna(1.0).sum())
            score = float((sub["confidence"].fillna(1.0) * sub["polarity"]).sum() / w) if w > 0 else 0.0
            row[f"{b}_sent"] = score
            row[f"{b}_cnt"]  = int(len(sub))
        row["total_mentions"] = int(sum(row[f"{b}_cnt"] for b in buckets))
        stats[key] = row
    out = pd.DataFrame(stats.values()).rename(columns={"key": by})
    return out

# map terms to buckets
def prepare_absa(df: pd.DataFrame, require_user: bool) -> pd.DataFrame:
    need = ["aspect","sentiment"]
    if require_user: need += ["user_id","gmap_id"]
    else:            need += ["gmap_id"]
    missing = [c for c in need if c not in df.columns]
    assert not missing, f"ABSA file missing: {missing}"

    out = df[need + (["confidence"] if "confidence" in df.columns else [])].copy()
    out["bucket"]   = out["aspect"].map(map_term_to_bucket)
    out["polarity"] = out["sentiment"].map(to_numeric_sentiment)
    out["confidence"] = pd.to_numeric(out.get("confidence", 1.0), errors="coerce").fillna(1.0).clip(0,1)
    out = out[out["bucket"].notna()].reset_index(drop=True)
    return out

U_map = prepare_absa(U_ABSA, require_user=True)
I_map = prepare_absa(I_ABSA, require_user=False)

# [CHANGE vs Sowmya] keep neutrals in the pool (polarity=0) so they don’t “erase” users/items completely
user_vecs = aggregate_aspect_vectors(U_map, by="user_id")
# items: pool aspects mentioned by users + those inferred per-place
I_pool = pd.concat([U_map[["gmap_id","bucket","polarity","confidence"]],
                    I_map[["gmap_id","bucket","polarity","confidence"]]], ignore_index=True)
item_vecs = aggregate_aspect_vectors(I_pool, by="gmap_id")

# Save
user_vecs.to_parquet(OUT_USER_VEC, index=False)
item_vecs.to_parquet(OUT_ITEM_VEC, index=False)

print("Saved:")
print(" -", OUT_USER_VEC)
print(" -", OUT_ITEM_VEC)

# Coverage report (for quick sanity)
cov = (pd.concat([U_map[["bucket"]], I_map[["bucket"]]]).value_counts()
       .rename_axis(["bucket"]).reset_index(name="n_terms"))
cov["share"] = cov["n_terms"] / cov["n_terms"].sum()
cov.to_csv(OUT_COVER, index=False)
cov.head(10)

In [None]:
# Support tiers (how much ABSA signal each entity has)
for df, key in [(user_vecs, "user_id"), (item_vecs, "gmap_id")]:
    df["support_tier"] = pd.cut(
        df["total_mentions"], bins=[0,2,5,10,999999], labels=["cold","light","medium","heavy"]
    ).astype(str)

print("Users by tier:\n", user_vecs["support_tier"].value_counts(normalize=True))
print("\nItems by tier:\n", item_vecs["support_tier"].value_counts(normalize=True))

# ID alignment with reviews universe — should be large, not “6”
U_keep = set(reviews["user_id"])
I_keep = set(reviews["gmap_id"])
print("\nAlignment:")
print("users in vectors ∩ reviews:", len(set(user_vecs["user_id"]) & U_keep))
print("items in vectors ∩ reviews:", len(set(item_vecs["gmap_id"]) & I_keep))

# quick heads
display(user_vecs.head(3))
display(item_vecs.head(3))