### Aspect feature generator
- Goal: represent each user and restaurant’s preference for a certain aspect
    - price
    - food
    - service
- Process
    1. get aspects (neutral sentiment)
        - run every review through ABSA to get aspects
        - pre-process aspects (lemmatisation)
        - filter for pre-defined keywords
    2. at this stage, every review will have a score for our predefined keyword, e.g 
        1. service: 1
        2. food: 0
        3. price: 1
        
        interpretation here is, if 1 → this aspect was mentioned in the text
        
    3. aggregate user aspects
        1. every user will have a score of
            1. price: (1+1+0+1+0)/5  0.3
            2. food: → 0.4
            3. price → 0.9
        2. 1 aspector [0.3, 0.4, 0.9]
    4. aggregate restaurants aspects

In [5]:
from pathlib import Path
import json, re
import pandas as pd
import numpy as np
from typing import Dict, List

# Project paths (adjust if your notebook runs elsewhere)
ROOT = Path("../..").resolve()
DATA_PROCESSED = ROOT / "data" / "processed"
REVIEWS_PATH = DATA_PROCESSED / "sf-reviews.json"           # full data
REVIEWS_SAMPLED_PATH = DATA_PROCESSED / "sf-sampled-reviews.json"  # optional subset

OUT_DIR = DATA_PROCESSED / "features" / "aspects"
OUT_DIR.mkdir(parents=True, exist_ok=True)

In [8]:
# Canonical buckets (editable)
ASPECT_BUCKETS: Dict[str, List[str]] = {
    "food":       ["food","taste","flavor","dish","noodle","sushi","pizza","burger","taco","spicy",
                   "fresh","broth","steak","dessert","menu","ramen","bbq"],
    "service":    ["service","staff","waiter","waitress","server","attentive","rude","friendly",
                   "manager","host","hosts","hostess"],
    "price":      ["price","cost","value","cheap","expensive","affordable","overpriced","worth",
                   "deal","pricing"],
    "ambience":   ["ambience","atmosphere","vibe","noise","music","decor","seating","lighting",
                   "crowded","cozy","environment"],
    "cleanliness":["clean","dirty","bathroom","restroom","sanitary","hygiene","messy","filthy"],
    "portion":    ["portion","size","serving","amount","quantity","share","big","small"],
    "wait_time":  ["wait","queue","delay","slow","fast","quick","prompt","line","service time"],
    "location":   ["location","parking","nearby","distance","convenient","access","walk","drive"]
}

PUNCT_RX = re.compile(r"[^\w\s\-]")
WS_RX = re.compile(r"\s+")
STOP = set(["the","a","an","and","or","in","at","of","to","for","on","is","are","was","were","very","really","so"])

def normalize_text(s: str):
    s = str(s).lower()
    s = PUNCT_RX.sub(" ", s)
    s = WS_RX.sub(" ", s).strip()
    return s

# Build one compiled regex per aspect. We use word boundaries and allow simple hyphen spacing.
def compile_aspect_regex(buckets: Dict[str, List[str]]) -> Dict[str, re.Pattern]:
    rx = {}
    for aspect, words in buckets.items():
        # escape and make "service time" work too
        alts = []
        for w in words:
            w = re.escape(w)
            w = w.replace("\\ ", r"\s+")
            alts.append(w)
        pattern = r"\b(" + "|".join(alts) + r")\b"
        rx[aspect] = re.compile(pattern)
    return rx

ASPECT_RX = compile_aspect_regex(ASPECT_BUCKETS)
ASPECT_LIST = list(ASPECT_BUCKETS.keys())

In [14]:
def load_reviews_json(path: Path, max_rows: int|None=None):
    """
    Expects objects with {'user_id','gmap_id','text','time',...}.
    Handles JSONL (one object per line) or a single large JSON array.
    """
    if not path.exists():
        raise FileNotFoundError(path)
    rows = []
    with path.open("r", encoding="utf-8") as f:
        sample = f.read(1)
        f.seek(0)
        if sample == "[":
            data = json.load(f)
            rows = data if max_rows is None else data[:max_rows]
        else:
            for i, line in enumerate(f):
                if max_rows is not None and i >= max_rows:
                    break
                if line.strip():
                    rows.append(json.loads(line))
    df = pd.DataFrame(rows)
    need = ["user_id","gmap_id","text"]
    miss = [c for c in need if c not in df.columns]
    if miss:
        raise ValueError(f"Missing columns {miss} in {path}")
    df = df[df["user_id"].notna() & df["gmap_id"].notna() & df["text"].notna()].copy()
    df["user_id"] = df["user_id"].astype(str)
    df["gmap_id"] = df["gmap_id"].astype(str)
    return df

try:
    df_reviews = load_reviews_json(REVIEWS_SAMPLED_PATH)  
except Exception:
    df_reviews = load_reviews_json(REVIEWS_PATH)

print(len(df_reviews))
df_reviews.head()

220550


Unnamed: 0,user_id,name,time,rating,text,gmap_id
0,1.111581421e+20,Tiffany Duong,1412890409813,5,Lazing your day away in cafes is one of the be...,0x1532f63595a49fdd:0xac59d1857e129a53
3,1.120315816e+20,Ev,1481342473506,4,A longtime favorite neighborhood spot. Don't t...,0x1532f63595a49fdd:0xac59d1857e129a53
4,1.183536215e+20,Daniel Ramsay,1503152603542,4,Wonderful french brasserie in a great location...,0x1532f63595a49fdd:0xac59d1857e129a53
7,1.056088314e+20,Matt Sheets,1555537028651,5,One of my favorite restaurants in all of SF!,0x1532f63595a49fdd:0xac59d1857e129a53
11,1.04019159e+20,Kelly Kemp,1556947452894,5,"Were I a hacker, I'd apply my skill base to ad...",0x1532f63595a49fdd:0xac59d1857e129a53


### Review-level aspect heads, (1 if mentioned, else 0)

In [12]:
def aspect_flags_for_text(s: str, aspect_rx: Dict[str, re.Pattern]):
    s = normalize_text(s)
    flags = {}
    for a, rx in aspect_rx.items():
        flags[a] = 1 if rx.search(s) else 0
    return flags

# Vectorized apply → one row per review with aspect 0/1 columns
flag_cols = [f"asp_{a}" for a in ASPECT_LIST]

def compute_review_flags(df: pd.DataFrame):
    out = df[["user_id","gmap_id","text"]].copy()
    flags = df["text"].map(lambda t: aspect_flags_for_text(t, ASPECT_RX))
    # explode flags dicts into columns
    flags_df = pd.DataFrame(list(flags))
    flags_df.columns = [f"asp_{c}" for c in flags_df.columns]
    out = pd.concat([out, flags_df], axis=1).fillna(0).astype({c:int for c in flags_df.columns})
    out["any_aspect_mentioned"] = out[flags_df.columns].sum(axis=1) > 0
    return out

df_flags = compute_review_flags(df_reviews)
df_flags.head()

Unnamed: 0,user_id,gmap_id,text,asp_food,asp_service,asp_price,asp_ambience,asp_cleanliness,asp_portion,asp_wait_time,asp_location,any_aspect_mentioned
0,1.111581421e+20,0x1532f63595a49fdd:0xac59d1857e129a53,Lazing your day away in cafes is one of the be...,1,0,0,0,0,1,1,0,True
3,1.120315816e+20,0x1532f63595a49fdd:0xac59d1857e129a53,A longtime favorite neighborhood spot. Don't t...,0,0,0,0,0,0,0,0,False
4,1.183536215e+20,0x1532f63595a49fdd:0xac59d1857e129a53,Wonderful french brasserie in a great location...,1,1,0,0,0,0,0,1,True
7,1.056088314e+20,0x1532f63595a49fdd:0xac59d1857e129a53,One of my favorite restaurants in all of SF!,1,0,0,0,0,1,0,0,True
11,1.04019159e+20,0x1532f63595a49fdd:0xac59d1857e129a53,"Were I a hacker, I'd apply my skill base to ad...",0,0,0,0,0,0,0,0,False


In [15]:
review_counts = df_flags['any_aspect_mentioned'].value_counts()
print(review_counts)

any_aspect_mentioned
False    167787
True     154406
Name: count, dtype: int64


In [18]:
# Set pandas display option to show full text in DataFrame
pd.set_option('display.max_colwidth', None)

false_aspect_reviews = df_flags[df_flags['any_aspect_mentioned'] == False]
false_aspect_reviews_sample = false_aspect_reviews.sample(n=5, random_state=42)

# Display the entire text of the sample reviews
print(false_aspect_reviews_sample['text'])


294029    Coffee $1.50 /2.00. One Hot Dog Is $5.00.  CCSF FAST FOOD
40537                      Frickin' awesomely delicious sandwiches!
136060                                                            0
79058               Average food and the chai tea was pretty awful.
399476                                          Try the meatballs!!
Name: text, dtype: object


### Aggregate to Users & Restaurants (neutral “preference” = share of reviews mentioning the aspect)

In [19]:
def aggregate_preferences(df_flags: pd.DataFrame, by: str) -> pd.DataFrame:
    # proportion of reviews where aspect appeared (neutral)
    grp = df_flags.groupby(by)
    counts = grp[flag_cols].count().rename(columns={c:f"{c}_n" for c in flag_cols})
    sums   = grp[flag_cols].sum().rename(columns={c:f"{c}_sum" for c in flag_cols})
    base = grp.size().rename("n_reviews").to_frame()

    out = base.join(counts).join(sums).reset_index()
    for a in ASPECT_LIST:
        num = out[f"asp_{a}_sum"]
        den = out[f"asp_{a}_n"].replace(0, np.nan)
        out[f"{a}_pref"] = (num / den).fillna(0.0)

    # Compact vector (ordered) for quick consumption: [food_pref, service_pref, ...]
    out["aspector"] = out.apply(lambda r: [float(r[f"{a}_pref"]) for a in ASPECT_LIST], axis=1)
    return out[[by, "n_reviews"] + [f"{a}_pref" for a in ASPECT_LIST] + ["aspector"]]

user_prefs = aggregate_preferences(df_flags, by="user_id")
item_prefs = aggregate_preferences(df_flags, by="gmap_id")

user_prefs.head(3), item_prefs.head(3)

(           user_id  n_reviews  food_pref  service_pref  price_pref  \
 0                0     101643   0.512126      0.271735    0.121336   
 1  1.000001066e+20         13   0.230769      0.230769    0.076923   
 2  1.000001862e+20          1   0.000000      1.000000    0.000000   
 
    ambience_pref  cleanliness_pref  portion_pref  wait_time_pref  \
 0       0.103578          0.027498      0.072942        0.102860   
 1       0.000000          0.000000      0.000000        0.076923   
 2       0.000000          0.000000      0.000000        0.000000   
 
    location_pref  \
 0       0.044519   
 1       0.000000   
 2       0.000000   
 
                                                                                                                                                                aspector  
 0  [0.5121257735407259, 0.2717353875820273, 0.12133644225377055, 0.1035782100095432, 0.02749820450006395, 0.0729415700048208, 0.10286001003512293, 0.04451856005824307]  
 1      

In [22]:
import pyarrow as pa
import pyarrow.parquet as pq

def coerce_for_parquet_with_list(df, keycol):
    df = df.copy()
    df[keycol] = df[keycol].astype(str)
    df["n_reviews"] = pd.to_numeric(df["n_reviews"], errors="coerce").fillna(0).astype("int32")
    for a in ASPECT_LIST:
        df[f"{a}_pref"] = pd.to_numeric(df[f"{a}_pref"], errors="coerce").astype("float32")
    # ensure list of floats
    df["aspector"] = df.apply(lambda r: [float(r[f"{a}_pref"]) for a in ASPECT_LIST], axis=1)
    return df

user_pq = coerce_for_parquet_with_list(user_prefs, "user_id")
item_pq = coerce_for_parquet_with_list(item_prefs, "gmap_id")

# build schema
def make_schema(key_name):
    fields = [pa.field(key_name, pa.string()),
              pa.field("n_reviews", pa.int32())]
    fields += [pa.field(f"{a}_pref", pa.float32()) for a in ASPECT_LIST]
    fields += [pa.field("aspector", pa.list_(pa.float32()))]
    return pa.schema(fields)

user_schema = make_schema("user_id")
item_schema = make_schema("gmap_id")

table_u = pa.Table.from_pandas(user_pq, schema=user_schema, preserve_index=False)
table_i = pa.Table.from_pandas(item_pq, schema=item_schema, preserve_index=False)

U_OUT = OUT_DIR / "user_aspect_prefs.parquet"
I_OUT = OUT_DIR / "item_aspect_prefs.parquet"
pq.write_table(table_u, U_OUT)
pq.write_table(table_i, I_OUT)

print("Saved:", U_OUT)
print("Saved:", I_OUT)


Saved: /Users/kienanana/Documents/SCHOOL/Y3S1/BT4222/PROJECT/data/processed/features/aspects/user_aspect_prefs.parquet
Saved: /Users/kienanana/Documents/SCHOOL/Y3S1/BT4222/PROJECT/data/processed/features/aspects/item_aspect_prefs.parquet


In [23]:
def quick_summary(df, keycol):
    nz = df[[f"{a}_pref" for a in ASPECT_LIST]].gt(0).sum(axis=1)
    return {
        "N": len(df),
        "mean_nonzero_aspects": float(nz.mean()),
        "pct_has_≥2_aspects": float((nz >= 2).mean()),
        "mean_reviews_per_entity": float(df["n_reviews"].mean()),
        "median_reviews_per_entity": float(df["n_reviews"].median()),
    }

print("[Users]", quick_summary(user_prefs, "user_id"))
print("[Items]", quick_summary(item_prefs, "gmap_id"))

[Users] {'N': 86303, 'mean_nonzero_aspects': 1.3114839576839739, 'pct_has_≥2_aspects': 0.37034633790250626, 'mean_reviews_per_entity': 3.7332769428640953, 'median_reviews_per_entity': 1.0}
[Items] {'N': 3722, 'mean_nonzero_aspects': 3.5548092423428264, 'pct_has_≥2_aspects': 0.5042987641053197, 'mean_reviews_per_entity': 86.5644814615798, 'median_reviews_per_entity': 56.0}


In [28]:
# sample of user preferences
user_sample = user_prefs.sample(5)
user_sample_df = pd.DataFrame(user_sample)
print("User Preferences Sample:")
user_sample_df

User Preferences Sample:


Unnamed: 0,user_id,n_reviews,food_pref,service_pref,price_pref,ambience_pref,cleanliness_pref,portion_pref,wait_time_pref,location_pref,aspector
19742,1.04136903e+20,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]"
31460,1.066186299e+20,5,0.6,0.0,0.2,0.2,0.0,0.0,0.0,0.0,"[0.6, 0.0, 0.2, 0.2, 0.0, 0.0, 0.0, 0.0]"
62874,1.133668063e+20,17,0.058824,0.117647,0.0,0.0,0.0,0.0,0.0,0.0,"[0.058823529411764705, 0.11764705882352941, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]"
41880,1.088242132e+20,1,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0]"
30690,1.064592677e+20,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]"


In [30]:
# sample of item preferences
item_sample = item_prefs.sample(5)
item_sample_df = pd.DataFrame(item_sample)
print("\nItem Preferences Sample:")
item_sample_df


Item Preferences Sample:


Unnamed: 0,gmap_id,n_reviews,food_pref,service_pref,price_pref,ambience_pref,cleanliness_pref,portion_pref,wait_time_pref,location_pref,aspector
2610,0x808f7d9faab8a4fb:0xda5bad5f2d93eff8,26,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]"
2267,0x808587530515a969:0x95e6fa774ed53e17,110,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]"
942,0x80858094e74fcae3:0x71a91d7e84f99922,125,0.552,0.336,0.144,0.136,0.024,0.04,0.064,0.064,"[0.552, 0.336, 0.144, 0.136, 0.024, 0.04, 0.064, 0.064]"
3072,0x808f7e3c72d32ecf:0x1d763751efbe1d8b,11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]"
2964,0x808f7e29399e8f23:0xe6859d703c343638,50,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]"
