In [3]:
from pathlib import Path
import pandas as pd
import numpy as np
import json
import joblib
import requests
import ast
import re
import unicodedata

In [4]:
pd.set_option("display.max_columns",None)

### Raw Data Import and Dataset Build

In [5]:
DATA = Path("dataset").resolve()
TEST_DF = Path("scratch_work_files").resolve()

In [6]:
# # Load only the first 10,000 reviews for testing
# reviews = []
# businesses = []
# with open(DATA / "yelp_academic_dataset_review.json", "r") as f1:
#     for i, line in enumerate(f1):
#         if i >= 250000:
#             break
#         reviews.append(json.loads(line))

# with open(DATA / "yelp_academic_dataset_business.json","r") as f2:
#     for i, line in enumerate(f2):
#         if i >= 250000:
#             break
#         businesses.append(json.loads(line))

# reviews_df = pd.merge(
#   pd.DataFrame(reviews),
#   pd.DataFrame(businesses),
#   how = "inner",
#   on = "business_id",
#   suffixes = ["_review","_restaurant"]
#   ).dropna()

# joblib.dump(reviews_df,TEST_DF / "review_testing_data.pkl")


In [7]:
reviews_df_complete = joblib.load(TEST_DF / "review_testing_data.pkl")

In [8]:
reviews_df = reviews_df_complete
reviews_df = reviews_df[reviews_df["categories"].str.lower().str.contains("restaurants")]
reviews_df = reviews_df[~reviews_df["categories"].str.lower().str.contains("hotels")]
reviews_df = reviews_df[reviews_df["stars_review"] < 5]
reviews_df = reviews_df.sample(100).sort_values(by = "name",ascending = True).reset_index(drop = True)

### Function Builds

In [9]:
def CleanReviewText(text):
    """
    Removes & and nonstandard ascii characters to make the LLM's interpretation job easier
    """

    if not isinstance(text, str):
        return ""

    # Normalize Unicode (e.g., é → e)
    text = unicodedata.normalize("NFKD", text).encode("ascii", "ignore").decode("utf-8")

    # Replace ampersands with "and" when used as a connector
    text = re.sub(r'\s*&\s*', ' and ', text)

    # Replace all forms of whitespace (tabs, newlines, multiple spaces) with a single space
    text = re.sub(r'\s+', ' ', text)

    # Trim leading/trailing whitespace
    return text.strip()


reviews_df["text"] = reviews_df["text"].apply(CleanReviewText)

In [10]:
def FormatClassList(values, max_items=None):
    """
    Transform the list of classes into different phrasings in natural English.
    Useful for dynamically passing possible classes to the prompt.
    """
    if max_items:
        values = values[:max_items]
    
    if not values:
        return ""
    elif len(values) == 1:
        return values[0]
    elif len(values) == 2:
        return f"{values[0]} and {values[1]}"
    else:
        return ", ".join(values[:-1]) + f", and {values[-1]}"

In [None]:
def BuildPrompt(business_name,classes,reprocess = False):
    """
    Build the prompt that will be passed to the LLM.
    Using a function allows the prompt to be printed and previewed.
    """

    prompt = f"""
    You are a restaurant review classifier that is reviewing {business_name}.
    Given a review of {business_name}, classify the reviewer's attitude toward three key aspects: food and drinks (including prices), service, and ambiance.
    Return a JSON-style matrix in this form: {{"food": "[class]", "service": "[class]", "ambiance": "[class]"}}.
    For [class], return one of four possible values: {FormatClassList(classes)}.
    {FormatClassList(classes[:-1])} are classifications of the reviewer's feelings toward their experience.
    Return "{FormatClassList(classes[-1:])}" if the reviewer does not address a certain aspect of the experience.
    Here is an example review: "The appetizers at this place were phenomenal, but the margaritas were overpriced and watery. Service was HORRENDOUS."
    In this example, return: {{"food": "{classes[1]}", "service": "{classes[2]}", "ambiance": "{classes[3]}"}}.
    Here is another example: "Our server was outstanding and the entrees were perfectly cooked. We loved the decor, too!"
    In this example, return: {{"food": "{classes[0]}", "service": "{classes[0]}", "ambiance": "{classes[0]}"}}.
    If the reviewer says multiple things about an aspect, bias toward the negative; that is, classify according to the more negative statements and choose "{valid_classes[0]}" only if the review is solely comprised of positive statements.
    No other output is desired.
    """
    if reprocess == True:
        prompt += f"\nPlease focus on selecting a class from these values: {FormatClassList(classes)}. Other class choices, even if more descriptive, are not helpful in this instance."

    return prompt

In [29]:
def ClassifyReview(review_text, business_name, classes, model='mistral',reprocess = False):
    """
    Build the prompt and pass to the LLM for evaluation.
    """

    url = "http://localhost:11434/api/chat"
    headers = {"Content-Type": "application/json"}
      
    data = {
        "model": model,
        "messages": [
            {
                "role": "system",
                "content": BuildPrompt(business_name,classes,reprocess) 
            },
            {
                "role": "user",
                "content": review_text
            }
        ],
        "stream": False
    }

    response = requests.post(url, json=data, headers=headers)
    return response.json()["message"]["content"]


In [13]:
def ParseReviewAnalyticsOutput(text):
    """
    Parses potentially messy JSON-like LLM output by:
    - Truncating after the first closing brace
    - Quoting unquoted keys
    - Quoting unquoted string values
    - Falling back to regex if JSON parsing fails
    Returns a dictionary with aspect classifications.
    """
    # Step 0: Truncate at first closing brace to avoid trailing commentary
    text = text.strip()
    closing_brace_index = text.find("}")
    if closing_brace_index != -1:
        text = text[:closing_brace_index + 1]

    # Step 1: Fix unquoted keys
    text = re.sub(r'([{,]\s*)(\w+)(\s*:)', r'\1"\2"\3', text)

    # Step 2: Fix unquoted string values (assume words are categorical, not numbers or booleans)
    text = re.sub(r':\s*([a-zA-Z_]+)(\s*[,}])', r': "\1"\2', text)

    # Try JSON parsing
    try:
        parsed = json.loads(text)
        return {
            "food": parsed.get("food", "none").lower(),
            "service": parsed.get("service", "none").lower(),
            "ambiance": parsed.get("ambiance", "none").lower()
        }
    except Exception:
        pass  # fallback to regex next

    # Fallback regex-based parsing
    result = {"food": "none", "service": "none", "ambiance": "none"}
    for aspect in ["food", "service", "ambiance"]:
        match = re.search(rf'{aspect}\s*[:=]\s*["\']?(\w+)["\']?', text, re.IGNORECASE)
        if match:
            result[aspect] = match.group(1).lower()

    return result


In [14]:
def GenerateReprocessData(df,classes):
    invalid_food = ~df["food"].isin(classes)
    invalid_service = ~df["service"].isin(classes)
    invalid_ambiance = ~df["ambiance"].isin(classes)

    to_reprocess = df[invalid_food | invalid_service | invalid_ambiance]
    df = df.drop(labels = to_reprocess.index)
    to_reprocess = to_reprocess.reset_index(drop=True)
    return df,to_reprocess

### Review Evaluation Config

In [48]:
valid_classes = ["positive","disappointed","angry","none"]
print(BuildPrompt(business_name="{restaurant name}",classes=valid_classes,reprocess=True))


    You are a restaurant review classifier that is reviewing {restaurant name}.
    Given a review of {restaurant name}, classify the reviewer's attitude toward three key aspects: food and drinks (including prices), service, and ambiance.
    Return a JSON-style matrix in this form: {"food": "[class]", "service": "[class]", "ambiance": "[class]"}.
    For [class], return one of four possible values: positive, disappointed, angry, and none.
    positive, disappointed, and angry are classifications of the reviewer's feelings toward their experience.
    Return "none" if the reviewer does not address a certain aspect of the experience.
    Here is an example review: "The appetizers at this place were phenomenal, but the margaritas were overpriced and watery. Service was HORRENDOUS."
    In this example, return: {"food": "disappointed", "service": "angry", "ambiance": "none"}.
    Here is another example: "Our server was outstanding and the entrees were perfectly cooked. We loved the deco

### Process Analytics

In [None]:
%%time

result_list = []
for review in reviews_df.itertuples():
    result = ClassifyReview(review.text,business_name=review.name,classes=valid_classes,reprocess = False)
    result_list.append(result)

CPU times: user 310 ms, sys: 81.1 ms, total: 391 ms
Wall time: 4min 31s


In [17]:
result_df = pd.concat([reviews_df[["name","text"]],(pd.DataFrame(pd.Series(result_list).apply(ParseReviewAnalyticsOutput).tolist()))],axis = 1)

In [18]:
# result_df["check_col"] = (
# result_df["food"].str.strip().str.lower()
# + result_df["service"].str.strip().str.lower()
# + result_df["ambiance"].str.strip().str.lower()
# )

# result_df[result_df["check_col"] == "nonenonenone"]

### Reprocess Analytics for Noncompliant Rows

In [19]:
# from random import randint
# counter = 0
# while counter < 5:
#     result_df.loc[randint(0,99),["food"]] = "N/A"
#     result_df.loc[randint(0,99),["ambiance"]] = "N/A"
#     result_df.loc[randint(0,99),["service"]] = "N/A"
#     counter +=1


In [20]:
result_df,to_reprocess = GenerateReprocessData(result_df,valid_classes)

In [38]:
to_reprocess

Unnamed: 0,name,text,food,service,ambiance
0,Cyrano's,We came here just for desert later in the even...,okay,angry,none
1,Joey B's On The Hill,We stopped for lunch on Sunday. We've tried to...,disappointed,positive,mixed
2,Sculley's Waterfront Restaurant,Pretty good food and drinks - would be happier...,positive,mixed,positive
3,Swiss Haus Cafe & Pastry Bar,When we first moved into Philadelphia and I wa...,disappointed,none,neutral
4,The Black Sheep Pub & Restaurant,A friend and I stopped by here to grab somethi...,disappointed,none (somewhat slow mentioned but with benefit...,none


In [39]:
%%time

repr_result_list = []
for review_repr in to_reprocess.itertuples():
    repr_result = ClassifyReview(review_repr.text,business_name=review_repr.name,classes=valid_classes,reprocess = True)
    repr_result_list.append(repr_result)

CPU times: user 15.8 ms, sys: 5.41 ms, total: 21.2 ms
Wall time: 17.3 s


In [40]:
reprocessed = pd.concat([to_reprocess[["name","text"]],(pd.DataFrame(pd.Series(repr_result_list).apply(ParseReviewAnalyticsOutput).tolist()))],axis = 1)

### Final Reprocess Cycle: Replace Noncompliant Rows With 'none'

In [42]:
reprocessed

Unnamed: 0,name,text,food,service,ambiance
0,Cyrano's,We came here just for desert later in the even...,disappointed,angry,none
1,Joey B's On The Hill,We stopped for lunch on Sunday. We've tried to...,disappointed,positive,none
2,Sculley's Waterfront Restaurant,Pretty good food and drinks - would be happier...,positive,disappointed,positive
3,Swiss Haus Cafe & Pastry Bar,When we first moved into Philadelphia and I wa...,disappointed,none,disappointed
4,The Black Sheep Pub & Restaurant,A friend and I stopped by here to grab somethi...,disappointed,none,positive


In [43]:
reprocessed.loc[~reprocessed["food"].isin(valid_classes),["food"]] = "none"
reprocessed.loc[~reprocessed["service"].isin(valid_classes),["service"]] = "none"
reprocessed.loc[~reprocessed["ambiance"].isin(valid_classes),["ambiance"]] = "none"


In [46]:
result_df = pd.concat([result_df,reprocessed],axis = 0).reset_index(drop = True)

In [47]:
result_df

Unnamed: 0,name,text,food,service,ambiance
0,101 Taiwanese Cuisine,Overrated. Wait is long. Took me 90mins to get...,disappointed,angry,none
1,1200 Chophouse,We came on a Saturday evening. We got the Surf...,disappointed,positive,positive
2,312 Pizza Company,"I love me some deep dish pizza, I was born in ...",positive,positive,positive
3,6 North Cafe,This was my first time at this place. I went h...,positive,none,positive
4,Akira Hibachi & Sushi Bar,"Place was okay, sushi was good, hibachi was wa...",disappointed,positive,none
...,...,...,...,...,...
95,Cyrano's,We came here just for desert later in the even...,disappointed,angry,none
96,Joey B's On The Hill,We stopped for lunch on Sunday. We've tried to...,disappointed,positive,none
97,Sculley's Waterfront Restaurant,Pretty good food and drinks - would be happier...,positive,disappointed,positive
98,Swiss Haus Cafe & Pastry Bar,When we first moved into Philadelphia and I wa...,disappointed,none,disappointed
