In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
import json
import joblib
import requests
import ast
import re
import unicodedata
import inflect
infl = inflect.engine()

In [2]:
pd.set_option("display.max_columns",None)

### Raw Data Import and Dataset Build

In [3]:
DATA = Path("dataset").resolve()
TEST_DF = Path("scratch_work_files").resolve()

In [4]:
# # Load only the first 10,000 reviews for testing
# reviews = []
# businesses = []
# with open(DATA / "yelp_academic_dataset_review.json", "r") as f1:
#     for i, line in enumerate(f1):
#         if i >= 250000:
#             break
#         reviews.append(json.loads(line))

# with open(DATA / "yelp_academic_dataset_business.json","r") as f2:
#     for i, line in enumerate(f2):
#         if i >= 250000:
#             break
#         businesses.append(json.loads(line))

# reviews_df = pd.merge(
#   pd.DataFrame(reviews),
#   pd.DataFrame(businesses),
#   how = "inner",
#   on = "business_id",
#   suffixes = ["_review","_restaurant"]
#   ).dropna()

# joblib.dump(reviews_df,TEST_DF / "review_testing_data.pkl")


In [5]:
reviews_df_complete = joblib.load(TEST_DF / "review_testing_data.pkl")

In [6]:
reviews_df_complete = pd.merge(
    reviews_df_complete,
    reviews_df_complete.groupby("business_id")["stars_review"].mean().rename("avg_stars"),
    how = "inner",
    on = "business_id"
)

In [7]:
cond1 = reviews_df_complete["categories"].str.lower().str.contains("restaurants")
cond2 = reviews_df_complete["categories"].str.lower().str.contains("hotels")
cond3 = reviews_df_complete["avg_stars"] < 4
reviews_df_complete = reviews_df_complete[
    (cond1) 
    & (~cond2)
#    & (cond3)
]

In [8]:
review_counts = reviews_df_complete.groupby("business_id")["review_id"].count()
sample_id = review_counts[review_counts.between(100,200)].sample(1).index.tolist()

In [9]:
reviews_df = reviews_df_complete.copy()[reviews_df_complete["business_id"].isin(sample_id)]
reviews_df = reviews_df[["name","text"]].sort_values(by = "name").reset_index(drop = True)

In [10]:
reviews_df

Unnamed: 0,name,text
0,4 Rivers Smokehouse,The food here is tasty - love the brisket and ...
1,4 Rivers Smokehouse,Food was good. I ordered a pound of meat and t...
2,4 Rivers Smokehouse,I love 4r. How can you go wrong here!!?? The m...
3,4 Rivers Smokehouse,I have been anticipating this 4Rivers Smokehou...
4,4 Rivers Smokehouse,"Amazing! \n\nParking is an issue, but we went ..."
...,...,...
116,4 Rivers Smokehouse,"Giving 3 stars, I have been there 3 times, 1st..."
117,4 Rivers Smokehouse,Great ribs. Really great pulled pork. Love the...
118,4 Rivers Smokehouse,Made the drive here and attempted to go to lun...
119,4 Rivers Smokehouse,by far the best bbq in tampa bay


### Function Builds

In [11]:
def CleanReviewText(text):
    """
    Removes & and nonstandard ascii characters to make the LLM's interpretation job easier
    """

    if not isinstance(text, str):
        return ""

    # Normalize Unicode (e.g., é → e)
    text = unicodedata.normalize("NFKD", text).encode("ascii", "ignore").decode("utf-8")

    # Replace ampersands with "and" when used as a connector
    text = re.sub(r'\s*&\s*', ' and ', text)

    # Replace all forms of whitespace (tabs, newlines, multiple spaces) with a single space
    text = re.sub(r'\s+', ' ', text)

    # Trim leading/trailing whitespace
    return text.strip()


reviews_df["text"] = reviews_df["text"].apply(CleanReviewText)

In [12]:
def FormatClassList(values, max_items=None):
    """
    Transform the list of classes into different phrasings in natural English.
    Useful for dynamically passing possible classes to the prompt.
    """
    if max_items:
        values = values[:max_items]
    
    if not values:
        return ""
    elif len(values) == 1:
        return values[0]
    elif len(values) == 2:
        return f"{values[0]} and {values[1]}"
    else:
        return ", ".join(values[:-1]) + f", and {values[-1]}"

In [13]:
def BuildPrompt(business_name,classes,reprocess = False):
    """
    Build the prompt that will be passed to the LLM.
    Using a function allows the prompt to be printed and previewed.
    """

    prompt = f"""
    You are a restaurant review classifier that is reviewing {business_name}.
    Given a review of {business_name}, classify the reviewer's attitude toward three key aspects: food and drinks (including prices), service, and ambiance.
    Return a JSON-style matrix in this form: {{"food": "[class]", "service": "[class]", "ambiance": "[class]"}}.
    For [class], return one of {infl.number_to_words(len(valid_classes))} possible values: {FormatClassList(classes)}.
    {FormatClassList(classes[:-1])} are classifications of the reviewer's feelings toward their experience.
    Return "{FormatClassList(classes[-1:])}" if the reviewer does not address a certain aspect of the experience.
    Here is an example review: "The appetizers at this place were phenomenal, but the margaritas were overpriced and watery. Service was HORRENDOUS."
    In this example, return: {{"food": "{classes[1]}", "service": "{classes[2]}", "ambiance": "{classes[3]}"}}.
    Here is another example: "Our server was outstanding and the entrees were perfectly cooked. We loved the decor, too!"
    In this example, return: {{"food": "{classes[0]}", "service": "{classes[0]}", "ambiance": "{classes[0]}"}}.
    If the reviewer says multiple things about an aspect, bias toward the negative; that is, classify according to the more negative statements and choose "{valid_classes[0]}" only if the review is solely comprised of positive statements.
    No other output is desired.
    """
    if reprocess == True:
        prompt += f"\nPlease focus on selecting a class from these values: {FormatClassList(classes)}. Other class choices, even if more descriptive, are not helpful in this instance."

    return prompt

In [14]:
def ClassifyReview(review_text, business_name, classes, model='mistral',reprocess = False):
    """
    Build the prompt and pass to the LLM for evaluation.
    """

    url = "http://localhost:11434/api/chat"
    headers = {"Content-Type": "application/json"}
      
    data = {
        "model": model,
        "messages": [
            {
                "role": "system",
                "content": BuildPrompt(business_name,classes,reprocess) 
            },
            {
                "role": "user",
                "content": review_text
            }
        ],
        "stream": False
    }

    response = requests.post(url, json=data, headers=headers)
    return response.json()["message"]["content"]


In [15]:
def ParseReviewAnalyticsOutput(text):
    """
    Parses potentially messy JSON-like LLM output by:
    - Truncating after the first closing brace
    - Quoting unquoted keys
    - Quoting unquoted string values
    - Falling back to regex if JSON parsing fails
    Returns a dictionary with aspect classifications.
    """
    # Step 0: Truncate at first closing brace to avoid trailing commentary
    text = text.strip()
    closing_brace_index = text.find("}")
    if closing_brace_index != -1:
        text = text[:closing_brace_index + 1]

    # Step 1: Fix unquoted keys
    text = re.sub(r'([{,]\s*)(\w+)(\s*:)', r'\1"\2"\3', text)

    # Step 2: Fix unquoted string values (assume words are categorical, not numbers or booleans)
    text = re.sub(r':\s*([a-zA-Z_]+)(\s*[,}])', r': "\1"\2', text)

    # Try JSON parsing
    try:
        parsed = json.loads(text)
        return {
            "food": parsed.get("food", "none").lower(),
            "service": parsed.get("service", "none").lower(),
            "ambiance": parsed.get("ambiance", "none").lower()
        }
    except Exception:
        pass  # fallback to regex next

    # Fallback regex-based parsing
    result = {"food": "none", "service": "none", "ambiance": "none"}
    for aspect in ["food", "service", "ambiance"]:
        match = re.search(rf'{aspect}\s*[:=]\s*["\']?(\w+)["\']?', text, re.IGNORECASE)
        if match:
            result[aspect] = match.group(1).lower()

    return result


In [16]:
def GenerateReprocessData(df,classes):
    invalid_food = ~df["food"].isin(classes)
    invalid_service = ~df["service"].isin(classes)
    invalid_ambiance = ~df["ambiance"].isin(classes)

    to_reprocess = df[invalid_food | invalid_service | invalid_ambiance]
    df = df.drop(labels = to_reprocess.index)
    to_reprocess = to_reprocess.reset_index(drop=True)
    return df,to_reprocess

### Review Evaluation Config

In [17]:
valid_classes = ["positive","disappointed","angry","none"]
print(BuildPrompt(business_name="{restaurant name}",classes=valid_classes,reprocess=True))


    You are a restaurant review classifier that is reviewing {restaurant name}.
    Given a review of {restaurant name}, classify the reviewer's attitude toward three key aspects: food and drinks (including prices), service, and ambiance.
    Return a JSON-style matrix in this form: {"food": "[class]", "service": "[class]", "ambiance": "[class]"}.
    For [class], return one of four possible values: positive, disappointed, angry, and none.
    positive, disappointed, and angry are classifications of the reviewer's feelings toward their experience.
    Return "none" if the reviewer does not address a certain aspect of the experience.
    Here is an example review: "The appetizers at this place were phenomenal, but the margaritas were overpriced and watery. Service was HORRENDOUS."
    In this example, return: {"food": "disappointed", "service": "angry", "ambiance": "none"}.
    Here is another example: "Our server was outstanding and the entrees were perfectly cooked. We loved the deco

### Process Analytics

In [18]:
%%time

result_list = []
for review in reviews_df.itertuples():
    result = ClassifyReview(review.text,business_name=review.name,classes=valid_classes,reprocess = False)
    result_list.append(result)

CPU times: user 397 ms, sys: 96.6 ms, total: 494 ms
Wall time: 2min 58s


In [20]:
result_df = pd.concat([reviews_df[["name","text"]],(pd.DataFrame(pd.Series(result_list).apply(ParseReviewAnalyticsOutput).tolist()))],axis = 1)

In [21]:
# result_df["check_col"] = (
# result_df["food"].str.strip().str.lower()
# + result_df["service"].str.strip().str.lower()
# + result_df["ambiance"].str.strip().str.lower()
# )

# result_df[result_df["check_col"] == "nonenonenone"]

### Reprocess Analytics for Noncompliant Rows

In [22]:
# from random import randint
# counter = 0
# while counter < 5:
#     result_df.loc[randint(0,99),["food"]] = "N/A"
#     result_df.loc[randint(0,99),["ambiance"]] = "N/A"
#     result_df.loc[randint(0,99),["service"]] = "N/A"
#     counter +=1


In [23]:
result_df,to_reprocess = GenerateReprocessData(result_df,valid_classes)

In [24]:
to_reprocess

Unnamed: 0,name,text,food,service,ambiance
0,4 Rivers Smokehouse,I'm very excited that 4 Rivers came to South T...,positive,none,neutral
1,4 Rivers Smokehouse,"For anyone who knows or really loves 'Q, this ...",disappointed,none,neutral


In [25]:
%%time

repr_result_list = []
for review_repr in to_reprocess.itertuples():
    repr_result = ClassifyReview(review_repr.text,business_name=review_repr.name,classes=valid_classes,reprocess = True)
    repr_result_list.append(repr_result)

CPU times: user 6.14 ms, sys: 1.79 ms, total: 7.93 ms
Wall time: 4.03 s


In [26]:
reprocessed = pd.concat([to_reprocess[["name","text"]],(pd.DataFrame(pd.Series(repr_result_list).apply(ParseReviewAnalyticsOutput).tolist()))],axis = 1)

### Final Reprocess Cycle: Replace Noncompliant Rows With 'none'

In [27]:
reprocessed

Unnamed: 0,name,text,food,service,ambiance
0,4 Rivers Smokehouse,I'm very excited that 4 Rivers came to South T...,positive,none,neutral
1,4 Rivers Smokehouse,"For anyone who knows or really loves 'Q, this ...",disappointed,none,positive


In [28]:
if reprocessed.shape[0] > 0:
    reprocessed.loc[~reprocessed["food"].isin(valid_classes),["food"]] = "none"
    reprocessed.loc[~reprocessed["service"].isin(valid_classes),["service"]] = "none"
    reprocessed.loc[~reprocessed["ambiance"].isin(valid_classes),["ambiance"]] = "none"


In [29]:
result_df = pd.concat([result_df,reprocessed],axis = 0).reset_index(drop = True)

In [30]:
plot_df = pd.get_dummies(data = result_df, columns = ["food","service","ambiance"],dtype= "int").groupby("name").sum()

In [31]:
food_cols = [col for col in plot_df.columns if col.startswith("food_")]
plot_df_food = plot_df[food_cols].transpose()

In [32]:
plot_df_food.index = [col.replace("food_","") for col in plot_df_food.index]

In [33]:
plot_df_food

name,4 Rivers Smokehouse
angry,3
disappointed,29
none,5
positive,84
