<a href="https://colab.research.google.com/github/jtsu03/Yelp-Review-NLP-Business-Insights-Analysis/blob/main/Yelp_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
yelp2 = pd.read_json("/content/drive/MyDrive/CRM projec/yelp_academic_dataset_review.json", lines=True)

In [None]:
yelp1 = pd.read_json("/content/drive/MyDrive/CRM projec/yelp_academic_dataset_business.json", lines=True)

In [None]:
yelp1.head(5)

In [None]:
yelp2.head ()

In [None]:
yelp2 = yelp2.rename(columns={"stars": "review_stars"})

In [None]:
yelp2

In [None]:
yelp1 = yelp1.rename(columns={"stars": "business_stars"})

In [None]:
yelp1

In [None]:
merged_df = pd.merge(
    yelp2,        # review dataset
    yelp1,        # business dataset
    on="business_id",
    how="inner"   # keeps only matching business_id rows
)


In [None]:
merged_df.head(5)

In [None]:
merged_df.columns

In [None]:
merged_df = merged_df.drop(columns=["longitude", "latitude", "address"])

In [None]:
merged_df.head(5)

In [None]:
merged_df["city"].value_counts().head(10)


In [None]:
top10_cities = [
    "Philadelphia",
    "New Orleans",
    "Tampa",
    "Nashville",
    "Tucson",
    "Indianapolis",
    "Reno",
    "Santa Barbara",
    "Saint Louis",
    "Boise"
]


In [None]:
top10_df = merged_df[merged_df["city"].isin(top10_cities)]


In [None]:
len(top10_df)

In [None]:
cats = top10_df["categories"].str.split(",", expand=False)

In [None]:
cats_exploded = cats.explode().str.strip()


In [None]:
category_counts = cats_exploded.value_counts()

In [None]:
category_counts


In [None]:
pd.set_option("display.max_rows", None)

display(category_counts)

In [None]:
import re

# Build a regex pattern for restaurant/food-related categories
restaurant_pattern = r"""
Restaurant|Restaurants|
Food|
Breakfast|Brunch|
Cafe|Cafes|
Coffee|Tea|
Bakery|Bakeries|
Dessert|Ice Cream|Frozen Yogurt|Gelato|
Pizza|Burgers|Sandwiches|Soup|Salad|
Seafood|Steak|Steakhouses|
BBQ|Barbeque|
Diner|Fast Food|
Mexican|Italian|Chinese|Japanese|Sushi|Korean|Thai|Vietnamese|
Mediterranean|Greek|Indian|Latin American|Cajun/Creole|Soul Food|
Tapas|Ramen|Noodles|Tacos|Pho|Kebab|
Vegan|Vegetarian|Gluten[- ]?Free|
Donuts|Bagels|Candy Stores|Chocolatiers|
Juice Bars|Smoothies|
Food Trucks|Food Stands|Street Vendors|
Poke|
Cheesesteaks|
Buffets|
Gastropubs|Beer Bar|Beer Hall|
Wine Bars|Brewpubs|Breweries|
Patisserie|Cake Shop|
Halal|Kosher|
Farmers Market|Health Markets|Ethnic Food|Grocery|Meat Shops
"""

# strip whitespace/newlines in the pattern string
restaurant_pattern = re.sub(r"\s+", "", restaurant_pattern)

# category_counts is a Series where the index is the category name
restaurant_category_counts = category_counts[
    category_counts.index.to_series().str.contains(restaurant_pattern, case=False, regex=True)
]

# Optional: see the top restaurant-related categories
restaurant_category_counts.head(50)


In [None]:
def is_restaurant(cat_string):
    if not isinstance(cat_string, str):
        return False
    return bool(re.search(restaurant_pattern, cat_string, flags=re.IGNORECASE))

restaurants_df = top10_df[top10_df["categories"].apply(is_restaurant)].copy()


In [None]:
restaurants_df.head()

In [None]:
len(restaurants_df)

In [None]:
!pip install bertopic[all]

In [None]:
import re
import numpy as np
import pandas as pd
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer

In [None]:
restaurants_df.columns

In [None]:
print("Total rows in restaurant_df:", len(restaurants_df))


In [None]:
restaurant_df = restaurants_df.dropna(subset=["text"]).reset_index(drop=True)


In [None]:
restaurant_df = restaurant_df[restaurant_df["text"].str.strip() != ""].reset_index(drop=True)


In [None]:
print("Rows with valid text:", len(restaurant_df))


In [None]:
sample_size = 50_000  # try 20_000 if this is too heavy

if len(restaurant_df) > sample_size:
    restaurants_sample = restaurant_df.sample(n=sample_size, random_state=42).reset_index(drop=True)
else:
    restaurants_sample = restaurant_df.copy()

print("Rows going into BERTopic:", len(restaurants_sample))

In [None]:
docs = restaurants_sample["text"].astype(str).str.strip().tolist()

In [None]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer

embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

vectorizer_model = CountVectorizer(ngram_range=(1, 2))   # THIS replaces n_grams

topic_model = BERTopic(
    embedding_model=embedding_model,
    vectorizer_model=vectorizer_model,
    min_topic_size=50,
    verbose=True
)


In [None]:
topics, probs = topic_model.fit_transform(docs)

# Attach topic info back to the sampled DataFrame
restaurants_sample["topic_id"] = topics
restaurants_sample["topic_confidence"] = probs

print("Unique topics found:", restaurants_sample["topic_id"].nunique())

In [None]:
topic_info = topic_model.get_topic_info()
print("\nTopic summary (first 10 rows):")
print(topic_info.head(10))

In [None]:
topic_info.head(10)

In [None]:
# Example: inspect one topic in detail (skip Topic -1 which is usually outliers)
# topic_info usually has rows like: Topic, Count, Name
example_topic_id = topic_info.loc[1, "Topic"]  # first real topic row (index 1)
print(f"\nExample topic {example_topic_id}:")
print(topic_model.get_topic(example_topic_id))

In [None]:
# 7a. Average review_stars per topic
topic_avg_rating = (
    restaurants_sample
    .groupby("topic_id")["review_stars"]
    .mean()
    .reset_index()
    .sort_values("review_stars", ascending=False)
)


In [None]:
topic_avg_rating.head(10)

In [None]:
topic_counts = (
    restaurants_sample
    .groupby("topic_id")
    .size()
    .reset_index(name="count")
    .sort_values("count", ascending=False)
)

print("\nMost common topics:")
topic_counts.head(10)

In [None]:
topic_city_counts = (
    restaurants_sample
    .groupby(["city", "topic_id"])
    .size()
    .reset_index(name="count")
    .sort_values(["city", "count"], ascending=[True, False])
)

In [None]:
example_city = "Philadelphia"
print(f"\nTop topics in {example_city}:")
print(topic_city_counts[topic_city_counts["city"] == example_city].head(10))

In [None]:
topic_model.visualize_topics()
topic_model.visualize_barchart()
topic_model.visualize_hierarchy()

In [None]:
cols_to_drop_for_csv = ["attributes", "hours"]
csv_df = restaurants_sample.drop(columns=[c for c in cols_to_drop_for_csv if c in restaurants_sample.columns])

In [None]:
topic_model.visualize_topics()

In [None]:
topic_model.visualize_barchart()

In [None]:
topic_model.visualize_hierarchy()

In [None]:
topic_info = topic_model.get_topic_info()
topic_info.head(10)

In [None]:
# Skip -1 (usually "misc / outliers")
for t in topic_info["Topic"].head(10):
    if t == -1:
        continue
    print("\nTopic", t)
    print(topic_model.get_topic(t))


In [None]:
def show_examples(topic_id, n=5):
    subset = restaurants_sample[restaurants_sample["topic_id"] == topic_id].sample(n=min(n, 20), random_state=0)
    for i, row in subset.iterrows():
        print(f"\nStars: {row['review_stars']} | City: {row['city']} | Biz: {row['name']}")
        print(row["text"][:400], "...")

show_examples(10)  # change topic id


In [None]:
# Get topic metadata
topic_info = topic_model.get_topic_info()
topic_ids = topic_info["Topic"].tolist()

def build_label_from_topic(topic_id, n_words=4):
    """Create a simple label using the top n_words for a topic."""
    words_scores = topic_model.get_topic(topic_id)
    if not words_scores:  # empty topic or outlier
        return f"Topic {topic_id}"
    top_words = [w for w, s in words_scores[:n_words]]
    return ", ".join(top_words)

# Build map: topic_id -> auto label
topic_label_map = {}
for t in topic_ids:
    if t == -1:
        topic_label_map[t] = "Outliers / Misc"
    else:
        topic_label_map[t] = build_label_from_topic(t, n_words=4)

# Attach to your DataFrame
restaurants_sample["topic_label"] = restaurants_sample["topic_id"].map(topic_label_map)

# Also attach to topic_info so you can inspect
topic_info["auto_label"] = topic_info["Topic"].map(topic_label_map)

topic_info[["Topic", "Count", "Name", "auto_label"]].head(15)


In [None]:
topic_stats = (
    restaurants_sample
    .groupby("topic_label")
    .agg(
        avg_stars=("review_stars", "mean"),
        count=("review_stars", "size")
    )
    .reset_index()
)

# Most positive topics
top_positive = topic_stats.sort_values("avg_stars", ascending=False).head(10)
print("Most positive topics:\n", top_positive)

# Most negative topics
top_negative = topic_stats.sort_values("avg_stars", ascending=True).head(10)
print("\nMost negative topics:\n", top_negative)


In [None]:
topic_city = (
    restaurants_sample
    .groupby(["city", "topic_label"])
    .size()
    .reset_index(name="count")
)

# Normalize to % within each city
topic_city["city_total"] = topic_city.groupby("city")["count"].transform("sum")
topic_city["pct_city"] = topic_city["count"] / topic_city["city_total"]

# Example: top topics in Philadelphia
topic_city[topic_city["city"] == "Philadelphia"] \
    .sort_values("pct_city", ascending=False) \
    .head(10)


In [None]:
topic_info[["Topic", "Count", "Name", "auto_label"]].head(50)


In [None]:
def show_examples(topic_id, n=5):
    subset = restaurants_sample[restaurants_sample["topic_id"] == topic_id].sample(n=min(n, 20), random_state=0)
    for i, row in subset.iterrows():
        print(f"\nStars: {row['review_stars']} | City: {row['city']} | Biz: {row['name']}")
        print(row["text"][:400], "...")

show_examples(2)  # change topic id


In [None]:
topic2_df = restaurants_sample[restaurants_sample["topic_id"] == 2].copy()
print(len(topic2_df), "reviews in topic 2")


In [None]:
topic2_biz = topic2_df.drop_duplicates(subset="business_id").copy()
print(len(topic2_biz), "unique businesses in topic 2")


In [None]:
# Drop missing categories to avoid errors
cats = topic2_biz["categories"].dropna().str.split(",", expand=False)

# Flatten into one long Series of category strings
cats_exploded = cats.explode().str.strip()

# Count how often each category appears
topic2_category_counts = cats_exploded.value_counts()

# See the top 30 categories for topic 2
topic2_category_counts.head(30)


In [None]:
topic_info = topic_model.get_topic_info()
topic_info.head()

In [None]:
topic_id_to_name = topic_info.set_index("Topic")["Name"].to_dict()

In [None]:
restaurants_sample["topic_label"] = restaurants_sample["topic_id"].map(topic_id_to_name)

In [None]:
restaurants_sample["topic_label"] = restaurants_sample["topic_label"].fillna("Other / Outlier")
restaurants_sample.loc[restaurants_sample["topic_id"] == -1, "topic_label"] = "Outlier topic"

In [None]:
restaurants_sample[["topic_id", "topic_label"]].head()

In [None]:
# 3.1 Define generic categories that are NOT cuisines
generic_cats = {
    "Restaurants", "Food", "Nightlife", "Bars", "Event Planning & Services",
    "Shopping", "Hotels & Travel", "Local Services", "Beauty & Spas",
    "Active Life", "Home Services", "Automotive", "Health & Medical",
    "Arts & Entertainment", "Professional Services", "Pets", "Education",
    "Public Services & Government", "Religious Organizations"
}

def extract_primary_cuisine(cat_string):
    # Handle missing values
    if pd.isna(cat_string):
        return "Unknown"

    # Split the string properly inside the function
    cats = [c.strip() for c in cat_string.split(",") if c.strip()]

    if not cats:
        return "Unknown"

    # Remove generic category words
    specific = [c for c in cats if c not in generic_cats]

    # If nothing left, it's generic
    if not specific:
        return "Generic Restaurant"

    # Use the first specific category as the cuisine
    return specific[0]


In [None]:
# 3.2 Apply function to your dataframe
restaurants_sample["cuisine_type"] = restaurants_sample["categories"].apply(extract_primary_cuisine)

# Preview results
restaurants_sample[["categories", "cuisine_type"]].head(10)

In [None]:
global_mean = restaurants_sample["review_stars"].mean()
print("‚≠ê Global mean rating:", round(global_mean, 3))

In [None]:
# 4.2 Mean stars and review count per topic_label
topic_stats = (
    restaurants_sample
    .groupby("topic_label")
    .agg(
        avg_stars=("review_stars", "mean"),
        n_reviews=("review_id", "count")
    )
    .reset_index()
)

In [None]:
# 4.3 Add an "impact" metric: how much each topic deviates from global average
topic_stats["impact_vs_global"] = topic_stats["avg_stars"] - global_mean

In [None]:
# 4.4 Sort topics by impact (high to low)
topic_stats_sorted = topic_stats.sort_values("impact_vs_global", ascending=False)

In [None]:
# 4.4 Sort topics by impact (high to low)
topic_stats_sorted = topic_stats.sort_values("impact_vs_global", ascending=False)

print("\nüîù Top 15 POSITIVE topics (above-average satisfaction):")
print(
    topic_stats_sorted
    .head(15)
    .assign(
        avg_stars=lambda df: df["avg_stars"].round(3),
        impact_vs_global=lambda df: df["impact_vs_global"].round(3)
    )
)

print("\nüîª Bottom 15 NEGATIVE topics (below-average satisfaction):")
print(
    topic_stats_sorted
    .tail(15)
    .assign(
        avg_stars=lambda df: df["avg_stars"].round(3),
        impact_vs_global=lambda df: df["impact_vs_global"].round(3)
    )
)

In [None]:
restaurants_sample.columns

In [None]:
def map_to_cuisine(cat_string):
    if pd.isna(cat_string):
        return "Unknown"

    cats = [c.strip() for c in cat_string.split(",") if c.strip()]

    cuisine_keywords = {
        "Italian": ["Italian", "Pizza", "Pasta"],
        "Mexican": ["Mexican", "Tacos"],
        "Japanese": ["Japanese", "Sushi", "Ramen"],
        "Chinese": ["Chinese", "Cantonese", "Dim Sum", "Szechuan"],
        "American": ["American", "Burgers", "Diners"],
        "Indian": ["Indian"],
        "Mediterranean": ["Mediterranean", "Greek", "Middle Eastern"],
        "Thai": ["Thai"],
        "Korean": ["Korean", "BBQ"],
        "Vietnamese": ["Vietnamese", "Pho"],
        "Seafood": ["Seafood"],
        "Barbecue": ["Barbeque", "BBQ"]
    }

    for cuisine, keywords in cuisine_keywords.items():
        if any(k in cats for k in keywords):
            return cuisine

    return "Other"


In [None]:
restaurants_sample["cuisine_cluster"] = restaurants_sample["categories"].apply(map_to_cuisine)

In [None]:
restaurants_sample["cuisine_cluster"].value_counts().head(20)

In [None]:
topics = topic_model.transform(restaurants_sample["text"].tolist())[0]
restaurants_sample["topic_id"] = topics

topic_info = topic_model.get_topic_info()
topic_map = topic_info.set_index("Topic")["Name"].to_dict()

restaurants_sample["topic_label"] = restaurants_sample["topic_id"].map(topic_map)


In [None]:
restaurants_sample[["topic_id", "topic_label"]].head()


In [None]:
topic_cuisine_stats = (
    restaurants_sample
    .groupby(["cuisine_cluster", "topic_label"])
    .agg(
        avg_stars=("review_stars", "mean"),
        n_reviews=("review_id", "count")
    )
    .reset_index()
)

topic_cuisine_stats.head()


In [None]:
topic_cuisine_stats = (
    restaurants_sample
    .groupby(["cuisine_cluster", "topic_label"])
    .agg(
        avg_stars=("review_stars", "mean"),
        n_reviews=("review_id", "count")
    )
    .reset_index()
)

topic_cuisine_stats.head()


In [None]:
worst_aspects_per_cuisine = (
    topic_cuisine_stats
    .sort_values(["cuisine_cluster", "avg_stars"], ascending=[True, True])
    .groupby("cuisine_cluster")
    .head(1)
)

print("‚ö†Ô∏è Lowest-rated aspect for each cuisine:")
worst_aspects_per_cuisine


In [None]:
cuisine_overall = (
    restaurants_sample
    .groupby("cuisine_cluster")
    .agg(
        avg_stars=("review_stars", "mean"),
        n_reviews=("review_id", "count")
    )
    .sort_values("avg_stars", ascending=False)
)

print("üèÜ Cuisine Ranking by Customer Satisfaction:")
cuisine_overall


In [None]:
weak_topics = topic_cuisine_stats[topic_cuisine_stats["avg_stars"] < 3.5]
weak_topics_sorted = (
    weak_topics.sort_values(["cuisine_cluster", "avg_stars"])
)

print("üö® Weak topics (cuisine √ó topic) with avg stars < 3.5:")
weak_topics_sorted.head(30)


In [None]:
opportunities = (
    topic_cuisine_stats
    .assign(impact=lambda df: df["n_reviews"] * (4 - df["avg_stars"]))
    .sort_values("impact", ascending=False)
)

print("üìà High-impact improvement opportunities:")
opportunities.head(20)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# If not already computed:
global_mean = restaurants_sample["review_stars"].mean()

# Filter out tiny topics to keep plot readable
min_reviews_topic = 2000   # adjust if needed
plot_topics = topic_stats[topic_stats["n_reviews"] >= min_reviews_topic].copy()

# Sort by impact
plot_topics = plot_topics.sort_values("impact_vs_global")

plt.figure(figsize=(9, 14))
sns.barplot(
    data=plot_topics,
    x="impact_vs_global",
    y="topic_label"
)
plt.axvline(0, linestyle="--", linewidth=1)
plt.title("Impact of Review Topics on Rating (vs Global Mean)")
plt.xlabel("Impact on Stars (avg_stars - global_mean)")
plt.ylabel("Topic (Aspect)")
plt.tight_layout()
plt.show()


In [None]:
# If you don‚Äôt have it:
cuisine_overall = (
    restaurants_sample
    .groupby("cuisine_cluster")
      .agg(
          avg_stars=("review_stars", "mean"),
            n_reviews=("review_id", "count")
          )
      )

min_reviews_cuisine = 5000  # adjust to focus on big cuisines
plot_cuisines = cuisine_overall[cuisine_overall["n_reviews"] >= min_reviews_cuisine].copy()
plot_cuisines = plot_cuisines.sort_values("avg_stars", ascending=False)

plt.figure(figsize=(8, 6))
sns.barplot(
    data=plot_cuisines,
    x="avg_stars",
    y="cuisine_cluster"
)
plt.title("Average Rating by Cuisine (large cuisines only)")
plt.xlabel("Average Stars")
plt.ylabel("Cuisine")
plt.xlim(0, 5)
plt.tight_layout()
plt.show()


In [None]:
# 1) choose top cuisines by volume
top_cuisines = (
    restaurants_sample["cuisine_cluster"]
    .value_counts()
    .head(10)
    .index
)

# 2) choose top topics by volume
top_topics = (
    topic_stats
    .sort_values("n_reviews", ascending=False)
    .head(10)["topic_label"]
    .tolist()
)

# 3) filter topic_cuisine_stats to those
heat_df = topic_cuisine_stats[
    topic_cuisine_stats["cuisine_cluster"].isin(top_cuisines)
    & topic_cuisine_stats["topic_label"].isin(top_topics)
].copy()

# 4) pivot for heatmap
pivot = heat_df.pivot(
    index="cuisine_cluster",
    columns="topic_label",
    values="avg_stars"
)

plt.figure(figsize=(12, 6))
sns.heatmap(
    pivot,
    annot=True,
    fmt=".2f",
    linewidths=0.5
)
plt.title("Average Rating by Cuisine √ó Topic (Top Cuisines & Topics)")
plt.xlabel("Topic (Aspect)")
plt.ylabel("Cuisine")
plt.tight_layout()
plt.show()


In [None]:
plt.figure(figsize=(8, 6))
sns.scatterplot(
    data=topic_stats,
    x="n_reviews",
    y="avg_stars"
)
plt.title("Topic Popularity vs Average Rating")
plt.xlabel("Number of Reviews with Topic")
plt.ylabel("Average Stars")
plt.ylim(0, 5)
plt.tight_layout()
plt.show()


In [None]:
topic_by_city = (
    restaurants_sample
    .groupby(["city", "topic_label"])
    .agg(
        avg_stars=("review_stars", "mean"),
        n_reviews=("review_id", "count")
    )
    .reset_index()
)

top_cities = (
    restaurants_sample["city"]
    .value_counts()
    .head(10)
    .index
)

top_topics_city = (
    topic_stats
    .sort_values("n_reviews", ascending=False)
    .head(10)["topic_label"]
    .tolist()
)

heat_city = topic_by_city[
    topic_by_city["city"].isin(top_cities)
    & topic_by_city["topic_label"].isin(top_topics_city)
]

pivot_city = heat_city.pivot(
    index="city",
    columns="topic_label",
    values="avg_stars"
)

plt.figure(figsize=(12, 6))
sns.heatmap(
    pivot_city,
    annot=True,
    fmt=".2f",
    linewidths=0.5
)
plt.title("Average Rating by City √ó Topic (Top Cities & Topics)")
plt.xlabel("Topic (Aspect)")
plt.ylabel("City")
plt.tight_layout()
plt.show()


In [None]:
# Ensure we have topic_cuisine_stats (from Step 5)
# Columns: cuisine_cluster, topic_label, avg_stars, n_reviews

# 7A: Find top positive/negative topics per cuisine
def top_topics_for_cuisine(cuisine, N=5):
    df = topic_cuisine_stats[topic_cuisine_stats["cuisine_cluster"] == cuisine].copy()
    df = df[df["n_reviews"] >= 100]   # filter tiny topics

    df["z_score"] = (df["avg_stars"] - df["avg_stars"].mean()) / df["avg_stars"].std()

    top_pos = df.sort_values("z_score", ascending=False).head(N)
    top_neg = df.sort_values("z_score", ascending=True).head(N)

    return top_pos, top_neg

cuisine_list = restaurants_sample["cuisine_cluster"].unique()

final_cuisine_topics = {}

for c in cuisine_list:
    pos, neg = top_topics_for_cuisine(c)
    final_cuisine_topics[c] = {
        "top_positive_topics": pos,
        "top_negative_topics": neg
    }

# Example: print for Mexican food
print("\nüî∫ Mexican ‚Äî Top Positive Topics:")
print(final_cuisine_topics["Mexican"]["top_positive_topics"])

print("\nüîª Mexican ‚Äî Top Negative Topics:")
print(final_cuisine_topics["Mexican"]["top_negative_topics"])


In [None]:
# 7B ‚Äî City √ó Cuisine √ó Topic
city_cuisine_topic = (
    restaurants_sample
    .groupby(["city", "cuisine_cluster", "topic_label"])
    .agg(
        avg_stars=("review_stars", "mean"),
        n_reviews=("review_id", "count")
    )
    .reset_index()
)

# Add impact vs global for interpretability
city_cuisine_topic["impact_vs_global"] = (
    city_cuisine_topic["avg_stars"] - global_mean
)

print(city_cuisine_topic.head())


In [None]:
# 7C ‚Äî Business-level topic KPI
business_topic_stats = (
    restaurants_sample
    .groupby(["business_id", "name", "city", "cuisine_cluster", "topic_label"])
    .agg(
        avg_stars=("review_stars", "mean"),
        n_reviews=("review_id", "count")
    )
    .reset_index()
)

# Add deviation from global
business_topic_stats["impact_vs_global"] = (
    business_topic_stats["avg_stars"] - global_mean
)

print(business_topic_stats.head())


In [None]:
plt.figure()
sns.countplot(x="review_stars", data=restaurants_sample, order=sorted(restaurants_sample["review_stars"].unique()))
plt.title("Distribution of Review Ratings")
plt.xlabel("Review Stars")
plt.ylabel("Number of Reviews")
plt.tight_layout()
plt.show()


In [None]:
# Add a review length column if you haven't already
restaurants_sample["review_length"] = restaurants_sample["text"].str.len()

plt.figure()
sns.histplot(restaurants_sample["review_length"], bins=40, kde=True)
plt.title("Distribution of Review Lengths (Characters)")
plt.xlabel("Review Length")
plt.ylabel("Count")
plt.tight_layout()
plt.show()


In [None]:
plt.figure()
sns.boxplot(x="review_stars", y="review_length", data=restaurants_sample)
plt.title("Review Length by Rating")
plt.xlabel("Review Stars")
plt.ylabel("Review Length (Characters)")
plt.tight_layout()
plt.show()


In [None]:
top_cities = (
    restaurants_sample["city"]
    .value_counts()
    .head(10)
    .index
)

city_counts = (
    restaurants_sample[restaurants_sample["city"].isin(top_cities)]
    ["city"]
    .value_counts()
    .reindex(top_cities)
)

plt.figure()
sns.barplot(x=city_counts.index, y=city_counts.values)
plt.title("Top 10 Cities by Number of Reviews")
plt.xlabel("City")
plt.ylabel("Number of Reviews")
plt.xticks(rotation=45, ha="right")
plt.tight_layout()
plt.show()


In [None]:
city_avg = (
    restaurants_sample[restaurants_sample["city"].isin(top_cities)]
    .groupby("city")["review_stars"]
    .mean()
    .reindex(top_cities)
)

plt.figure()
sns.barplot(x=city_avg.index, y=city_avg.values)
plt.title("Average Review Rating by City (Top 10)")
plt.xlabel("City")
plt.ylabel("Average Stars")
plt.ylim(0, 5)
plt.xticks(rotation=45, ha="right")
plt.tight_layout()
plt.show()


In [None]:
top_cuisines = (
    restaurants_sample["cuisine_cluster"]
    .value_counts()
    .head(10)
    .index
)

cuisine_counts = (
    restaurants_sample[restaurants_sample["cuisine_cluster"].isin(top_cuisines)]
    ["cuisine_cluster"]
    .value_counts()
    .reindex(top_cuisines)
)

plt.figure()
sns.barplot(x=cuisine_counts.index, y=cuisine_counts.values)
plt.title("Top 10 Cuisines by Number of Reviews")
plt.xlabel("Cuisine")
plt.ylabel("Number of Reviews")
plt.xticks(rotation=45, ha="right")
plt.tight_layout()
plt.show()


In [None]:
topic_counts = (
    restaurants_sample["topic_label"]
    .value_counts()
    .head(15)
)

plt.figure(figsize=(8, 6))
sns.barplot(x=topic_counts.values, y=topic_counts.index)
plt.title("Top 15 Topics by Number of Reviews")
plt.xlabel("Number of Reviews")
plt.ylabel("Topic")
plt.tight_layout()
plt.show()


In [None]:
top_topic_labels = topic_counts.index

topic_avg = (
    restaurants_sample[restaurants_sample["topic_label"].isin(top_topic_labels)]
    .groupby("topic_label")["review_stars"]
    .mean()
    .reindex(top_topic_labels)
)

plt.figure(figsize=(8, 6))
sns.barplot(x=topic_avg.values, y=topic_avg.index)
plt.title("Average Rating for Top 15 Topics")
plt.xlabel("Average Stars")
plt.ylabel("Topic")
plt.xlim(0, 5)
plt.tight_layout()
plt.show()


In [None]:
city_cuisine_pivot = (
    restaurants_sample
    .groupby(["city", "cuisine_cluster"])
    .size()
    .reset_index(name="n_reviews")
    .pivot(index="city", columns="cuisine_cluster", values="n_reviews")
    .fillna(0)
)

# Optional: limit to top 10 cities and top 10 cuisines to keep it readable
top_cities_for_heatmap = top_cities
top_cuisines_for_heatmap = top_cuisines

city_cuisine_pivot = city_cuisine_pivot.loc[top_cities_for_heatmap, top_cuisines_for_heatmap]

plt.figure(figsize=(10, 6))
sns.heatmap(city_cuisine_pivot, annot=False, cmap="Blues")
plt.title("Number of Reviews by City √ó Cuisine")
plt.xlabel("Cuisine")
plt.ylabel("City")
plt.tight_layout()
plt.show()


In [None]:
numeric_cols = ["review_stars", "business_stars", "review_length"]
numeric_cols = [c for c in numeric_cols if c in restaurants_sample.columns]

corr = restaurants_sample[numeric_cols].corr()

plt.figure()
sns.heatmap(corr, annot=True, vmin=-1, vmax=1, center=0)
plt.title("Correlation Between Numeric Features")
plt.tight_layout()
plt.show()


The better visualizations

In [None]:
plt.figure(figsize=(8,4))
sns.countplot(
    x="review_stars",
    data=restaurants_sample,
    order=sorted(restaurants_sample["review_stars"].unique())
)
plt.title("Distribution of Yelp Review Ratings")
plt.xlabel("Stars")
plt.ylabel("Number of Reviews")
plt.tight_layout()
plt.show()


In [None]:
restaurants_sample["review_length"] = restaurants_sample["text"].str.len()

plt.figure(figsize=(8,4))
sns.histplot(restaurants_sample["review_length"], bins=40, kde=True)
plt.title("Distribution of Review Lengths")
plt.xlabel("Characters")
plt.ylabel("Count")
plt.tight_layout()
plt.show()


In [None]:
plt.figure(figsize=(8,4))
sns.boxplot(
    x="review_stars",
    y="review_length",
    data=restaurants_sample
)
plt.title("Review Length by Rating")
plt.xlabel("Stars")
plt.ylabel("Review Length")
plt.tight_layout()
plt.show()


In [None]:
top_cities = (
    restaurants_sample["city"].value_counts().head(10).index
)

city_counts = (
    restaurants_sample[restaurants_sample["city"].isin(top_cities)]
    ["city"]
    .value_counts()
    .reindex(top_cities)
)

plt.figure(figsize=(8,4))
sns.barplot(x=city_counts.index, y=city_counts.values)
plt.xticks(rotation=45, ha="right")
plt.title("Top 10 Cities by Review Volume")
plt.xlabel("City")
plt.ylabel("Number of Reviews")
plt.tight_layout()
plt.show()


In [None]:
city_avg = (
    restaurants_sample[restaurants_sample["city"].isin(top_cities)]
    .groupby("city")["review_stars"]
    .mean()
    .reindex(top_cities)
)

plt.figure(figsize=(8,4))
sns.barplot(x=city_avg.index, y=city_avg.values)
plt.xticks(rotation=45, ha="right")
plt.title("Average Rating by City")
plt.ylabel("Avg Stars")
plt.ylim(0,5)
plt.tight_layout()
plt.show()


In [None]:
top_cuisines = (
    restaurants_sample["cuisine_cluster"]
    .value_counts()
    .head(10)
    .index
)

cuisine_counts = (
    restaurants_sample["cuisine_cluster"]
    .value_counts()
    .reindex(top_cuisines)
)

plt.figure(figsize=(8,4))
sns.barplot(x=cuisine_counts.index, y=cuisine_counts.values)
plt.xticks(rotation=45, ha="right")
plt.title("Top 10 Cuisines by Review Volume")
plt.xlabel("Cuisine")
plt.ylabel("Review Count")
plt.tight_layout()
plt.show()


In [None]:
cuisine_avg = (
    restaurants_sample[restaurants_sample["cuisine_cluster"].isin(top_cuisines)]
    .groupby("cuisine_cluster")["review_stars"]
    .mean()
    .reindex(top_cuisines)
)

plt.figure(figsize=(8,4))
sns.barplot(x=cuisine_avg.index, y=cuisine_avg.values)
plt.xticks(rotation=45, ha="right")
plt.title("Average Rating by Cuisine")
plt.ylabel("Avg Stars")
plt.ylim(0,5)
plt.tight_layout()
plt.show()


In [None]:
topic_counts = (
    restaurants_sample["topic_label"]
    .value_counts()
    .head(15)
)

plt.figure(figsize=(10,6))
sns.barplot(x=topic_counts.values, y=topic_counts.index)
plt.title("Top 15 Topics by Review Volume")
plt.xlabel("Review Count")
plt.ylabel("Topic")
plt.tight_layout()
plt.show()



In [None]:
top_topic_labels = topic_counts.index

topic_avg = (
    restaurants_sample[restaurants_sample["topic_label"].isin(top_topic_labels)]
    .groupby("topic_label")["review_stars"]
    .mean()
    .reindex(top_topic_labels)
)

plt.figure(figsize=(10,6))
sns.barplot(x=topic_avg.values, y=topic_avg.index)
plt.title("Average Rating for Top 15 Topics")
plt.xlabel("Avg Stars")
plt.xlim(0,5)
plt.tight_layout()
plt.show()


In [None]:
city_cuisine_pivot = (
    restaurants_sample
    .groupby(["city", "cuisine_cluster"])
    .size()
    .reset_index(name="n_reviews")
    .pivot(index="city", columns="cuisine_cluster", values="n_reviews")
    .fillna(0)
)

plt.figure(figsize=(12,6))
sns.heatmap(city_cuisine_pivot, cmap="Blues")
plt.title("Review Volume: City √ó Cuisine")
plt.xlabel("Cuisine")
plt.ylabel("City")
plt.tight_layout()
plt.show()


In [None]:
numeric_cols = ["review_stars", "business_stars", "review_length"]
numeric_cols = [c for c in numeric_cols if c in restaurants_sample.columns]

corr = restaurants_sample[numeric_cols].corr()

plt.figure(figsize=(6,4))
sns.heatmap(corr, annot=True, cmap="coolwarm", center=0)
plt.title("Correlation Between Numeric Features")
plt.tight_layout()
plt.show()
