1. Review Data (yelp_academic_dataset_review.json)

In [None]:
import pandas as pd
import json

# Load review data
with open('yelp_academic_dataset_review.json', 'r') as f:
    first_char = f.read(1)
    f.seek(0)
    if first_char == '[':
        review_df = pd.DataFrame(json.load(f))
    else:
        review_df = pd.read_json(f, lines=True)

print("=" * 60)
print("REVIEW DATASET OVERVIEW")
print("=" * 60)
print(f"\nTotal reviews: {len(review_df):,}")
print(f"Unique businesses reviewed: {review_df['business_id'].nunique():,}")
print(f"Unique users: {review_df['user_id'].nunique():,}")

print(f"\n" + "=" * 60)
print("DATE RANGE")
print("=" * 60)
print(f"Earliest review: {review_df['date'].min()}")
print(f"Latest review: {review_df['date'].max()}")

print(f"\n" + "=" * 60)
print("STAR RATING DISTRIBUTION")
print("=" * 60)
print(review_df['stars'].value_counts().sort_index().to_string())

print(f"\n" + "=" * 60)
print("REVIEW LENGTH STATS (characters)")
print("=" * 60)
review_df['text_length'] = review_df['text'].str.len()
print(review_df['text_length'].describe().to_string())

print(f"\n" + "=" * 60)
print("SAMPLE REVIEWS (first 3)")
print("=" * 60)
for i, row in review_df.head(3).iterrows():
    print(f"\n⭐ {row['stars']} stars:")
    print(f"{row['text'][:300]}...")
    print("-" * 40)

print(f"\n" + "=" * 60)
print("REVIEWS PER YEAR")
print("=" * 60)
review_df['year'] = pd.to_datetime(review_df['date']).dt.year
print(review_df['year'].value_counts().sort_index().to_string())

REVIEW DATASET OVERVIEW

Total reviews: 100,000
Unique businesses reviewed: 9,973
Unique users: 79,345

DATE RANGE
Earliest review: 2005-03-01 17:47:15
Latest review: 2018-10-04 18:22:35

STAR RATING DISTRIBUTION
stars
1.0    10921
2.0     7988
3.0    11362
4.0    25337
5.0    44392

REVIEW LENGTH STATS (characters)
count    100000.000000
mean        548.438330
std         501.703274
min           3.000000
25%         226.000000
50%         395.000000
75%         693.000000
max        5000.000000

SAMPLE REVIEWS (first 3)

⭐ 3.0 stars:
If you decide to eat here, just be aware it is going to take about 2 hours from beginning to end. We have tried it multiple times, because I want to like it! I have been to it's other locations in NJ and never had a bad experience. 

The food is good, but it takes a very long time to come out. The w...
----------------------------------------

⭐ 5.0 stars:
I've taken a lot of spin classes over the years, and nothing compares to the classes at Body Cycle.

2. Tips Data (yelp_academic_dataset_tip.json)

In [None]:
import pandas as pd
import json

# Load tip data
with open('yelp_academic_dataset_tip.json', 'r') as f:
    first_char = f.read(1)
    f.seek(0)
    if first_char == '[':
        tip_df = pd.DataFrame(json.load(f))
    else:
        tip_df = pd.read_json(f, lines=True)

print("=" * 60)
print("TIP DATASET OVERVIEW")
print("=" * 60)
print(f"\nTotal tips: {len(tip_df):,}")
print(f"Unique businesses: {tip_df['business_id'].nunique():,}")
print(f"Unique users: {tip_df['user_id'].nunique():,}")

print(f"\n" + "=" * 60)
print("DATE RANGE")
print("=" * 60)
print(f"Earliest tip: {tip_df['date'].min()}")
print(f"Latest tip: {tip_df['date'].max()}")

print(f"\n" + "=" * 60)
print("TIP LENGTH STATS (characters)")
print("=" * 60)
tip_df['text_length'] = tip_df['text'].str.len()
print(tip_df['text_length'].describe().to_string())

print(f"\n" + "=" * 60)
print("SAMPLE TIPS (first 5)")
print("=" * 60)
for i, row in tip_df.head(5).iterrows():
    print(f"• {row['text'][:150]}")
    print("-" * 40)

TIP DATASET OVERVIEW

Total tips: 264,693
Unique businesses: 64,383
Unique users: 29,207

DATE RANGE
Earliest tip: 2009-04-17 00:34:55
Latest tip: 2022-01-19 18:42:44

TIP LENGTH STATS (characters)
count    264693.000000
mean         61.997393
std          54.518891
min           1.000000
25%          27.000000
50%          48.000000
75%          78.000000
max         500.000000

SAMPLE TIPS (first 5)
• Avengers time with the ladies.
----------------------------------------
• They have lots of good deserts and tasty cuban sandwiches
----------------------------------------
• It's open even when you think it isn't
----------------------------------------
• Very decent fried chicken
----------------------------------------
• Appetizers.. platter special for lunch
----------------------------------------


3. User Data (yelp_academic_dataset_user.json)

In [None]:
import pandas as pd
import json

# Load user data
with open('yelp_academic_dataset_user.json', 'r') as f:
    first_char = f.read(1)
    f.seek(0)
    if first_char == '[':
        user_df = pd.DataFrame(json.load(f))
    else:
        user_df = pd.read_json(f, lines=True)

print("=" * 60)
print("USER DATASET OVERVIEW")
print("=" * 60)
print(f"\nTotal users: {len(user_df):,}")

print(f"\n" + "=" * 60)
print("USER ACTIVITY STATS")
print("=" * 60)
print(f"\nReview count per user:")
print(user_df['review_count'].describe().to_string())

print(f"\n" + "=" * 60)
print("USER TENURE")
print("=" * 60)
print(f"Earliest user joined: {user_df['yelping_since'].min()}")
print(f"Latest user joined: {user_df['yelping_since'].max()}")

print(f"\n" + "=" * 60)
print("AVERAGE STARS GIVEN BY USERS")
print("=" * 60)
print(user_df['average_stars'].describe().to_string())

print(f"\n" + "=" * 60)
print("TOP 10 MOST ACTIVE REVIEWERS")
print("=" * 60)
top_users = user_df.nlargest(10, 'review_count')[['name', 'review_count', 'average_stars', 'yelping_since']]
print(top_users.to_string(index=False))

USER DATASET OVERVIEW

Total users: 79,345

USER ACTIVITY STATS

Review count per user:
count    79345.000000
mean        71.786804
std        207.437559
min          1.000000
25%          7.000000
50%         20.000000
75%         56.000000
max      16567.000000

USER TENURE
Earliest user joined: 2004-10-14 23:50:06
Latest user joined: 2018-10-04 01:21:31

AVERAGE STARS GIVEN BY USERS
count    79345.000000
mean         3.776432
std          0.795032
min          1.000000
25%          3.400000
50%          3.880000
75%          4.290000
max          5.000000

TOP 10 MOST ACTIVE REVIEWERS
    name  review_count  average_stars       yelping_since
   Bruce         16567           3.67 2009-03-08 21:47:44
     Kim          9941           3.81 2006-05-31 21:27:42
  Nijole          8363           3.75 2011-11-29 15:50:53
  George          7738           3.49 2009-11-06 22:53:16
Jennifer          6679           3.34 2009-11-09 20:44:45
   Sunil          6459           3.53 2009-01-28 23:35:24

4. Check-in Data (yelp_academic_dataset_checkin.json)

In [None]:
import pandas as pd
import json

# Load checkin data
with open('yelp_academic_dataset_checkin.json', 'r') as f:
    first_char = f.read(1)
    f.seek(0)
    if first_char == '[':
        checkin_df = pd.DataFrame(json.load(f))
    else:
        checkin_df = pd.read_json(f, lines=True)

print("=" * 60)
print("CHECKIN DATASET OVERVIEW")
print("=" * 60)
print(f"\nTotal businesses with check-ins: {len(checkin_df):,}")

print(f"\n" + "=" * 60)
print("CHECKIN FREQUENCY STATS")
print("=" * 60)
checkin_df['checkin_count'] = checkin_df['date'].str.split(', ').str.len()
print(checkin_df['checkin_count'].describe().to_string())

print(f"\n" + "=" * 60)
print("TOP 10 MOST CHECKED-IN BUSINESSES")
print("=" * 60)
top_checkins = checkin_df.nlargest(10, 'checkin_count')[['business_id', 'checkin_count']]
print(top_checkins.to_string(index=False))

CHECKIN DATASET OVERVIEW

Total businesses with check-ins: 9,337

CHECKIN FREQUENCY STATS
count     9337.000000
mean       137.754204
std        380.523718
min          1.000000
25%          9.000000
50%         33.000000
75%        112.000000
max      10063.000000

TOP 10 MOST CHECKED-IN BUSINESSES
           business_id  checkin_count
Xq-8-I0U8Artr7d70SjX-g          10063
GBTPC53ZrG1ZBY3DT8Mbcw           9390
PY9GRfzr4nTZeINf346QOw           7887
mcr1lAEdvGLMJhuPwI3I2A           7739
SZU9c8V2GuREDN5KgyHFJw           6148
gGyqnAlpFrka_qzpO7j4lQ           5905
SwBhaxfQPbyhsi0QHUAN0A           5394
8uF-bhJFgT4Tn6DTb27viA           5090
LUXRw-mr9emGL2gw4otvVA           4968
plobBsTtVUODb353xxFT_g           4930


5. Business Data (yelp_academic_dataset_business.json)

In [None]:
import pandas as pd
import json

# Load business data
with open('yelp_academic_dataset_business.json', 'r') as f:
    first_char = f.read(1)
    f.seek(0)
    if first_char == '[':
        business_df = pd.DataFrame(json.load(f))
    else:
        business_df = pd.read_json(f, lines=True)

# Basic stats
print(f"Total businesses: {len(business_df)}")
print(f"\nTop 20 categories:")
print(business_df['categories'].str.split(', ').explode().value_counts().head(20))
print(f"\nCities covered:")
print(business_df['city'].value_counts().head(15))
print(f"\nStates covered:")
print(business_df['state'].value_counts().head(10))

Total businesses: 9973

Top 20 categories:
categories
Restaurants                  4132
Food                         2062
Shopping                     1524
Nightlife                     923
Beauty & Spas                 916
Bars                          827
Event Planning & Services     769
Automotive                    668
American (Traditional)        645
Sandwiches                    631
Local Services                620
Home Services                 618
Pizza                         603
Health & Medical              583
American (New)                546
Fast Food                     513
Active Life                   508
Breakfast & Brunch            496
Coffee & Tea                  488
Burgers                       459
Name: count, dtype: int64

Cities covered:
city
Philadelphia        1093
Tucson               608
Tampa                577
Indianapolis         494
New Orleans          479
Nashville            457
Saint Louis          387
Reno                 378
Edmonton          