In [2]:

import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import seaborn as sns

import os
import gc
import json


In [3]:

# from CFG import Config
# cfg_ref = Config()
# config = cfg_ref.get_config()
config = {'data_dir': '/kaggle/input/yelp-compressed-dataset'}


In [4]:

# business_df = pq.read_table(os.path.join(config['data_dir'], 'business.parquet')).to_pandas()
users_df = pq.read_table(os.path.join(config['data_dir'], 'user.parquet')).to_pandas()
# checkin_df = pq.read_table(os.path.join(config['data_dir'], 'checkin.parquet')).to_pandas()

# review_df = pq.read_table(os.path.join(config['data_dir'], 'review.parquet')).to_pandas()

# pf = ParquetFile('review.parquet') 
# first_ten_rows = next(pf.iter_batches(batch_size = 1000)) 
# df = pa.Table.from_batches([first_ten_rows]).to_pandas() 



In [5]:
users_df.shape

(1987897, 22)

In [5]:
review_df.shape

(6990280, 9)

In [6]:
business_df.shape

(150346, 14)

In [7]:
business_df_sample = business_df.sample(n=10000, random_state=42)
print("business_df_sample.shape", business_df_sample.shape)

checkin_df_sample = checkin_df.sample(n=10000, random_state=42)
print("checkin_df_sample.shape", checkin_df_sample.shape)


business_df_sample.shape (10000, 14)
checkin_df_sample.shape (10000, 2)


In [8]:
# Filter to keep only the restaurant businesses
restaurants_df = business_df_sample[business_df_sample['categories'].apply(\
                    lambda x: 'Restaurants' in x if x is not None else False)]
print("restaurants_df.shape", restaurants_df.shape)


restaurants_df.shape (3482, 14)


In [9]:
restaurants_df.head(n=1)

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
99277,6XOn1p3sbO22UjJGpmCgxg,China Wok,4319 Telegraph Rd,Saint Louis,MO,63129,38.486759,-90.30481,4.5,20,1,"{'AcceptsInsurance': None, 'AgesAllowed': None...","Restaurants, Chinese",


### What are the top categories of restaurants in the dataset?

In [10]:
categories_freq = restaurants_df['categories'].str.split(', ').explode().value_counts().reset_index()


In [11]:
top_k = 20
top_non_restaurant_cats_freq = categories_freq[categories_freq['categories']!='Restaurants'][:top_k]

fig = px.pie(top_non_restaurant_cats_freq, values='count', names='categories', \
        title=f'Top {top_k} Restaurant Categories')
fig.show()


### Where are the restaurants located on map?

In [12]:

fig = px.scatter_geo(restaurants_df, lon='longitude', lat='latitude', 
                     projection="natural earth",
                     title="Restaurants Geo-Location", 
                     hover_name='name',
                    )
fig.update_geos(
    showland=True, landcolor="LightGreen",
    showocean=True, oceancolor="LightBlue"
)
fig.update_layout(height=300, margin={"r":0,"t":0,"l":0,"b":0})
fig.show()


### How many unique cities and restaurants do we have in the dataset?

In [13]:
unique_cities = restaurants_df['city'].nunique()
print("Number of unique cities:", unique_cities)

unique_restaurants = restaurants_df['name'].nunique()
print("Number of unique restaurants:", unique_restaurants)



Number of unique cities: 409
Number of unique restaurants: 2933


### Which are the most common cuisines that restaurants offer?

In [14]:

top_cuisines = restaurants_df['categories'].str.split(', ', expand=True).stack().value_counts().head(10)

# Create the bar plot with different colors for each bar
fig = px.bar(x=top_cuisines.index, y=top_cuisines.values, color=top_cuisines.index)

# Set the title and axis labels
fig.update_layout(title="Top 10 Most Common Cuisines",
                  xaxis_title="Cuisine",
                  yaxis_title="Count")

# Show the plot
fig.show()






### Which cities have most number of restaurants, in our dataset?

In [15]:
# Get the top cities with the most number of restaurants
top_cities = restaurants_df['city'].value_counts().head(10)

# Create the bar plot with different colors for each bar
fig = px.bar(x=top_cities.index, y=top_cities.values, color=top_cities.index)

# Set the title and axis labels
fig.update_layout(title="Top Cities with the Most Number of Restaurants",
                  xaxis_title="City",
                  yaxis_title="Number of Restaurants")

# Show the plot
fig.show()






### What does the distribution of reviews that restaurants get look like?
#### Can consider restaurants with more than 100 reviews only, for get better idea.

In [16]:

fig = go.Figure()

# fig.add_trace(go.Histogram(x=business_df['review_count'], nbinsx=50))
fig.add_trace(go.Histogram(
        x=business_df[(business_df['review_count'] >= 100) & 
                      (business_df['review_count'] <= 2000)]['review_count'], 
        nbinsx=50))

fig.update_layout(
    title='Distribution of Number of Reviews for Each Restaurant (Minimum 100 Reviews)',
    xaxis_title='Number of Reviews (Min: 100; Max: 2000)',
    yaxis_title='Frequency'
)

fig.show()


### What does the distribution of the ratings, in stars, that restaurants get look like?

In [17]:

fig = go.Figure()

fig.add_trace(go.Histogram(x=restaurants_df['stars'], nbinsx=25))

fig.update_layout(
    title='Distribution of Restaurant Ratings',
    xaxis_title='Rating',
    yaxis_title='Frequency'
)

fig.show()


### Is there any pattern in the number of reviews restaurants get the and their ratings?

In [18]:

fig = px.scatter(restaurants_df[restaurants_df['review_count'] >= 100], x='review_count', y='stars', 
                 title='Scatter Plot: Number of Reviews vs Ratings',
                 labels={'review_count': 'Number of Reviews', 'stars': 'Ratings'})

fig.show()


### At what time, do restaurants tend to be more busy?

In [19]:
def extract_hour(dates):
    ts = pd.to_datetime(dates.split(', '), format='%Y-%m-%d %H:%M:%S')
    ts_series = pd.Series(ts)
    return ts_series.dt.hour.values


In [20]:
all_checkin_hours = checkin_df_sample['date'].apply(extract_hour)
all_checkin_hours = np.concatenate(all_checkin_hours.values)


In [21]:

fig = px.histogram(x=all_checkin_hours, nbins=24)

fig.update_layout(
    title='Distribution of Check-in Hours',
    xaxis_title='Hour of Check-in',
    yaxis_title='Frequency'
)

fig.show()


### What does the trend of ratings the 'Willie Mae's Scotch House' retaurant in New Orleans look like over time?

In [94]:
restanrant_name = 'Willie Mae\'s Scotch House'
city = 'New Orleans'

curr_rest_business_id = restaurants_df[(restaurants_df['name'] == restanrant_name) & 
                            (restaurants_df['city'] == city)].business_id
curr_rest_business_id


73096    VVH6k9-ycttH3TV_lk5WfQ
Name: business_id, dtype: object

In [86]:
# Extract the ratings data for the restaurant; sort by date. 
# compute average quarterly ratings. 

curr_restau_data = review_df[review_df.business_id==curr_rest_business_id]
ratings_over_time = curr_restau_data.sort_values(by='date')[['date', 'stars']]
ratings_over_time['date'] = pd.to_datetime(ratings_over_time['date'], 
                                           format='%Y-%m-%d %H:%M:%S')
ratings_over_time.set_index('date', inplace=True)
monthly_avg_ratings = ratings_over_time.rolling(90).mean('stars').reset_index()


In [99]:
# Plot the average quarterly ratings with a trendline

fig = px.scatter(monthly_avg_ratings.dropna(), x='date', y='stars', trendline="lowess", 
                 trendline_options=dict(frac=0.1), trendline_color_override='magenta', 
                title=f'Quarterly ratings for {restanrant_name} in {city} over time')
fig.show()

### How many new users, on average, have joined Yelp per month; over the years?  
#### Show Monthly variance for each year.

In [None]:
users_df['yelping_since_month'] = pd.to_datetime(users_df['yelping_since']).dt.month
users_df['yelping_since_year'] = pd.to_datetime(users_df['yelping_since']).dt.year


In [None]:
yelp_user_cnt = users_df.groupby(by=['yelping_since_year', \
        'yelping_since_month']).size().reset_index().rename({0:'user_count'}, \
        axis=1).sort_values(['yelping_since_year', 'yelping_since_month'])# .user_count

In [62]:
# Plot the yearly new users count with a trendline - along with monthly details.

fig = px.scatter(yelp_user_cnt.dropna(), x='yelping_since_year', y='user_count', 
                 trendline="lowess", 
                 trendline_options=dict(frac=0.1), trendline_color_override='magenta', 
                title=f'Monthly New users over time', 
                labels={'user_count': "Monthly New Users", 
                       'yelping_since_year': "Year"})

fig.show()

# yelp_user_cnt_monthly.plot()


#### Show total number of yelp users over time

In [63]:
yelp_user_cnt['total_user_joined_count'] = yelp_user_cnt['user_count'].cumsum()
# yelp_user_cnt

In [91]:
# Plot the total number of users that have joined yelp over time.

fig = px.line(yelp_user_cnt.dropna(), x='yelping_since_year', y='total_user_joined_count', 
                title=f'Total new users joined Yelp over time', 
                labels={'total_user_joined_count': "Total New Users Joined Yelp", 
                       'yelping_since_year': "Year"}, 
                )
fig.update_layout(
    xaxis=dict(
        dtick=2  # Set tick difference to 1
    ),
)
fig.show()
plt.close()


#### How many friends do yelp users tend to have? What does the distribution of number of friends look like?
#### We can filter out or ignore users who have less and 10 or more than 1500 users - to remove users who either do not use Yelp for networking or potential bots. 

In [14]:
num_friends = users_df.friends.apply(lambda x: len(x))
num_friends_filtered = num_friends[(num_friends > 10) & (num_friends < 1500)]
num_friends_filtered = num_friends_filtered.to_frame()
num_friends_filtered.rename({'friends': "Number of friends"}, axis=1, inplace=True)

In [24]:
fig = px.histogram(num_friends_filtered, nbins=50)

fig.update_layout(
    title='Distribution of number of friends',
    xaxis_title='Number of friends',
    yaxis_title='Frequency'
)

fig.show()

## Among the yelp influencers; how many fans they tend to have? Distribution.

### Can limit number of fans minimum to minimum of 25- to filter out non-influencer users 
### Can limit number of fans to maximum of 2500 - to filter out extremely popular influeners or bots - for clarity of view


In [10]:
fig = px.histogram(users_df[(users_df.fans>100) & (users_df.fans<2500)].fans, 
            labels={"value": 'Number of fans', 
                   "count": 'Frequency'})
fig.show()