In [8]:

import os
import gc

import pandas as pd
import pyarrow.parquet as pq

import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt


In [9]:

config = {'data_dir': os.path.join('..', 'data', 'extracted_compressed_data')}


### First, Let's fetch all the data about businesses that are restaurants.
#### For this, we can refer to the 'categories' attribute of the business data and select businessses that have 'restaurant' as one of the categories.

In [10]:
business_df = pq.read_table(os.path.join(config['data_dir'], 'yelp_academic_dataset_business.parquet')).to_pandas()
business_df = business_df.sample(n=10000, random_state=42)
print("business_df.shape", business_df.shape)


business_df.shape (10000, 14)


In [11]:

restaurant_filter_fn = lambda x: 'restaurants' in x.lower() if x is not None else False

restaurants_df = business_df[business_df['categories'].apply(restaurant_filter_fn)]
print("restaurants_df.shape", restaurants_df.shape)


restaurants_df.shape (3482, 14)


## What are the top categories of restaurants in the dataset?
#### For this, we'll again use the categoies column from the business information dataframe and select and count occurrences of all the categories except 'restaurants'. 
##### We will plot the frequency of top categories with a pie chart.

In [12]:

categories_freq = restaurants_df['categories'].str.lower().apply(lambda x: x.split(', ')).explode().value_counts().reset_index()


In [13]:
top_k = 10
top_non_restaurant_cats_freq = categories_freq[categories_freq['categories']!='Restaurants'][:top_k]

fig = px.pie(top_non_restaurant_cats_freq, values='count', names='categories', \
        title=f'Top {top_k} Restaurant Categories')
fig.show()


## Where are the restaurants located on map?

#### We have the geo-location of restaurants available as 'latitude' and 'longitude'. 
#### Let's plot those using a scatter plot on the globe with plotly scatter_geo function.

In [14]:

fig = px.scatter_geo(restaurants_df, lon='longitude', lat='latitude', 
                     projection="natural earth",
                     title="Restaurants Geo-Location", 
                     hover_name='name',
                    )
fig.update_geos(
    showland=True, landcolor="LightGreen",
    showocean=True, oceancolor="LightBlue"
)
fig.update_layout(height=300, margin={"r":0,"t":0,"l":0,"b":0})
fig.show()


## How many unique cities and restaurants do we have in the dataset?

In [15]:
unique_cities = restaurants_df['city'].nunique()
print("Number of unique cities:", unique_cities)

unique_restaurants = restaurants_df['name'].nunique()
print("Number of unique restaurants:", unique_restaurants)



Number of unique cities: 409
Number of unique restaurants: 2933


## Which are the most common cuisines that restaurants offer?

In [16]:
top_k = 10
top_cuisines = restaurants_df['categories'].str.split(', ', expand=True).stack().\
                    value_counts().head(top_k)

fig = px.bar(x=top_cuisines.index, y=top_cuisines.values, color=top_cuisines.index)

fig.update_layout(title=f"Top {top_k} Most Common Cuisines",
                  xaxis_title="Cuisine",
                  yaxis_title="Count")

fig.show()


## Which cities have most number of restaurants, in our dataset?

In [17]:
top_k = 10
top_cities = restaurants_df['city'].value_counts().head(top_k)

fig = px.bar(x=top_cities.index, y=top_cities.values, color=top_cities.index)

fig.update_layout(title=f"Top-{top_k} Cities with the Most Number of Restaurants",
                  xaxis_title="City",
                  yaxis_title="Number of Restaurants")

fig.show()


## What does the distribution of reviews that restaurants get look like?
#### To minimize noise and for clearer visualization, we will consider only the restaurants with more than 100 and less than 2000 reviews. 
##### (These limits were decided based on analysing unclipped distribution first and clipping the extreme values that clutter the visualzation portion that is representative of the most restaurants i.e. 'the middle section'.)

In [18]:
min_reviews = 100
max_reviews = 2000

fig = go.Figure()

fig.add_trace(go.Histogram(
        x=restaurants_df[(restaurants_df['review_count'] >= min_reviews) & 
                      (restaurants_df['review_count'] <= max_reviews)]['review_count'], 
        nbinsx=50))

fig.update_layout(
    title=f'Distribution of Number of Reviews for Restaurants' \
        f'\n  (Min: {min_reviews}; Max: {max_reviews})',
    xaxis_title=f'Number of Reviews',
    yaxis_title='Frequency'
)

fig.show()


## What does the distribution of the ratings, in stars, that restaurants get look like?

In [19]:

fig = go.Figure()

fig.add_trace(go.Histogram(x=restaurants_df['stars'], nbinsx=25))

fig.update_layout(
    title='Distribution of Restaurant Ratings',
    xaxis_title='Rating',
    yaxis_title='Frequency'
)

fig.show()


## Is there any relation in the number of reviews restaurants get the and their ratings?

In [20]:

fig = px.scatter(restaurants_df[restaurants_df['review_count'] >= 100], 
                 x='review_count', y='stars', 
                 title='Scatter Plot: Number of Reviews vs Ratings',
                 labels={'review_count': 'Number of Reviews', 'stars': 'Ratings'})

fig.show()


## At what time, do restaurants tend to be more busy?

#### For this, we'd need to use the checkin data from the checkin.parquet file.

#### Let's first the checkin data and filter to keep on the restaurants data.
#### We can filter based on the business_id's that are present in the restaurants_df. 

In [21]:

checkin_df = pq.read_table(os.path.join(config['data_dir'], 'yelp_academic_dataset_checkin.parquet')).to_pandas()
print("Read checkin_df.shape: ", checkin_df.shape)

# check_if_rest_fn = lambda x: x.business_id in restaurants_df.business_id
check_if_rest_mask = checkin_df['business_id'].apply(lambda x: x in restaurants_df['business_id'].values)
checkin_df = checkin_df.loc[check_if_rest_mask]

print("After filtering out non-restaurant data: ")
print("checkin_df.shape: ", checkin_df.shape)

# checkin_df = checkin_df.sample(n=10000, random_state=42)
gc.collect()


Read checkin_df.shape:  (131930, 2)
After filtering out non-restaurant data: 
checkin_df.shape:  (3415, 2)


4659

#### Now we can extract the all checkin hours from the checkin times across restaurants 

In [22]:
checkin_df['date']

3         2011-02-15 17:12:00, 2011-07-28 02:46:10, 2012...
89        2015-12-12 02:42:54, 2015-12-22 03:13:48, 2015...
90        2020-12-16 21:36:49, 2020-12-19 20:42:51, 2020...
147       2020-09-09 21:54:07, 2020-09-09 22:12:01, 2020...
161       2011-03-26 01:57:04, 2013-08-21 17:25:54, 2015...
                                ...                        
131723    2011-04-22 22:50:28, 2011-09-24 22:23:05, 2012...
131808    2013-06-30 22:36:55, 2013-07-01 00:42:36, 2013...
131819    2012-03-04 00:25:43, 2012-03-04 21:03:34, 2012...
131838    2012-07-15 17:43:58, 2012-08-18 19:29:01, 2012...
131859    2014-05-11 19:03:37, 2014-05-11 19:07:03, 2014...
Name: date, Length: 3415, dtype: object

In [23]:
checkin_df.shape

(3415, 2)

In [24]:
def extract_hour(dates):
    ts = pd.to_datetime(dates.split(', '), format='%Y-%m-%d %H:%M:%S')
    ts_series = pd.Series(ts)
    return ts_series.dt.hour.values

checkin_hours = checkin_df['date'].apply(extract_hour)
all_checkin_hours = checkin_hours.explode()

In [25]:
# Let's plot checkin hours as a barplot with 24 bars throuout the day
fig = px.histogram(x=all_checkin_hours, nbins=24)

fig.update_layout(
    title='Distribution of Check-in Hours',
    xaxis_title='Hour of Check-in',
    yaxis_title='Frequency'
)

fig.show()


In [26]:
del checkin_df

### What does the trend of ratings the 'Willie Mae's Scotch House' retaurant in New Orleans look like over time?

In [27]:
review_df = pq.read_table(os.path.join(config['data_dir'], 'yelp_academic_dataset_review.parquet')).to_pandas()

In [28]:
restanrant_name = 'Willie Mae\'s Scotch House'
city = 'New Orleans'

curr_rest_business_id = restaurants_df[(restaurants_df['name'] == restanrant_name) & 
                            (restaurants_df['city'] == city)].business_id
curr_rest_business_id


73096    VVH6k9-ycttH3TV_lk5WfQ
Name: business_id, dtype: object

In [29]:
review_df.business_id

0          XQfwVwDr-v0ZS3_CbbE5Xw
1          7ATYjTIgM3jUlt4UM3IypQ
2          YjUWPpI6HXG530lwP-fb2A
3          kxX2SOes4o-D3ZQBkiMRfA
4          e4Vwtrqf-wpJfwesgvdgxQ
                    ...          
6990275    jals67o91gcrD4DC81Vk6w
6990276    2vLksaMmSEcGbjI5gywpZA
6990277    R1khUUxidqfaJmcpmGd4aw
6990278    Rr9kKArrMhSLVE9a53q-aA
6990279    VAeEXLbEcI9Emt9KGYq9aA
Name: business_id, Length: 6990280, dtype: object

In [30]:
# Extract the ratings data for the restaurant; sort by date. 
# compute average quarterly ratings. 

curr_restau_data = review_df[review_df.business_id.values==curr_rest_business_id.values[0]]
ratings_over_time = curr_restau_data.sort_values(by='date')[['date', 'stars']]
ratings_over_time['date'] = pd.to_datetime(ratings_over_time['date'], 
                                           format='%Y-%m-%d %H:%M:%S')
ratings_over_time.set_index('date', inplace=True)
monthly_avg_ratings = ratings_over_time.rolling(90).mean('stars').reset_index()


In [31]:
# Plot the average quarterly ratings with a trendline

fig = px.scatter(monthly_avg_ratings.dropna(), x='date', y='stars', trendline="lowess", 
                 trendline_options=dict(frac=0.1), trendline_color_override='magenta', 
                title=f'Quarterly ratings for {restanrant_name} in {city} over time')
fig.show()

In [32]:
del review_df

### How many new users, on average, have joined Yelp over the years?  
#### Also, show data the monthly granularity and with variance; for each year.

In [33]:
users_df = pq.read_table(os.path.join(config['data_dir'], 'yelp_academic_dataset_user.parquet')).to_pandas()

In [34]:
users_df['yelping_since_month'] = pd.to_datetime(users_df['yelping_since']).dt.month
users_df['yelping_since_year'] = pd.to_datetime(users_df['yelping_since']).dt.year


In [35]:
yelp_user_cnt = users_df.groupby(by=['yelping_since_year', \
        'yelping_since_month']).size().reset_index().rename({0:'user_count'}, \
        axis=1).sort_values(['yelping_since_year', 'yelping_since_month'])

In [36]:

fig = px.scatter(yelp_user_cnt.dropna(), x='yelping_since_year', y='user_count', 
                 trendline="lowess", 
                 trendline_options=dict(frac=0.1), trendline_color_override='magenta', 
                title=f'Monthly New users over time', 
                labels={'user_count': "Monthly New Users", 
                       'yelping_since_year': "Year"})

fig.show()

## Show total number of yelp users over time

In [37]:
yelp_user_cnt['total_user_joined_count'] = yelp_user_cnt['user_count'].cumsum()

In [38]:

fig = px.line(yelp_user_cnt.dropna(), 
              x='yelping_since_year', 
              y='total_user_joined_count', 
              title=f'Total new users joined Yelp over time', 
              labels={'total_user_joined_count': "Total New Users Joined Yelp", 
                      'yelping_since_year': "Year"}, 
             )
fig.update_layout(
    xaxis=dict(
        dtick=2  # difference between axis ticks shown
    ),
)
fig.show()
plt.close()


## How many friends do yelp users tend to have? What does the distribution of number of friends look like?
#### To reduce noise and for clearer visualization; we will filter out users who have less and 10 or more than 1500 friends.

In [39]:
num_friends = users_df.friends.apply(lambda x: len(x))
num_friends_filtered = num_friends[(num_friends > 10) & (num_friends < 1500)]
num_friends_filtered = num_friends_filtered.to_frame()
num_friends_filtered.rename({'friends': "Number of friends"}, axis=1, inplace=True)

In [40]:
fig = px.histogram(num_friends_filtered, nbins=50)

fig.update_layout(
    title='Distribution of number of friends',
    xaxis_title='Number of friends',
    yaxis_title='Frequency'
)

fig.show()

## Among the yelp influencers; how many fans they tend to have? Distribution.

#### We'll limit the visualization to number of fans to minimum of 25 and maximum of 2500 - to filter out non-influencer users and for clearer visualization.


In [41]:
fig = px.histogram(users_df[(users_df.fans>100) & (users_df.fans<2500)].fans, 
            labels={"value": 'Number of fans', 
                   "count": 'Frequency'})
fig.show()