In [5]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from task2_funcs import *
from tabulate import tabulate

# Task 2: Consultancy for opening a restaurant in the city of Philadelphia

<!-- Task 2 is more open in nature as there is no specific target. We don’t expect you to analyse all aspects of the problem. You can decide yourself on which approaches, summary statistics or analysis procedures you focus on. Grading will be based on scientific correctness, originality and presentation. -->

## Introduction

The goal of this report is to analyse customer review data to better understand the regional market so that the business is more likely to achieve success for the opening of their restaurant in Philadelphia. In particular, we aim to answer the following questions:

1. An insight on what restaurant consumers generally seem to like (for example in terms of food, service, location, etc…).
2. An analysis of the evolution of food trends in the area over time, in terms of consumer preferences. Do the preferences evolve over time, or do they seem stable?
3. Imagine you have to present your findings to the business owner and his investors. What advice would you give to the new business, based on your findings?

Given the nature of the task, we'll focus on customer reviews which concern restaurants located in the city of Philadelphia. First, let's take a look at the data to see how we can approach this task.

## Extracting restaurant reviews in Philadelphia
Before starting the analysis, we'll have to obtain the relevant review data. Let's start by importing the data and then inspecting their structures. It is worth mentioning that the reviews in the test data (`ATML2024_reviews_test.csv`) are NOT used in this analysis, since they don't contain the ratings by customers for the businesses which will hinder our ability to know the preferences of restaurants in Philadelphia.

In [6]:
# Importing the dataset
reviews_df = pd.read_csv("datasets/ATML2024_reviews_train.csv")
users_df = pd.read_csv("datasets/ATML2024_users.csv")
business_df = pd.read_csv("datasets/ATML2024_businesses.csv")

# Getting the zip codes of Philadelphia to filter businesses in the city
zip_code_philly = pd.read_excel("datasets/PHILADELPHIA.xlsx")
zip_code_philly["ZIP Code 5"] = zip_code_philly["ZIP Code 5"].astype('string')

### Glimpsing at the data
Below cells show the data types of the columns as well as the first 5 rows of each dataset. Based on the output, in order to extract reviews about restaurants in Philadelphia, we can first filter out businesses who are based in Philadelphia under the `city` column, and then look at those whose categories include restaurants. One crucial thing to note that, however, is that due to the textual nature of the data, there's no guarantee that the `city` column is free of typos or has standardised how Philadelphia is referred to. For example, the city is sometimes referred to as Philly. We shall inspect this column more in detail to ensure that we include all the restaurant reviews in Philadelphia (or at least we don't miss out too much because of typos).

We can also notice that some columns aren't in the correct data types and will need to be changed if they're to be used in the following analysis. For instance, the date-related columns (`date` in `reviews_df` and `user_since` in `users_df`) are wrongly marked as `object`, and the `premium_account` column is just a string of years concatenated together which might pose some troubles if we'd like to look like the number of premium users by year. But for now let's focus on filtering restaurant reviews in Philadelphia.

In [7]:
print(reviews_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1050000 entries, 0 to 1049999
Data columns (total 9 columns):
 #   Column       Non-Null Count    Dtype 
---  ------       --------------    ----- 
 0   id           1050000 non-null  int64 
 1   user_id      1050000 non-null  object
 2   business_id  1050000 non-null  object
 3   rating       1050000 non-null  int64 
 4   useful       1050000 non-null  int64 
 5   funny        1050000 non-null  int64 
 6   cool         1050000 non-null  int64 
 7   text         1050000 non-null  object
 8   date         1050000 non-null  object
dtypes: int64(5), object(4)
memory usage: 72.1+ MB
None


In [8]:
print(tabulate(reviews_df.head(), headers = "keys", tablefmt='orgtbl', showindex=False))

|   id | user_id                | business_id            |   rating |   useful |   funny |   cool | text                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      | date    

In [9]:
print(users_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 747468 entries, 0 to 747467
Data columns (total 19 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   user_id             747468 non-null  object 
 1   name                747457 non-null  object 
 2   user_since          747468 non-null  object 
 3   useful              747468 non-null  float64
 4   funny               747468 non-null  float64
 5   cool                747468 non-null  float64
 6   premium_account     57420 non-null   object 
 7   friends             747468 non-null  float64
 8   fans                747468 non-null  float64
 9   compliment_hot      747468 non-null  float64
 10  compliment_more     747468 non-null  float64
 11  compliment_profile  747468 non-null  float64
 12  compliment_cute     747468 non-null  float64
 13  compliment_list     747468 non-null  float64
 14  compliment_note     747468 non-null  float64
 15  compliment_plain    747468 non-nul

In [10]:
print(tabulate(users_df.head(), headers='keys', tablefmt='orgtbl', showindex=False))

| user_id                | name   | user_since          |   useful |   funny |   cool | premium_account                                                   |   friends |   fans |   compliment_hot |   compliment_more |   compliment_profile |   compliment_cute |   compliment_list |   compliment_note |   compliment_plain |   compliment_cool |   compliment_funny |   compliment_writer |
|------------------------+--------+---------------------+----------+---------+--------+-------------------------------------------------------------------+-----------+--------+------------------+-------------------+----------------------+-------------------+-------------------+-------------------+--------------------+-------------------+--------------------+---------------------|
| w7IdXgBVXKjZS5UYDO8cVq | Walker | 2007-01-25 16:47:26 |     7217 |    1259 |   5994 | 2007                                                              |     14995 |    267 |              250 |                65 |                   

In [11]:
print(business_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 138210 entries, 0 to 138209
Data columns (total 11 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   business_id  138210 non-null  object 
 1   name         138210 non-null  object 
 2   address      133772 non-null  object 
 3   city         138210 non-null  object 
 4   state        138210 non-null  object 
 5   postal_code  138145 non-null  object 
 6   latitude     138210 non-null  float64
 7   longitude    138210 non-null  float64
 8   attributes   126589 non-null  object 
 9   categories   138136 non-null  object 
 10  hours        117852 non-null  object 
dtypes: float64(2), object(9)
memory usage: 11.6+ MB
None


In [12]:
print(tabulate(business_df.head(), headers = "keys", tablefmt='orgtbl', showindex = False))

| business_id            | name                     | address                         | city         | state   |   postal_code |   latitude |   longitude | attributes                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 | categories                                                                         | hours                                                                                                                                     

### Extracting businesses located in Philadelphia
The first step is to extract businesses located in Philadelphia. This [Wikipedia page](https://en.wikipedia.org/wiki/List_of_Philadelphia_neighborhoods) shows the many names of neighbourhoods and districts in the city which do not necessarily contain "Philadelphia" (and the code output below shows that the city names are not free of typos...), so simply matching the `city` column with literal strings will probably miss out a considerable amount of businesses. Therefore, we will resort to using ZIP codes to extract businesses located in the city of Philadelphia.

In [13]:
# Checking how Philadelphia might be referred to
import re
unique_cities = business_df['city'].unique()  
philly_matches = [re.search(r"phi.*", city, flags=re.IGNORECASE) is not None for city in unique_cities]
print(unique_cities[philly_matches])  # Typos...

['Philadelphia' 'Philadephia' 'PHILA' 'Southwest Philadelphia' 'Phila'
 'Philadelphia PA' 'Philadelphila ' 'PHILADELPHIA' 'West Philadelphia'
 'philadelphia' 'Philiidelphia' 'Philly' 'Philadelphia (Northeast Philly)'
 'Philadelphia ' 'Philiadelphia']


In [14]:
# Getting businesses in Philly based on zip codes
zip_code_list = zip_code_philly["ZIP Code 5"].to_list()
business_philly = business_df.query("postal_code in @zip_code_list")
business_philly.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,attributes,categories,hours
2,wm9eoqjytVbC7dQcM4WSTM,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ..."
13,AeNmR681lBdMyqu8eqTTUM,Tuna Bar,205 Race St,Philadelphia,PA,19106,39.953949,-75.143226,"{'RestaurantsReservations': 'True', 'Restauran...","Sushi Bars, Restaurants, Japanese","{'Tuesday': '13:30-22:0', 'Wednesday': '13:30-..."
17,gCT6F7gqR50heBwQJcaeOR,BAP,1224 South St,Philadelphia,PA,19147,39.943223,-75.162568,"{'NoiseLevel': ""u'quiet'"", 'GoodForMeal': ""{'d...","Korean, Restaurants","{'Monday': '11:30-20:30', 'Tuesday': '11:30-20..."
26,g7r5IhhJGdpyFyoWB27NdQ,Bar One,767 S 9th St,Philadelphia,PA,19147,39.939825,-75.157447,"{'Smoking': ""u'no'"", 'NoiseLevel': ""u'average'...","Cocktail Bars, Bars, Italian, Nightlife, Resta...","{'Monday': '16:0-0:0', 'Tuesday': '16:0-0:0', ..."
29,Q_dELKJOieBlN9M42zobjM,DeSandro on Main,4105 Main St,Philadelphia,PA,19127,40.022466,-75.218314,"{'RestaurantsReservations': 'False', 'Caters':...","Pizza, Restaurants, Salad, Soup","{'Tuesday': '17:0-21:30', 'Wednesday': '17:0-1..."


### Checking Restaurant-related Businesses
We shall also pre-process the `categories` column to ensure that we do not miss out restaurant-related reviews due to typos. Using regex to look for words containing part of the spelling of restaurants (namely "Rest"), we see that more than one result were returned, of which 3 contain the term 'restaurant.' 

In [15]:
from itertools import chain

# Getting all the distinct business categories 
business_philly.loc[:, "categories"] = business_philly["categories"].str.lower()
categories_split = chain(*list(business_philly["categories"].dropna().str.split(r",\s+", regex=True)))
unique_categories = list(set(categories_split))

# Checking which categories might contain part of the spelling of the word restaurant
resto_matches = [re.search(r"Rest.*", cat, flags = re.IGNORECASE) is not None for cat in unique_categories]
np.array(unique_categories)[resto_matches]

array(['pop-up restaurants', 'rest stops', 'restaurants',
       'restaurant supplies', 'art restoration', 'damage restoration'],
      dtype='<U35')

Unlike the case of city names where we can resort to zip codes as reference, we need another way to avoid missing out restaurant-related businesses in the dataset. Here, we can use Jaro similarity to check whether restaurants are referred to with multiple ways due to typos. Mathematically, Jaro similarity $sim_j$ between two strings $s_1 \; \text{and} \; s_2$ is defined as below (a more detailed discussion of the Jaro similairty and its variants can be found [here](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance)):

\begin{align}
sim_j = 
\begin{cases}
0 \quad &\text{if} \; m = 0 \\
\frac{1}{3}(\frac{m}{|s_1|} + \frac{m}{|s_2|} + \frac{m-t}{m}) \; &\text{otherwise}
\end{cases},
\end{align}

where $|s_i|$ is the length of the string $s_i$, $m$ is the number of matching characters (characters in $s_1 \; \text{and} \; s_2$ are matching only if they're at most $[\frac{\text{max}(|s_1|, |s_2|)}{2}] - 1$ characters apart), and $t$ is the number of transpositions (i.e. swapping the positions of two characters) which is calculated as number of matching characters not being in the correct order divided by two.

Admittedly, there exist other string distance measures such as the Levenshtein distance as briefly mentioned in class. Nevertheless, we will use Jaro similarity because its output always ranges from 0 to 1 no matter the lengths of the two strings under comparison which makes thresholding easier. Moreover, Jaro similarity satisfies the mathematical definition of a distance metric, whereas its variant Jaro-Winkler similarity violates the triangle inequality.

If we use a threshold of 0.7 for the Jaro score to match the categories, only the top 3 results have their spellings containing the word 'restaurant.' Indeed, only the categories of *restaurants* and *pop-up restaurants* will be relevant for helping the business owners understand the customer preferences of the regional market, as `restaurant supplies` is more about the operation of a restaurant per se rather than on the customer's side. Therefore, we can just extract businesses whose categories are under *restaurants* or *pop-up restaurants* in `business_df`. 

In [16]:
# Checking how many categories can be considered as variants of restaurant-related businesses
jaro_replace_names(unique_categories, "restaurants", threshold = 0.7)

restaurants -> restaurants, score: 1.00
restaurant supplies -> restaurants, score: 0.86
pop-up restaurants -> restaurants, score: 0.78
austrian -> restaurants, score: 0.74
arts & crafts -> restaurants, score: 0.74
party bus rentals -> restaurants, score: 0.73
art space rentals -> restaurants, score: 0.73
vegetarian -> restaurants, score: 0.72
rest stops -> restaurants, score: 0.72
aerial tours -> restaurants, score: 0.71
real estate -> restaurants, score: 0.71
shutters -> restaurants, score: 0.71
southern -> restaurants, score: 0.71


In [17]:
# Getting only restaurant businesses in Philadelphia
restaurant_masks = business_philly["categories"].str.contains("restaurants") | business_philly["categories"].str.contains("pop-up restaurants")
resto_philly = business_philly.loc[restaurant_masks, :]
resto_philly.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,attributes,categories,hours
2,wm9eoqjytVbC7dQcM4WSTM,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","restaurants, food, bubble tea, coffee & tea, b...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ..."
13,AeNmR681lBdMyqu8eqTTUM,Tuna Bar,205 Race St,Philadelphia,PA,19106,39.953949,-75.143226,"{'RestaurantsReservations': 'True', 'Restauran...","sushi bars, restaurants, japanese","{'Tuesday': '13:30-22:0', 'Wednesday': '13:30-..."
17,gCT6F7gqR50heBwQJcaeOR,BAP,1224 South St,Philadelphia,PA,19147,39.943223,-75.162568,"{'NoiseLevel': ""u'quiet'"", 'GoodForMeal': ""{'d...","korean, restaurants","{'Monday': '11:30-20:30', 'Tuesday': '11:30-20..."
26,g7r5IhhJGdpyFyoWB27NdQ,Bar One,767 S 9th St,Philadelphia,PA,19147,39.939825,-75.157447,"{'Smoking': ""u'no'"", 'NoiseLevel': ""u'average'...","cocktail bars, bars, italian, nightlife, resta...","{'Monday': '16:0-0:0', 'Tuesday': '16:0-0:0', ..."
29,Q_dELKJOieBlN9M42zobjM,DeSandro on Main,4105 Main St,Philadelphia,PA,19127,40.022466,-75.218314,"{'RestaurantsReservations': 'False', 'Caters':...","pizza, restaurants, salad, soup","{'Tuesday': '17:0-21:30', 'Wednesday': '17:0-1..."


### Getting restaurant review data in Philly
We can finally merge the `reviews_df` with `restos_philly` to extract reviews (particularly the ratings) of restaurants located in the city of Philadelphia while also combining the restaurants' information into one table for analysis later on.

In [18]:
# Finally, joining user reviews with restaurant information
resto_reviews_philly = reviews_df.merge(resto_philly, how = "inner", on = "business_id")

# Dropping redundant columns
resto_reviews_philly.drop(columns=['id', 'user_id', 'business_id', 'state'], inplace=True)

We now inspect the restaurant review data to see if more cleaning needs to be done. The data type of the `date` column is defined as `object` which will hinder our ability to analyse changes in customer preferences over time later, so let's convert it into `datetime` type object. By looking at the first few rows of the data, we also notice that the `attributes` column is in json format. It will be good to convert them into something easier to use for subsequent analysis if needed, namely by removing the symbols specific to json format with regex.

Missing values are also present in some columns of our data, and this is relatively the most serious for `hours`. Given the primary task is to make sense of customer preferences for restaurants in Philadelphia, opening hours are likely to be less informative than food style and service for deciding the optimal business strategy to open a new restaurant in the city. Accordingly, we will simply drop the `hours` column first, and then check how many rows will be omitted if we only include rows which contain non-null values in all the other columns.

In [19]:
# Inspecting the data types of the columns
print(resto_reviews_philly.info())
resto_reviews_philly.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103290 entries, 0 to 103289
Data columns (total 15 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   rating       103290 non-null  int64  
 1   useful       103290 non-null  int64  
 2   funny        103290 non-null  int64  
 3   cool         103290 non-null  int64  
 4   text         103290 non-null  object 
 5   date         103290 non-null  object 
 6   name         103290 non-null  object 
 7   address      103162 non-null  object 
 8   city         103290 non-null  object 
 9   postal_code  103290 non-null  object 
 10  latitude     103290 non-null  float64
 11  longitude    103290 non-null  float64
 12  attributes   103227 non-null  object 
 13  categories   103290 non-null  object 
 14  hours        99578 non-null   object 
dtypes: float64(2), int64(4), object(9)
memory usage: 11.8+ MB
None


Unnamed: 0,rating,useful,funny,cool,text,date,name,address,city,postal_code,latitude,longitude,attributes,categories,hours
0,4,0,0,0,One of my favorite places to grab a quick bit ...,2014-05-28 19:05:26,Chickie's & Pete's,1526 Packer Ave,Philadelphia,19145,39.911417,-75.174511,"{'GoodForKids': 'True', 'Corkage': 'False', 'H...","seafood, nightlife, sports bars, bars, restaur...","{'Monday': '11:0-2:0', 'Tuesday': '11:0-2:0', ..."
1,5,0,0,0,I love this place! There's nothing like authen...,2013-05-28 16:52:23,Guacamole Mex-Grill,4612 Woodland Ave,Philadelphia,19143,39.943971,-75.209914,"{'Caters': 'False', 'RestaurantsAttire': ""u'ca...","mexican, restaurants, specialty food, ethnic f...","{'Monday': '11:0-20:0', 'Tuesday': '11:0-20:0'..."
2,4,2,0,0,Situated on Race street away from the main str...,2014-12-21 15:03:49,Shiao Lan Kung,930 Race St,Philadelphia,19107,39.955247,-75.155409,"{'RestaurantsGoodForGroups': 'True', 'Restaura...","noodles, seafood, restaurants, chinese","{'Tuesday': '15:0-2:0', 'Wednesday': '15:0-2:0..."
3,5,0,0,0,"Outstanding venue, great band, and the food is...",2015-10-17 01:07:26,Amari's Restaurant,5037 Baltimore Ave,Philadelphia,19143,39.947985,-75.224744,"{'HasTV': 'True', 'RestaurantsAttire': ""u'casu...","soul food, american (new), breakfast & brunch,...","{'Wednesday': '7:30-15:0', 'Thursday': '7:30-1..."
4,5,4,0,2,We loved our experience at Elwood.\n\nWhen you...,2019-05-04 12:24:05,Elwood,1007 Frankford Ave,Philadelphia,19125,39.966404,-75.134227,"{'RestaurantsReservations': 'True', 'OutdoorSe...","american (traditional), american (new), food, ...","{'Thursday': '17:0-22:0', 'Friday': '17:0-22:0..."


In [20]:
# Converting the date column into datetime data type for easier manipulation
resto_reviews_philly["date"] = pd.to_datetime(resto_reviews_philly['date']).dt.normalize()

# Cleaning attributes
resto_reviews_philly["attributes"] = resto_reviews_philly["attributes"].str.replace(r"\{|\}|'|\"|\bu", "", regex=True)
resto_reviews_philly["attributes"] = resto_reviews_philly["attributes"].str.split(", ")
resto_reviews_philly["attributes"][0][:5]  # I'll leave it like this for now but do further pre-processing if you need to

['GoodForKids: True',
 'Corkage: False',
 'HasTV: True',
 'RestaurantsAttire: casual',
 'RestaurantsTakeOut: True']

If we look at the shape of the `resto_review_philly` dataset after firsting dropping the `hours` columns and then the rows containing any null values, the decrease in the number of observations is only less than 200 which is minimal, so we will just proceed with this approach for dealing with missing values.

In [21]:
# Dropping the hours column 
resto_reviews_philly.drop(columns=['hours'], inplace = True)

# Checking how many obs we'll lose by dropping rows which contain missing values in any of the columns
resto_reviews_philly.dropna(inplace = True)  
resto_reviews_philly.info()  # The loss in observation is fewer than 200 rows which is minimal 

<class 'pandas.core.frame.DataFrame'>
Index: 103099 entries, 0 to 103289
Data columns (total 14 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   rating       103099 non-null  int64         
 1   useful       103099 non-null  int64         
 2   funny        103099 non-null  int64         
 3   cool         103099 non-null  int64         
 4   text         103099 non-null  object        
 5   date         103099 non-null  datetime64[ns]
 6   name         103099 non-null  object        
 7   address      103099 non-null  object        
 8   city         103099 non-null  object        
 9   postal_code  103099 non-null  object        
 10  latitude     103099 non-null  float64       
 11  longitude    103099 non-null  float64       
 12  attributes   103099 non-null  object        
 13  categories   103099 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(4), object(7)
memory usage: 11.8+ MB


Looking again at the `info()` of the dataset, every columns now has the correct data type, and the missing values are removed. Looking at the first 5 rows of the dataset, our pre-processing steps also seemed to work as intended. However, if we would like to use machine learning techniques (e.g. dimensionality reduction) on the textual data (namely, `text`, `categories` and potentially `attributes`), then we will need a way to convert them into numerical embeddings which can be understood by computers. We shall discuss this more when we analyse the data in later parts of this report.

In [22]:
# Format-wise the data look good
resto_reviews_philly.head()

Unnamed: 0,rating,useful,funny,cool,text,date,name,address,city,postal_code,latitude,longitude,attributes,categories
0,4,0,0,0,One of my favorite places to grab a quick bit ...,2014-05-28,Chickie's & Pete's,1526 Packer Ave,Philadelphia,19145,39.911417,-75.174511,"[GoodForKids: True, Corkage: False, HasTV: Tru...","seafood, nightlife, sports bars, bars, restaur..."
1,5,0,0,0,I love this place! There's nothing like authen...,2013-05-28,Guacamole Mex-Grill,4612 Woodland Ave,Philadelphia,19143,39.943971,-75.209914,"[Caters: False, RestaurantsAttire: casual, Noi...","mexican, restaurants, specialty food, ethnic f..."
2,4,2,0,0,Situated on Race street away from the main str...,2014-12-21,Shiao Lan Kung,930 Race St,Philadelphia,19107,39.955247,-75.155409,"[RestaurantsGoodForGroups: True, RestaurantsTa...","noodles, seafood, restaurants, chinese"
3,5,0,0,0,"Outstanding venue, great band, and the food is...",2015-10-17,Amari's Restaurant,5037 Baltimore Ave,Philadelphia,19143,39.947985,-75.224744,"[HasTV: True, RestaurantsAttire: casual, DogsA...","soul food, american (new), breakfast & brunch,..."
4,5,4,0,2,We loved our experience at Elwood.\n\nWhen you...,2019-05-04,Elwood,1007 Frankford Ave,Philadelphia,19125,39.966404,-75.134227,"[RestaurantsReservations: True, OutdoorSeating...","american (traditional), american (new), food, ..."


In [23]:
resto_reviews_philly.describe()

Unnamed: 0,rating,useful,funny,cool,date,latitude,longitude
count,103099.0,103099.0,103099.0,103099.0,103099,103099.0,103099.0
mean,3.813296,1.096102,0.368364,0.544205,2015-12-05 20:18:41.423098368,39.961504,-75.160255
min,1.0,0.0,0.0,0.0,2005-05-26 00:00:00,39.865466,-75.325
25%,3.0,0.0,0.0,0.0,2013-06-18 00:00:00,39.947631,-75.17157
50%,4.0,0.0,0.0,0.0,2016-03-22 00:00:00,39.950956,-75.16177
75%,5.0,1.0,0.0,1.0,2018-08-19 00:00:00,39.961327,-75.150251
max,5.0,115.0,82.0,112.0,2022-01-19 00:00:00,40.141488,-74.940729
std,1.303016,2.475243,1.378088,1.794338,,0.035732,0.035684


In [24]:
# # Creating copy of the restaurant data
# batch_n = int(5e4)
# restos_philly.iloc[:batch_n, :].to_csv("restos_philly_batch1.csv", index=False)
# restos_philly.iloc[batch_n:, :].to_csv("restos_philly_batch2.csv", index=False)


# # Loading the pre-processed data in csv files
# restos_b1 = pd.read_csv("restos_philly_batch1.csv")
# restos_b2 = pd.read_csv("restos_philly_batch2.csv")
# restos_all = pd.concat([restos_b1, restos_b2])
# assert restos_all.shape == restos_philly.shape

## Exploring the restaurant reviews data

Now that we have extracted the restaurant review data in Philadelphia, let's start by some exploratory data analysis (EDA) to make sense of the data and (hopefully) find some inspirations of what advice can be given to the business owner. 

### How often are restaurants reviewed by customers?

To begin with, how often is each restaurant reviewed by customers in Philly? The histogram below indicates that very few restaurants have received more than 30 reviews. 

<!-- Some EDAs
1. Distribution of the count of reviews per resto
2. 
-->

<!-- Some ideas of the analysis: 
1. Use embeddings on reviews and categories and then run LSA to summarise them as "themes" (e.g., what is a review about in terms of food, service etc.? How can we group many categories into one based on their similarity?) 
2. Analyse how the themes are related to rating to understand what customers like/ don't like about
-->

In [25]:
# Number of reviews per restaurant
review_counts = resto_reviews_philly["name"].value_counts()

fig = px.histogram(review_counts, 
                   title = " Histogram of the number of customer reviews per restaurant in Philadelphia")

fig.update_layout(xaxis_title = "Number of customer reviews", 
                  yaxis_title = "Count of restaurants", 
                  showlegend = False)

fig.show()

In particular, which are the most frequently reviews restaurants in the city? And what are their average ratings and types of restaurant? We can observe from below that the cuisine styles of the top 20 most reviewed restaurants are quite diverse, as we've cafes, Chinese restaurants and steakhouses among other types. Furthermore, most of the restaurants receive average ratings of at least 4, which could imply that customers in Philadelphia tend to give their reviews to restaurants which they like, albeit there is an exception of Geno's Steaks which only has an average rating of around 2.5.

In [26]:
# Getting top 20 restaurants in terms of reviews received
top_20_counts = review_counts[:20]

# Getting average ratings of top 20 restos
top_20_restos = top_20_counts.index
top_20_avg_ratings = resto_reviews_philly.query("name in @top_20_restos").groupby("name")["rating"].mean()

top_20s = pd.DataFrame(data = {"reviews_received": top_20_counts, 
                               "average_rating": top_20_avg_ratings}, 
                       index = top_20_restos)
top_20s

Unnamed: 0_level_0,reviews_received,average_rating
name,Unnamed: 1_level_1,Unnamed: 2_level_1
Reading Terminal Market,888,4.603604
Pat's King of Steaks,624,3.205128
Green Eggs Café,560,3.882143
Sabrina's Café,530,4.173585
Dim Sum Garden,523,3.923518
El Vez,511,4.017613
Geno's Steaks,505,2.493069
Barbuzzo,440,4.279545
Zahav,439,4.526196
Parc,426,4.239437


Below we also look at the proportion of reviews about the top 20 most reviewed restaurants out of all the restaurants in the dataset. They altogether received about 9% of the reviews from customers in Philadelphia, which is not a small proportion.

In [27]:
unique_restos = resto_reviews_philly["name"].unique()
n_restos = len(unique_restos)
print(f"There are {n_restos} restaurants in Philadelphia under the current dataset, and the top 20 most reviewed restaurants received {sum(top_20s["reviews_received"]) / resto_reviews_philly.shape[0] * 100:.2f}% of reviews.")

There are 4691 restaurants in Philadelphia under the current dataset, and the top 20 most reviewed restaurants received 8.82% of reviews.


### A brief look on the evolution of ratings over time

Apart from the distribution of the number of reviews among restaurants, we shall also explore how the customer ratings evolved throughout the period covered by the data. The dates of the reviews are originally recorded on the second level which will be too fine grained for understanding the rating trends. Therefore, we will start by aggregating the dates on a monthly level. the restaurant reviews covered in this dataset were created from May 2005 to January 2022.

In [28]:
# Aggregating dates to monthly level
resto_reviews_philly["date"] = resto_reviews_philly["date"].dt.to_period("M").dt.to_timestamp()
resto_reviews_philly["date"].describe()

count                           103099
mean     2015-11-21 03:49:47.260787968
min                2005-05-01 00:00:00
25%                2013-06-01 00:00:00
50%                2016-03-01 00:00:00
75%                2018-08-01 00:00:00
max                2022-01-01 00:00:00
Name: date, dtype: object

It will also be useful to look at how the average ratings evolved throughout this period, and more importantly how many reviews (or observations) there are for each month in the dataset, since this will affect how certain we can say about the food preference of customers in a given month. 

In the line plot on the left below, the monthly average rating throughout the period fluctuated around 3.7, albeit there was some erratic movement up to the end of 2007. What could be the reasons? The line plot on the right showing the number of reviews per month might give us an answer, namely, there were very few (sometimes even only a single digit) reviews for each month until December 2007, and thus the variances of the average rating for these months will be higher than subsequent months starting from 2008 when there were substantively more observations. We shall bear this characteristic in mind as we analyse the evolution of food trends in later sections, and we will discuss how to handle this accordingly.

Another interesting observations from the right line plot is that the number of reviews had a sharp drop since February 2020. Given how countries (including the US) responded to the COVID-19 outbreak by lockdowns (one of the measures being the prohibition of dine-in in restaurants), it is then not surprising that customers had fewer (or even no) chances to visit restaurants and thus not able to give as many reviews as before the COVID outbreak. 

In [29]:
# Seeing how the average ratings evolved as a function of time
from plotly.subplots import make_subplots

monthly_avg_rating = resto_reviews_philly.groupby("date")["rating"].apply(np.mean)
monthly_rating_no = resto_reviews_philly.groupby("date")["rating"].count()

monthly_trend = pd.DataFrame({"review_no": monthly_rating_no, 
                              "avg_rating": monthly_avg_rating})

fig = make_subplots(rows = 1, cols = 2)

fig.add_trace(
    go.Scatter(x = monthly_trend.index,
               y = monthly_trend["avg_rating"], 
               mode = "lines", showlegend = False), 
    row = 1, col = 1
)

fig.add_trace(
    go.Scatter(x = monthly_trend.index,
               y = monthly_trend["review_no"], 
               mode = "lines", showlegend = False), 
    row = 1, col = 2
)

fig.update_layout(title_text = "Trends of customer reviews on restaurants from May 2005 to January 2022 in Philadelphia",
                  xaxis1_title = "Time", yaxis1_title = "Average rating", 
                  xaxis2_title = "Time", yaxis2_title = "Number of reviews")

fig.show()

## The evolution of food trends over time

Besides looking at what customers in general like about restaurants, it will be also beneficial to understand whether food preferences by customers might change over time. For instance, there might exist "seasonality" for certain types of cuisine, as fondue 🫕 is usually more preferred by customers in winters than in summers. Having knowledge of food trends in Philadelphia can be useful for the business owner to decide which type of restaurant to open and whether he will have to adjust the restaurant's capacity for service depending on the season. 

But how should we define "food trends" with the data we are using? Recall that the `categories` column, now that only restaurant-type businesses are included, contain information about the types of catering services or cuisine a restaurant offers. Accordingly, we will see how the average ratings for categories of restaurants changed over the period of time covered in the dataset. 

Continuing from where we left off about the number of reviews per month in the dataset, we will only analyse the food trends with data from 2008 because the earlier period in the dataset contains too few monthly observations to draw any reliable conclusions about the food trends. 

### Clustering restaurants into different types 
Initially, one might just want to directly work on the categories of the restaurants to examine food trends. On a closer look, however, there are over 300 unique categories describing businesses with restaurants being one of their services in Philadelphia. Using the categories directly will then make presenting the food trends in a concise manner to the business owner difficult.

In [30]:
# Checking how many categories of restaurants there are in the dataset
resto_reviews_philly["categories"] = resto_reviews_philly["categories"].str.split(r",\s+", regex = True)
resto_cat_list = resto_reviews_philly["categories"].to_list()
unique_resto_types = set(chain(*resto_cat_list))
print(f"There are a total of {len(unique_resto_types)} categories related to restaurants in the dataset.")

There are a total of 368 categories related to restaurants in the dataset.


We would therefore need a way to summarise all the restaurants in Philadelphia as a few major types and then look at how the ratings have evolved for each type throughout the period. Using Cluster analysis, which is one of the unsupervised learning algorithms, to find similar groups (the number being pre-defined) in the data will be very helpful for this task. The first step is to convert the restaurants, whose categories are in texts in the dataset, into some representations that can be understood by clustering algorithms which usually requires numerical inputs. Word embeddings, then, will be very helpful for representing natural language as vectors.

Specifically, below are the steps of using cluster analysis to summarise the restaurants into a few major types:z
1. Train word embeddings for the categories of restaurant-related businesses
2. Use the above word embeddings to create an embedding vector of each restaurant
3. Perform clustering of the restaurants and see how many groups fit the restaurants best

For word embeddings, we will opt for the Word2Vec algorithm and train it on the restaurant categories in the dataset. A brief explanation of the above decision: we are training a Word2Vec model here because our task is quite specific, namely, understanding the food preferences of customers in Philadelphia on several major types of restaurants. If we use pre-trained embedding models, then we will likely risk not being able to understand how some categories, having multiple connotations in English, just mean for restaurant categories. For instance, "Chinese" is one category of restaurants included in the dataset, but it can also be used in other contexts such as politics and media, and it is very likely that pre-trained models (usually on a large corpus of documents from the Internet) will not have embeddings for the word "Chinese" solely on its meaning as a cuisine style.

As for using the Word2Vec algorithm, one of the reasons is that it can be easily trained with personal computers via `gensim`, unlike GloVe which requires using the command line to train or Transformers architectures which are prohibitively expensive for individuals to train on their own computers. Moreover, Word2Vec can capture more nuances of the restaurant categories beyond their similarities than rudimentary embedding methods like the bag of words or TF-IDF. 

Since Word2Vec is already discussed in detail during module 7 of this course, here we will just quickly mention its main characteristics. In essence, Word2Vec obtains embeddings for each word $v$ in the vocabulary $V$ by training a classifier to . The parameter matrices, , are trained via minimising the negative log-likelihood over all words as target words. 

In [31]:
# For training word embeddings and clustering
from gensim.models import Word2Vec
from sklearn.cluster import KMeans

# For faster computation
import multiprocessing
cores = multiprocessing.cpu_count()

Below we extract the unique entries of restaurants in the dataset along with their categories. We notice that the extracted data below have more rows than the total number of restaurants reviewed in Philadelphia in our dataset. What is the reason behind? 

In [32]:
# Getting all the distinct restaurants and their respective categories
resto_reviews_philly["categories"] = resto_reviews_philly["categories"].apply(tuple)  
resto_cats = resto_reviews_philly[["name", "categories"]].drop_duplicates()
resto_cats.shape

(5294, 2)

It seems that there are restaurants which have more than one entry of their categories in the data, and if we look at the case of Popeyes Louisiana Kitchen, then we observe that some categories are only present in some review entries or ordered in a different way, causing certain restaurants to have multiple entries of list of categories. A possible reason might be that the categories of the restaurants are determined by user inputs, so there exist discrepancies about the categories of the restaurants in the original dataset. To remedy this issue, we can combine these repeated entries into one by using `set` operations in python. 

In [33]:
# Restaurants having more than one entry for categories
restaurant_counts = resto_cats["name"].value_counts()
print(restaurant_counts)

popeyes = resto_cats.query("name == 'Taco Bell'")
popeyes

name
Wawa                            35
McDonald's                      30
Dunkin'                         20
Popeyes Louisiana Kitchen       16
Taco Bell                       14
                                ..
Saloon Restaurant                1
Suya Suya West African Grill     1
The Fresh Grocer of LaSalle      1
Ansill                           1
DOHO Taqueria                    1
Name: count, Length: 4691, dtype: int64


Unnamed: 0,name,categories
6722,Taco Bell,"(mexican, tacos, restaurants, tex-mex, breakfa..."
10292,Taco Bell,"(italian, restaurants, tex-mex, fast food, piz..."
10791,Taco Bell,"(restaurants, mexican, tacos, tex-mex, breakfa..."
15087,Taco Bell,"(mexican, breakfast & brunch, tacos, fast food..."
15761,Taco Bell,"(restaurants, breakfast & brunch, tacos, fast ..."
16033,Taco Bell,"(restaurants, breakfast & brunch, tacos, mexic..."
25522,Taco Bell,"(restaurants, tex-mex, mexican, fast food, tacos)"
28671,Taco Bell,"(mexican, restaurants)"
33631,Taco Bell,"(restaurants, mexican)"
36756,Taco Bell,"(restaurants, fast food, mexican, tex-mex, tacos)"


In [34]:
# Concatenating all distinct categories for each restaurant
def combine_cats(restaurant):
    categories = resto_cats.query("name == @restaurant")["categories"].to_list()
    return list(set(chain(*categories)))

resto_cats_dict = {}

for resto in restaurant_counts.index:
    resto_cats_dict[resto] = combine_cats(resto)
    
resto_cats = pd.DataFrame({"name": resto_cats_dict.keys(),
                           "categories": resto_cats_dict.values()})

# 
assert resto_cats.shape[0] == n_restos
resto_cats.head()

Unnamed: 0,name,categories
0,Wawa,"[coffee & tea, convenience stores, gas station..."
1,McDonald's,"[coffee & tea, american (traditional), burgers..."
2,Dunkin',"[coffee & tea, bagels, donuts, sandwiches, bre..."
3,Popeyes Louisiana Kitchen,"[southern, american (traditional), cajun/creol..."
4,Taco Bell,"[italian, pizza, breakfast & brunch, tacos, te..."


Now we will train a clustering model for the restaurants to 

In [None]:
# A function for converting restaurants into embedding vectors based on its categories
def resto2vec(tokens, embedding_wv, normalize=True):
    """Returns the embedding of a restaurant-related business as the mean of the tokens/words embeddings of its categories."""
    sent_mean = np.array([embedding_wv.get_vector(tok, norm=normalize) for tok in tokens]).mean(axis=0)
    return sent_mean

In [None]:
w2v_resto = Word2Vec(resto_cat_list, vector_size = 30, window = 2, min_count = 5, sg = 1, negative = 2, ns_exponent = 0.75,
                     alpha = 0.01, min_alpha = 0.0001, workers = cores - 1, epochs = 30, seed = 123)

In [None]:
w2v_resto.wv.most_similar("chinese")

[('taiwanese', 0.6674388647079468),
 ('noodles', 0.6260411143302917),
 ('dim sum', 0.5680884122848511),
 ('ramen', 0.5618141889572144),
 ('shanghainese', 0.5361201763153076),
 ('cantonese', 0.5340621471405029),
 ('hotel bar', 0.5229731202125549),
 ('hong kong style cafe', 0.5224063992500305),
 ('szechuan', 0.5212990045547485),
 ('do-it-yourself food', 0.5171496272087097)]