In [1]:
import pandas as pd

# Columns that are used for the initial dataframe
cols = ['business_id', 'city', 'state', 'postal_code', 'latitude', 'longitude', 'stars', 'review_count', 'categories']

business_df = pd.read_csv('..\Yelp Dataset\yelp_business.csv', index_col='business_id', usecols=cols)

In [2]:
business_df.head()

Unnamed: 0_level_0,city,state,postal_code,latitude,longitude,stars,review_count,categories
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
M82c96ICluambjx0wWg6Bg,110 Las Vegas,NV,89147,36.099575,-115.306722,5.0,63,Smog Check Stations;Automotive;Oil Change Stat...
LixFCMGKdptI8WRsjAl5cQ,AGINCOURT,ON,M1W 2T4,43.794282,-79.32994,2.5,6,Fast Food;Burgers;Restaurants
2DBNUgEeFaQ3FznUA7P1eQ,Aberdour,FIF,KY3 0SL,56.055265,-3.297981,4.0,4,Landmarks & Historical Buildings;Public Servic...
D0ohQMGTZ8QhGiMIh6aTFg,Aberlady,ELN,EH32 0PX,56.003354,-2.870777,3.5,4,Food;Farmers Market
LtfdxQfCYiIK8sXXW0m7rA,Aberlady,ELN,EH32 0RF,56.008924,-2.861384,5.0,3,Restaurants;British


In [3]:
business_df.shape

(174567, 8)

In [4]:
business_df.describe(include='all')

Unnamed: 0,city,state,postal_code,latitude,longitude,stars,review_count,categories
count,174566,174566,173944.0,174566.0,174566.0,174567.0,174567.0,174567
unique,1093,67,16004.0,,,,,76419
top,Las Vegas,AZ,89109.0,,,,,Restaurants;Pizza
freq,26775,52214,2965.0,,,,,990
mean,,,,38.627312,-92.679009,3.632196,30.137059,
std,,,,5.389012,26.240079,1.003739,98.208174,
min,,,,-36.086009,-142.46665,1.0,3.0,
25%,,,,33.63155,-112.125879,3.0,4.0,
50%,,,,36.144257,-89.410128,3.5,8.0,
75%,,,,43.606181,-79.657609,4.5,23.0,


Because we are visualizing only restaurant review data, we want to extract observations that have Restaurants value in their category attribute.

In [5]:
business_df.categories.value_counts()

Restaurants;Pizza                                                                                                                             990
Pizza;Restaurants                                                                                                                             987
Food;Coffee & Tea                                                                                                                             978
Nail Salons;Beauty & Spas                                                                                                                     936
Coffee & Tea;Food                                                                                                                             929
Beauty & Spas;Nail Salons                                                                                                                     909
Mexican;Restaurants                                                                                                         

In [6]:
# Matching 'Restaurants' string with categories attribute and extracting a new dataframe
restaurants_df = business_df[business_df['categories'].str.contains('Restaurants')]

In [7]:
restaurants_df.shape

(54618, 8)

In [8]:
restaurants_df.head()

Unnamed: 0_level_0,city,state,postal_code,latitude,longitude,stars,review_count,categories
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
LixFCMGKdptI8WRsjAl5cQ,AGINCOURT,ON,M1W 2T4,43.794282,-79.32994,2.5,6,Fast Food;Burgers;Restaurants
LtfdxQfCYiIK8sXXW0m7rA,Aberlady,ELN,EH32 0RF,56.008924,-2.861384,5.0,3,Restaurants;British
UY1BHeeb1oE0-uk0EKqaGg,Ahwatukee,AZ,85044,33.348551,-111.975151,1.5,7,Fast Food;Food;Restaurants;Burgers
qJf61TR4Jq9Xph5RiXPS9A,Ahwatukee,AZ,85048,33.305289,-111.992538,4.0,270,Restaurants;Coffee & Tea;Food;Creperies
0Rni7ocMC_Lg2UH0lDeKMQ,Ahwatukee,AZ,85044,33.318059,-111.983528,3.5,74,Restaurants;Pizza;Italian;Sandwiches


Inspecting the DF for missing and duplicate values

In [9]:
restaurants_df.isna().sum()

city              0
state             0
postal_code     101
latitude          0
longitude         0
stars             0
review_count      0
categories        0
dtype: int64

There are some missing values, but so few that we can drop them.

In [10]:
r_df_nona = restaurants_df.dropna(axis=0, how='any')
r_df_nona.isna().sum()

city            0
state           0
postal_code     0
latitude        0
longitude       0
stars           0
review_count    0
categories      0
dtype: int64

In [11]:
r_df_nona[r_df_nona.duplicated()]

Unnamed: 0_level_0,city,state,postal_code,latitude,longitude,stars,review_count,categories
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
e8cFfRySb8n9lMXRnyzeIA,Charlotte,NC,28270,35.137593,-80.73942,3.0,7,Restaurants;Chinese


Only one duplicate value so we can drop it.

In [12]:
r_df = r_df_nona.drop_duplicates(keep='first')
r_df.duplicated().value_counts()

False    54516
dtype: int64

In [13]:
r_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 54516 entries, LixFCMGKdptI8WRsjAl5cQ to mPuJAD2dEw3Wax-JlUnEBw
Data columns (total 8 columns):
city            54516 non-null object
state           54516 non-null object
postal_code     54516 non-null object
latitude        54516 non-null float64
longitude       54516 non-null float64
stars           54516 non-null float64
review_count    54516 non-null int64
categories      54516 non-null object
dtypes: float64(3), int64(1), object(4)
memory usage: 3.7+ MB


We want to replace the Restaurants string in categories attribute, because it doesn't provide any additional information

In [14]:
r_df['categories'] = r_df['categories'].replace({';Restaurants' : '', 'Restaurants;' : '', 'Restaurants' : ''}, regex=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [15]:
r_df.head()

Unnamed: 0_level_0,city,state,postal_code,latitude,longitude,stars,review_count,categories
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
LixFCMGKdptI8WRsjAl5cQ,AGINCOURT,ON,M1W 2T4,43.794282,-79.32994,2.5,6,Fast Food;Burgers
LtfdxQfCYiIK8sXXW0m7rA,Aberlady,ELN,EH32 0RF,56.008924,-2.861384,5.0,3,British
UY1BHeeb1oE0-uk0EKqaGg,Ahwatukee,AZ,85044,33.348551,-111.975151,1.5,7,Fast Food;Food;Burgers
qJf61TR4Jq9Xph5RiXPS9A,Ahwatukee,AZ,85048,33.305289,-111.992538,4.0,270,Coffee & Tea;Food;Creperies
0Rni7ocMC_Lg2UH0lDeKMQ,Ahwatukee,AZ,85044,33.318059,-111.983528,3.5,74,Pizza;Italian;Sandwiches


Now we want to split the categories column by ";" character and add the first given category (the most important one) to the r_df.

In [16]:
r_df['category'] = r_df.categories.str.split(';').str[0]
r_df.category.value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Pizza                        3387
Fast Food                    2847
Mexican                      2557
Chinese                      2538
Food                         2520
Italian                      2326
Sandwiches                   2300
American (Traditional)       2043
Burgers                      1825
Nightlife                    1568
Breakfast & Brunch           1496
Bars                         1487
American (New)               1386
Japanese                     1159
Cafes                        1023
Sushi Bars                    907
Thai                          832
Seafood                       800
Indian                        799
Chicken Wings                 786
Barbeque                      671
Vietnamese                    647
Mediterranean                 623
Asian Fusion                  601
Delis                         582
Coffee & Tea                  582
Steakhouses                   564
Canadian (New)                552
Greek                         519
Salad         

In [17]:
# Create DataFrame out of food categories and count frequencies of the first given cuisine
food_categories = r_df.categories.str.split(';').str[0]
food_categories.value_counts()

Pizza                        3387
Fast Food                    2847
Mexican                      2557
Chinese                      2538
Food                         2520
Italian                      2326
Sandwiches                   2300
American (Traditional)       2043
Burgers                      1825
Nightlife                    1568
Breakfast & Brunch           1496
Bars                         1487
American (New)               1386
Japanese                     1159
Cafes                        1023
Sushi Bars                    907
Thai                          832
Seafood                       800
Indian                        799
Chicken Wings                 786
Barbeque                      671
Vietnamese                    647
Mediterranean                 623
Asian Fusion                  601
Delis                         582
Coffee & Tea                  582
Steakhouses                   564
Canadian (New)                552
Greek                         519
Salad         

In [18]:
# Let's pick 50 most frequent out of these and read them into a list

# Read all food categories to an list
food_categories_list = food_categories.value_counts().index.tolist()

food_categories_50 = food_categories_list[0:51]

In [19]:
#r_df = pd.merge(r_df, food_categories, how='inner', on='business_id')