### Combining business and reviews for Toronto + expanding attributes

In [1]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

import sys
np.set_printoptions(threshold=sys.maxsize)

pd.set_option('display.max_columns', 999)  # This allows us to view all the columns
pd.set_option('display.max_rows', 999)

  import pandas.util.testing as tm


Read in `business` json file

In [2]:
business = pd.read_json('../Dataset/yelp_academic_dataset_business.json', lines=True)
business.head(3)

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,1SWheh84yJXfytovILXOAQ,Arizona Biltmore Golf Club,2818 E Camino Acequia Drive,Phoenix,AZ,85016,33.522143,-112.018481,3.0,5,0,{'GoodForKids': 'False'},"Golf, Active Life",
1,QXAEGFB4oINsVuTFxEYKFQ,Emerald Chinese Restaurant,30 Eglinton Avenue W,Mississauga,ON,L5R 3E7,43.605499,-79.652289,2.5,128,1,"{'RestaurantsReservations': 'True', 'GoodForMe...","Specialty Food, Restaurants, Dim Sum, Imported...","{'Monday': '9:0-0:0', 'Tuesday': '9:0-0:0', 'W..."
2,gnKjwL_1w79qoiV3IC_xQQ,Musashi Japanese Restaurant,"10110 Johnston Rd, Ste 15",Charlotte,NC,28210,35.092564,-80.859132,4.0,170,1,"{'GoodForKids': 'True', 'NoiseLevel': 'u'avera...","Sushi Bars, Restaurants, Japanese","{'Monday': '17:30-21:30', 'Wednesday': '17:30-..."


Keep only business that are still **open**

In [3]:
business.is_open.value_counts()

1    158525
0     34084
Name: is_open, dtype: int64

In [4]:
# Keep only business that are still open
# Drop columns that may not be relavent
df_business = business[business['is_open']==1].drop(['is_open'], axis=1)
print(df_business.shape)
df_business.head(2)

(158525, 13)


Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,attributes,categories,hours
1,QXAEGFB4oINsVuTFxEYKFQ,Emerald Chinese Restaurant,30 Eglinton Avenue W,Mississauga,ON,L5R 3E7,43.605499,-79.652289,2.5,128,"{'RestaurantsReservations': 'True', 'GoodForMe...","Specialty Food, Restaurants, Dim Sum, Imported...","{'Monday': '9:0-0:0', 'Tuesday': '9:0-0:0', 'W..."
2,gnKjwL_1w79qoiV3IC_xQQ,Musashi Japanese Restaurant,"10110 Johnston Rd, Ste 15",Charlotte,NC,28210,35.092564,-80.859132,4.0,170,"{'GoodForKids': 'True', 'NoiseLevel': 'u'avera...","Sushi Bars, Restaurants, Japanese","{'Monday': '17:30-21:30', 'Wednesday': '17:30-..."


Explode categories for Toronto alone

In [5]:
df_business_toronto = df_business[df_business.city == 'Toronto']
print(df_business_toronto.shape)
df_business_toronto.head(2)

(14331, 13)


Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,attributes,categories,hours
19,zSpQmEBvRe3IhTUlMSA6HQ,Totum Life Science,"445 King Street W, Suite 101",Toronto,ON,M5V 1K4,43.645041,-79.395799,4.0,23,"{'AcceptsInsurance': 'True', 'ByAppointmentOnl...","Trainers, Health & Medical, Active Life, Physi...","{'Monday': '6:0-22:0', 'Tuesday': '6:0-22:0', ..."
26,C9oCPomVP0mtKa8z99E3gg,Bakery Gateau,"865 York Mills Road, Unit 1",Toronto,ON,M3B 1Y6,43.754093,-79.349548,4.5,8,"{'RestaurantsDelivery': 'False', 'RestaurantsP...","Bakeries, Food","{'Monday': '9:0-20:0', 'Tuesday': '9:0-20:0', ..."


In [6]:
# df.explode requires pandas ver 0.25
# Create one row for each series that contain comma-separated items
df_explode_toronto = df_business_toronto.assign(categories = df_business_toronto.categories.str.split(', ')).explode('categories')
df_explode_toronto.sample(3)

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,attributes,categories,hours
58271,2JieLs02McAWSFnqVtnDhA,Heritage Toronto,157 King Street E,Toronto,ON,M5C 1G9,43.650194,-79.372137,4.5,3,{'GoodForKids': 'True'},Tours,"{'Monday': '10:0-16:0', 'Tuesday': '10:0-16:0'..."
19996,8m8flB_cNJseSHQcok7r-A,The Calm on Dundas,3148 Dundas Street W,Toronto,ON,M6P 2A1,43.66574,-79.474851,5.0,15,"{'WheelchairAccessible': 'True', 'RestaurantsP...",Acupuncture,"{'Monday': '7:0-20:0', 'Tuesday': '9:0-20:0', ..."
170986,yWTzqlOK61gbfsx4JGFHeQ,East York Massage and Physiotherapy Studio,1557 O`Connor Drive,Toronto,ON,M4B 2V7,43.714932,-79.304972,5.0,3,"{'RestaurantsPriceRange2': '2', 'AcceptsInsura...",Fitness & Instruction,


In [7]:
print('Total number of categories:', len(df_explode_toronto.categories.value_counts()))
print('Top 20 categories:')
df_explode_toronto.categories.value_counts()[:30]

Total number of categories: 888
Top 20 categories:


Restaurants                  5253
Food                         3041
Shopping                     2564
Beauty & Spas                1618
Nightlife                    1165
Bars                         1045
Health & Medical              973
Coffee & Tea                  952
Local Services                836
Event Planning & Services     742
Fashion                       703
Active Life                   632
Specialty Food                612
Hair Salons                   576
Home Services                 548
Arts & Entertainment          495
Sandwiches                    493
Cafes                         484
Breakfast & Brunch            479
Fast Food                     448
Hotels & Travel               446
Chinese                       438
Pizza                         427
Bakeries                      426
Canadian (New)                414
Nail Salons                   400
Desserts                      385
Italian                       378
Hair Removal                  376
Automotive    

In [8]:
# Food categories in the top 30 categories
categories_toronto = ['Restaurants','Food','Bars','Coffee & Tea','Specialty Food','Sandwiches','Cafes','Breakfast & Brunch','Fast Food','Chinese','Pizza','Bakeries','Canadian (New)','Desserts','Italian']

In [9]:
categories_toronto = '|'.join(categories_toronto)
categories_toronto

'Restaurants|Food|Bars|Coffee & Tea|Specialty Food|Sandwiches|Cafes|Breakfast & Brunch|Fast Food|Chinese|Pizza|Bakeries|Canadian (New)|Desserts|Italian'

Filtering dataset to only food businesses

In [10]:
# Keep only business with categories that are in categories_lvl1
business_food = df_business_toronto[df_business_toronto['categories'].str.contains(
                         categories_toronto, case=False, na=False)]
print(business_food.shape)
business_food.head()

(7143, 13)


  return func(self, *args, **kwargs)


Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,attributes,categories,hours
26,C9oCPomVP0mtKa8z99E3gg,Bakery Gateau,"865 York Mills Road, Unit 1",Toronto,ON,M3B 1Y6,43.754093,-79.349548,4.5,8,"{'RestaurantsDelivery': 'False', 'RestaurantsP...","Bakeries, Food","{'Monday': '9:0-20:0', 'Tuesday': '9:0-20:0', ..."
29,NDuUMJfrWk52RA-H-OtrpA,Bolt Fresh Bar,1170 Queen Street W,Toronto,ON,M6J 1J5,43.642889,-79.425429,3.0,57,"{'WiFi': 'u'no'', 'BikeParking': 'True', 'Rest...","Juice Bars & Smoothies, Food, Restaurants, Fas...","{'Monday': '8:0-21:0', 'Tuesday': '8:0-21:0', ..."
83,RFbMVekR8lU9tPJ8sWrwHg,Tavolino,619 Mount Pleasant Road,Toronto,ON,M4S 2M5,43.704229,-79.38823,4.0,18,"{'RestaurantsPriceRange2': '2', 'HasTV': 'Fals...","Italian, Restaurants",
96,NPHZkn1e-tSJAbo8Zm9rYw,Burrito Bandidos,1614 Queen Street E,Toronto,ON,M4L 1G4,43.666181,-79.316468,3.5,43,"{'RestaurantsGoodForGroups': 'True', 'Restaura...","Tex-Mex, Mexican, Restaurants","{'Monday': '11:0-23:0', 'Tuesday': '11:0-23:0'..."
116,gyFYZV4b_9TxG1ulQNi0Ig,Paramount Fine Foods,165 East Liberty Street,Toronto,ON,M6K 3K4,43.638442,-79.417237,2.0,40,"{'RestaurantsTakeOut': 'True', 'RestaurantsTab...","Middle Eastern, Restaurants, Salad, Breakfast ...","{'Monday': '10:0-22:0', 'Tuesday': '10:0-22:0'..."


In [11]:
business_food = business_food.reset_index(drop = True)
business_food.shape

(7143, 13)

Splitting attributes

In [12]:
# Finding null attributes
business_food['attributes'].isnull().sum()

393

In [13]:
# Removing null attributes and resetting index
business_food = business_food.dropna(subset=['attributes'])
business_food = business_food.reset_index(drop = True)
business_food.shape

(6750, 13)

In [14]:
business_food['attributes'][0]

{'RestaurantsDelivery': 'False',
 'RestaurantsPriceRange2': '2',
 'RestaurantsTakeOut': 'True',
 'BikeParking': 'True',
 'BusinessParking': "{'garage': False, 'street': False, 'validated': False, 'lot': False, 'valet': False}"}

In [15]:
# Looking at the attributes in the first business/ row
print(len(business_food['attributes'][0].keys()))
business_food['attributes'][0].keys()

5


dict_keys(['RestaurantsDelivery', 'RestaurantsPriceRange2', 'RestaurantsTakeOut', 'BikeParking', 'BusinessParking'])

In [16]:
# Finding all the unique attributes in all the businesses/ rows 
key_list = []
for i in range(6750):
    for key in business_food['attributes'][i].keys():
        if key not in key_list:
            key_list.append(key)

In [17]:
key_list

['RestaurantsDelivery',
 'RestaurantsPriceRange2',
 'RestaurantsTakeOut',
 'BikeParking',
 'BusinessParking',
 'WiFi',
 'Caters',
 'HasTV',
 'OutdoorSeating',
 'RestaurantsAttire',
 'GoodForKids',
 'Ambience',
 'NoiseLevel',
 'RestaurantsReservations',
 'GoodForMeal',
 'RestaurantsGoodForGroups',
 'RestaurantsTableService',
 'Alcohol',
 'DogsAllowed',
 'WheelchairAccessible',
 'DriveThru',
 'Music',
 'Smoking',
 'GoodForDancing',
 'HappyHour',
 'BestNights',
 'CoatCheck',
 'ByAppointmentOnly',
 'BusinessAcceptsCreditCards',
 'AcceptsInsurance',
 'DietaryRestrictions',
 'AgesAllowed',
 'HairSpecializesIn',
 'RestaurantsCounterService',
 'BusinessAcceptsBitcoin']

In [18]:
counter = 0
for i in range(business_food.shape[0]):
    
    ats = [i.split(':', 1)[0] for i in business_food['attributes'][i]]
    contain = [i for i in ats if i in key_list]
    
    if len(contain) == len (key_list):
            counter += 1

print("Number of restaurants containing all attributes:  ", counter)
print("Total Number of restaurants:  ", len(business_food))

Number of restaurants containing all attributes:   0
Total Number of restaurants:   6750


In [19]:
# Expanding attributes to multiple columns
for col in key_list:
    business_food[col]= np.nan
#loop through each attribute
idx = 0
for attr in business_food['attributes']:
    # print(idx)
    if attr is None:
        idx+=1
        continue
    for k,v in attr.items():
        #print('value: '+v)
#         if'{' in v:
#             continue
        business_food[k][idx] = v
    idx+=1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [20]:
business_food['attributes'].iloc[1]

{'WiFi': "u'no'",
 'BikeParking': 'True',
 'RestaurantsPriceRange2': '2',
 'BusinessParking': "{'garage': False, 'street': True, 'validated': False, 'lot': False, 'valet': False}",
 'RestaurantsTakeOut': 'True',
 'Caters': 'False'}

In [21]:
# Viewing the expanded dataset
business_food.head(3)

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,attributes,categories,hours,RestaurantsDelivery,RestaurantsPriceRange2,RestaurantsTakeOut,BikeParking,BusinessParking,WiFi,Caters,HasTV,OutdoorSeating,RestaurantsAttire,GoodForKids,Ambience,NoiseLevel,RestaurantsReservations,GoodForMeal,RestaurantsGoodForGroups,RestaurantsTableService,Alcohol,DogsAllowed,WheelchairAccessible,DriveThru,Music,Smoking,GoodForDancing,HappyHour,BestNights,CoatCheck,ByAppointmentOnly,BusinessAcceptsCreditCards,AcceptsInsurance,DietaryRestrictions,AgesAllowed,HairSpecializesIn,RestaurantsCounterService,BusinessAcceptsBitcoin
0,C9oCPomVP0mtKa8z99E3gg,Bakery Gateau,"865 York Mills Road, Unit 1",Toronto,ON,M3B 1Y6,43.754093,-79.349548,4.5,8,"{'RestaurantsDelivery': 'False', 'RestaurantsP...","Bakeries, Food","{'Monday': '9:0-20:0', 'Tuesday': '9:0-20:0', ...",False,2,True,True,"{'garage': False, 'street': False, 'validated'...",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,NDuUMJfrWk52RA-H-OtrpA,Bolt Fresh Bar,1170 Queen Street W,Toronto,ON,M6J 1J5,43.642889,-79.425429,3.0,57,"{'WiFi': 'u'no'', 'BikeParking': 'True', 'Rest...","Juice Bars & Smoothies, Food, Restaurants, Fas...","{'Monday': '8:0-21:0', 'Tuesday': '8:0-21:0', ...",,2,True,True,"{'garage': False, 'street': True, 'validated':...",u'no',False,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,RFbMVekR8lU9tPJ8sWrwHg,Tavolino,619 Mount Pleasant Road,Toronto,ON,M4S 2M5,43.704229,-79.38823,4.0,18,"{'RestaurantsPriceRange2': '2', 'HasTV': 'Fals...","Italian, Restaurants",,,2,,True,"{'garage': False, 'street': True, 'validated':...",,,False,False,u'casual',False,"{'romantic': False, 'intimate': False, 'classy...",u'quiet',True,"{'dessert': False, 'latenight': False, 'lunch'...",True,True,u'full_bar',,,,,,,,,,,,,,,,,


Rename column names of attributes to attr_`column name`

In [22]:
# Rename column names of attributes to attr_column name
business_food.iloc[:, 13:].columns.values

array(['RestaurantsDelivery', 'RestaurantsPriceRange2',
       'RestaurantsTakeOut', 'BikeParking', 'BusinessParking', 'WiFi',
       'Caters', 'HasTV', 'OutdoorSeating', 'RestaurantsAttire',
       'GoodForKids', 'Ambience', 'NoiseLevel', 'RestaurantsReservations',
       'GoodForMeal', 'RestaurantsGoodForGroups',
       'RestaurantsTableService', 'Alcohol', 'DogsAllowed',
       'WheelchairAccessible', 'DriveThru', 'Music', 'Smoking',
       'GoodForDancing', 'HappyHour', 'BestNights', 'CoatCheck',
       'ByAppointmentOnly', 'BusinessAcceptsCreditCards',
       'AcceptsInsurance', 'DietaryRestrictions', 'AgesAllowed',
       'HairSpecializesIn', 'RestaurantsCounterService',
       'BusinessAcceptsBitcoin'], dtype=object)

In [23]:
new_names = [(i,'attr_'+i) for i in business_food.iloc[:, 13:].columns.values]
business_food.rename(columns = dict(new_names), inplace=True)

In [24]:
business_food.columns

Index(['business_id', 'name', 'address', 'city', 'state', 'postal_code',
       'latitude', 'longitude', 'stars', 'review_count', 'attributes',
       'categories', 'hours', 'attr_RestaurantsDelivery',
       'attr_RestaurantsPriceRange2', 'attr_RestaurantsTakeOut',
       'attr_BikeParking', 'attr_BusinessParking', 'attr_WiFi', 'attr_Caters',
       'attr_HasTV', 'attr_OutdoorSeating', 'attr_RestaurantsAttire',
       'attr_GoodForKids', 'attr_Ambience', 'attr_NoiseLevel',
       'attr_RestaurantsReservations', 'attr_GoodForMeal',
       'attr_RestaurantsGoodForGroups', 'attr_RestaurantsTableService',
       'attr_Alcohol', 'attr_DogsAllowed', 'attr_WheelchairAccessible',
       'attr_DriveThru', 'attr_Music', 'attr_Smoking', 'attr_GoodForDancing',
       'attr_HappyHour', 'attr_BestNights', 'attr_CoatCheck',
       'attr_ByAppointmentOnly', 'attr_BusinessAcceptsCreditCards',
       'attr_AcceptsInsurance', 'attr_DietaryRestrictions', 'attr_AgesAllowed',
       'attr_HairSpecializesIn

Finding for columns with dictionary embedded in them

In [25]:
for i in business_food.columns:
    if 'attr_' in i:
        if type(business_food[i].value_counts().index[0]) == str:
            len_index = len(business_food[i].value_counts().index[0])
            if len_index > 15:
                print(i)

attr_BusinessParking
attr_Ambience
attr_GoodForMeal
attr_Music
attr_BestNights
attr_DietaryRestrictions
attr_HairSpecializesIn


Remove rows attributing to `attr_HairSpecializesIn`

In [26]:
business_food.shape[0]

6750

In [27]:
business_food = business_food[business_food.attr_HairSpecializesIn.isnull() == True]

In [28]:
business_food = business_food.drop(['attr_HairSpecializesIn'], axis = 1)

In [29]:
business_food.reset_index(drop= True)
business_food.shape

(6749, 47)

In [30]:
column_list = []
for i in business_food.columns:
    if 'attr_' in i:
        if type(business_food[i].value_counts().index[0]) == str:
            len_index = len(business_food[i].value_counts().index[0])
            if len_index > 15:
                column_list.append(i)
                
column_list

['attr_BusinessParking',
 'attr_Ambience',
 'attr_GoodForMeal',
 'attr_Music',
 'attr_BestNights',
 'attr_DietaryRestrictions']

Join with reviews

In [31]:
# Set up your local path
review_json_path = '../Dataset/yelp_academic_dataset_review.json'

In [32]:
import pandas as pd

# Set chunk size (smaller if dataset is smaller)
# 2019 Yelp review.json has more than 6 million reviews(rows)
size = 1000000
review = pd.read_json(review_json_path, lines=True,
                      # identifying the data type of each column can reduce memory usage
                      dtype={'review_id':str,'user_id':str,'business_id':str,'stars':int,
                             'date':str,'text':str,'useful':int,'funny':int,'cool':int},
                      chunksize=size)

In [33]:
# There are multiple chunks to be read
chunk_list = []
for chunk in review:
    # Drop columns that aren't needed
    chunk = chunk.drop(['review_id','useful','funny','cool'], axis=1)
    # Renaming column name to avoid conflict with business overall star rating
    chunk = chunk.rename(columns={'stars': 'review_stars'})
    # Inner merge with edited business file so only reviews related to the business remain
    chunk_merged = pd.merge(business_food, chunk, on='business_id', how='inner')
    # Show feedback on progress
    print(f"{chunk_merged.shape[0]} out of {size:,} related reviews")
    chunk_list.append(chunk_merged)
    
# After trimming down the review file, concatenate all relevant data back to one dataframe
df = pd.concat(chunk_list, ignore_index=True, join='outer', axis=0)
print(df.shape)
df.sample(3)

50166 out of 1,000,000 related reviews
52650 out of 1,000,000 related reviews
48920 out of 1,000,000 related reviews
53878 out of 1,000,000 related reviews
53385 out of 1,000,000 related reviews
52057 out of 1,000,000 related reviews
38479 out of 1,000,000 related reviews
(349535, 51)


Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,attributes,categories,hours,attr_RestaurantsDelivery,attr_RestaurantsPriceRange2,attr_RestaurantsTakeOut,attr_BikeParking,attr_BusinessParking,attr_WiFi,attr_Caters,attr_HasTV,attr_OutdoorSeating,attr_RestaurantsAttire,attr_GoodForKids,attr_Ambience,attr_NoiseLevel,attr_RestaurantsReservations,attr_GoodForMeal,attr_RestaurantsGoodForGroups,attr_RestaurantsTableService,attr_Alcohol,attr_DogsAllowed,attr_WheelchairAccessible,attr_DriveThru,attr_Music,attr_Smoking,attr_GoodForDancing,attr_HappyHour,attr_BestNights,attr_CoatCheck,attr_ByAppointmentOnly,attr_BusinessAcceptsCreditCards,attr_AcceptsInsurance,attr_DietaryRestrictions,attr_AgesAllowed,attr_RestaurantsCounterService,attr_BusinessAcceptsBitcoin,user_id,review_stars,text,date
304987,tngl2VZ4VGpK3qFpmU0NMw,Scaddabush,200 Front Street W,Toronto,ON,M5V 3K2,43.644936,-79.386151,3.5,148,"{'RestaurantsDelivery': 'False', 'RestaurantsR...","Restaurants, Italian","{'Monday': '11:30-0:0', 'Tuesday': '11:30-0:0'...",False,2,True,True,"{'garage': False, 'street': False, 'validated'...",u'free',,True,True,'casual',True,"{'touristy': False, 'hipster': False, 'romanti...",u'average',True,"{'dessert': False, 'latenight': False, 'lunch'...",True,True,u'full_bar',,,,,,,,,,,,,,,,,tXEqXYByl9NUD1WO2DjpPA,4,Went for dinner before a Raptors game and the ...,2017-01-26 18:57:31
278274,rA1cY_AXPEOTV_eOTYsU1w,Côte de Boeuf,130 Ossington Avenue,Toronto,ON,M6J 2Z5,43.647205,-79.420167,4.5,44,"{'BikeParking': 'True', 'RestaurantsTakeOut': ...","Food, Specialty Food, Meat Shops",,,2,True,True,"{'garage': False, 'street': True, 'validated':...",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,oqrUVs4daCH9UDW_xA5M5w,5,Unique French butcher shop turn farm to table ...,2016-08-04 23:40:49
111910,nZ4HW543eDyqqE62QYnChg,The Burger Cellar,3391 Yonge Street,Toronto,ON,M4N 2M8,43.732407,-79.403917,3.5,124,"{'Alcohol': 'u'full_bar'', 'RestaurantsPriceRa...","Food, Restaurants, Beer, Wine & Spirits, Salad...","{'Monday': '11:30-22:0', 'Tuesday': '11:30-22:...",False,2,True,True,"{'garage': False, 'street': True, 'validated':...",u'free',True,True,True,u'casual',True,"{'romantic': False, 'intimate': False, 'classy...",u'average',True,,True,,u'full_bar',,,False,"{'dj': False, 'background_music': True, 'no_mu...",,False,,,,,,,,,,,5lxTlix1T_3NRjviwZ3Ayg,3,I want to start by saying that my waitress was...,2013-10-09 12:37:09


Keep businesses that have more than 75 user reviews

In [34]:
sum(df.user_id.value_counts() >= 75)

421

In [35]:
df['user_freq'] = df.groupby('user_id')['user_id'].transform('count')

In [36]:
df[['business_id','user_id','user_freq']].sort_values(by='user_freq', ascending = False)

Unnamed: 0,business_id,user_id,user_freq
123598,Qzbcq82RJKIcAl0HSoSBJQ,CxDOIDnH8gp9KXzpBHJYXw,964
73488,g6GXqg-QdDiQGLYMVqNOUw,CxDOIDnH8gp9KXzpBHJYXw,964
149312,TU95jEn8aGitY8hZowXaBg,CxDOIDnH8gp9KXzpBHJYXw,964
72865,L_JIwCB8OeZLiHWg6VTydA,CxDOIDnH8gp9KXzpBHJYXw,964
206536,fJ-2acaqvWOsujUTAJB-ew,CxDOIDnH8gp9KXzpBHJYXw,964
...,...,...,...
310502,AnemBUVAb9NRaselL2KITA,-HcPUpMULILM_pFHn8_RUg,1
310503,AnemBUVAb9NRaselL2KITA,FerEcJNdsV6SzaTctwK-XA,1
224864,YY23OABxBKOryNqtmMOKjA,zRLTuVhf5FIsPY48c5ldRQ,1
224863,YY23OABxBKOryNqtmMOKjA,cGDvfeuBQAz_BY0Y-g5xYQ,1


In [37]:
df = df[df['user_freq'] >= 75]

In [38]:
df.shape

(54643, 52)

Expand the embedded dictionaries

In [39]:
import ast

In [40]:
# Write a function to expand the dictionaries
# Input, business_food dataframe and a list of columns to expand 
def expand_dict(x,y):
    
    merged_df = y
    # for each attribute in column list
    for i in x: 
        
        print(i)
        
        # create a new dataframe - y_attr
        y_attr = y[['business_id',i]]
        print('Initial shape:',y_attr.shape)
        
        # remove null columns
        y_attr = y_attr.dropna(subset = [i])
        # reset index
        y_attr = y_attr.reset_index(drop = True)
        print('Shape after dropping null values:', y_attr.shape)
        
        # drop duplicated rows - save memory 
        y_attr = y_attr.drop_duplicates()
        print('Shape after dropping duplicates', y_attr.shape)
        
        # reset index
        y_attr = y_attr.reset_index(drop=True)
        
        num = y_attr.shape[0]
        print(num)
        
        # finding all the keys in each embedded dictionary
        key_list = []
        for a in range(num):
            if y_attr[i][a] == 'None':
                a+=1
            else:
                res = ast.literal_eval(y_attr[i][a])
                for key in res.keys():
                    if key not in key_list:
                        key_list.append(key)
        
        print(key_list)
        
        # epanding each new dataframe to by the number of dictionary columns
        for col in key_list:
            y_attr[col] = np.nan
            
        # inserting values from embedded dictionaries to their respected column/cells
        idx = 0
        for b in range(num):
            if y_attr[i][b] == 'None':
                b+=1
            else:
                res = ast.literal_eval(y_attr[i][b])
                for k,v in res.items():
                    y_attr[k][idx] = v
                idx+=1
                
        new_names = [(c,str(i)+'_'+c) for c in y_attr.iloc[:, 2:].columns.values]
        y_attr.rename(columns = dict(new_names), inplace=True)
        
        print(y_attr.columns)
        print(y_attr.shape)
        
        # merge to business_food dataframe
        merged_df = pd.merge(merged_df, y_attr, on = 'business_id', how = 'left')
        
        print("")
        print(merged_df.shape)
   
    return merged_df

In [41]:
df_toronto = expand_dict(column_list, df)

attr_BusinessParking
Initial shape: (54643, 2)
Shape after dropping null values: (52904, 2)
Shape after dropping duplicates (5017, 2)
5017
['garage', 'street', 'validated', 'lot', 'valet']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


Index(['business_id', 'attr_BusinessParking', 'attr_BusinessParking_garage',
       'attr_BusinessParking_street', 'attr_BusinessParking_validated',
       'attr_BusinessParking_lot', 'attr_BusinessParking_valet'],
      dtype='object')
(5017, 7)

(54643, 58)
attr_Ambience
Initial shape: (54643, 2)
Shape after dropping null values: (44390, 2)
Shape after dropping duplicates (3835, 2)
3835
['romantic', 'intimate', 'classy', 'hipster', 'divey', 'touristy', 'trendy', 'upscale', 'casual']
Index(['business_id', 'attr_Ambience', 'attr_Ambience_romantic',
       'attr_Ambience_intimate', 'attr_Ambience_classy',
       'attr_Ambience_hipster', 'attr_Ambience_divey',
       'attr_Ambience_touristy', 'attr_Ambience_trendy',
       'attr_Ambience_upscale', 'attr_Ambience_casual'],
      dtype='object')
(3835, 11)

(54643, 68)
attr_GoodForMeal
Initial shape: (54643, 2)
Shape after dropping null values: (37976, 2)
Shape after dropping duplicates (2856, 2)
2856
['dessert', 'latenight', 'lunch', 'din

In [42]:
print(df_toronto.shape)
df_toronto.head()

(54643, 99)


Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,attributes,categories,hours,attr_RestaurantsDelivery,attr_RestaurantsPriceRange2,attr_RestaurantsTakeOut,attr_BikeParking,attr_BusinessParking_x,attr_WiFi,attr_Caters,attr_HasTV,attr_OutdoorSeating,attr_RestaurantsAttire,attr_GoodForKids,attr_Ambience_x,attr_NoiseLevel,attr_RestaurantsReservations,attr_GoodForMeal_x,attr_RestaurantsGoodForGroups,attr_RestaurantsTableService,attr_Alcohol,attr_DogsAllowed,attr_WheelchairAccessible,attr_DriveThru,attr_Music_x,attr_Smoking,attr_GoodForDancing,attr_HappyHour,attr_BestNights_x,attr_CoatCheck,attr_ByAppointmentOnly,attr_BusinessAcceptsCreditCards,attr_AcceptsInsurance,attr_DietaryRestrictions_x,attr_AgesAllowed,attr_RestaurantsCounterService,attr_BusinessAcceptsBitcoin,user_id,review_stars,text,date,user_freq,attr_BusinessParking_y,attr_BusinessParking_garage,attr_BusinessParking_street,attr_BusinessParking_validated,attr_BusinessParking_lot,attr_BusinessParking_valet,attr_Ambience_y,attr_Ambience_romantic,attr_Ambience_intimate,attr_Ambience_classy,attr_Ambience_hipster,attr_Ambience_divey,attr_Ambience_touristy,attr_Ambience_trendy,attr_Ambience_upscale,attr_Ambience_casual,attr_GoodForMeal_y,attr_GoodForMeal_dessert,attr_GoodForMeal_latenight,attr_GoodForMeal_lunch,attr_GoodForMeal_dinner,attr_GoodForMeal_brunch,attr_GoodForMeal_breakfast,attr_Music_y,attr_Music_dj,attr_Music_background_music,attr_Music_jukebox,attr_Music_live,attr_Music_video,attr_Music_karaoke,attr_Music_no_music,attr_BestNights_y,attr_BestNights_monday,attr_BestNights_tuesday,attr_BestNights_friday,attr_BestNights_wednesday,attr_BestNights_thursday,attr_BestNights_sunday,attr_BestNights_saturday,attr_DietaryRestrictions_y,attr_DietaryRestrictions_dairy-free,attr_DietaryRestrictions_gluten-free,attr_DietaryRestrictions_vegan,attr_DietaryRestrictions_kosher,attr_DietaryRestrictions_halal,attr_DietaryRestrictions_soy-free,attr_DietaryRestrictions_vegetarian
0,C9oCPomVP0mtKa8z99E3gg,Bakery Gateau,"865 York Mills Road, Unit 1",Toronto,ON,M3B 1Y6,43.754093,-79.349548,4.5,8,"{'RestaurantsDelivery': 'False', 'RestaurantsP...","Bakeries, Food","{'Monday': '9:0-20:0', 'Tuesday': '9:0-20:0', ...",False,2,True,True,"{'garage': False, 'street': False, 'validated'...",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,orh0HRUNCWuQMt9Iia_osg,3,Oh? Another patbingsu review? This one was bet...,2017-07-30 23:32:27,283,"{'garage': False, 'street': False, 'validated'...",False,False,False,False,False,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,C9oCPomVP0mtKa8z99E3gg,Bakery Gateau,"865 York Mills Road, Unit 1",Toronto,ON,M3B 1Y6,43.754093,-79.349548,4.5,8,"{'RestaurantsDelivery': 'False', 'RestaurantsP...","Bakeries, Food","{'Monday': '9:0-20:0', 'Tuesday': '9:0-20:0', ...",False,2,True,True,"{'garage': False, 'street': False, 'validated'...",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,orh0HRUNCWuQMt9Iia_osg,3,My first patbingsu!\n\nBakery Gateau is locate...,2015-07-01 22:40:22,283,"{'garage': False, 'street': False, 'validated'...",False,False,False,False,False,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,NDuUMJfrWk52RA-H-OtrpA,Bolt Fresh Bar,1170 Queen Street W,Toronto,ON,M6J 1J5,43.642889,-79.425429,3.0,57,"{'WiFi': 'u'no'', 'BikeParking': 'True', 'Rest...","Juice Bars & Smoothies, Food, Restaurants, Fas...","{'Monday': '8:0-21:0', 'Tuesday': '8:0-21:0', ...",,2,True,True,"{'garage': False, 'street': True, 'validated':...",u'no',False,,,,,,,,,,,,,,,,,,,,,,,,,,,,h3SfoZcs04WnJErnFaeBpQ,3,I've been to Bolt twice now. It's a good spot...,2014-01-05 06:10:24,98,"{'garage': False, 'street': True, 'validated':...",False,True,False,False,False,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,NDuUMJfrWk52RA-H-OtrpA,Bolt Fresh Bar,1170 Queen Street W,Toronto,ON,M6J 1J5,43.642889,-79.425429,3.0,57,"{'WiFi': 'u'no'', 'BikeParking': 'True', 'Rest...","Juice Bars & Smoothies, Food, Restaurants, Fas...","{'Monday': '8:0-21:0', 'Tuesday': '8:0-21:0', ...",,2,True,True,"{'garage': False, 'street': True, 'validated':...",u'no',False,,,,,,,,,,,,,,,,,,,,,,,,,,,,3uWCWMWcrn8YSH_qBvlr6w,4,"In the time of the 2016 Rio Olympics, big ups ...",2016-08-15 14:15:21,149,"{'garage': False, 'street': True, 'validated':...",False,True,False,False,False,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,NDuUMJfrWk52RA-H-OtrpA,Bolt Fresh Bar,1170 Queen Street W,Toronto,ON,M6J 1J5,43.642889,-79.425429,3.0,57,"{'WiFi': 'u'no'', 'BikeParking': 'True', 'Rest...","Juice Bars & Smoothies, Food, Restaurants, Fas...","{'Monday': '8:0-21:0', 'Tuesday': '8:0-21:0', ...",,2,True,True,"{'garage': False, 'street': True, 'validated':...",u'no',False,,,,,,,,,,,,,,,,,,,,,,,,,,,,eMcT_F41kJzVtAiTPDzIUA,4,Loved the smoothie & salad. The people serving...,2016-11-15 18:21:09,85,"{'garage': False, 'street': True, 'validated':...",False,True,False,False,False,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [43]:
df_toronto.to_csv("../Dataset/toronto_business_attributes.csv", index = False)

#### Clean data

Remove repeated columns

In [44]:
toronto = df_toronto

In [45]:
for i in toronto.columns:
    if ('_y' in i) or ('_x' in i):
        toronto = toronto.drop(i, axis = 1)

In [46]:
toronto.shape

(54643, 87)

In [47]:
# renaming columns with _x to just column names
for i in toronto.columns:
    if '_x' in i:
        i = i[:-2]
        print(i)


Remove unnecessary columns

In [48]:
toronto = toronto.drop(columns=['city','state','user_freq'])
toronto.shape

(54643, 84)

Cleaning attributes

In [49]:
sum = 0
attr_col = []
for i in toronto.columns:
    if 'attr_' in i:
        toronto[i] = toronto[i].astype(str)
        sum = sum + 1 
        attr_col.append(i)

In [50]:
print('Sum of attributes columns:',sum)

Sum of attributes columns: 69


In [51]:
# cleaning True False columns
for i in attr_col:
    a = toronto[i].value_counts().keys()
    if ('True' in a) or ('False' in a):
        toronto[i] = toronto[i].map({'True':1,'False':0,'nan':0,'None':0})

In [52]:
str_col = []
for i in attr_col:
    a = toronto[i].value_counts().keys()
    if (a.dtype != 'int64') and (a.dtype != 'float64'):
        str_col.append(i)
        print(i,'\n',a)
        print("")

attr_RestaurantsPriceRange2 
 Index(['2', '1', '3', '2.0', '1.0', 'nan', '4', '3.0', '4.0', 'None'], dtype='object')

attr_WiFi 
 Index(['u'no'', 'u'free'', 'nan', ''no'', ''free'', 'u'paid'', ''paid'',
       'None'],
      dtype='object')

attr_RestaurantsAttire 
 Index(['u'casual'', 'nan', ''casual'', 'u'dressy'', ''dressy'', 'None',
       ''formal'', 'u'formal''],
      dtype='object')

attr_NoiseLevel 
 Index(['u'average'', 'nan', 'u'loud'', 'u'quiet'', ''average'', 'u'very_loud'',
       ''loud'', ''quiet'', 'None', ''very_loud''],
      dtype='object')

attr_Alcohol 
 Index(['u'full_bar'', 'u'none'', 'nan', 'u'beer_and_wine'', ''none'',
       ''beer_and_wine'', ''full_bar'', 'None'],
      dtype='object')

attr_Smoking 
 Index(['nan', 'u'no'', 'u'outdoor'', 'u'yes'', 'None'], dtype='object')

attr_AgesAllowed 
 Index(['nan', 'u'19plus''], dtype='object')



In [53]:
# attr_RestaurantsPriceRange2
toronto['attr_RestaurantsPriceRange2'] = toronto['attr_RestaurantsPriceRange2'].map({
    '2.0': 2, '1.0': 1, '2': 2,
    '3.0': 3, '1': 1, '3': 3,
    'nan': 0, '4.0': 4, '4': 4, 'None': 0
})

toronto.attr_RestaurantsPriceRange2.value_counts()

2    35991
1    11480
3     5594
0      802
4      776
Name: attr_RestaurantsPriceRange2, dtype: int64

In [54]:
# attr_WiFi
for i in range(toronto.shape[0]):
    if ('no' in toronto.attr_WiFi.iloc[i]) or ('None' in toronto.attr_WiFi.iloc[i]) or ('nan' in toronto.attr_WiFi.iloc[i]):
        toronto.attr_WiFi.iloc[i] = 'no'
    else:
        if 'free' in toronto.attr_WiFi.iloc[i]:
            toronto.attr_WiFi.iloc[i] = 'free'
        else:
            if 'paid' in toronto.attr_WiFi.iloc[i]:
                toronto.attr_WiFi.iloc[i] = 'paid'

toronto.attr_WiFi.value_counts()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


no      32587
free    21888
paid      168
Name: attr_WiFi, dtype: int64

In [55]:
for i in range(toronto.shape[0]):
    if ('None' in toronto.attr_RestaurantsAttire.iloc[i]) or ('nan' in toronto.attr_RestaurantsAttire.iloc[i]):
        toronto.attr_RestaurantsAttire.iloc[i] = 'no'
    else:
        if 'casual' in toronto.attr_RestaurantsAttire.iloc[i]:
            toronto.attr_RestaurantsAttire.iloc[i] = 'casual'
        else:
            if 'dressy' in toronto.attr_RestaurantsAttire.iloc[i]:
                toronto.attr_RestaurantsAttire.iloc[i] = 'dressy'
            else:
                if 'formal' in toronto.attr_RestaurantsAttire.iloc[i]:
                    toronto.attr_RestaurantsAttire.iloc[i] = 'formal'
                    
                    
toronto.attr_RestaurantsAttire.value_counts()

casual    39893
no        12663
dressy     2076
formal       11
Name: attr_RestaurantsAttire, dtype: int64

In [56]:
a = 'attr_NoiseLevel' 
for i in range(toronto.shape[0]):
    if ('nan' in toronto[a].iloc[i]) or ('None' in toronto[a].iloc[i]):
        toronto[a].iloc[i] = 'no'
    elif 'average' in toronto[a].iloc[i]:
        toronto[a].iloc[i] = 'average'
    elif 'loud' in toronto[a].iloc[i]:
        toronto[a].iloc[i] = 'loud'
    elif 'very_loud' in toronto[a].iloc[i]:
        toronto[a].iloc[i] = 'very_loud'
    elif 'quiet' in toronto[a].iloc[i]:
        toronto[a].iloc[i] = 'quiet'
        
toronto[a].value_counts()

average    30407
no         11436
loud        8848
quiet       3952
Name: attr_NoiseLevel, dtype: int64

In [57]:
a = 'attr_Alcohol'
for i in range(toronto.shape[0]):
    if ('nan' in toronto[a].iloc[i]) or ('None' in toronto[a].iloc[i]) or ('none' in toronto[a].iloc[i]):
        toronto[a].iloc[i] = 'no'
    elif 'full_bar' in toronto[a].iloc[i]:
        toronto[a].iloc[i] = 'full_bar'
    elif 'beer_and_wine' in toronto[a].iloc[i]:
        toronto[a].iloc[i] = 'beer_and_wine'

toronto[a].value_counts()

full_bar         23651
no               23428
beer_and_wine     7564
Name: attr_Alcohol, dtype: int64

In [58]:
a = 'attr_Smoking'
for i in range(toronto.shape[0]):
    if ('nan' in toronto[a].iloc[i]) or ('None' in toronto[a].iloc[i]) or ('no' in toronto[a].iloc[i]):
        toronto[a].iloc[i] = 'no'
    elif 'outdoor' in toronto[a].iloc[i]:
        toronto[a].iloc[i] = 'outdoor'
    elif 'yes' in toronto[a].iloc[i]:
        toronto[a].iloc[i] = 'yes'

toronto[a].value_counts()

no         53684
outdoor      943
yes           16
Name: attr_Smoking, dtype: int64

In [59]:
a = 'attr_AgesAllowed'
for i in range(toronto.shape[0]):
    if ('nan' in toronto[a].iloc[i]) or ('None' in toronto[a].iloc[i]) or ('no' in toronto[a].iloc[i]):
        toronto[a].iloc[i] = 'no'
    elif '19plus' in toronto[a].iloc[i]:
        toronto[a].iloc[i] = '19plus'

toronto[a].value_counts()

no        54615
19plus       28
Name: attr_AgesAllowed, dtype: int64

In [60]:
for i in attr_col:
    a = toronto[i].value_counts().keys()
    if (a.dtype == 'int64') or (a.dtype == 'float64'):
        toronto[i] = toronto[i].fillna(0)

In [61]:
for i in attr_col:
    a = toronto[i].value_counts().keys()
    if (a.dtype != 'int64') and (a.dtype != 'float64'):
        toronto[i] = toronto[i].fillna('no')

In [62]:
col_to_encode = []
for i in toronto.columns:
    if 'attr_' in i:
        a = toronto[i].value_counts().keys()
        if (a.dtype == 'object'):
            col_to_encode.append(i)
            print(i,'\n',a)
            print("")

attr_WiFi 
 Index(['no', 'free', 'paid'], dtype='object')

attr_RestaurantsAttire 
 Index(['casual', 'no', 'dressy', 'formal'], dtype='object')

attr_NoiseLevel 
 Index(['average', 'no', 'loud', 'quiet'], dtype='object')

attr_Alcohol 
 Index(['full_bar', 'no', 'beer_and_wine'], dtype='object')

attr_Smoking 
 Index(['no', 'outdoor', 'yes'], dtype='object')

attr_AgesAllowed 
 Index(['no', '19plus'], dtype='object')



In [63]:
toronto = pd.get_dummies(toronto, prefix_sep='_', columns=col_to_encode)

In [64]:
toronto.head()

Unnamed: 0,business_id,name,address,postal_code,latitude,longitude,stars,review_count,attributes,categories,hours,attr_RestaurantsDelivery,attr_RestaurantsPriceRange2,attr_RestaurantsTakeOut,attr_BikeParking,attr_Caters,attr_HasTV,attr_OutdoorSeating,attr_GoodForKids,attr_RestaurantsReservations,attr_RestaurantsGoodForGroups,attr_RestaurantsTableService,attr_DogsAllowed,attr_WheelchairAccessible,attr_DriveThru,attr_GoodForDancing,attr_HappyHour,attr_CoatCheck,attr_ByAppointmentOnly,attr_BusinessAcceptsCreditCards,attr_AcceptsInsurance,attr_RestaurantsCounterService,attr_BusinessAcceptsBitcoin,user_id,review_stars,text,date,attr_BusinessParking_garage,attr_BusinessParking_street,attr_BusinessParking_validated,attr_BusinessParking_lot,attr_BusinessParking_valet,attr_Ambience_romantic,attr_Ambience_intimate,attr_Ambience_classy,attr_Ambience_hipster,attr_Ambience_divey,attr_Ambience_touristy,attr_Ambience_trendy,attr_Ambience_upscale,attr_Ambience_casual,attr_GoodForMeal_dessert,attr_GoodForMeal_latenight,attr_GoodForMeal_lunch,attr_GoodForMeal_dinner,attr_GoodForMeal_brunch,attr_GoodForMeal_breakfast,attr_Music_dj,attr_Music_background_music,attr_Music_jukebox,attr_Music_live,attr_Music_video,attr_Music_karaoke,attr_Music_no_music,attr_BestNights_monday,attr_BestNights_tuesday,attr_BestNights_friday,attr_BestNights_wednesday,attr_BestNights_thursday,attr_BestNights_sunday,attr_BestNights_saturday,attr_DietaryRestrictions_dairy-free,attr_DietaryRestrictions_gluten-free,attr_DietaryRestrictions_vegan,attr_DietaryRestrictions_kosher,attr_DietaryRestrictions_halal,attr_DietaryRestrictions_soy-free,attr_DietaryRestrictions_vegetarian,attr_WiFi_free,attr_WiFi_no,attr_WiFi_paid,attr_RestaurantsAttire_casual,attr_RestaurantsAttire_dressy,attr_RestaurantsAttire_formal,attr_RestaurantsAttire_no,attr_NoiseLevel_average,attr_NoiseLevel_loud,attr_NoiseLevel_no,attr_NoiseLevel_quiet,attr_Alcohol_beer_and_wine,attr_Alcohol_full_bar,attr_Alcohol_no,attr_Smoking_no,attr_Smoking_outdoor,attr_Smoking_yes,attr_AgesAllowed_19plus,attr_AgesAllowed_no
0,C9oCPomVP0mtKa8z99E3gg,Bakery Gateau,"865 York Mills Road, Unit 1",M3B 1Y6,43.754093,-79.349548,4.5,8,"{'RestaurantsDelivery': 'False', 'RestaurantsP...","Bakeries, Food","{'Monday': '9:0-20:0', 'Tuesday': '9:0-20:0', ...",0,2,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,orh0HRUNCWuQMt9Iia_osg,3,Oh? Another patbingsu review? This one was bet...,2017-07-30 23:32:27,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,1,1,0,0,0,1
1,C9oCPomVP0mtKa8z99E3gg,Bakery Gateau,"865 York Mills Road, Unit 1",M3B 1Y6,43.754093,-79.349548,4.5,8,"{'RestaurantsDelivery': 'False', 'RestaurantsP...","Bakeries, Food","{'Monday': '9:0-20:0', 'Tuesday': '9:0-20:0', ...",0,2,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,orh0HRUNCWuQMt9Iia_osg,3,My first patbingsu!\n\nBakery Gateau is locate...,2015-07-01 22:40:22,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,1,1,0,0,0,1
2,NDuUMJfrWk52RA-H-OtrpA,Bolt Fresh Bar,1170 Queen Street W,M6J 1J5,43.642889,-79.425429,3.0,57,"{'WiFi': 'u'no'', 'BikeParking': 'True', 'Rest...","Juice Bars & Smoothies, Food, Restaurants, Fas...","{'Monday': '8:0-21:0', 'Tuesday': '8:0-21:0', ...",0,2,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,h3SfoZcs04WnJErnFaeBpQ,3,I've been to Bolt twice now. It's a good spot...,2014-01-05 06:10:24,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,1,1,0,0,0,1
3,NDuUMJfrWk52RA-H-OtrpA,Bolt Fresh Bar,1170 Queen Street W,M6J 1J5,43.642889,-79.425429,3.0,57,"{'WiFi': 'u'no'', 'BikeParking': 'True', 'Rest...","Juice Bars & Smoothies, Food, Restaurants, Fas...","{'Monday': '8:0-21:0', 'Tuesday': '8:0-21:0', ...",0,2,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3uWCWMWcrn8YSH_qBvlr6w,4,"In the time of the 2016 Rio Olympics, big ups ...",2016-08-15 14:15:21,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,1,1,0,0,0,1
4,NDuUMJfrWk52RA-H-OtrpA,Bolt Fresh Bar,1170 Queen Street W,M6J 1J5,43.642889,-79.425429,3.0,57,"{'WiFi': 'u'no'', 'BikeParking': 'True', 'Rest...","Juice Bars & Smoothies, Food, Restaurants, Fas...","{'Monday': '8:0-21:0', 'Tuesday': '8:0-21:0', ...",0,2,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,eMcT_F41kJzVtAiTPDzIUA,4,Loved the smoothie & salad. The people serving...,2016-11-15 18:21:09,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,1,1,0,0,0,1


In [65]:
def basic_details(df):
    print('Row:{}, columns:{}'.format(df.shape[0],df.shape[1]))
    k = pd.DataFrame()
    #k['number of Unique value'] = df.nunique()
    k['Number of missing value'] = df.isnull().sum()
    k['Data type'] = df.dtypes
    return k

In [66]:
basic_details(toronto)

Row:54643, columns:97


Unnamed: 0,Number of missing value,Data type
business_id,0,object
name,0,object
address,0,object
postal_code,0,object
latitude,0,float64
longitude,0,float64
stars,0,float64
review_count,0,int64
attributes,0,object
categories,0,object


In [67]:
for i in toronto.columns:
    if toronto[i].dtype == 'uint8':
        toronto[i] = toronto[i].astype('int64')

In [68]:
basic_details(toronto)

Row:54643, columns:97


Unnamed: 0,Number of missing value,Data type
business_id,0,object
name,0,object
address,0,object
postal_code,0,object
latitude,0,float64
longitude,0,float64
stars,0,float64
review_count,0,int64
attributes,0,object
categories,0,object


In [69]:
toronto.to_csv('../Dataset/CLEANED_toronto_business_attributes.csv', index = False)