In [1]:
%%time
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer

CPU times: user 1.64 s, sys: 501 ms, total: 2.14 s
Wall time: 2.03 s


In [2]:
df = pd.read_json('./yelp_academic_dataset_business.json', lines=True)

In [3]:
df.shape

(188593, 15)

In [4]:
df.head(3)

Unnamed: 0,address,attributes,business_id,categories,city,hours,is_open,latitude,longitude,name,neighborhood,postal_code,review_count,stars,state
0,1314 44 Avenue NE,"{'BikeParking': 'False', 'BusinessAcceptsCredi...",Apn5Q_b6Nz61Tq4XzPdf9A,"Tours, Breweries, Pizza, Restaurants, Food, Ho...",Calgary,"{'Monday': '8:30-17:0', 'Tuesday': '11:0-21:0'...",1,51.091813,-114.031675,Minhas Micro Brewery,,T2E 6L6,24,4.0,AB
1,,"{'Alcohol': 'none', 'BikeParking': 'False', 'B...",AjEbIBw6ZFfln7ePHha9PA,"Chicken Wings, Burgers, Caterers, Street Vendo...",Henderson,"{'Friday': '17:0-23:0', 'Saturday': '17:0-23:0...",0,35.960734,-114.939821,CK'S BBQ & Catering,,89002,3,4.5,NV
2,1335 rue Beaubien E,"{'Alcohol': 'beer_and_wine', 'Ambience': '{'ro...",O8S5hYJ1SMc8fA4QBtVujA,"Breakfast & Brunch, Restaurants, French, Sandw...",Montréal,"{'Monday': '10:0-22:0', 'Tuesday': '10:0-22:0'...",0,45.540503,-73.5993,La Bastringue,Rosemont-La Petite-Patrie,H2G 1K7,5,4.0,QC


In [5]:
## We need to remove the NaN cells for Count Vectorizer to work.
df.categories.isna().sum()
df.categories = df.categories.fillna('supercali')     # Arbitrary word ro replace NaN
df.categories.isna().sum()

0

In [6]:
cvec = CountVectorizer()
cvec = cvec.fit(df['categories'])
df_vec = pd.DataFrame(cvec.transform(df['categories']).todense(), columns=cvec.get_feature_names())

In [7]:
df_vec.shape

(188593, 1393)

In [8]:
len(df_vec.columns)

1393

In [9]:
## Top 100 business types according to Count Vectorizer.
## With Count Vectorizer business types labels do not match the labels from the dataset
## e.g. 'beauty and spas' becomes two different categories.
top_100 = df_vec.sum().sort_values(ascending=False).head(50).tolist
top_100

<bound method IndexOpsMixin.tolist of services         68098
restaurants      57180
food             43023
shopping         30877
home             29626
spas             22668
beauty           20961
bars             20392
medical          19180
health           17551
hair             14992
local            13762
event            13655
repair           13271
automotive       12656
salons           12445
nightlife        12438
stores           12055
planning         11950
american         11436
auto             10838
life              9647
active            9119
arts              8998
estate            8590
hotels            8383
real              8374
tea               7784
pet               7779
fashion           7571
coffee            7136
sandwiches        6912
traditional       6866
fast              6812
pizza             6603
travel            6264
entertainment     6121
new               6107
shops             6053
garden            6028
dentists          5972
specialty         5

In [10]:
## Creating new column. Each row has a list of strings business types.
df['categories_2'] = [i.split(',') for i in df['categories']]

In [11]:
## Creating a list containing all cleaned business types.
%time
business_list = []                             
for index, value in df.categories_2.items():
    for i in value:
        business_list.append(i.strip())        #cleaning each business type

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 8.34 µs


In [12]:
## Examining the list.
business_list

['Tours',
 'Breweries',
 'Pizza',
 'Restaurants',
 'Food',
 'Hotels & Travel',
 'Chicken Wings',
 'Burgers',
 'Caterers',
 'Street Vendors',
 'Barbeque',
 'Food Trucks',
 'Food',
 'Restaurants',
 'Event Planning & Services',
 'Breakfast & Brunch',
 'Restaurants',
 'French',
 'Sandwiches',
 'Cafes',
 'Insurance',
 'Financial Services',
 'Home & Garden',
 'Nurseries & Gardening',
 'Shopping',
 'Local Services',
 'Automotive',
 'Electronics Repair',
 'Coffee & Tea',
 'Food',
 'Food',
 'Bakeries',
 'Restaurants',
 'Thai',
 'Mexican',
 'Restaurants',
 'Flowers & Gifts',
 'Gift Shops',
 'Shopping',
 'Restaurants',
 'Japanese',
 'Cajun/Creole',
 'Southern',
 'Restaurants',
 'Bars',
 'Sports Bars',
 'Dive Bars',
 'Burgers',
 'Nightlife',
 'Sandwiches',
 'Restaurants',
 'Restaurants',
 'Pakistani',
 'Indian',
 'Middle Eastern',
 'Beauty & Spas',
 'Barbers',
 'Delis',
 'Restaurants',
 'Sandwiches',
 'Nightlife',
 'Bars',
 'American (Traditional)',
 'Tapas/Small Plates',
 'Poutineries',
 'Supper 

In [13]:
## Removing duplicates from the list.
business_list_5 = set(business_list) 

In [14]:
## Checking lenght of list of business types without duplicates.
len(business_list_5)

1306

In [15]:
business_list_5

{'Wedding Chapels',
 'Henna Artists',
 'Refinishing Services',
 'Photo Booth Rentals',
 'Tax Law',
 'Dialysis Clinics',
 'Appliances & Repair',
 'Himalayan/Nepalese',
 'Car Brokers',
 'Drywall Installation & Repair',
 'Watch Repair',
 'Health Markets',
 'Discount Store',
 'Asian Fusion',
 'Counseling & Mental Health',
 'Olive Oil',
 'Newspapers & Magazines',
 'Halfway Houses',
 'Spin Classes',
 'Business Law',
 'Hotels & Travel',
 'Auto Glass Services',
 'Courthouses',
 'Bicycles',
 'Backshop',
 'Skin Care',
 'Body Contouring',
 'IV Hydration',
 'Cardiologists',
 'Reflexology',
 'Motorcycle Gear',
 'Playgrounds',
 'Fur Clothing',
 'Lebanese',
 'Tickets',
 'Hot Tub & Pool',
 'Bikes',
 'Day Camps',
 'Trainers',
 'Venezuelan',
 'Cardio Classes',
 'Brewpubs',
 'Commercial Real Estate',
 'Bed & Breakfast',
 'Food Trucks',
 'Registry Office',
 'Pet Insurance',
 'Zoos',
 'Weight Loss Centers',
 'Speakeasies',
 'Eyebrow Services',
 'Blood & Plasma Donation Centers',
 'Coffee & Tea',
 'Private 

In [16]:
## Creating a dictionary which has for each key = business type label, value = count.
business_dictionary = {}
for i in business_list_5:
    business_dictionary[i] = 0

In [17]:
## Updating dictionary with right count.
%time
for cell in df.categories_2:
    for category in cell:
        business_dictionary[category.strip()] += 1

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 7.15 µs


In [18]:
#business_dictionary['Cafes']

In [19]:
## Checking...
keys = business_dictionary.keys()
len(keys)

1306

In [20]:
## Creating DataFrame out of the dictionary containing business types and their count.
final = pd.DataFrame([business_dictionary])    
final = final.T
final = final.rename(columns={0:'Count'})
final.sort_values('Count', ascending=False)

Unnamed: 0,Count
Restaurants,57173
Shopping,30231
Food,27118
Beauty & Spas,18967
Home Services,18634
Health & Medical,16157
Local Services,12906
Automotive,12656
Nightlife,12438
Bars,10853


In [21]:
final.to_csv('./categories_count')

In [22]:
final = pd.read_csv('./categories_count')

In [23]:
final.columns

Index(['Unnamed: 0', 'Count'], dtype='object')

In [24]:
final = final.rename(columns={'Unnamed: 0': 'Category'})

In [25]:
final.to_csv('./categories_count')

In [35]:
#pd.read_csv('./categories_count')

In [27]:
final_sorted = final.sort_values('Count', ascending=False)

In [28]:
macro_categories = ['Active Life','Arts & Entertainment','Automotive','Beauty & Spas',
'Education','Event Planning & Services','Financial Services','Food','Health & Medical',
'Home Services','Hotels & Travel','Local Services','Mass Media','Nightlife','Pets',
'Professional Services','Public Services & Government','Real estate','Religious Organizations',
'Restaurants','Shopping', 'Prova']

In [29]:
top_150 = final.sort_values('Count', ascending=False)[0:151]
top_150.head(3)

Unnamed: 0,Category,Count
1006,Restaurants,57173
1060,Shopping,30231
466,Food,27118


In [30]:
top_150['Category'][0:3]

1006    Restaurants
1060       Shopping
466            Food
Name: Category, dtype: object

In [31]:
for i in macro_categories:
    if i in list(top_150['Category']):
        pass #print(i, 'True')
    else:
        print(i, 'False')

Mass Media False
Real estate False
Religious Organizations False
Prova False


In [32]:
macro_not_in_top_150 = ['Mass Media', 'Real estate', 'Religious Organizations']

In [33]:
top_150 = top_150['Category'].tolist()

In [34]:
#top_150