#### This Notebook:
    1. Determines the most frequent business categories in the Kaggle Data Set.
    2. Develops dictionary and function to map Kaggle business categories: top 150 busine4ss categories are left as they are, the remaining are mapped to the corresponding Macro Category.

# 1. Assessing most frequent business categories in the Kaggle Data Set.

In [1]:
#%%timeit
import pandas as pd
import json
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer

In [2]:
df = pd.read_json('./yelp_academic_dataset_business.json', lines=True)

In [3]:
df.shape

(188593, 15)

In [4]:
df.head(3)

Unnamed: 0,address,attributes,business_id,categories,city,hours,is_open,latitude,longitude,name,neighborhood,postal_code,review_count,stars,state
0,1314 44 Avenue NE,"{'BikeParking': 'False', 'BusinessAcceptsCredi...",Apn5Q_b6Nz61Tq4XzPdf9A,"Tours, Breweries, Pizza, Restaurants, Food, Ho...",Calgary,"{'Monday': '8:30-17:0', 'Tuesday': '11:0-21:0'...",1,51.091813,-114.031675,Minhas Micro Brewery,,T2E 6L6,24,4.0,AB
1,,"{'Alcohol': 'none', 'BikeParking': 'False', 'B...",AjEbIBw6ZFfln7ePHha9PA,"Chicken Wings, Burgers, Caterers, Street Vendo...",Henderson,"{'Friday': '17:0-23:0', 'Saturday': '17:0-23:0...",0,35.960734,-114.939821,CK'S BBQ & Catering,,89002,3,4.5,NV
2,1335 rue Beaubien E,"{'Alcohol': 'beer_and_wine', 'Ambience': '{'ro...",O8S5hYJ1SMc8fA4QBtVujA,"Breakfast & Brunch, Restaurants, French, Sandw...",Montréal,"{'Monday': '10:0-22:0', 'Tuesday': '10:0-22:0'...",0,45.540503,-73.5993,La Bastringue,Rosemont-La Petite-Patrie,H2G 1K7,5,4.0,QC


In [5]:
## We need to remove the NaN cells for Count Vectorizer to work.
df.categories.isna().sum()
df.categories = df.categories.fillna('supercali')     # Arbitrary word ro replace NaN
df.categories.isna().sum()

0

In [6]:
cvec = CountVectorizer()
cvec = cvec.fit(df['categories'])
df_vec = pd.DataFrame(cvec.transform(df['categories']).todense(), columns=cvec.get_feature_names())

In [7]:
df_vec.shape

(188593, 1393)

In [8]:
len(df_vec.columns)

1393

In [9]:
## Top 100 business types according to Count Vectorizer.
## With Count Vectorizer business types labels do not match the labels from the dataset
## e.g. 'beauty and spas' becomes two different categories.
#top_100 = df_vec.sum().sort_values(ascending=False).head(50).tolist
#top_100

In [11]:
## Creating new column. Each row has a list of strings business types.
df['categories_2'] = [i.split(',') for i in df['categories']]

In [12]:
## Creating a list containing all cleaned business types.
%time
business_list = []                             
for index, value in df.categories_2.items():
    for i in value:
        business_list.append(i.strip())        #cleaning each business type

CPU times: user 3 µs, sys: 1e+03 ns, total: 4 µs
Wall time: 7.87 µs


In [13]:
## Examining the list.
## business_list

In [14]:
## Removing duplicates from the list.
business_list_5 = set(business_list) 

In [15]:
## Checking lenght of list of business types without duplicates.
len(business_list_5)

1306

In [16]:
#business_list_5

In [17]:
## Creating a dictionary which has for each key = business type label, value = count.
business_dictionary = {}
for i in business_list_5:
    business_dictionary[i] = 0

In [18]:
## Updating dictionary with right count.
%time
for cell in df.categories_2:
    for category in cell:
        business_dictionary[category.strip()] += 1

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 6.91 µs


In [19]:
#business_dictionary['Cafes']

In [20]:
## Checking...
keys = business_dictionary.keys()
len(keys)

1306

In [21]:
## Creating DataFrame out of the dictionary containing business types and their count.
final = pd.DataFrame([business_dictionary])    
final = final.T
final = final.rename(columns={0:'Count'})
final = final.sort_values('Count', ascending=False)
final.head(5)

Unnamed: 0,Count
Restaurants,57173
Shopping,30231
Food,27118
Beauty & Spas,18967
Home Services,18634


In [22]:
#final.to_csv('./categories_count')

In [23]:
#final = pd.read_csv('./categories_count')

In [24]:
final.columns

Index(['Count'], dtype='object')

In [25]:
final = final.rename(columns={'Unnamed: 0': 'Category'})

In [26]:
final.to_csv('./categories_count')

In [27]:
#pd.read_csv('./categories_count')

In [28]:
final_sorted = final.sort_values('Count', ascending=False)

In [29]:
## Defining variable macro categories
macro_categories = ['Active Life','Arts & Entertainment','Automotive','Beauty & Spas',
'Education','Event Planning & Services','Financial Services','Food','Health & Medical',
'Home Services','Hotels & Travel','Local Services','Mass Media','Nightlife','Pets',
'Professional Services','Public Services & Government','Real estate','Religious Organizations',
'Restaurants','Shopping', 'Prova']

In [30]:
top_150 = final.sort_values('Count', ascending=False)[0:151]
top_150.head(3)

Unnamed: 0,Count
Restaurants,57173
Shopping,30231
Food,27118


In [33]:
top_150['Count'][0:3]

Restaurants    57173
Shopping       30231
Food           27118
Name: Count, dtype: int64

In [34]:
## Which macro categories are not in the top 150 titles?
macro_not_in_top_150 = ['Mass Media', 'Real estate', 'Religious Organizations']

In [36]:
top_150 = top_150['Count'].tolist()

# 2. Mapping titles to Macro Categories (22 in total) if title is not in top 150 titles by frequency.

In [37]:
## Import the complete list of categories labels.
cat = pd.read_json('./categories.json', orient='records')
cat.head(5)

Unnamed: 0,alias,country_blacklist,country_whitelist,parents,title
0,3dprinting,,,[localservices],3D Printing
1,abruzzese,,[IT],[italian],Abruzzese
2,absinthebars,,[CZ],[bars],Absinthe Bars
3,acaibowls,"[TR, AR, IT, MX, PL, CL]",,[food],Acai Bowls
4,accessories,,,[fashion],Accessories


In [49]:
## Creating sub macro category list - which includes also macro categories!

sub_master_list = []                         #instantiate empty container
for i in cat.parents:                        #iterate over each cell == list in column parent
    for y in i:                              #iterate over each list element
        sub_master_list.append(y)            #append each element to list containers
        
sub_master_list = list(set(sub_master_list)) #deleting duplicates
sub_master_list.sort()                       #sorting
len(sub_master_list)                         #checking

121

In [50]:
## Creating list of macro categories
## Macrocategories are titles whose corresponding 'parents' list is empty.

master_list = []                             #instantiate empty container
for index, value in enumerate(cat.title):    #iterate over each cell in column title
    if cat.parents.iloc[index] == []:                 #if title value has no parent
        master_list.append(value)            #append to list container
#master_list

In [51]:
## Making strings in master list comparable to elements in sub master list 
master_list_minnospace = [i.lower().replace(' ', '') for i in master_list]
#master_list_minnospace

In [52]:
## Displaying master categories found in sub master list
master_list_parent_format = []
for i in sub_master_list:
    if i in master_list_minnospace:
        master_list_parent_format.append(i)
master_list_parent_format

['bicycles',
 'education',
 'financialservices',
 'food',
 'homeservices',
 'localflavor',
 'localservices',
 'massmedia',
 'nightlife',
 'pets',
 'restaurants',
 'shopping']

In [53]:
## Manually creating macro categories not identified in sub master list
missing_macro_categories = ['active', 'arts', 'auto', 'beautysvc', 'eventservices',
                            'health', 'hotelstravel', 'professional',
                           'publicservices&government', 'religiousorganizations']

## Adding missing_macro_categories to master list in 'parent' format
## Now we have a complete list of master categories written in 'parent format'
master_list_parent_format = missing_macro_categories + master_list_parent_format

In [54]:
## Checking
len(master_list_parent_format)
len(master_list)

22

In [55]:
#master_list

In [56]:
#master_list_parent_format

In [57]:
## Create dictionary level 1: 22 k, 121 v
## Create function that first iterates dict level 2, than level 1 to assign position

In [58]:
## Create dictionary level 2: 121 keys (macro and sub macro categories), 1565 values
dict_2 = {}
for i in sub_master_list:
    dict_2[i] = []
    
for row, cell in enumerate(cat.title):
    try:
        parent = cat.parents[row]
        parent = parent[0]
        dict_2[parent].append(cell)
    except:
        print(cell)

Active Life
Arts & Entertainment
Automotive
Beauty & Spas
Bicycles
Education
Event Planning & Services
Financial Services
Food
Health & Medical
Home Services
Hotels & Travel
Local Flavor
Local Services
Mass Media
Nightlife
Pets
Professional Services
Public Services & Government
Religious Organizations
Restaurants
Shopping


In [60]:
## Create dictionary level 1: 22 k, 121 v
dict_1 = {}
for i in master_list:
    dict_1[i] = []

In [62]:
## Add manually sub_cat_list to dict_1 values
dict_1 = {'Active Life': ['active', 'diving', 'fitness', 'gyms', 'martialarts', 'othersalons', 'parks','zoos'],
 'Arts & Entertainment': ['arts', 'farms', 'festivals', 'movietheaters', 'museums', 'psychic_astrology','social_clubs', 'wineries'],
 'Automotive': ['auto', 'autoglass', 'autopartssupplies', 'autorepair'],
 'Beauty & Spas': ['beautysvc', 'hair', 'hairremoval', 'medicalspa', 'skincare', 'tanning'],
 'Bicycles': ['bicycles'],
 'Education': ['artclasses', 'education', 'specialtyschools', 'tastingclasses'],
 'Event Planning & Services': ['eventservices', 'partyequipmentrentals', 'photographers'],
 'Financial Services': ['financialservices', 'insurance'],
 'Food': ['breweries', 'food', 'gourmet', 'jpsweets'],
 'Health & Medical': ['c_and_mh', 'cannabis_clinics', 'crisispregnancycenters', 'dentalhygienists', 'diagnosticservices', 'dentists', 'health', 'medcenters', 'opthamalogists', 'physicians'],
 'Home Services': ['homeservices', 'landscaping', 'plumbing', 'realestateagents', 'realestatesvcs', 'tcm', 'utilities'],
 'Hotels & Travel': ['airports', 'hotels', 'hotelstravel', 'tours',  'transport', 'travelservices'],
 'Local Flavor': ['localflavor'],
 'Local Services': ['funeralservices', 'itservices', 'junkremovalandhauling', 'laundryservices', 'localservices', 'nonprofit'],
 'Mass Media': ['massmedia'],
 'Nightlife': ['bars', 'nightlife', 'adultentertainment',],
 'Pets': ['pet_sitting', 'pets', 'petservices', 'petstore'],
 'Professional Services': ['estateplanning', 'lawyers', 'legalservices', 'professional', 'wholesalers'],
 'Public Services & Government': ['publicservicesgovt'],
 'Religious Organizations': ['religiousorgs'],
 'Restaurants': ['african', 'arabian', 'belgian', 'brazilian', 'breakfast_brunch', 'caribbean', 'cafes', 'chinese', 'donburi', 'french', 'german', 'italian', 'japanese', 'latin', 'malaysian', 'mediterranean', 'mexican', 'mideastern', 'polish', 'portuguese', 'restaurants', 'spanish', 'turkish', ],
 'Shopping': ['artsandcrafts', 'fashion', 'flowers', 'gardening','homeandgarden', 'kitchenandbath', 'media', 'musicinstrumentservices', 'opticians', 'shopping','sportgoods', 'sportswear']}

In [63]:
## Reversing Key Value Pairs in dict_2
dict_2B = {}
for k, v in dict_2.items():
    for i in v:
        dict_2B[i] = k

In [64]:
## Reversing Key Value Pairs in dict_1
dict_2A = {}
for k, v in dict_1.items():
    for i in v:
        dict_2A[i] = k

In [65]:
def map_category(category):
    if category not in top_150:
        if category in dict_2B.keys():
            title = dict_2B[category]
        else:
            title = category
        if title in dict_2A.keys():
            title_2 = dict_2A[title]
            return(title_2)
        else:
            return(title)
    else:
        return(category)

In [66]:
## Applying mapping functiomn to dataset
cat['title_3'] = cat['country_blacklist']                      # adding a column to our df by copying an existing one
for index, cell in enumerate(cat['title']):                    
    cat['title_3'][index] = map_category(cat['title'][index])

In [67]:
result = []
for word in top_150:
    result.append(map_category(word))

In [None]:
#json.dump(dict_2B, open("dict_2B.json","w"))

In [None]:
#json.load('./dict_2B.json'())