In [1]:
import pandas as pd

# Let's import `west_df.csv` to create binary columns of shop categories

In [2]:
df = pd.read_csv('./datasets/west_df.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,category1,category2,category3,category4,category5,city,latitude,longitude,price,rating,zip_code
0,0,mexican,?,?,?,?,Los Angeles,33.97499,-118.24696,1.0,3.5,90001
1,2,desserts,chocolate,?,?,?,Los Angeles,33.97363,-118.24989,2.0,5.0,90001
2,4,foodtrucks,mexican,?,?,?,Los Angeles,34.060716,-118.344931,1.0,4.5,90017
3,6,bakeries,cupcakes,customcakes,?,?,Los Angeles,33.974865,-118.240467,2.0,4.0,90001
4,8,mexican,?,?,?,?,South Gate,33.956748,-118.223968,1.0,4.0,90280


In [3]:
df.shape

(69335, 12)

In [4]:
cats = ['category1', 'category2', 'category3', 'category4', 'category5', 'zip_code']
cat_df = df[cats]
cat_df.head()

Unnamed: 0,category1,category2,category3,category4,category5,zip_code
0,mexican,?,?,?,?,90001
1,desserts,chocolate,?,?,?,90001
2,foodtrucks,mexican,?,?,?,90017
3,bakeries,cupcakes,customcakes,?,?,90001
4,mexican,?,?,?,?,90280


In [5]:
categories_df = cat_df.astype(str).groupby('zip_code').agg(lambda x: ' '.join(x.unique()))

In [6]:
categories_df.head()

Unnamed: 0_level_0,category1,category2,category3,category4,category5
zip_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
90000,salad,tradamerican,wraps,?,?
90001,mexican desserts bakeries foodtrucks burgers p...,? chocolate cupcakes seafood comfortfood mexic...,? customcakes sandwiches hotdog catering comfo...,?,?
90002,salvadoran chinese coffee,? juicebars breakfast_brunch,? desserts,?,?
90003,mexican foodtrucks soulfood bakeries pizza sea...,foodstands mexican ? noodles sandwiches soulfo...,? tradamerican waffles soulfood,?,?
90004,bakeries salvadoran sushi coffee breakfast_bru...,coffee ? cafes vegan comfortfood chinese korea...,cakeshop ? vegetarian soup noodles korean germ...,?,?


In [7]:
categories_df.shape

(2655, 5)

In [8]:
#combining all category columns together in order to turn into lists 
categories_df['all'] = categories_df['category1'] + ' ' + categories_df['category2'] + ' ' + categories_df['category3'] + ' ' + categories_df['category4'] + ' ' + categories_df['category5'] 

In [9]:
categories_df.head()

Unnamed: 0_level_0,category1,category2,category3,category4,category5,all
zip_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
90000,salad,tradamerican,wraps,?,?,salad tradamerican wraps ? ?
90001,mexican desserts bakeries foodtrucks burgers p...,? chocolate cupcakes seafood comfortfood mexic...,? customcakes sandwiches hotdog catering comfo...,?,?,mexican desserts bakeries foodtrucks burgers p...
90002,salvadoran chinese coffee,? juicebars breakfast_brunch,? desserts,?,?,salvadoran chinese coffee ? juicebars breakfas...
90003,mexican foodtrucks soulfood bakeries pizza sea...,foodstands mexican ? noodles sandwiches soulfo...,? tradamerican waffles soulfood,?,?,mexican foodtrucks soulfood bakeries pizza sea...
90004,bakeries salvadoran sushi coffee breakfast_bru...,coffee ? cafes vegan comfortfood chinese korea...,cakeshop ? vegetarian soup noodles korean germ...,?,?,bakeries salvadoran sushi coffee breakfast_bru...


In [10]:
categories_df['all'][0].split(' ')

['salad', 'tradamerican', 'wraps', '?', '?']

In [11]:
import numpy as np

# Getting unique categories for each zip code

In [12]:
categories_df['list_cat'] = categories_df['all'].str.split(' ').apply(np.unique)

In [13]:
categories_df.head()

Unnamed: 0_level_0,category1,category2,category3,category4,category5,all,list_cat
zip_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
90000,salad,tradamerican,wraps,?,?,salad tradamerican wraps ? ?,"[?, salad, tradamerican, wraps]"
90001,mexican desserts bakeries foodtrucks burgers p...,? chocolate cupcakes seafood comfortfood mexic...,? customcakes sandwiches hotdog catering comfo...,?,?,mexican desserts bakeries foodtrucks burgers p...,"[?, bakeries, breakfast_brunch, burgers, cater..."
90002,salvadoran chinese coffee,? juicebars breakfast_brunch,? desserts,?,?,salvadoran chinese coffee ? juicebars breakfas...,"[?, breakfast_brunch, chinese, coffee, dessert..."
90003,mexican foodtrucks soulfood bakeries pizza sea...,foodstands mexican ? noodles sandwiches soulfo...,? tradamerican waffles soulfood,?,?,mexican foodtrucks soulfood bakeries pizza sea...,"[?, bakeries, bbq, breakfast_brunch, burgers, ..."
90004,bakeries salvadoran sushi coffee breakfast_bru...,coffee ? cafes vegan comfortfood chinese korea...,cakeshop ? vegetarian soup noodles korean germ...,?,?,bakeries salvadoran sushi coffee breakfast_bru...,"[?, asianfusion, bakeries, bbq, beer_and_wine,..."


In [14]:
#sanity check
categories_df['list_cat'][50]

array(['?', 'breakfast_brunch', 'cafes', 'cocktailbars', 'coffee',
       'foodtrucks', 'italian', 'mexican', 'poke', 'salad', 'sandwiches',
       'sportsbars', 'streetvendors', 'tradamerican', 'venues'],
      dtype='<U16')

# Creating an object containing unique categories of ALL zipcodes

In [15]:
unique_cats = categories_df['list_cat'].apply(pd.Series).stack().unique()

In [16]:
len(unique_cats)

469

In [17]:
type(unique_cats)

numpy.ndarray

In [18]:
#sanity check 
'mexican' in unique_cats

True

In [19]:
for word in categories_df['list_cat'][0]:
    print(word)

?
salad
tradamerican
wraps


In [43]:
cat_dict = {}

def categorize(text):
  
    for word in text:
        #for category in unique_cats:
        if word in unique_cats:
            cat_dict[word] = 1
        else: 
            cat_dict[word] = 0
           
    return cat_dict


In [99]:
cat_dictionary = {}
def categorize(text):
    for category in unique_cats:
        #for cell in text:
            #for word in cell:
        if category in text:
            cat_dictionary[category] = 1
        else: 
            cat_dictionary[category] = 0
    return cat_dictionary

In [100]:
#creating new column from array to list 
categories_df['cat_list'] = categories_df['list_cat'].tolist()

In [101]:
categories_df.head()

Unnamed: 0_level_0,category1,category2,category3,category4,category5,all,list_cat,cat_list
zip_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
90000,salad,tradamerican,wraps,?,?,salad tradamerican wraps ? ?,"[?, salad, tradamerican, wraps]","[?, salad, tradamerican, wraps]"
90001,mexican desserts bakeries foodtrucks burgers p...,? chocolate cupcakes seafood comfortfood mexic...,? customcakes sandwiches hotdog catering comfo...,?,?,mexican desserts bakeries foodtrucks burgers p...,"[?, bakeries, breakfast_brunch, burgers, cater...","[?, bakeries, breakfast_brunch, burgers, cater..."
90002,salvadoran chinese coffee,? juicebars breakfast_brunch,? desserts,?,?,salvadoran chinese coffee ? juicebars breakfas...,"[?, breakfast_brunch, chinese, coffee, dessert...","[?, breakfast_brunch, chinese, coffee, dessert..."
90003,mexican foodtrucks soulfood bakeries pizza sea...,foodstands mexican ? noodles sandwiches soulfo...,? tradamerican waffles soulfood,?,?,mexican foodtrucks soulfood bakeries pizza sea...,"[?, bakeries, bbq, breakfast_brunch, burgers, ...","[?, bakeries, bbq, breakfast_brunch, burgers, ..."
90004,bakeries salvadoran sushi coffee breakfast_bru...,coffee ? cafes vegan comfortfood chinese korea...,cakeshop ? vegetarian soup noodles korean germ...,?,?,bakeries salvadoran sushi coffee breakfast_bru...,"[?, asianfusion, bakeries, bbq, beer_and_wine,...","[?, asianfusion, bakeries, bbq, beer_and_wine,..."


In [102]:
categories_df['cat_list'][5]

array(['?', 'argentine', 'asianfusion', 'bakeries', 'bars', 'bbq',
       'beerbar', 'breakfast_brunch', 'bubbletea', 'burgers', 'cafes',
       'chicken_wings', 'chickenshop', 'chinese', 'cocktailbars',
       'coffee', 'desserts', 'donuts', 'empanadas', 'foodtrucks',
       'hotdogs', 'hotpot', 'icecream', 'italian', 'izakaya', 'japanese',
       'juicebars', 'karaoke', 'korean', 'latin', 'lounges',
       'newamerican', 'noodles', 'peruvian', 'ramen', 'salvadoran',
       'sandwiches', 'shavedice', 'soup', 'sportsbars', 'steak', 'sushi',
       'tapasmallplates', 'thai', 'vietnamese'], dtype='<U16')

In [103]:
cat_df = categories_df['cat_list'].apply(categorize)

In [104]:
cat_df

zip_code
90000    {'?': 1, 'salad': 0, 'tradamerican': 0, 'wraps...
90001    {'?': 1, 'salad': 0, 'tradamerican': 0, 'wraps...
90002    {'?': 1, 'salad': 0, 'tradamerican': 0, 'wraps...
90003    {'?': 1, 'salad': 0, 'tradamerican': 0, 'wraps...
90004    {'?': 1, 'salad': 0, 'tradamerican': 0, 'wraps...
90005    {'?': 1, 'salad': 0, 'tradamerican': 0, 'wraps...
90006    {'?': 1, 'salad': 0, 'tradamerican': 0, 'wraps...
90007    {'?': 1, 'salad': 0, 'tradamerican': 0, 'wraps...
90008    {'?': 1, 'salad': 0, 'tradamerican': 0, 'wraps...
90010    {'?': 1, 'salad': 0, 'tradamerican': 0, 'wraps...
90011    {'?': 1, 'salad': 0, 'tradamerican': 0, 'wraps...
90012    {'?': 1, 'salad': 0, 'tradamerican': 0, 'wraps...
90013    {'?': 1, 'salad': 0, 'tradamerican': 0, 'wraps...
90014    {'?': 1, 'salad': 0, 'tradamerican': 0, 'wraps...
90015    {'?': 1, 'salad': 0, 'tradamerican': 0, 'wraps...
90016    {'?': 1, 'salad': 0, 'tradamerican': 0, 'wraps...
90017    {'?': 1, 'salad': 0, 'tradamerican': 0

In [105]:
df = pd.DataFrame.from_dict(cat_df)

In [106]:
df['cat_list'][0]

{'?': 1,
 'salad': 0,
 'tradamerican': 0,
 'wraps': 0,
 'bakeries': 0,
 'breakfast_brunch': 1,
 'burgers': 1,
 'catering': 0,
 'chinese': 0,
 'chocolate': 0,
 'comfortfood': 0,
 'cupcakes': 0,
 'customcakes': 0,
 'desserts': 0,
 'foodtrucks': 0,
 'gelato': 0,
 'herbsandspices': 0,
 'hotdog': 0,
 'icecream': 0,
 'japanese': 0,
 'juicebars': 0,
 'mexican': 0,
 'pizza': 0,
 'salvadoran': 0,
 'sandwiches': 0,
 'seafood': 0,
 'seafoodmarkets': 0,
 'streetvendors': 0,
 'tacos': 0,
 'vegan': 0,
 'vietnamese': 0,
 'coffee': 1,
 'bbq': 0,
 'fishnchips': 0,
 'foodstands': 0,
 'hotdogs': 1,
 'latin': 0,
 'noodles': 0,
 'soulfood': 0,
 'waffles': 0,
 'asianfusion': 0,
 'beer_and_wine': 0,
 'cafes': 0,
 'cakeshop': 0,
 'cheese': 0,
 'colombian': 0,
 'gastropubs': 0,
 'german': 0,
 'gluten_free': 0,
 'italian': 0,
 'korean': 0,
 'newamerican': 1,
 'soup': 0,
 'sushi': 0,
 'tea': 0,
 'thai': 0,
 'vegetarian': 0,
 'argentine': 0,
 'bars': 1,
 'beerbar': 0,
 'bubbletea': 0,
 'chicken_wings': 0,
 'chick

In [98]:
df['cat_list'][10]

{'?': 1,
 'salad': 0,
 'tradamerican': 0,
 'wraps': 0,
 'bakeries': 0,
 'breakfast_brunch': 1,
 'burgers': 1,
 'catering': 0,
 'chinese': 0,
 'chocolate': 0,
 'comfortfood': 0,
 'cupcakes': 0,
 'customcakes': 0,
 'desserts': 0,
 'foodtrucks': 0,
 'gelato': 0,
 'herbsandspices': 0,
 'hotdog': 0,
 'icecream': 0,
 'japanese': 0,
 'juicebars': 0,
 'mexican': 0,
 'pizza': 0,
 'salvadoran': 0,
 'sandwiches': 0,
 'seafood': 0,
 'seafoodmarkets': 0,
 'streetvendors': 0,
 'tacos': 0,
 'vegan': 0,
 'vietnamese': 0,
 'coffee': 1,
 'bbq': 0,
 'fishnchips': 0,
 'foodstands': 0,
 'hotdogs': 1,
 'latin': 0,
 'noodles': 0,
 'soulfood': 0,
 'waffles': 0,
 'asianfusion': 0,
 'beer_and_wine': 0,
 'cafes': 0,
 'cakeshop': 0,
 'cheese': 0,
 'colombian': 0,
 'gastropubs': 0,
 'german': 0,
 'gluten_free': 0,
 'italian': 0,
 'korean': 0,
 'newamerican': 1,
 'soup': 0,
 'sushi': 0,
 'tea': 0,
 'thai': 0,
 'vegetarian': 0,
 'argentine': 0,
 'bars': 1,
 'beerbar': 0,
 'bubbletea': 0,
 'chicken_wings': 0,
 'chick