# Caracterização categorias

- https://www.yelp.com/developers/documentation/v3/all_category_list
- https://www.yelp.com/developers/documentation/v3/all_category_list/categories.json
- https://www.yelp.com/developers/documentation/v3/category

Problemas:
- Existem categorias não listadas. 

<table class="table table-bordered">
            <thead>
                <tr>
                    <th>Name</th>
                    <th>Type</th>
                    <th>Description</th>
                </tr>
            </thead>
            <tbody>
                <tr>
                    <td>alias</td>
                    <td>string</td>
                    <td>Category alias.</td>
                </tr>
                <tr>
                    <td>title</td>
                    <td>string</td>
                    <td>Title of this category.</td>
                </tr>
                <tr>
                    <td>parent_aliases</td>
                    <td>string[]</td>
                    <td>List of aliases of parent categories.</td>
                </tr>
                <tr>
                    <td>country_whitelist</td>
                    <td>string[]</td>
                    <td>Countries for which this category is whitelisted.</td>
                </tr>
                <tr>
                    <td>country_blacklist</td>
                    <td>string[]</td>
                    <td>Countries for which this category is blacklisted.</td>
                </tr>
            </tbody>
        </table>

In [1]:

import pandas as pd

df=pd.read_json("../data/categories.json")

In [2]:
df.head()

Unnamed: 0,alias,title,parents,country_whitelist,country_blacklist
0,3dprinting,3D Printing,[localservices],,
1,abruzzese,Abruzzese,[italian],[IT],
2,absinthebars,Absinthe Bars,[bars],[CZ],
3,acaibowls,Acai Bowls,[food],,"[MX, IT, PL, CL, TR, AR]"
4,accessories,Accessories,[fashion],,


In [3]:
# dicionario alias title 2 way
dict_alias_title={}
for index, row in df.iterrows():
    dict_alias_title[row['alias']]=row['title']
    dict_alias_title[row['title']]=row['alias']

In [4]:
# criação da árvore de categorias

import networkx as nx
import matplotlib.pyplot as plt
%matplotlib inline

category_tree= nx.DiGraph()
for index, row in df.iterrows():
    if not row['parents']:
        category_tree.add_edge(row['alias'],'root')
    else:
        for parent_label in row['parents']:
            category_tree.add_edge(row['alias'],parent_label)


In [5]:
nx.shortest_path_length(category_tree.to_undirected(),'brazilianjiujitsu','chinesemartialarts')

2

# depth filtering

In [6]:
nx.shortest_path_length(category_tree,'beaches','root')

2

In [None]:
dict_alias_depth=dict()
for index, row in df.iterrows():
    dict_alias_depth[row['alias']]=nx.shortest_path_length(category_tree,row['alias'],'root')

In [44]:
def string_to_array(string):
    return eval(string)
df_business=pd.read_csv("../data/business.csv",nrows=5,index_col=0,converters={'categories':string_to_array})

Unnamed: 0,business_id,name,city,latitude,longitude,stars,categories
0,1SWheh84yJXfytovILXOAQ,Arizona Biltmore Golf Club,Phoenix,33.522143,-112.018481,3.0,"[Golf, Active Life]"
1,QXAEGFB4oINsVuTFxEYKFQ,Emerald Chinese Restaurant,Mississauga,43.605499,-79.652289,2.5,"[Specialty Food, Restaurants, Dim Sum, Importe..."
2,gnKjwL_1w79qoiV3IC_xQQ,Musashi Japanese Restaurant,Charlotte,35.092564,-80.859132,4.0,"[Sushi Bars, Restaurants, Japanese]"
3,xvX2CttrVhyG2z1dFg_0xw,Farmers Insurance - Paul Lorenz,Goodyear,33.455613,-112.395596,5.0,"[Insurance, Financial Services]"
4,HhyxOkGAM07SRYtlQ4wMFQ,Queen City Plumbing,Charlotte,35.190012,-80.887223,4.0,"[Plumbing, Shopping, Local Services, Home Serv..."


In [46]:
df_business.categories[0][1]

'Active Life'

In [47]:
for index,row in df_business.iterrows():
    tmp_cat_list=list()
    print(row['categories'])
    for category in row['categories']:
        try:
            if dict_alias_depth[dict_alias_title[category]] >= 2:
                tmp_cat_list.append(category)
        except:
            print("Error: Category \""+category+"\" without alias name.")
    print(tmp_cat_list)
            

['Golf', 'Active Life']
['Golf']
['Specialty Food', 'Restaurants', 'Dim Sum', 'Imported Food', 'Food', 'Chinese', 'Ethnic Food', 'Seafood']
Error: Category "Ethnic Food" without alias name.
['Specialty Food', 'Dim Sum', 'Imported Food', 'Chinese', 'Seafood']
['Sushi Bars', 'Restaurants', 'Japanese']
['Sushi Bars', 'Japanese']
['Insurance', 'Financial Services']
['Insurance']
['Plumbing', 'Shopping', 'Local Services', 'Home Services', 'Kitchen & Bath', 'Home & Garden', 'Water Heater Installation/Repair']
['Plumbing', 'Kitchen & Bath', 'Home & Garden', 'Water Heater Installation/Repair']


In [54]:
def category_filter(categories):
    tmp_cat_list=list()
    print(categories)
    for category in categories:
        try:
            if dict_alias_depth[dict_alias_title[category]] >= 2:
                tmp_cat_list.append(category)
        except:
            print("Error: Category \""+category+"\" without alias name.")
    print(tmp_cat_list)
    return tmp_cat_list


df_business.categories=df_business.categories.apply(category_filter)

['Golf', 'Active Life']
['Golf']
['Specialty Food', 'Restaurants', 'Dim Sum', 'Imported Food', 'Food', 'Chinese', 'Ethnic Food', 'Seafood']
Error: Category "Ethnic Food" without alias name.
['Specialty Food', 'Dim Sum', 'Imported Food', 'Chinese', 'Seafood']
['Sushi Bars', 'Restaurants', 'Japanese']
['Sushi Bars', 'Japanese']
['Insurance', 'Financial Services']
['Insurance']
['Plumbing', 'Shopping', 'Local Services', 'Home Services', 'Kitchen & Bath', 'Home & Garden', 'Water Heater Installation/Repair']
['Plumbing', 'Kitchen & Bath', 'Home & Garden', 'Water Heater Installation/Repair']


In [55]:
df_business

Unnamed: 0,business_id,name,city,latitude,longitude,stars,categories
0,1SWheh84yJXfytovILXOAQ,Arizona Biltmore Golf Club,Phoenix,33.522143,-112.018481,3.0,[Golf]
1,QXAEGFB4oINsVuTFxEYKFQ,Emerald Chinese Restaurant,Mississauga,43.605499,-79.652289,2.5,"[Specialty Food, Dim Sum, Imported Food, Chine..."
2,gnKjwL_1w79qoiV3IC_xQQ,Musashi Japanese Restaurant,Charlotte,35.092564,-80.859132,4.0,"[Sushi Bars, Japanese]"
3,xvX2CttrVhyG2z1dFg_0xw,Farmers Insurance - Paul Lorenz,Goodyear,33.455613,-112.395596,5.0,[Insurance]
4,HhyxOkGAM07SRYtlQ4wMFQ,Queen City Plumbing,Charlotte,35.190012,-80.887223,4.0,"[Plumbing, Kitchen & Bath, Home & Garden, Wate..."
