In [4]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from ydc.tools import import_data, distances
from ydc.tools.supercats import add_supercats
from simplekml import Kml, Style
from colorsys import hsv_to_rgb
import matplotlib.pylab as pylab
pylab.rcParams['figure.figsize'] = 12, 8  # that's default image size for this interactive session

In [13]:
businesses = import_data.import_businesses(status=True)

Successfully imported businesses with columns ['attributes' 'business_id' 'categories' 'city' 'full_address' 'hours'
 'latitude' 'longitude' 'name' 'neighborhoods' 'open' 'review_count'
 'stars' 'state' 'type']


In [5]:
businesses['price'] = businesses.attributes.apply(lambda row: row.get('Price Range'))

In [14]:
(supercats_frame, names) = add_supercats(businesses)

In [15]:
cells = distances.CellCollection(15, businesses)

In [112]:
def category_neighbourhoods(businesses):
    # Map Categories in Neighbourhoods together
    cat_map = pd.DataFrame()
    for supercat in names.keys():
        this_supercategory = supercats_frame[businesses['super_category'] == supercat]
        super_cat_name = names[supercat]['name']
        for subcat in names[supercat]['sub_categories'].keys():
            cats = {} # dict of category-mappings for items of category
            this_subcategory = businesses[businesses['sub_category'] == subcat]
            sub_cat_name = super_cat_name + "/" + names[supercat]['sub_categories'][subcat]['name']
            print('analyzing {}'.format(sub_cat_name))
            for idx, in_item in this_subcategory.iterrows():
                neighbours = cells.get_neighbours(in_item, num=3)
                index = pd.DataFrame(neighbours)
                # get indizes of neighbours
                index = index['index'].tolist()
                for idx, neighbour in supercats_frame.loc[index].iterrows():
                    neighbour_cat = (names[int(neighbour['super_category'])]['name'] + "/"
                        + names[int(neighbour['super_category'])]['sub_categories'][int(neighbour['sub_category'])]['name'])
                    # we simply count the occurences
                    cats[neighbour_cat] = cats.get(neighbour_cat, 0) + 1
            # add to the map of categories
            if sub_cat_name in list(cat_map.columns.values):
                cat_map[sub_cat_name] = cat_map[sub_cat_name] + pd.Series(cats)
            else:
                cat_map[sub_cat_name] = pd.Series(cats)
    return cat_map

def neighbourhoods(businesses):
    cats = {}
    for idx, in_item in businesses.iterrows():
        neighbours = cells.get_neighbours(in_item, num=3)
        index = pd.DataFrame(neighbours)
        # get indizes of neighbours
        index = index['index'].tolist()
        for idx, neighbour in supercats_frame.loc[index].iterrows():
            neighbour_cat = (names[int(neighbour['super_category'])]['name'] + "/"
                + names[int(neighbour['super_category'])]['sub_categories'][int(neighbour['sub_category'])]['name'])
            # we simply count the occurences
            if neighbour_cat not in cats.keys():
                cats[neighbour_cat] = {'count': 0, 'stars': [], 'review_count': 0}
            cats[neighbour_cat]['count'] = cats[neighbour_cat]['count'] + 1
            cats[neighbour_cat]['stars'].append(float(neighbour['stars']))
            cats[neighbour_cat]['review_count'] = cats[neighbour_cat]['review_count'] + int(neighbour['review_count'])
    # normalize data:
    for cat, data in cats.items():
        # average stars per row-category
        data['stars'] = np.mean(data['stars'])
        # review_count is average review-number per business
        data['review_count'] = data['review_count'] / data['count']
    cats = pd.DataFrame(cats).T
    return cats['count'], cats['stars'], cats['review_count']

In [120]:
# for performane: filter the category-names for the following string:
filter = 'Restaurant'

# quality-measures for which we devide the group of businesses per sub-category
measures = ['stars']

# produce names in the form <supercat>/<subcat> for columns and rows
columns = []
rows = []
for idx, super_cat in names.items():
    for idx, sub_cat in super_cat['sub_categories'].items():
        sub_cat_name = super_cat['name'] + "/" + sub_cat['name']
        rows.append(sub_cat_name)
        if filter not in sub_cat_name:
            continue
        columns.append(sub_cat_name)

# make hierarchical index
qualities = ('good', 'bad')
fields = ('count', 'stars', 'review_count')
liste = list(set([(c, q, f) for c in columns for f in fields for q in qualities]))
liste.sort()
idx = pd.MultiIndex.from_tuples(liste, names=('type', 'quality', 'field'))

# analyze and put it all together into a DataFrame
differencies = pd.DataFrame(columns=idx, index=rows)
for super_cat in names.keys():
    this_super_cat = supercats_frame[supercats_frame['super_category'] == super_cat]
    super_cat_name = names[super_cat]['name']
    for sub_cat in names[super_cat]['sub_categories'].keys():
        this_sub_cat = this_super_cat[this_super_cat['sub_category'] == sub_cat]
        sub_cat_name = super_cat_name + "/" + names[super_cat]['sub_categories'][sub_cat]['name']
        if filter not in sub_cat_name:
            continue
        print('analyzing {}'.format(sub_cat_name))
        for measure in measures:
            mean = this_sub_cat[measure].mean()
            std = this_sub_cat[measure].std()
            best_decade = neighbourhoods(this_sub_cat[this_sub_cat[measure] > (mean + 1.4 * std)])
            worst_decade = neighbourhoods(this_sub_cat[this_sub_cat[measure] < (mean - 1.4 * std)])
            i = 0
            for field in fields:
                differencies[sub_cat_name, 'good', field] = worst_decade[i]
                differencies[sub_cat_name, 'bad', field] = best_decade[i]
                i = i + 1
                
# normalize the 'count'-columns with the sum of all neighbours
for category in set(differencies.columns.get_level_values(0)):
    for quality in set(differencies.columns.get_level_values(1)):
        differencies[category, quality, 'count'] = differencies[category, quality, 'count'] \
                                                   / differencies[category, quality, 'count'].sum()

analyzing Restaurants/American (Traditional)
analyzing Restaurants/Mediterranean
analyzing Restaurants/Nightlife
analyzing Restaurants/Chinese
analyzing Restaurants/Fast Food
analyzing Restaurants/Uncategorized


In [150]:
differencies['Restaurants/American (Traditional)'].sort([('good', 'count')], ascending=False)

quality,bad,bad,bad,good,good,good
field,count,review_count,stars,count,review_count,stars
Restaurants/Fast Food,0.114577,30.893297,3.515048,0.124605,66.894862,3.340642
Restaurants/Nightlife,0.132915,39.643868,3.568396,0.124424,91.237169,3.483466
Restaurants/American (Traditional),0.138401,39.591166,3.67214,0.117808,115.063286,3.5095
Shopping/Fashion,0.052194,9.486486,3.696697,0.068219,16.597346,3.726538
Food/Coffee & Tea,0.067555,21.331787,3.941995,0.068038,26.846638,3.674528
Restaurants/Chinese,0.065204,30.966346,3.512019,0.065816,95.772943,3.346587
Event Planning & Services/Arts & Entertainment,0.037931,47.260331,3.902893,0.058311,113.507762,3.800734
Event Planning & Services/Hotels & Travel,0.041066,24.083969,3.55916,0.05,96.778473,3.500658
Beauty & Spas/Hair Salons,0.043417,12.99278,4.027076,0.048025,18.567169,3.960932
Restaurants/Mediterranean,0.057053,29.197802,3.635989,0.037591,52.951839,3.556042


In [155]:
def difference_norm(differencies, field):
    ret = {}
    for category in set(differencies.columns.get_level_values(0)):
        difference = ((differencies[category, 'good', field] * differencies[category, 'good', 'count']) 
                      - (differencies[category, 'bad', field] * differencies[category, 'bad', 'count']) 
                     ).dropna()
        ret[category] = np.linalg.norm(difference) / (difference.max() - difference.min())
    return ret
        
print('stars:')
difference_stars = difference_norm(differencies, 'stars')
for k, v in difference_stars.items():
    print('{}: {}'.format(k, v))
print('review_count:')
difference_reviews = difference_norm(differencies, 'review_count')
for k, v in difference_reviews.items():
    print('{}: {}'.format(k, v))

stars:
Restaurants/Nightlife: 0.9842064601192956
Restaurants/Fast Food: 1.0114247968029064
Restaurants/Uncategorized: 0.8998228815196052
Restaurants/Chinese: 0.9322503525496548
Restaurants/Mediterranean: 1.1447921681207325
Restaurants/American (Traditional): 1.0334033307949162
review_count:
Restaurants/Nightlife: 0.9454613370940755
Restaurants/Fast Food: 1.910570212498913
Restaurants/Uncategorized: 1.023609474287029
Restaurants/Chinese: 1.673282909979908
Restaurants/Mediterranean: 1.2955103086353834
Restaurants/American (Traditional): 1.6498835875643603
