In [3]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from ydc.tools import import_data, distances
from ydc.tools.supercats import add_supercats
from simplekml import Kml, Style
from colorsys import hsv_to_rgb
import matplotlib.pylab as pylab
pylab.rcParams['figure.figsize'] = 12, 8  # that's default image size for this interactive session

from ydc.tools.cache import cache_result

In [4]:
businesses = import_data.import_businesses()

In [5]:
businesses['price'] = businesses.attributes.apply(lambda row: row.get('Price Range'))

In [6]:
(supercats_frame, names) = add_supercats(businesses)

In [7]:
cells = distances.CellCollection(15, businesses)

In [8]:
def category_neighbourhoods(businesses):
    # Map Categories in Neighbourhoods together
    cat_map = pd.DataFrame()
    for supercat in names.keys():
        this_supercategory = supercats_frame[businesses['super_category'] == supercat]
        super_cat_name = names[supercat]['name']
        for subcat in names[supercat]['sub_categories'].keys():
            cats = {} # dict of category-mappings for items of category
            this_subcategory = businesses[businesses['sub_category'] == subcat]
            sub_cat_name = super_cat_name + "/" + names[supercat]['sub_categories'][subcat]['name']
            print('analyzing {}'.format(sub_cat_name))
            for idx, in_item in this_subcategory.iterrows():
                neighbours = cells.get_neighbours(in_item, num=3)
                index = pd.DataFrame(neighbours)
                # get indizes of neighbours
                index = index['index'].tolist()
                for idx, neighbour in supercats_frame.loc[index].iterrows():
                    neighbour_cat = (names[int(neighbour['super_category'])]['name'] + "/"
                        + names[int(neighbour['super_category'])]['sub_categories'][int(neighbour['sub_category'])]['name'])
                    # we simply count the occurences
                    cats[neighbour_cat] = cats.get(neighbour_cat, 0) + 1
            # add to the map of categories
            if sub_cat_name in list(cat_map.columns.values):
                cat_map[sub_cat_name] = cat_map[sub_cat_name] + pd.Series(cats)
            else:
                cat_map[sub_cat_name] = pd.Series(cats)
    return cat_map

def neighbourhoods(businesses):
    cats = {}
    for idx, in_item in businesses.iterrows():
        neighbours = cells.get_neighbours(in_item, num=3)
        index = pd.DataFrame(neighbours)
        # get indizes of neighbours
        index = index['index'].tolist()
        for idx, neighbour in supercats_frame.loc[index].iterrows():
            neighbour_cat = (names[int(neighbour['super_category'])]['name'] + "/"
                + names[int(neighbour['super_category'])]['sub_categories'][int(neighbour['sub_category'])]['name'])
            # we simply count the occurences
            if neighbour_cat not in cats.keys():
                cats[neighbour_cat] = {'count': 0, 'stars': [], 'review_count': 0}
            cats[neighbour_cat]['count'] = cats[neighbour_cat]['count'] + 1
            cats[neighbour_cat]['stars'].append(float(neighbour['stars']))
            cats[neighbour_cat]['review_count'] = cats[neighbour_cat]['review_count'] + int(neighbour['review_count'])
    # normalize data:
    for cat, data in cats.items():
        # average stars per row-category
        data['stars'] = np.mean(data['stars'])
        # review_count is average review-number per business
        data['review_count'] = data['review_count'] / data['count']
    cats = pd.DataFrame(cats).T
    return cats['count'], cats['stars'], cats['review_count']

In [9]:
# for performane: filter the category-names for the following string:
filter = 'Restaurant'

# quality-measures for which we devide the group of businesses per sub-category
measures = ['stars']

# produce names in the form <supercat>/<subcat> for columns and rows
columns = []
rows = []
for idx, super_cat in names.items():
    for idx, sub_cat in super_cat['sub_categories'].items():
        sub_cat_name = super_cat['name'] + "/" + sub_cat['name']
        rows.append(sub_cat_name)
        if filter not in sub_cat_name:
            continue
        columns.append(sub_cat_name)

# make hierarchical index
qualities = ('good', 'bad')
fields = ('count', 'stars', 'review_count')
liste = list(set([(c, q, f) for c in columns for f in fields for q in qualities]))
liste.sort()
idx = pd.MultiIndex.from_tuples(liste, names=('type', 'quality', 'field'))

# analyze and put it all together into a DataFrame
differencies = pd.DataFrame(columns=idx, index=rows)
for super_cat in names.keys():
    this_super_cat = supercats_frame[supercats_frame['super_category'] == super_cat]
    super_cat_name = names[super_cat]['name']
    for sub_cat in names[super_cat]['sub_categories'].keys():
        this_sub_cat = this_super_cat[this_super_cat['sub_category'] == sub_cat]
        sub_cat_name = super_cat_name + "/" + names[super_cat]['sub_categories'][sub_cat]['name']
        if filter not in sub_cat_name:
            continue
        print('analyzing {}'.format(sub_cat_name))
        for measure in measures:
            mean = this_sub_cat[measure].mean()
            std = this_sub_cat[measure].std()
            best_decade = neighbourhoods(this_sub_cat[this_sub_cat[measure] > (mean + 1.4 * std)])
            worst_decade = neighbourhoods(this_sub_cat[this_sub_cat[measure] < (mean - 1.4 * std)])
            i = 0
            for field in fields:
                differencies[sub_cat_name, 'good', field] = worst_decade[i]
                differencies[sub_cat_name, 'bad', field] = best_decade[i]
                i = i + 1
                
# normalize the 'count'-columns with the sum of all neighbours
for category in set(differencies.columns.get_level_values(0)):
    for quality in set(differencies.columns.get_level_values(1)):
        differencies[category, quality, 'count'] = differencies[category, quality, 'count'] \
                                                   / differencies[category, quality, 'count'].sum()

analyzing Restaurants/Nightlife
analyzing Restaurants/Mediterranean
analyzing Restaurants/Chinese
analyzing Restaurants/American (Traditional)
analyzing Restaurants/Fast Food
analyzing Restaurants/Uncategorized


In [10]:
differencies['Restaurants/American (Traditional)'].sort([('good', 'count')], ascending=False)

quality,bad,bad,bad,good,good,good
field,count,review_count,stars,count,review_count,stars
Restaurants/Nightlife,0.149907,63.21095,3.570412,0.130112,94.929884,3.485019
Restaurants/Fast Food,0.099328,54.441987,3.451963,0.121612,63.991904,3.315268
Restaurants/American (Traditional),0.104692,74.410288,3.640535,0.099186,104.221232,3.485312
Food/Coffee & Tea,0.090108,26.266555,3.877002,0.082407,33.485979,3.708608
Restaurants/Chinese,0.061027,63.409813,3.429403,0.068422,94.813803,3.348018
Shopping/Fashion,0.064237,12.597921,3.71613,0.065769,16.316529,3.735564
Event Planning & Services/Arts & Entertainment,0.045216,82.954264,3.870414,0.055039,112.92406,3.807411
Event Planning & Services/Hotels & Travel,0.045,50.697942,3.571326,0.048227,89.525417,3.491667
Beauty & Spas/Hair Salons,0.04123,15.822362,4.068704,0.047785,17.761985,3.965097
Restaurants/Mediterranean,0.048318,35.040571,3.664512,0.040028,44.552711,3.545683


In [11]:
def difference_norm(differencies, field):
    ret = {}
    for category in set(differencies.columns.get_level_values(0)):
        difference = ((differencies[category, 'good', field] * differencies[category, 'good', 'count']) 
                      - (differencies[category, 'bad', field] * differencies[category, 'bad', 'count']) 
                     ).dropna()
        ret[category] = np.linalg.norm(difference) / (difference.max() - difference.min())
    return ret
        
print('stars:')
difference_stars = difference_norm(differencies, 'stars')
for k, v in difference_stars.items():
    print('{}: {}'.format(k, v))
print('review_count:')
difference_reviews = difference_norm(differencies, 'review_count')
for k, v in difference_reviews.items():
    print('{}: {}'.format(k, v))

stars:
Restaurants/American (Traditional): 0.956000372376922
Restaurants/Chinese: 0.9166427885578194
Restaurants/Fast Food: 1.0483498227209995
Restaurants/Nightlife: 0.8739806704499574
Restaurants/Mediterranean: 1.1972890100445082
Restaurants/Uncategorized: 0.8555856217971244
review_count:
Restaurants/American (Traditional): 2.1172932502198147
Restaurants/Chinese: 1.608545750381763
Restaurants/Fast Food: 1.9911322122100656
Restaurants/Nightlife: 0.7920241052559486
Restaurants/Mediterranean: 1.036049398918637
Restaurants/Uncategorized: 1.2180533183532727


In [43]:
means = supercats_frame.groupby(['super_category', 'sub_category'])['stars'].mean()
stds = supercats_frame.groupby(['super_category', 'sub_category'])['stars'].std()
factor = 1.4

def threshold(row):
    sup = row['super_category']
    sub = row['sub_category']
    if row['stars'] > means[(sup,sub)] + factor * stds[(sup,sub)]:
        return "good"
    elif row['stars'] < means[(sup,sub)] - factor * stds[(sup,sub)]:
        return "bad"
    else:
        return "semi"

In [44]:
frame = supercats_frame.copy(deep=True)
frame['quality'] = frame.apply(lambda row: threshold(row), axis=1)

In [158]:
cat_keys = []
for sup in names:
    s_names = names[sup]['sub_categories']
    for sub in s_names:
        if s_names[sub] == []:
            continue
        cat_keys.append((sup, sub))

In [188]:
catframe = frame.loc[(frame['quality']=="good"),:].groupby(['super_category', 'sub_category'])['name']
cell_col = distances.CellCollection(15, businesses)


feat_frame = supercats_frame[['business_id', 'super_category', 'sub_category', 'stars']].copy(deep=True)
for key in cat_keys:
    feat_frame[str(key)] = np.nan

cat_str_keys = [str(key) for key in cat_keys]

In [170]:
for idx_list in catframe.groups.values():
    break

In [194]:
func_frame = supercats_frame[['super_category', 'sub_category']].copy(deep=True)
n = 25
# cells need to be defined (baaad, i know)

cat_keys = []
for sup in names:
    s_names = names[sup]['sub_categories']
    for sub in s_names:
        if s_names[sub] == []:
            continue
        cat_keys.append((sup, sub))

def mean_list(l):
    if l==[]:
        return np.nan
    else:
        return np.mean(l)
        
def average_dist(business_row):
    work = {key: [] for key in cat_keys}
    neighbours = cells.get_neighbours(business_row, n)
    for item in neighbours:
        key = (func_frame.loc[item['index'], 'super_category'], func_frame.loc[item['index'], 'sub_category'])
        work[key].append(item['distance'])

    results = {key: mean_list(work[key]) for key in work}
        
    return results

In [195]:
df = supercats_frame.copy(deep=True)

features = {}

for supercat in supercats

for idx, row in df.iterrows():
    features[idx] = average_dist(row)
    break

In [160]:
timeit (frame.loc[10, 'super_category'], frame.loc[10, 'sub_category'])

1000 loops, best of 3: 439 µs per loop


In [196]:
features

{0: {(-1, -1): nan,
  (0, -1): nan,
  (0, 0): nan,
  (0, 1): 0.67902884395379015,
  (0, 2): 0.49003752480244334,
  (1, -1): nan,
  (1, 0): 0.45676232503408837,
  (1, 1): nan,
  (1, 2): nan,
  (1, 3): nan,
  (1, 4): 0.51442493232666697,
  (2, -1): nan,
  (2, 0): 0.66996833140903655,
  (2, 1): 0.55966562451771706,
  (2, 2): nan,
  (2, 3): 0.55702225202547784,
  (3, -1): nan,
  (3, 0): nan,
  (3, 1): nan,
  (3, 2): 0.50393983831777933,
  (3, 3): nan,
  (4, -1): nan,
  (4, 0): nan,
  (4, 1): nan,
  (4, 2): 0.55235926013443137,
  (4, 3): 0.49494763284507126,
  (4, 4): nan,
  (4, 5): 0.67193633334684355,
  (5, -1): 0.59215754804602527,
  (5, 0): 0.55296315911582494,
  (5, 1): 0.67902884395379015,
  (5, 2): nan,
  (5, 3): 0.61437687382929929,
  (5, 4): 0.45050228169743334,
  (6, -1): nan,
  (6, 0): nan,
  (6, 1): nan,
  (6, 2): 0.62994457119833747,
  (6, 3): 0.58733819142387667,
  (6, 4): 0.59215754804602527,
  (7, -1): nan,
  (7, 0): nan,
  (7, 1): nan,
  (7, 2): nan,
  (7, 3): nan,
  (7, 4)

In [None]:
zip(*df_r.apply(lambda row: busi_dict[row['business_id']], axis=1))