In [1]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import numpy as np
sns.set_style("darkgrid")
import math

In [2]:
data_path = "data/"
places = pd.read_csv(data_path + "places_final.csv", sep=",")
reviews = pd.read_csv(data_path + "reviews_final.csv", sep=",")

## Datafram preperation

To create a dataframe which can be used for space embedding in London and New York, the features for both cities are aggregated on a grid cell level. 

### Business per grid cell

Counting all business per cell

In [3]:
df_grid=places.groupby('Grid')['gPlusPlaceId'].count().reset_index()
df_grid.rename(columns = {'gPlusPlaceId':'PlaceCount'}, inplace = True)

Finding the avg price range for businesses in the grid cells

In [4]:
price = places.groupby('Grid').mean()['price'].reset_index()
df_grid=df_grid.merge(price,on='Grid')

Counting number of business per category per grid cell

In [5]:
cat_cols=places.groupby(['Grid','category']).count()['gPlusPlaceId'].reset_index()
cat_cols=pd.pivot_table(cat_cols, index='Grid', columns='category', values='gPlusPlaceId').fillna(0)

In [6]:
df_grid=df_grid.join(cat_cols,on='Grid')

In [7]:
df_grid.shape

(508, 15)

In [8]:
df_grid.head()

Unnamed: 0,Grid,PlaceCount,price,Accommodation,Bar,Cafe,Cultural,Education,Health,Other,Outdoors,Restaurant,Retail,Service,Wholesale
0,L0,8,2.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,5.0,0.0,1.0,0.0
1,L1,10,2.0,0.0,0.0,1.0,0.0,0.0,1.0,3.0,0.0,2.0,0.0,2.0,1.0
2,L10,3,,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
3,L100,74,2.235294,12.0,6.0,6.0,0.0,1.0,4.0,16.0,0.0,17.0,4.0,7.0,1.0
4,L101,163,1.953488,21.0,19.0,20.0,1.0,1.0,10.0,19.0,0.0,37.0,21.0,9.0,5.0


### Rating

Finding number of reviews per cell

In [9]:
mergeddf=places.merge(reviews,on='gPlusPlaceId',how='outer').dropna(subset=['name'])

In [24]:
grid_reviews

Unnamed: 0,Grid,gPlusUserId
0,L0,125
1,L1,31
2,L10,8
3,L100,853
4,L101,1330
...,...,...
503,NY86,599
504,NY87,131
505,NY88,47
506,NY98,9


In [10]:
grid_reviews = mergeddf.groupby('Grid').count()['gPlusUserId'].reset_index()
df_grid=df_grid.merge(grid_reviews,on='Grid')

Overall average rating per cell, and average per budiness type

When finding the average ratings, all business with less than 3 reviews are removed, as these tend to have a very large average review. 
A reason for this could be that friends and family give good ratings to newly opened businesses.

In [11]:
mergeddf['rating'] = mergeddf['rating'].astype(float)

In [12]:
# removing all business with less than 3 reviews
avg_reviews = mergeddf.groupby(['Grid','gPlusPlaceId'])['rating'].agg(["count", "mean"]).loc[lambda x: x['count'] > 3, "mean"].reset_index()

# finding the average rating per grid cell
avg_reviews = avg_reviews.groupby('Grid').mean()['mean'].reset_index()

df_grid=df_grid.merge(avg_reviews,on='Grid')

In [13]:
cat_rev = mergeddf.groupby(['Grid','category','gPlusPlaceId'])['rating'].agg(["count", "mean"]).loc[lambda x: x['count'] > 3, "mean"].reset_index()

In [14]:
#Creating rating name column for each business type's avg rating 
cat_val=[]

for i in range(len(cat_rev)):
    cat_val.append(cat_rev['category'][i]+'_rating')

cat_rev['cat_val']=cat_val

In [15]:
category_rating=pd.pivot_table(cat_rev, index='Grid', columns='cat_val', values='mean').fillna(0)

df_grid=df_grid.merge(category_rating,on='Grid')

### Sensitivity

Counting positive and negative reviews in a grid cell

In [16]:
positive=np.zeros(len(mergeddf))
negative=np.zeros(len(mergeddf))
#neutral=np.zeros(len(mergeddf))

for i in range(len(mergeddf)):
    sensitiv=[mergeddf['posReviewPercent'][i],mergeddf['negReviewPercent'][i]]#,mergeddf['midReviewPercent'][i]]
    sensitivmax=np.where(sensitiv==np.max(sensitiv))[0]
    
    if math.isnan(sensitiv[0])==False:
        if 0 in sensitivmax:
            positive[i]=1

        if 1 in sensitivmax:
            negative[i]=1   

#        if 2 in sensitivmax:
#            neutral[i]=1
    else:
        0

mergeddf['positive'] = positive
mergeddf['negative'] = negative
#mergeddf['neutral'] = neutral

In [17]:
# Sum of positiv and negative ratings per grid cell

sensitivity = mergeddf.groupby('Grid').sum()[['positive','negative']].reset_index()

# Total number of positive and negative ratings in dataframe
count=sensitivity.iloc[:,1:].sum(axis=1)

# distribution of positive and negative reviews per grid cell
sensitivity['positive']=sensitivity['positive']/count
sensitivity['negative']=sensitivity['negative']/count

In [18]:
df_grid=df_grid.merge(sensitivity,on='Grid')

Relative review length per grip cell (Avg length per cell over evg length for all reviews)

In [19]:
review_len=[]
for i in range(len(mergeddf)):
    if pd.isnull(mergeddf['reviewText'][i])==False:
        review_len.append(len(mergeddf['reviewText'][i].split(" ")))
    else: 
        review_len.append(0)
        

mergeddf['review_len']=review_len   
avg_review_len = np.mean(review_len)

In [20]:
Grid_review_len = mergeddf.groupby('Grid').mean()['review_len'].reset_index()
Grid_review_len['review_len'] = Grid_review_len['review_len']/avg_review_len

Final dataframe for grid cell aggregated features

In [21]:
df_grid=df_grid.merge(Grid_review_len,on='Grid')
df_grid.head()

Unnamed: 0,Grid,PlaceCount,price,Accommodation,Bar,Cafe,Cultural,Education,Health,Other,...,Health_rating,Other_rating,Outdoors_rating,Restaurant_rating,Retail_rating,Service_rating,Wholesale_rating,positive,negative,review_len
0,L0,8,2.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,4.02,0.0,3.942857,0.0,0.0,0.0,0.696,0.304,0.536595
1,L1,10,2.0,0.0,0.0,1.0,0.0,0.0,1.0,3.0,...,5.0,0.0,0.0,4.777778,0.0,4.0,0.0,0.851852,0.148148,0.802238
2,L10,3,,0.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,4.2,0.0,0.0,0.0,0.0,0.0,0.857143,0.142857,0.999095
3,L100,74,2.235294,12.0,6.0,6.0,0.0,1.0,4.0,16.0,...,3.6,4.02446,0.0,3.711172,4.5,3.028846,0.0,0.732975,0.267025,0.895548
4,L101,163,1.953488,21.0,19.0,20.0,1.0,1.0,10.0,19.0,...,4.5,4.223132,0.0,3.748855,3.549446,4.311111,3.722222,0.801535,0.198465,0.843739


In [23]:
df_grid.columns

Index(['Grid', 'PlaceCount', 'price', 'Accommodation', 'Bar', 'Cafe',
       'Cultural', 'Education', 'Health', 'Other', 'Outdoors', 'Restaurant',
       'Retail', 'Service', 'Wholesale', 'gPlusUserId', 'mean',
       'Accommodation_rating', 'Bar_rating', 'Cafe_rating', 'Cultural_rating',
       'Education_rating', 'Health_rating', 'Other_rating', 'Outdoors_rating',
       'Restaurant_rating', 'Retail_rating', 'Service_rating',
       'Wholesale_rating', 'positive', 'negative', 'review_len'],
      dtype='object')