In [27]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import numpy as np
from tqdm import tqdm 
sns.set_style("darkgrid")
from datetime import datetime


# Text mining
import re
import nltk
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer 
lemmatizer = WordNetLemmatizer()
nltk.download("stopwords")
nltk.download('punkt')
nltk.download('wordnet')

# Folium maps
import folium
from folium.plugins import MarkerCluster
from folium.plugins import FastMarkerCluster

[nltk_data] Downloading package stopwords to /Users/ann-
[nltk_data]     katrinechristiansen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/ann-
[nltk_data]     katrinechristiansen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/ann-
[nltk_data]     katrinechristiansen/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [38]:
data_path = "data/"
places = pd.read_csv(data_path + "places.csv", sep=";")
reviews = pd.read_csv(data_path + "reviews.csv", sep=";", low_memory=False)

# 1 Data preprocessing

## 1.1 Places

In [39]:
# Quick overwiew of the attributes of the places in the Google Local file
places.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 102851 entries, 0 to 102850
Data columns (total 8 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   name          102851 non-null  object
 1   price         30486 non-null   object
 2   address       102851 non-null  object
 3   hours         65403 non-null   object
 4   phone         98390 non-null   object
 5   closed        102851 non-null  bool  
 6   gPlusPlaceId  102851 non-null  object
 7   gps           101600 non-null  object
dtypes: bool(1), object(7)
memory usage: 5.6+ MB


Places consist of 102851 entries with 7 attributes ... We see that some of the attributes have missing values...

In [40]:
# Lets consider the 
places.isna().sum()

name                0
price           72365
address             0
hours           37448
phone            4461
closed              0
gPlusPlaceId        0
gps              1251
dtype: int64

In [41]:
# First, we have to remove places that are currently closed
places = places.loc[places.closed == False ].reset_index(drop=True)

# That leaves ud with 96497 places across NY state and London
places.shape

(96497, 8)

In [42]:
# Remove places that does not have coordinates
places = places.loc[places.gps.isna()==False].reset_index(drop=True)

# Then we clean the GPS coordinates to become latitiude and longitude
places['lat'] = [float(i.strip('][').split(', ')[0]) for i in places.gps]
places['lon'] = [float(i.strip('][').split(', ')[1]) for i in places.gps]

In [43]:
# We have some missing values in the price, we wish to predict these based on other features

# But first, we convert price ranges into three categories
# $ = £
# $$ = ££
# $$$ = £££

places['price_cat'] = np.nan

for i in range(1,4):
    indexes = places.loc[(places.price == "$"*i) |(places.price == "£"*i)].index
    places.loc[indexes, 'price_cat'] = i 

places['price_cat'] = places.price_cat.astype('category')

# Now the values have been filled in
places.price_cat.value_counts()


2.0    15484
3.0     9124
1.0     3151
Name: price_cat, dtype: int64

In [44]:
# Make a feature that can distinguish between places in London and NY city

places['city'] = np.nan

for num,adress in tqdm(enumerate(places.address)):

    # Clean adress
    i = adress.strip('][').split(',')[-1]
    # Check if in NY
    in_ny = re.findall(r"NY\s100\d{2}", i)
    # Check if in United Kingdom aka London
    in_london = re.findall(r'United Kingdom', i)

    if len(in_ny)>0:

        places.loc[num, 'city'] = 'New York'

    elif len(in_london)>0:

        places.loc[num, 'city'] = 'London' 

    else:
        places.loc[num, 'city'] = 'Remove' 

    

95443it [01:03, 1502.92it/s]


In [45]:
places = places.loc[places.city.isin(['New York','London'])]

In [46]:
# Drop features, that we are not going to use: Adress, Hours, Phone, GPS
# Rename price_cat to priceb
places = places[['gPlusPlaceId','name','price_cat','lat','lon','city', 'address']].rename(columns={'price_cat':"price"}).reset_index(drop=True)
places.head(4)

Unnamed: 0,gPlusPlaceId,name,price,lat,lon,city,address
0,102297455696045037925,HSBC Head Quarters,,51.506582,-0.016885,London,"['8 Canada Sq', 'London E14 5HQ', 'United King..."
1,107630647505995708542,Consulate General of the Republic of Poland,,40.74968,-73.981173,New York,"['233 Madison Ave', 'New York, NY 10016']"
2,104388257267586837092,Blockbuster Express,3.0,40.815224,-73.958116,New York,"['568-574 W 125th St', 'New York, NY 10027']"
3,101742583391038750118,Carpo London,,51.509499,-0.135762,London,"['16 Piccadilly', 'London W1J 0DE', 'United Ki..."


In [47]:
places.to_csv(data_path + "places_cleaned.csv", index=False)

## Reviews

In [48]:
print("Shape of reviews:", reviews.shape)
reviews.isna().sum()

Shape of reviews: (571724, 8)


rating                 0
reviewerName        2139
reviewText        146604
categories         50096
gPlusPlaceId        3885
unixReviewTime     49070
reviewTime         50817
gPlusUserId         5632
dtype: int64

In [49]:
# Categories is essential for our future analysis. Hence we remove rows with missing categories
reviews.dropna(subset=["categories"], inplace=True)

In [50]:
# Only get reviews that are in the places list
reviews = reviews.loc[lambda x: x.gPlusPlaceId.isin(places.gPlusPlaceId.values)].reset_index(drop=True)

In [51]:
reviews.dtypes

rating             object
reviewerName       object
reviewText         object
categories         object
gPlusPlaceId       object
unixReviewTime    float64
reviewTime         object
gPlusUserId        object
dtype: object

In [52]:
# Convert rating to numeric feature
reviews['rating'] = pd.to_numeric(reviews['rating'])

In [53]:
# Using unix review time since it has less missing values and more information than the review time feature
def unixToDatetime(unix_timestamp):
    if not pd.isna(unix_timestamp):
        return datetime.utcfromtimestamp(unix_timestamp).strftime("%Y-%m-%d %H:%M:%S")
    else:
        return np.NaN

reviews['unixReviewTime'] = reviews['unixReviewTime'].apply(unixToDatetime)
reviews.drop(["reviewTime"], axis=1, inplace=True)

In [54]:
# Categories are stored as a "['Category']", thus we use regex to find the word between the apostrophes
reviews['categories'] = reviews.categories.apply(lambda x: re.findall(r"[\[|\s]\'(.*?)\'[\,|\]]", x))


In [55]:
# Text processing function from Social Graphs and Interactions
def cleanString(text):
    if not pd.isna(text):
        tokens = word_tokenize(text) #Tokenizing
        punctuation = [word.lower() for word in tokens if word.isalnum()] # remove punctuation and set to lower case
        lemma = [lemmatizer.lemmatize(word) for word in punctuation] # lemmatize words
        #stopwords = stopwords.words('english')
        stopwords =  nltk.corpus.stopwords.words('english')
        return [w for w in lemma if w not in stopwords] # Finally removes stopwords
    else:
        return np.NaN

In [56]:
reviews['reviewTextClean'] = reviews['reviewText'].apply(cleanString)

In [57]:
reviews.to_csv(data_path + "reviews_cleaned.csv", index=False)

# Descriptive Statistics 

In [3]:
data_path = "data/"
reviews = pd.read_csv(data_path + 'reviews_cleaned.csv',)
places = pd.read_csv(data_path + 'places_cleaned.csv')

### Create map of places in London and New York

In [16]:
def generateBaseMap(default_location, default_zoom_start=12):
    base_map = folium.Map(location=default_location, control_scale=True, zoom_start=default_zoom_start)
    return base_map


In [30]:
def CreateScatterMap(df_folium_map, coordinates):

    scatter_map = generateBaseMap(default_location = coordinates)
    mc = MarkerCluster()

    for index,row in df_folium_map.iterrows():  
        latlon = row[["lat","lon"]]
        labels = row['address'].strip('][').replace("'","")

        mc.add_child(FastMarkerCluster([latlon.iloc[0], latlon.iloc[1]], popup = labels))
        #mc.add_child(folium.Marker(location=[latlon.iloc[0], latlon.iloc[1]], popup = labels))

    scatter_map.add_child(mc)

    return scatter_map

In [31]:
df_folium_map_NY=places.loc[places.city == 'New York'][['gPlusPlaceId','lat','lon', 'address']].set_index('gPlusPlaceId')
scatter_map = CreateScatterMap(df_folium_map_NY,[40.76160, -73.97564])
scatter_map

TypeError: __init__() missing 1 required positional argument: 'data'

In [25]:
df_folium_map_L = places.loc[places.city == 'London'][['gPlusPlaceId','lat','lon', 'address']].set_index('gPlusPlaceId').head(5000)

scatter_map = CreateScatterMap(df_folium_map_L, [51.50762, -0.12708])
scatter_map

In [26]:
df_folium_map_NY

Unnamed: 0_level_0,lat,lon,address
gPlusPlaceId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
107630647505995708542,4.074968e+01,-7.398117e+01,"['233 Madison Ave', 'New York, NY 10016']"
104388257267586837092,4.081522e+01,-7.395812e+01,"['568-574 W 125th St', 'New York, NY 10027']"
115622769881026234674,4.075322e+01,-7.397837e+01,"['40 E 43rd St', 'New York, NY 10017']"
102435850914648163950,4.076643e+07,-7.398162e+07,"['1776 Broadway', 'New York, NY 10019']"
117761296900896012322,4.073989e+01,-7.400778e+01,"['827 Washington St', 'New York, NY 10014']"
...,...,...,...
110741528261561607331,4.076499e+01,-7.397690e+01,"['1419 Avenue of the Americas', 'New York, NY ..."
101406384283536798894,4.071944e+01,-7.400888e+01,"['105 Hudson St', 'New York, NY 10013']"
107513422414028520747,4.071881e+01,-7.399764e+01,"['145 Mulberry St', 'New York, NY 10013']"
117071119139841947889,4.072469e+01,-7.399469e+01,"['47 E Houston St', 'Manhattan, NY 10012']"
