In [173]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import numpy as np
from tqdm import tqdm 
sns.set_style("darkgrid")
from datetime import datetime


# Text mining
import re
import nltk
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer 
lemmatizer = WordNetLemmatizer()
nltk.download("stopwords")
_stop_words = nltk.corpus.stopwords.words('english')

# Folium maps
import folium
from folium.plugins import MarkerCluster

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/akterminsprove/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [174]:
data_path = "data/"
places = pd.read_csv(data_path + "places.csv", sep=";")
reviews = pd.read_csv(data_path + "reviews.csv", sep=";", low_memory=False)

# 1 Data preprocessing

## 1.1 Places

In [175]:
# Quick overwiew of the attributes of the places in the Google Local file
places.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 102851 entries, 0 to 102850
Data columns (total 8 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   name          102851 non-null  object
 1   price         30486 non-null   object
 2   address       102851 non-null  object
 3   hours         65403 non-null   object
 4   phone         98390 non-null   object
 5   closed        102851 non-null  bool  
 6   gPlusPlaceId  102851 non-null  object
 7   gps           101600 non-null  object
dtypes: bool(1), object(7)
memory usage: 5.6+ MB


Places consist of 102851 entries with 7 attributes ... We see that some of the attributes have missing values...

In [176]:
# Lets consider the 
places.isna().sum()

name                0
price           72365
address             0
hours           37448
phone            4461
closed              0
gPlusPlaceId        0
gps              1251
dtype: int64

In [177]:
# First, we have to remove places that are currently closed
places = places.loc[places.closed == False ].reset_index(drop=True)

# That leaves ud with 96497 places across NY and London
places.shape

(96497, 8)

In [178]:
# Remove places that does not have coordinates
places = places.loc[places.gps.isna()==False].reset_index(drop=True)

# Then we clean the GPS coordinates to become latitiude and longitude
places['lat'] = [float(i.strip('][').split(', ')[0]) for i in places.gps]
places['lon'] = [float(i.strip('][').split(', ')[1]) for i in places.gps]

In [179]:
# We have some missing values in the price, we wish to predict these based on other features

# But first, we convert price ranges into three categories
# $ = £
# $$ = ££
# $$$ = £££

places['price_cat'] = np.nan

for i in range(1,4):
    indexes = places.loc[(places.price == "$"*i) |(places.price == "£"*i)].index
    places.loc[indexes, 'price_cat'] = i 

places['price_cat'] = places.price_cat.astype('category')

# Now the values have been filled in
places.price_cat.value_counts()


2.0    15484
3.0     9124
1.0     3151
Name: price_cat, dtype: int64

In [180]:
# Make a feature that can distinguish between places in London and NY

places['city'] = np.nan

for num,adress in tqdm(enumerate(places.address)):

    # Clean adress
    i = adress.strip('][').split(',')[-1]
    # Check if in NY
    in_ny = re.findall(r"NY\s\d{5}", i)
    # Check if in United Kingdom aka London
    #in_london = re.findall(r'United Kingdom', i)

    if len(in_ny)>0:

        places.loc[num, 'city'] = 'New York'

    else:

        places.loc[num, 'city'] = 'London' 
    

94583it [02:29, 731.27it/s]

In [None]:
# Drop features, that we are not going to use: Adress, Hours, Phone, GPS
# Rename price_cat to priceb
places = places[['gPlusPlaceId','name','price_cat','lat','lon','city']].rename(columns={'price_cat':"price"})
places.head(4)

Unnamed: 0,name,price,lat,lon,city
0,Steppingstone Day School Inc,,40.719993,-73.820745,New York
1,HSBC Head Quarters,,51.506582,-0.016885,London
2,Consulate General of the Republic of Poland,,40.74968,-73.981173,New York
3,Bagel Depot,,40.543839,-74.165041,New York


In [None]:
places.to_csv(data_path + "places_cleaned.csv", index=False)

## Reviews

In [None]:
print("Shape of reviews:", reviews.shape)
reviews.isna().sum()

Shape of reviews: (571724, 8)


rating                 0
reviewerName        2139
reviewText        146604
categories         50096
gPlusPlaceId        3885
unixReviewTime     49070
reviewTime         50817
gPlusUserId         5632
dtype: int64

In [None]:
# Categories is essential for our future analysis. Hence we remove rows with missing categories
reviews.dropna(subset=["categories"], inplace=True)

In [None]:
# Only get reviews that are in the places list
reviews = reviews.loc[lambda x: x.gPlusPlaceId.isin(places.gPlusPlaceId.values)]

AttributeError: 'DataFrame' object has no attribute 'gPlusPlaceId'

In [None]:
reviews.dtypes

rating             object
reviewerName       object
reviewText         object
categories         object
gPlusPlaceId       object
unixReviewTime    float64
reviewTime         object
gPlusUserId        object
dtype: object

In [None]:
# Convert rating to numeric feature
reviews['rating'] = pd.to_numeric(reviews['rating'])

In [None]:
# Using unix review time since it has less missing values and more information than the review time feature
def unixToDatetime(unix_timestamp):
    if not pd.isna(unix_timestamp):
        return datetime.utcfromtimestamp(unix_timestamp).strftime("%Y-%m-%d %H:%M:%S")
    else:
        return np.NaN

reviews['unixReviewTime'] = reviews['unixReviewTime'].apply(unixToDatetime)
reviews.drop(["reviewTime"], axis=1, inplace=True)

In [None]:
# Categories are stored as a "['Category']", thus we use regex to find the word between the apostrophes
reviews['categories'] = reviews.categories.apply(lambda x: re.findall(r"[\[|\s]\'(.*?)\'[\,|\]]", x))


In [None]:
# Text processing function from Social Graphs and Interactions
def cleanString(text):
    if not pd.isna(text):
        tokens = word_tokenize(text) #Tokenizing
        punctuation = [word.lower() for word in tokens if word.isalnum()] # remove punctuation and set to lower case
        lemma = [lemmatizer.lemmatize(word) for word in punctuation] # lemmatize words
        stopwords = stopwords.words('english')
        return [w for w in lemma if w not in stopwords] # Finally removes stopwords
    else:
        return np.NaN

In [None]:
reviews['reviewTextClean'] = reviews['reviewText'].apply(cleanString)

KeyboardInterrupt: 

In [None]:
reviews.to_csv(data_path + "reviews_cleaned.csv", index=False)

# Descriptive Statistics 

In [None]:
def generateBaseMap(default_location=[56.46679, -2.97028], default_zoom_start=12):
    base_map = folium.Map(location=default_location, control_scale=True, zoom_start=default_zoom_start)
    return base_map