In [162]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("darkgrid")

In [163]:
data_path = "data/"
places = pd.read_csv(data_path + "places.csv", sep=";")
reviews = pd.read_csv(data_path + "reviews.csv", sep=";", low_memory=False)

## 1.2 Reviews

In [1]:
from datetime import datetime
import numpy as np
import re
import string
import nltk
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer 
lemmatizer = WordNetLemmatizer()
nltk.download("stopwords")
_stop_words = nltk.corpus.stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jesperhauch/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [165]:
print("Shape of reviews:", reviews.shape)
reviews.isna().sum()

Shape of reviews: (571724, 8)


rating                 0
reviewerName        2139
reviewText        146604
categories         50096
gPlusPlaceId        3885
unixReviewTime     49070
reviewTime         50817
gPlusUserId         5632
dtype: int64

In [166]:
# Categories is essential for our future analysis. Hence we remove rows with missing categories
reviews.dropna(subset=["categories"], inplace=True)

In [167]:
# Only get reviews that are in the places list
reviews = reviews.loc[lambda x: x.gPlusPlaceId.isin(places.gPlusPlaceId.values)]

In [168]:
reviews.dtypes

rating             object
reviewerName       object
reviewText         object
categories         object
gPlusPlaceId       object
unixReviewTime    float64
reviewTime         object
gPlusUserId        object
dtype: object

In [169]:
# Convert rating to numeric feature
reviews['rating'] = pd.to_numeric(reviews['rating'])

In [170]:
# Using unix review time since it has less missing values and more information than the review time feature
def unixToDatetime(unix_timestamp):
    if not pd.isna(unix_timestamp):
        return datetime.utcfromtimestamp(unix_timestamp).strftime("%Y-%m-%d %H:%M:%S")
    else:
        return np.NaN

reviews['unixReviewTime'] = reviews['unixReviewTime'].apply(unixToDatetime)
reviews.drop(["reviewTime"], axis=1, inplace=True)

In [171]:
# Categories are stored as a "['Category']", thus we use regex to find the word between the apostrophes
reviews['categories'] = reviews.categories.apply(lambda x: re.findall(r"[\[|\s]\'(.*?)\'[\,|\]]", x))


In [183]:
# Text processing function from Social Graphs and Interactions
def cleanString(text):
    if not pd.isna(text):
        tokens = word_tokenize(text) #Tokenizing
        punctuation = [word.lower() for word in tokens if word.isalnum()] # remove punctuation and set to lower case
        lemma = [lemmatizer.lemmatize(word) for word in punctuation] # lemmatize words
        stopwords = stopwords.words('english')
        return [w for w in lemma if w not in stopwords] # Finally removes stopwords
    else:
        return np.NaN

In [184]:
reviews['reviewTextClean'] = reviews['reviewText'].apply(cleanString)

KeyboardInterrupt: 

In [None]:
reviews.to_csv(data_path + "reviews_cleaned.csv", index=False)

In [185]:
reviews

Unnamed: 0,rating,reviewerName,reviewText,categories,gPlusPlaceId,unixReviewTime,reviewTime,gPlusUserId
0,5.0,Jason Wagner,The Arbiter changed to be The Barrow Boy somew...,[Pub],108354284028700140190,2010-07-27 22:46:50,"Jul 27, 2010",100000196778399872657
1,3.0,elicia richardson-ellis,This is a very cute hotel with good amenities ...,[Restaurant],106689630448064755324,2013-05-22 18:28:27,"May 22, 2013",100000340778638927606
2,4.0,elicia richardson-ellis,Love this place. The Great/Good: Massage an...,"[Stores and Shopping, Beauty Salon, Day Spa]",108256990636148259283,2013-05-22 18:26:22,"May 22, 2013",100000340778638927606
3,4.0,Nakamura Zen,,[Clothing Store],104395160856690993217,2013-06-19 21:23:21,"Jun 19, 2013",100000469599933991939
4,4.0,Nakamura Zen,,[Clothing Store],116090508615311611202,2013-06-19 21:23:29,"Jun 19, 2013",100000469599933991939
...,...,...,...,...,...,...,...,...
571719,4.0,Bruno Orsini,,[Bakery],115393413807091201685,2011-01-31 05:07:21,"Jan 30, 2011",118446627871409499434
571720,4.0,Bruno Orsini,,"[Southern Restaurant (US), American Restaurant...",115740012747676843252,2011-01-31 05:09:23,"Jan 30, 2011",118446627871409499434
571721,4.0,Bruno Orsini,,"[Restaurant or Cafe, Latin American Restaurant...",116184585745702669719,2010-11-30 08:06:23,"Nov 30, 2010",118446627871409499434
571722,4.0,Bruno Orsini,,"[Restaurant, Eclectic Restaurant, American Res...",117664146467660757996,2010-11-30 08:05:37,"Nov 30, 2010",118446627871409499434
