# Exploratory Data Analysis - TripAdvisor data

In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import os
import matplotlib.pyplot as plt
import requests
from wordcloud import WordCloud
from PIL import Image

%matplotlib inline
pd.set_option('display.max_columns', None)

# Preprocessing

In [2]:
def freq_dict_generator(keywords, keywords_dict):
    """Creation of a dict containing all of the keywords. """
    for keyword in keywords:
        if keyword not in keywords_dict:
            keywords_dict[keyword] = 1
        else:
            keywords_dict[keyword] += 1

def col_evaluation(ls, col_name):
    """ Function used to check if a given cuisine is present in the list of the 'cuisine' column. It is
    used in a parallelised manner."""
    if col_name in ls:
        val = 1
    else:
        val = 0
    return val
            
def generate_cuisine_cols(df, cuisine_dict):
    """ Creation of the columns containing all of the cuisine types in a dataframe. """
    for col_name in cuisine_dict:
        df[col_name] = df['cuisine'].apply(lambda x: col_evaluation(x, col_name))
    return df44

def keywords_list_parser(ls):
    """Parses a given list of 3 keywords to determine the main cuisine (ex: Indian, British), the food specificity 
    (Vegan, serving seafood...) and whether the restaurant has unique features (ex: if it is a bar, a pub...). Each
     keyword is read in order to first find the specificity and the feature. For the cuisine, the algorithm first
     tries to find a 'tier one cuisine' (i.e a very specific type of cuisine) before moving on to a 'tier two cuisine' 
     (i.e a more global type of cuisine such as 'European'). For each category, a hardcoded list of entries is read. """
    
    place_type = None
    food_spec = None
    cuisine_t1 = None
    cuisine_t2 = None

    found_place = False
    found_spec = False
    found_cuisine_t1 = False
    found_cuisine_t2 = False

    place_type_ls = ['Bar', 'Pub', 'Cafe', 'Street food', 'Steakhouse']

    food_specifity = ['Vegan', 'Gluten Free Options', 'Healthy', 'Fast food', 'Barbecue', 'Seafood',
                      'Fusion', 'Contemporary', 'Halal']
    
    cuisine_tier_one = ['British', 'French', 'Italian', 'Indian', 'Spanish', 'Turkish',
                        'South American', 'Other European', 'Lebanese', 'Moroccan', 'American',
                        'Thai', 'African'] 

    cuisine_tier_two = ['Asian', 'European', 'Other European', 'Middle Eastern', 'International']
    
    for word in ls:
        
        if (word in place_type_ls) and (found_place == False):
            place_type = word
            found_place = True
        if (word in food_specifity) and (found_spec == False):
            food_spec = word
            found_spec = True
        if (word in cuisine_tier_one) and (found_cuisine_t1 == False):
            cuisine_t1 = word
            found_cuisine_t1 = True
        if (word in cuisine_tier_two) and (found_cuisine_t2 == False):
            cuisine_t2 = word
            found_cuisine_t2 = True
        
    if cuisine_t1 == None:
        cuisine = cuisine_t2
    else:
        cuisine = cuisine_t1
            
    return [cuisine, food_spec, place_type]
    

def simplify_keywords(keyword_ls, keyword_dict):
    """Simplifies keywords thanks to a hardcoded dict given as input. It is used to assemble close cuisines and 
    features into a single entry to enhance the analysis (ex : Sri Lankan and Indian are grouped)."""
    new_keyword_ls = [keyword_dict.get(item,item) for item in keyword_ls]
    return new_keyword_ls
    
def score_builder(text):
    """A unified ranking score is built by considering this formula : 
    
    score = (total_entries - rank + 1)/total_entries
    
    Therefore, restaurants ranking can be compared across different locations. """
    
    if text != 'Ranking not found':
        rank = int(text[1])
        total_entries = int(text.split()[2].replace(',',''))
        score = (total_entries - rank + 1) / total_entries
        return round(score, 3)
    else:
        return None
    
def location_extractor(text):
    """Used to extract the neighborhood where the restaurant is located."""
    if text != 'Ranking not found':
        return text.split()[-1]
    else:
        return None

def identify_coordinates(address):
    """This functions performs an API call with the scrapped address of the restaurant to the OpenStreetMap API.
    A longitude and a latitude are then returned."""
    
    # Try using full address
    target_url = 'https://nominatim.openstreetmap.org/search?q=' + address + '&format=json'
    target_url = target_url.replace(',', '%2C')
    target_url = target_url.replace(' ', '+')

    r = requests.get(target_url)
    found_coordinates = True
    if r.status_code==200:
        try:
            coordinates = r.json()[0]
            latitude, longitude = coordinates['lat'], coordinates['lon']
            
        except:
            #print(f'Address not found for: {target_url}')
            latitude, longitude = None, None
            found_coordinates = False
            
    else:
        print(f'Error {status_code} ocurred on the request')
        latitude, longitude, found_coordinates = identify_coordinates_postcode(address)
    return (latitude, longitude, found_coordinates) 

def identify_coordinates_postcode(address):
    # If address fails then use postcode to get approximate lat and long
    postcode = get_postcode(address)
    
    target_url = 'https://nominatim.openstreetmap.org/search?q=' + postcode + '&format=json'
    target_url = target_url.replace(',', '%2C')
    target_url = target_url.replace(' ', '+')

    r = requests.get(target_url)
    found_coordinates = True
    if r.status_code==200:
        try:
            coordinates = r.json()[0]
            latitude, longitude = coordinates['lat'], coordinates['lon']
            
        except:
            #print(f'Address not found for: {target_url}')
            latitude, longitude = None, None
            found_coordinates = False
            
    else:
        print(f'Error {status_code} ocurred on the request')
        latitude, longitude, found_coordinates = None, None, False
    return (latitude, longitude, found_coordinates) 
    
def get_postcode(address):
    splits = address.split(' ')
    postcode = []

    for idx, split in enumerate(splits):
        if split.upper() == split:
            if len(split) == 3 and len(postcode) == 0:
                postcode.append(split)
                first_part = idx
            elif len(split) == 3 and len(postcode) != 0:
                if idx == first_part + 1:
                    postcode.append(split)
                    # finish
                else:
                    first_part = idx
                    postcode[0] = split
    return postcode

def haversine_dist_to_bokan(lat, lon):
    """Computes the Haversine distance of a given restaurant to the Bokan, given a longitude and a latitude."""

    deg_to_rad = lambda x: x * np.pi/180
    
    bokan_lat = 51.501244
    bokan_lon = -0.023363
    
    R = 6371 # Radius of the earth in km
    deg_lat = deg_to_rad(lat - bokan_lat)
    deg_lon = deg_to_rad(lon - bokan_lon) 
    a = np.sin(deg_lat/2) * np.sin(deg_lat/2) + np.cos(deg_to_rad(lat)) * np.cos(deg_to_rad(bokan_lat)) * np.sin(deg_lon/2) * np.sin(deg_lon/2)
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    d = R * c ## Distance in km
    return np.round(d, 3)

def process_distances(df, print_output=250):
    """Determines all of the distances between the restaurants and the bokan. It also returns the number of 
    restaurants which address couldn't be used to find the latitude and the longitude."""
    adress_list = df['address'].tolist()
    not_found_counter = 0
    for counter, address in enumerate(adress_list):
        lat, lon, found_coordinates = identify_coordinates(address)
        if found_coordinates == False:
            not_found_counter += 1
        if counter%print_output==0:
            print(f'{counter}/{len(df)} restaurant coordinates have been processed.')
        adress_list[counter] = [lat, lon]
    print(f'Finished determining coordinates. {not_found_counter}/{len(df)} coordinates have not been found.')
    
    np_coords = np.asarray(adress_list, dtype=np.float32)
    np_lat, np_lon = np_coords[:,0], np_coords[:,1]
    distances = haversine_dist_to_bokan(np_lat, np_lon)

    return distances

def handle_duplicate_addresses(df):
    df.loc[df[['address']].duplicated(), 'address'] = df.loc[df[['address']].duplicated(), 'address'] + ' bis'
    return df
    

def cleaning_pipeline(df, keywords_renaming_dict, parse_distances=False):
    """Performs all of the cleaning operations. The process_distances parameter is set on False by default as the 
    number of API calls can run for a substantial amount of time."""
    cuisine_dict = {}
    df_restaurants_cuisine = df.copy()
    df_restaurants_cuisine = df_restaurants_cuisine.rename(columns={'cuisine':'keywords'})                                                                                   
    # Creates a frequency dictionary containing the occurence of all types of cuisine
    df_restaurants_cuisine['keywords'].apply(lambda x: freq_dict_generator(x, cuisine_dict))
    
    # Converting reviews as int
    df_restaurants_cuisine['nb_reviews'] = df_restaurants_cuisine['nb_reviews'].str.replace(',','').astype('int64')
    
    # Creates a ranking score in order to compare restaurants competing among a different amount of restaurants
    # in a neighborhood. The score of the restaurant ranked i among n restaurants is as such : 
    # Score = (n - i + 1) / n
    df_restaurants_cuisine['score'] = df_restaurants_cuisine['ranking'].apply(lambda x: score_builder(x))
    
    # Extracts the neighborhood where the restaurant is located
    df_restaurants_cuisine['neighborhood'] = df_restaurants_cuisine['ranking'].apply(lambda x: location_extractor(x))
    
    # Simplifies and groups the original keywords to enhance the analysis
    df_restaurants_cuisine['keywords'] = df_restaurants_cuisine['keywords'].apply(lambda x: simplify_keywords(x, 
                                                                                            keywords_renaming_dict))
    # Separates the keywords into useful categories used to performed detailled analysis
    df_restaurants_cuisine['cuisine'] = None
    df_restaurants_cuisine['food_specificity'] = None
    df_restaurants_cuisine['place_type'] = None
    key_res = df_restaurants_cuisine['keywords'].apply(keywords_list_parser)
    df_restaurants_cuisine[['cuisine','food_specificity','place_type']] = pd.DataFrame(key_res.tolist(), index=df_restaurants_cuisine.index)
    
    
    # Computes the distance in kilometers between the Bokan and the scrapped restaurant
    if parse_distances:
        df_restaurants_cuisine['distance_to_bokan'] = process_distances(df_restaurants_cuisine)
    
    df_restaurants_cuisine = handle_duplicate_addresses(df_restaurants_cuisine)
    # Used to generate the columns indicating the cuisine of the restaurant - DEPRECATED
    #df_restaurants_cuisine = generate_cuisine_cols(df_restaurants_cuisine, cuisine_dict)
    
    return df_restaurants_cuisine, cuisine_dict

def cuisine_stats(df, target_col='cuisine', stats='mean', 
                  cols=['cuisine', 'nb_reviews', 'min_price', 'max_price', 'rating', 'score', 'neighborhood']):
    """ Generate statistics on restaurants, using a column such as the type of cuisine or the special features. """
    df_res = df[cols].groupby(by=target_col)
    temp = df_res.size()
    if stats == 'mean':
        df_res = df_res.mean()
        rename_dict = {col:'mean_' + col for col in df_res.columns}
    elif stats == 'median':
        df_res = df_res.median()
        rename_dict = {col:'median_' + col for col in df_res.columns}
    elif stats == 'std':
        df_res = df_res.std()
        rename_dict = {col:'std_' + col for col in df_res.columns}
    df_res = df_res.rename(columns=rename_dict)
    df_res['nb_restaus'] = temp.tolist()
    cols = df_res.columns.tolist()
    cols = cols[-1:] + cols[:-1]
    df_res = df_res[cols]
#     f = lambda x: cuisine_name if x==1 else 'Other cuisines'
#     df_res.index = list(map(f, df_res.index))
    return df_res

In [3]:
root_path = os.path.join('..', '..', 'scraper', 'scraped_data', 'merged_data')

restaurants_path = os.path.join('..', '..', 'scraper', 'scraper_restaurants', 'scraped_data', 'restaurants', 'restaurants_run_1.json')
review_path = os.path.join(root_path, 'merged_reviews.json')
users_path = os.path.join(root_path, 'merged_users.json')

df_restaurants = pd.read_json(restaurants_path, lines=True)
restaurants = list(df_restaurants['restaurant_id'])
df_restaurants = df_restaurants.set_index('restaurant_id')

df_reviews = pd.read_json(review_path, lines=True)
df_users = pd.read_json(users_path, lines=True)


In [4]:
keywords_renaming_dict = {
    'Southern-Italian':'Italian', 'Sicilian':'Italian', 'Tuscan':'Italian', 'Neapolitan':'Italian',
    'Central-Italian':'Italian', 'Pizza':'Italian', 'Polish':'European', 'Belgian':'European',
    'German':'European', 'Eastern European':'European', 'Portuguese':'European',
    'Greek':'European', 'Hong Kong':'Chinese','Cantonese':'Chinese', 'Sushi':'Japanese', 
    'Malaysian':'Asian', 'Tibetan':'Asian', 'Vietnamese':'Asian', 'Latin':'South American', 
    'Argentinian':'South American', 'Mexican':'South American', 'Brazilian':'South American',
    'Colombian':'South American', 'Vegetarian Friendly':'Vegan', 'Vegan Options':'Vegan', 'Persian':'Middle Eastern',
    'Afghani':'Middle Eastern', 'Deli':'Indian', 'Sri Lankan':'Indian', 'Pakistani':'Indian', 'Balti':'Indian',
    'Nepalese':'Indian', 'Jamaican':'African', 'Carribean':'African', 'Cajun & Creole':'African',
    'Moroccan':'African', 'Brew Pub':'Pub', 'Gastropub':'Pub', 'Grill':'Barbecue'}

df_restaurants_cuisine, cuisine_dict = cleaning_pipeline(df_restaurants, keywords_renaming_dict, parse_distances=True)

0/13405 restaurant coordinates have been processed.
250/13405 restaurant coordinates have been processed.
500/13405 restaurant coordinates have been processed.
750/13405 restaurant coordinates have been processed.
1000/13405 restaurant coordinates have been processed.
1250/13405 restaurant coordinates have been processed.
1500/13405 restaurant coordinates have been processed.
1750/13405 restaurant coordinates have been processed.
2000/13405 restaurant coordinates have been processed.
2250/13405 restaurant coordinates have been processed.
2500/13405 restaurant coordinates have been processed.
2750/13405 restaurant coordinates have been processed.
3000/13405 restaurant coordinates have been processed.
3250/13405 restaurant coordinates have been processed.
3500/13405 restaurant coordinates have been processed.
3750/13405 restaurant coordinates have been processed.
4000/13405 restaurant coordinates have been processed.
4250/13405 restaurant coordinates have been processed.
4500/13405 resta

In [5]:
print(cuisine_dict)

{'Grill': 302, 'Pakistani': 86, 'Afghani': 31, 'Indian': 1567, 'Asian': 2123, 'Vegetarian Friendly': 3163, 'Steakhouse': 212, 'Argentinian': 35, 'Latin': 64, 'Bar': 1691, 'Pizza': 1066, 'British': 3037, 'Italian': 1475, 'Deli': 88, 'Lebanese': 259, 'Cafe': 1384, 'Mediterranean': 1448, 'European': 1602, 'Balti': 295, 'Spanish': 155, 'Vegan Options': 573, 'International': 367, 'Contemporary': 159, 'Japanese': 421, 'Middle Eastern': 552, 'Sushi': 313, 'Thai': 357, 'Healthy': 212, 'Nepalese': 60, 'French': 311, 'Gluten Free Options': 205, 'Fast food': 926, 'Chinese': 638, 'Cantonese': 140, 'African': 66, 'Moroccan': 46, 'Turkish': 469, 'Persian': 97, 'Seafood': 405, 'Barbecue': 190, 'Pub': 1148, 'Caribbean': 123, 'Mexican': 106, 'Brazilian': 53, 'Bakeries': 4, 'Fusion': 79, 'Brew Pub': 47, 'Sri Lankan': 56, 'Greek': 111, 'Polish': 41, 'Eastern European': 58, 'Belgian': 12, 'German': 20, 'Sicilian': 15, 'Southern-Italian': 13, 'Tibetan': 3, 'American': 419, 'Gastropub': 51, 'Street Food': 5

## Visualisation of a restaurant's reviews

In [None]:
restaurant_id = "0"
while (restaurant_id not in restaurants):
    try:
        restaurant_id = int(input("Enter restaurant id from 1 to 143 : "))
    except:
        print("Please enter int")

### TF-IDF matrix

In [None]:
target_csv = str(restaurant_id) + '_word_freq.csv'
target_csv_path = os.path.join('..', 'cleaned_data', 'restaurants_tfidf', target_csv)

df_tfidf = pd.read_csv(target_csv_path, index_col=0)

In [None]:
df_tfidf.head()

In [None]:
df_tfidf_mean = df_tfidf.mean().sort_values(ascending=False).to_frame(name='tfidf mean')
df_tfidf_mean.head()

In [None]:
df_tfidf_mean[:15].plot(kind='bar')
plt.show()

### Wordcloud visualisation

In [None]:
target_img = str(restaurant_id) + '_word_cloud.png'
target_img_path = os.path.join('..', 'cleaned_data', 'restaurant_wordclouds', target_img)
img = plt.imread(target_img_path)

plt.figure(figsize=(12,16))
plt.imshow(img)
plt.axis("off")
plt.show()

Other wordclouds and TF-IDF can be found in the dedicated folder

# Global restaurant analysis

In [None]:
# def filtering_dataframe(df, column, terms):
    
#     df = df[df[''].str.findall()]
#     return df

# def generate_wordcloud(df, idx, directory, mask):
#     #filename = directory + str(idx) + "_word_cloud.png"

#     df_mean = df.mean().sort_values(ascending=False).to_frame(name='tfidf mean')
#     dict_words_tfidf = df_mean[df_mean['tfidf mean'] != 0].to_dict()['tfidf mean']

#     wordcloud = WordCloud(height=600, width=800, background_color="white",
#         colormap='Blues', max_words=100, mask=mask,
#         contour_width=0.5, contour_color='lightsteelblue')
#     wordcloud.generate_from_frequencies(frequencies=dict_words_tfidf)
#     wordcloud.to_file(filename)  

In [None]:
df_restaurants_cuisine.head(5)

In [None]:
df_restaurants_cuisine.info()

In [None]:
df_stats = cuisine_stats(df_restaurants_cuisine, stats='mean')
df_stats.head(20)

In [None]:
df_restaurants_cuisine.to_csv('cleaned_restaus.csv')