# Social Data Science Exam - 2019

## Group 13

Nynne Bech Nielsen (btq674) \
Marcus Bjarup Thøgersen (vhp312) \
Kaiyue Xu (vsp923) \
Jakob Lauge Toft Hansen (qkr676)

### Python code for all plots and scraping

This Notebook contains the code for the data used in the final exam for group 13, in the course Social Data Science 2019.

In [None]:
## Code for the Log, made by Snorre.

import requests,os,time
def ratelimit():
    "A function that handles the rate of your calls."
    time.sleep(1) # sleep one second.

class Connector():
  def __init__(self,logfile,overwrite_log=False,connector_type='requests',session=False,path2selenium='',n_tries = 5,timeout=30):
    """This Class implements a method for reliable connection to the internet and monitoring. 
    It handles simple errors due to connection problems, and logs a range of information for basic quality assessments
    
    Keyword arguments:
    logfile -- path to the logfile
    overwrite_log -- bool, defining if logfile should be cleared (rarely the case). 
    connector_type -- use the 'requests' module or the 'selenium'. Will have different since the selenium webdriver does not have a similar response object when using the get method, and monitoring the behavior cannot be automated in the same way.
    session -- requests.session object. For defining custom headers and proxies.
    path2selenium -- str, sets the path to the geckodriver needed when using selenium.
    n_tries -- int, defines the number of retries the *get* method will try to avoid random connection errors.
    timeout -- int, seconds the get request will wait for the server to respond, again to avoid connection errors.
    """
    
    ## Initialization function defining parameters. 
    self.n_tries = n_tries # For avoiding triviel error e.g. connection errors, this defines how many times it will retry.
    self.timeout = timeout # Defining the maximum time to wait for a server to response.
    ## not implemented here, if you use selenium.
    if connector_type=='selenium':
      assert path2selenium!='', "You need to specify the path to you geckodriver if you want to use Selenium"
      from selenium import webdriver 
      ## HIN download the latest geckodriver here: https://github.com/mozilla/geckodriver/releases

      assert os.path.isfile(path2selenium),'You need to insert a valid path2selenium the path to your geckodriver. You can download the latest geckodriver here: https://github.com/mozilla/geckodriver/releases'
      self.browser = webdriver.Firefox(executable_path=path2selenium) # start the browser with a path to the geckodriver.

    self.connector_type = connector_type # set the connector_type
    
    if session: # set the custom session
      self.session = session
    else:
      self.session = requests.session()
    self.logfilename = logfile # set the logfile path
    ## define header for the logfile
    header = ['id','project','connector_type','t', 'delta_t', 'url', 'redirect_url','response_size', 'response_code','success','error']
    if os.path.isfile(logfile):        
      if overwrite_log==True:
        self.log = open(logfile,'w')
        self.log.write(';'.join(header))
      else:
        self.log = open(logfile,'a')
    else:
      self.log = open(logfile,'w')
      self.log.write(';'.join(header))
    ## load log 
    with open(logfile,'r') as f: # open file
        
      l = f.read().split('\n') # read and split file by newlines.
      ## set id
      if len(l)<=1:
        self.id = 0
      else:
        self.id = int(l[-1][0])+1
            
  def get(self,url,project_name):
    """Method for connector reliably to the internet, with multiple tries and simple error handling, as well as default logging function.
    Input url and the project name for the log (i.e. is it part of mapping the domain, or is it the part of the final stage in the data collection).
    
    Keyword arguments:
    url -- str, url
    project_name -- str, Name used for analyzing the log. Use case could be the 'Mapping of domain','Meta_data_collection','main data collection'. 
    """
     
    project_name = project_name.replace(';','-') # make sure the default csv seperator is not in the project_name.
    if self.connector_type=='requests': # Determine connector method.
      for _ in range(self.n_tries): # for loop defining number of retries with the requests method.
        ratelimit()
        t = time.time()
        try: # error handling 
          response = self.session.get(url,timeout = self.timeout) # make get call

          err = '' # define python error variable as empty assumming success.
          success = True # define success variable
          redirect_url = response.url # log current url, after potential redirects 
          dt = t - time.time() # define delta-time waiting for the server and downloading content.
          size = len(response.text) # define variable for size of html content of the response.
          response_code = response.status_code # log status code.
          ## log...
          call_id = self.id # get current unique identifier for the call
          self.id+=1 # increment call id
          #['id','project_name','connector_type','t', 'delta_t', 'url', 'redirect_url','response_size', 'response_code','success','error']
          row = [call_id,project_name,self.connector_type,t,dt,url,redirect_url,size,response_code,success,err] # define row to be written in the log.
          self.log.write('\n'+';'.join(map(str,row))) # write log.
          return response,call_id # return response and unique identifier.

        except Exception as e: # define error condition
          err = str(e) # python error
          response_code = '' # blank response code 
          success = False # call success = False
          size = 0 # content is empty.
          redirect_url = '' # redirect url empty 
          dt = t - time.time() # define delta t

          ## log...
          call_id = self.id # define unique identifier
          self.id+=1 # increment call_id

          row = [call_id,project_name,self.connector_type,t,dt,url,redirect_url,size,response_code,success,err] # define row
          self.log.write('\n'+';'.join(map(str,row))) # write row to log.
    else:
      t = time.time()
      ratelimit()
      self.browser.get(url) # use selenium get method
      ## log
      call_id = self.id # define unique identifier for the call. 
      self.id+=1 # increment the call_id
      err = '' # blank error message
      success = '' # success blank
      redirect_url = self.browser.current_url # redirect url.
      dt = t - time.time() # get time for get method ... NOTE: not necessarily the complete load time.
      size = len(self.browser.page_source) # get size of content ... NOTE: not necessarily correct, since selenium works in the background, and could still be loading.
      response_code = '' # empty response code.
      row = [call_id,project_name,self.connector_type,t,dt,url,redirect_url,size,response_code,success,err] # define row 
      self.log.write('\n'+';'.join(map(str,row))) # write row to log file.
    # Using selenium it will not return a response object, instead you should call the browser object of the connector.
    ## connector.browser.page_source will give you the html.
      return call_id

### This is the final code for our exam!

### Here we scrape the tripadvisor page for all restaurants in Copenhagen. 

In [None]:
# Importing packages 
from bs4 import BeautifulSoup
import urllib.request
import pandas as pd
import pprint
import requests,os,re
from time import sleep
from urllib.request import urlopen

In [None]:
connector = Connector('logfile_sds_trip_all_res.csv')

# Header
session = requests.session()
session.headers['emails'] = "vhp312@alumni.ku.dk"
session.headers['names'] = "Nynne Bech Nielsen (btq674) Marcus Bjarup Thøgersen (vhp312) Kaiyue Xu (vsp923) Jakob Lauge Toft Hansen (qkr676)"
session.headers['description'] = "Til brug for eksamen i Social Data Science, \
                                  KU (https://kurser.ku.dk/course/aØkk08216u/2018-2019)"

# Getting all the links from the tripadvisor page. Stores it in a list called 'links'
links=[]
for nummer in list(range(0, 30*78, 30)):
    sleep(0.5)
    url_nr = 'https://www.tripadvisor.dk/Restaurants-g189541-oa{}-Copenhagen_Zealand.html'.format(nummer)
    response, call_id = connector.get(url_nr,'scraping restaurants')
    links.append(url_nr)
#print(links)

pd.read_csv('logfile_sds_trip_all_res.csv',sep=';')

In [None]:
# The function takes an url as input. 
def get_info(link):
    sleep(0.5)
    print("Slept 0.5, getting ", link)
    response = requests.get(link)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, "lxml")
        N, B, R= [], [], []
        for items in soup.find_all(class_="shortSellDetails"):
            try:
                name = items.find(class_="property_title").get_text(strip=True) ## Creating a function that searches for the restaurant name
            except:
                name = ''
            try:
                bubble = items.find(class_="ui_bubble_rating").get("alt") #rating
            except:
                bubble = ''
            try:
                review = items.find(class_="reviewCount").get_text(strip=True)#and how many reviews
            except:
                review = ''
            N.append(name)
            B.append(bubble)
            R.append(review)
        return N,B,R

# Loops over all the links in the list of links and appends all the data to a dataframe.   
data = []
for link in links:
    n,b,r = get_info(link)
    data.append(pd.DataFrame({'name': n, 'bubble': b, 'review':r}))
    



In [None]:
trip = pd.concat(data)
trip_csv = trip.to_csv(r'/Users/marcusbjarupthogersen/Documents/Group_13/trip_csv.csv')
trip_sorted = pd.read_csv('/Users/marcusbjarupthogersen/Documents/Group_13/trip_csv.csv')
trip


In [None]:
# Deleting rows with NaN and duplicates 
trip_sorted = trip_sorted.dropna()
trip_sorted.drop_duplicates(subset ="name", inplace = True)

trip_csv2 = trip_sorted.to_csv(r'/Users/marcusbjarupthogersen/Documents/Group_13/trip_csv2.csv')

# Deletes the thousand separator in order to make the d.object into an integer. 
trip_sorted = trip_sorted.astype(str).apply(lambda x: x.str.replace('.',''))

# Replaces the comma with a dot in order to make the d.object into a float
trip_sorted = trip_sorted.astype(str).apply(lambda x: x.str.replace(',','.'))

# Extracts the number from reviews count and stores it in another column as an integer
trip_sorted['reviews int'] = trip_sorted['review'].str.extract('(\d+)').astype(int)
#trip_sorted['bubbles int'] = trip_sorted['bubble'].str.extract('(\d+)').astype(float)

# Creates a new column with the rating by deleting the sentence "ud af 5 bobler"
trip_sorted['bubbles float'] = trip_sorted['bubble'].replace(to_replace = r' ud af 5 bobler', value = '', regex=True)

# Converts the rating-column into a float in order to sort on this later.
trip_sorted['bubbles float'] = trip_sorted['bubbles float'].astype(str).astype(float)

# Sorts the data on reviews count
trip_sorted.sort_values(by = ['reviews int'], ascending = False)

### Here we scrape all the reviews for the three chosen restaurants: Mother, Kødbyens Fiskebar and restaurant tight. 

In [None]:
# Importing packages 
from bs4 import BeautifulSoup
import urllib.request
import pandas as pd
import pprint
import requests,os,re
from time import sleep
from urllib.request import urlopen

In [None]:
# First we scrape mother

# Header
session = requests.session()
session.headers['emails'] = "vhp312@alumni.ku.dk"
session.headers['names'] = "Nynne Bech Nielsen (btq674) Marcus Bjarup Thøgersen (vhp312) Kaiyue Xu (vsp923) Jakob Lauge Toft Hansen (qkr676)"
session.headers['description'] = "Til brug for eksamen i Social Data Science, KU (https://kurser.ku.dk/course/aØkk08216u/2018-2019)"

connector = Connector('logfile_sds_trip_mother.csv')

# Getting all the links from the tripadvisor(mother) page. There are 192 pages of reviews with 10 reviews on each page
# Stores it in a list called 'links(Marcus) or listen(Jakob)'
links_mother=[]
for nummer in list(range(0, 10*192, 10)): #192 is the number of pages on tripadvisor
    sleep(0.5)
    url_nr = 'https://www.tripadvisor.com/Restaurant_Review-g189541-d1898372-Reviews-or{}-Mother-Copenhagen_Zealand.html'.format(nummer)
    response, call_id = connector.get(url_nr,'scraping restaurants') # make the log-file for data validation
    links_mother.append(url_nr)

pd.read_csv('logfile_sds_trip_mother.csv',sep=';') # examine the logfile results. It seemes that it iterated over the same link to many times.

## Creating a function that searches for the review, rating (bubble) and location. 
### The function takes an url as input. 

In [None]:
connector = Connector('logfile_sds_trip_mother.csv')

def get_info(link):
    sleep(0.5)
    print("Slept 0.5, getting ", link)
    response = requests.get(link)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, "lxml")
        N, L, M = [], [], []
        for items in soup.find_all(class_="reviewSelector"):
            try:
                name = items.find(class_="partial_entry").get_text(strip=True)
            except:
                name = ''
            try:
                location = items.find(class_="userLoc").get_text(strip=True)
            except:
                location = ''
            try:
                bubble = items.find(class_="ui_bubble_rating")
            except:
                bubble = ''
            N.append(name)
            L.append(location)
            M.append(bubble)

        return N, L, M

# Loops over all the links in the list of links and appends all the data to a dataframe.   
data_mother = []
for link in links_mother:
    n, l, m = get_info(link)
    response, call_id = connector.get(link,'scraping mother')
    data_mother.append(pd.DataFrame({'name': n, 'location': l, 'bubble':m}))

pd.read_csv('logfile_sds_trip_mother.csv',sep=';')

In [None]:
trip_mother = pd.concat(data_mother)

trip_mother['bubble'][10:]

# start stop and step variables 
start, stop, step = 37, -9, 1

# slicing to integer
trip_mother["rating"]= trip_mother["bubble"].astype(str).str.slice(start, stop, step).astype(int) / 10 
  
trip_mother = trip_mother.reset_index(drop=True)
trip_mother 

In [None]:
trip_mother = trip_mother.to_csv(r'/Users/marcusbjarupthogersen/Desktop/SDS_Eksamen/dataframe_mother2.csv')

In [None]:
# Kødbyens Fiskebar
# Header
session = requests.session()
session.headers['emails'] = "vhp312@alumni.ku.dk"
session.headers['names'] = "Nynne Bech Nielsen (btq674) Marcus Bjarup Thøgersen (vhp312) Kaiyue Xu (vsp923) Jakob Lauge Toft Hansen (qkr676)"
session.headers['description'] = "Til brug for eksamen i Social Data Science, KU (https://kurser.ku.dk/course/aØkk08216u/2018-2019)"

connector = Connector('logfile_sds_trip_fiskebar.csv')

# Getting all the links from the tripadvisor(Kødbyens Fiskebar) page. There are 242 pages of reviews with 10 reviews on 
# each page. Stores it in a list called 'links_fiskebar'
links_fiskebar=[]
for nummer in list(range(0, 10*242, 10)):
    sleep(0.5)
    url_nr = 'https://www.tripadvisor.com/Restaurant_Review-g189541-d2085491-Reviews-or{}-Kodbyens_Fiskebar-Copenhagen_Zealand.html'.format(nummer)
    response, call_id = connector.get(url_nr,'scraping kødbyens fiskebar')
    links_fiskebar.append(url_nr)
    
pd.read_csv('logfile_sds_trip_fiskebar.csv',sep=';')
print(links_fiskebar)

In [None]:
links_fiskebar

In [None]:
connector = Connector('logfile_sds_trip_fiskebar.csv')


def get_info(link):
    sleep(0.5)
    print("Slept 0.5, getting ", link)
    response = requests.get(link)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, "lxml")
        N, L, M = [], [], []
        for items in soup.find_all(class_="reviewSelector"):
            try:
                name = items.find(class_="partial_entry").get_text(strip=True)
            except:
                name = ''
            try:
                location = items.find(class_="userLoc").get_text(strip=True)
            except:
                location = ''
            try:
                bubble = items.find(class_="ui_bubble_rating")
            except:
                bubble = ''
            N.append(name)
            L.append(location)
            M.append(bubble)

        return N, L, M

# Loops over all the links in the list of links and appends all the data to a dataframe. 
data_fiskebar = []
for link in links_fiskebar:
    n, l, m = get_info(link)
    response, call_id = connector.get(link,'scraping kødbyens fiskebar reviews')
    data_fiskebar.append(pd.DataFrame({'name': n, 'location': l, 'bubble':m}))

pd.read_csv('logfile_sds_trip_fiskebar.csv',sep=';')

### Kødbyens fiskebar

In [None]:
trip_fiskebar = pd.concat(data_fiskebar)

trip_fiskebar['bubble'][10:]

# start stop and step variables 
start, stop, step = 37, -9, 1

# slicing to integer
trip_fiskebar["rating"]= trip_fiskebar["bubble"].astype(str).str.slice(start, stop, step).astype(int) / 10 
  
trip_fiskebar = trip_fiskebar.reset_index(drop=True)
trip_fiskebar

In [None]:
# Restaurant Tight
# Header
session = requests.session()
session.headers['emails'] = "vhp312@alumni.ku.dk"
session.headers['names'] = "Nynne Bech Nielsen (btq674) Marcus Bjarup Thøgersen (vhp312) Kaiyue Xu (vsp923) Jakob Lauge Toft Hansen (qkr676)"
session.headers['description'] = "Til brug for eksamen i Social Data Science, KU (https://kurser.ku.dk/course/aØkk08216u/2018-2019)"

connector = Connector('logfile_sds_trip_tight.csv')

# Getting all the links from the tripadvisor(Restaurant Tight) page. There are 287 pages of reviews with 10 reviews on 
# each page. Stores it in a list called 'links_fiskebar'
links_tight=[]
for nummer in list(range(0, 10*287, 10)):
    sleep(0.5)
    url_nr = 'https://www.tripadvisor.dk/Restaurant_Review-g189541-d1528309-Reviews-or{}-Restaurant_Tight-Copenhagen_Zealand.html'.format(nummer)
    
    response, call_id = connector.get(url_nr,'scraping Restaurant Tight')

pd.read_csv('logfile_sds_trip_tight.csv',sep=';')
#print(links)

In [None]:
connector = Connector('logfile_sds_trip_tight.csv')

def get_info(link):
    sleep(0.5)
    print("Slept 0.5, getting ", link)
    response = requests.get(link)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, "lxml")
        N, L, M = [], [], []
        for items in soup.find_all(class_="reviewSelector"):
            try:
                name = items.find(class_="partial_entry").get_text(strip=True)
            except:
                name = ''
            try:
                location = items.find(class_="userLoc").get_text(strip=True)
            except:
                location = ''
            try:
                bubble = items.find(class_="ui_bubble_rating")
            except:
                bubble = ''
            N.append(name)
            L.append(location)
            M.append(bubble)

        return N, L, M


# Loops over all the links in the list of links and appends all the data to a dataframe.   
data_tight = []
for link in links_tight:
    n, l, m = get_info(link)
    response, call_id = connector.get(link,'Restaurant Tight reviews')
    data_tight.append(pd.DataFrame({'name': n, 'location': l, 'bubble':m}))

pd.read_csv('logfile_sds_trip_tight.csv',sep=';')

In [None]:
trip_tight = pd.concat(data_tight)

trip_tight['bubble'][10:]

# start stop and step variables 
start, stop, step = 37, -9, 1 # isolating the rating

# slicing to integer
trip_tight["rating"]= trip_tight["bubble"].astype(str).str.slice(start, stop, step).astype(int) / 10 
  
trip_tight = trip_fiskebar.reset_index(drop=True)
trip_tight


In [None]:
log_mother = pd.read_csv('logfile_sds_trip_mother.csv', sep=';')
log_mother.describe().to_csv('description_log.csv')
log_mother.plot(y='delta_t')


# Sentiment analysis

### Now we do the sentiment s analysis. We included the code for the different  sintiments analysis presented in the course. The final choice were the vader analysis.

In [None]:
## importing packages
import numpy as np
seaborn as sns
pandas as pd

## For text classification:
import nltk, nltk.sentiment, sklearn
%matplotlib inline

### Download data as pandas dataframe
import requests
path2data = 'https://raw.githubusercontent.com/snorreralund/scraping_seminar/master/english_review_sample.csv'
df = pd.read_csv(path2data)

In [None]:
# positive and negative sentiment lexicons 
negative = set(requests.get('http://ptrckprry.com/course/ssd/data/negative-words.txt').text.split(';\n')[-1].split('\n'))
positive = set(requests.get('http://ptrckprry.com/course/ssd/data/positive-words.txt').text.split(';\n')[-1].split('\n'))
print(len(negative),len(positive))

In [None]:
#importing the respective csv.files

df1 = pd.read_csv (r'C:\Users\jtoft\Downloads\dataframe_tight_v2.csv')
df2 = pd.read_csv (r'C:\Users\jtoft\Downloads\dataframe_fiskebar.csv')
df3 = pd.read_csv (r'C:\Users\jtoft\Downloads\dataframe_mother.csv')

frames = [df1, df2, df3]

#df = pd.read_csv (r'C:\Users\jtoft\Downloads\dataframe_all3.csv')

df = pd.concat(frames)
df.reset_index(drop=True)
df = df.loc[~df.index.duplicated(keep='first')]
#df = df.to_csv(r'C:\Users\jtoft\Downloads\dataframe_all.csv')

In [None]:
df['country'] = df['location'].str.rsplit(',').str[-1] # isolating the country as a variable in column 
df.reset_index(drop=True)

In [None]:
# initialize tokenizer
tokenizer = nltk.tokenize.TweetTokenizer()

# define function
def preprocessing(string):
    return tokenizer.tokenize(string.lower())

In [None]:
documents = df.name.apply(preprocessing)

In [None]:
# Define count function using a list comprehension.
def count_dictionary(tokenized_doc,dictionary):
    return len([word for word in tokenized_doc if word in dictionary])

In [None]:
df['positive_liu'] = documents.apply(count_dictionary,dictionary=positive)
df['negative_liu'] = documents.apply(count_dictionary,dictionary=negative)

In [None]:
# import nltk.sentiment
import nltk
nltk.download('vader_lexicon')
import nltk.sentiment
# initialize the vader function
vader = nltk.sentiment.vader.SentimentIntensityAnalyzer()
# apply the function and convert to dataframe
vader_df = pd.DataFrame(list(df['name'].apply(vader.polarity_scores)))
# rename columns adding the 'vader_' prefix using a list comprehension
vader_df.columns = ['vader_'+col for col in vader_df.columns]
# merge with original dataframe
df = pd.concat([df,vader_df],axis=1)

In [None]:
from afinn import Afinn #importing one of the methods for sentiment analysis
afinn = Afinn()
df['afinn'] = df.name.apply(afinn.score)

In [None]:
# # define columns
sentiment_columns = ['afinn','positive_liu','negative_liu']+[col for col in df.columns if 'vader_' in col]
hue = 'rating'
sns.pairplot(df.sample(2000)[sentiment_columns+[hue]],hue=hue)

In [None]:
# check positive LIU classification
positive_liu_reviews = df[((df.positive_liu-df.negative_liu)>0)]

# #check words with LUI classified as positive:
for idx in df.loc[df[((df.positive_liu-df.negative_liu)>0)].vader_compound.sort_values().index].index:
    print(idx,set(documents[idx])&positive)


In [None]:
import pandas as pd
df2 = df.drop_duplicates(subset=['name'])
df2.groupby('tourism').count()

In [None]:
df['country'].astype(str) # change type from 'object' to string

df['tourism'] = np.where(dft['country'] == ' Denmark', 'Local', 'Tourist')#when splitting, there were a space in country column

df.groupby('tourism').count() # create categories local and tourist

In [None]:
sns.catplot(x="rating", y="vader_compound", hue="tourism", kind="swarm", data=df) #make some quick plots to analyze



In [None]:
sns.catplot(x="rating", y="vader_compound", hue="tourism", kind="box", data=df);



In [None]:
%matplotlib inline 
from matplotlib.patches import Polygon
import matplotlib as mpl
import matplotlib.pyplot as plt

mpl.style.use(['ggplot']) # optional: for ggplot-like style

In [None]:
#Making the final plot, a boxplot to illustrate rating and reviews between categories.

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Draw Plot
plt.figure(figsize=(13,10), dpi= 80)
sns.boxplot(x='rating', y='vader_compound', data=df, hue='tourism')
#sns.stripplot(x='rating', y='vader_compound', data=df, color='black', size=2, jitter=1)

for i in range(len(df['tourism'].unique())-1):
    plt.vlines(i+.1, 1, 1, linestyles='solid', colors='gray', alpha=0.4)

# Decoration
plt.title('Reviews - local vs. tourist', fontsize=22)
plt.legend(title='Category', fontsize=12)
fig1 = plt.gcf()
fig.tight_layout()
plt.show()
plt.draw()
fig1.savefig('boxplot_category.png', dpi=100)




In [None]:
#statstik der beskriver data

stat = df.iloc[:,9:16:5] 
stat2 = df.groupby('tourism').describe()

stat2.to_csv(r'C:\Users\jtoft\Downloads\statestik.csv')

## The following sections make the graphs.  

### Most frequent words

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import warnings
import numpy as np
%matplotlib inline
warnings.filterwarnings("ignore")


# create a dataFrame object with title, score and release_year
df = pd.read_csv("local_freq.csv")[["name", "location", "bubble", "dollars", "rating"]]
df['country'] = df['location'].str.rsplit(',').str[-1] 
df.reset_index(drop=True)
df['country'].astype(str) # change type from 'object' to string

# lande = {' Denmark', ' Sweden', ' Norway', ' Finland', ' Iceland'}


df['tourism'] = np.where(df['country'] == ' Denmark', 'Local', 'Tourist')
df

In [None]:
from collections import Counter

out = list()
for sentence in df['name'].apply(lambda x: x.split()):
    for word in sentence:
        out.append(word)

word_freq  = dict(Counter(out))
word_freq

In [None]:
# Sort and keep x highest values
# sorted(word_freq, key= word_freq.get, reverse=True)
from collections import OrderedDict
from operator import itemgetter

wordlist = dict(OrderedDict(sorted(word_freq.items(), key=itemgetter(1), reverse=True)[18:50]))
wordlist

In [None]:
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
plt.subplots(figsize=(18,10))
plt.barh(range(len(wordlist)),list(wordlist.values()), align='center')
plt.yticks(range(len(wordlist)), list(wordlist.keys()))
plt.ylabel('words')
#define values
values = wordlist.values()
plt.xlabel('frequency')
plt.title('Frequent words of locals')



plt.show()

In [None]:
df = pd.read_csv("tourist_freq.csv")[["name", "location", "bubble", "dollars", "rating"]]
df['country'] = df['location'].str.rsplit(',').str[-1] 
df.reset_index(drop=True)
df['country'].astype(str) # change type from 'object' to string

# lande = {' Denmark', ' Sweden', ' Norway', ' Finland', ' Iceland'}


df['tourism'] = np.where(df['country'] == ' Denmark', 'Local', 'Tourist')
df

In [None]:
from collections import Counter

out = list()
for sentence in df['name'].apply(lambda x: x.split()):
    for word in sentence:
        out.append(word)

word_freq  = dict(Counter(out))
word_freq

In [None]:
from collections import OrderedDict
from operator import itemgetter

wordlist = dict(OrderedDict(sorted(word_freq.items(), key=itemgetter(1), reverse=True)[18:50]))
wordlist

In [None]:
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
plt.subplots(figsize=(18,10))
plt.barh(range(len(wordlist)),list(wordlist.values()), align='center')
plt.yticks(range(len(wordlist)), list(wordlist.keys()))
plt.ylabel('words')
#define values
values = wordlist.values()
plt.xlabel('frequency')
plt.title('Frequent words of tourists')
plt.show()

### Wordcloud

In [None]:
import numpy as np # linear algebra
import pandas as pd 
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline

from subprocess import check_output
from wordcloud import WordCloud, STOPWORDS

mpl.rcParams['figure.figsize']=(6.0,4.0)    #(6.0,4.0)
mpl.rcParams['font.size']=10               #10 
mpl.rcParams['savefig.dpi']= 72        #72 
mpl.rcParams['figure.subplot.bottom']=.1 




stopwords = ["restaurant", "Pizza", "Tight","els","back","wi","pi","ve","ano","delish"] + list(STOPWORDS)

wordcloud = WordCloud(
                          background_color='white',
                          stopwords=stopwords,
                          max_words=100,
                          max_font_size=40, 
                          random_state=42
                         ).generate(str(data['name']))

print(wordcloud)

fig = plt.figure(1, figsize = (12,6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()
fig.savefig("word1.png", dpi=900)



### Top 5

In [None]:
import csv
import pandas as pd

In [None]:
# # Deleting rows with NaN and duplicates 
# trip_sorted = trip_sorted.dropna()
# trip_sorted.drop_duplicates(subset ="name"
#                             , inplace = True) #

trip_sorted = pd.read_csv(r'/Users/KaiyueXu/Desktop/test.csv')

# Deletes the thousand separator in order to make the d.object into an integer. 
trip_sorted = trip_sorted.astype(str).apply(lambda x: x.str.replace('.',''))

# Replaces the comma with a dot in order to make the d.object into a float
trip_sorted = trip_sorted.astype(str).apply(lambda x: x.str.replace(',','.'))

In [None]:
# Extracts the number from reviews count and stores it in another column as an integer
trip_sorted['reviews int'] = trip_sorted['review'].str.extract('(\d+)').astype(int)
#trip_sorted['bubbles int'] = trip_sorted['bubble'].str.extract('(\d+)').astype(float)

# Creates a new column with the rating by deleting the sentence "ud af 5 bobler"
trip_sorted['bubbles int'] = trip_sorted['bubble'].replace(to_replace = r' ud af 5 bobler', value = '', regex=True)

# Converts the rating-column into a float in order to sort on this later.
trip_sorted['bubbles int'] = trip_sorted['bubbles int'].astype(str).astype(float)

In [None]:
# Sorts the data on reviews count
trip_rank = trip_sorted.sort_values(by = ['reviews int'], ascending = False).drop(['Unnamed: 0', 'Unnamed: 0.1'], axis=1).head(5)
trip_rank

In [None]:
import matplotlib.pyplot as plt

Reviews = [1919,2021,2135,2417,2826]

x = ['Mother','Höst','The Olive Kitchen &Bar','Kodbyens Fiskebar','Restaurant Tight']

plt.barh(range(5), Reviews, 0.4,color='b', alpha = 0.8)

plt.ylabel('Restaurant')

plt.xlabel('Reviews')

plt.title('Top 5 most reviews restaurants')

plt.yticks(range(5),['Mother','Höst','The Olive Kitchen &Bar','Kodbyens Fiskebar','Restaurant Tight'])

plt.xlim([1500,3000])


for x,y in enumerate(Reviews):
    plt.text(y+0.2,x,'%s' %y,va='center')

plt.show()

## This is the code that gets the Danish reviews in the expanded version. The data is not alalyzed in the research paper  


In [None]:
## SOLUTION

import requests,os,time
def ratelimit():
    "A function that handles the rate of your calls."
    time.sleep(1) # sleep one second.

class Connector():
  def __init__(self,logfile,overwrite_log=False,connector_type='requests',session=False,path2selenium='',n_tries = 5,timeout=30):
    """This Class implements a method for reliable connection to the internet and monitoring. 
    It handles simple errors due to connection problems, and logs a range of information for basic quality assessments
    
    Keyword arguments:
    logfile -- path to the logfile
    overwrite_log -- bool, defining if logfile should be cleared (rarely the case). 
    connector_type -- use the 'requests' module or the 'selenium'. Will have different since the selenium webdriver does not have a similar response object when using the get method, and monitoring the behavior cannot be automated in the same way.
    session -- requests.session object. For defining custom headers and proxies.
    path2selenium -- str, sets the path to the geckodriver needed when using selenium.
    n_tries -- int, defines the number of retries the *get* method will try to avoid random connection errors.
    timeout -- int, seconds the get request will wait for the server to respond, again to avoid connection errors.
    """
    
    ## Initialization function defining parameters. 
    self.n_tries = n_tries # For avoiding triviel error e.g. connection errors, this defines how many times it will retry.
    self.timeout = timeout # Defining the maximum time to wait for a server to response.
    ## not implemented here, if you use selenium.
    if connector_type=='selenium':
      assert path2selenium!='', "You need to specify the path to you geckodriver if you want to use Selenium"
      from selenium import webdriver 
      ## HIN download the latest geckodriver here: https://github.com/mozilla/geckodriver/releases

      assert os.path.isfile(path2selenium),'You need to insert a valid path2selenium the path to your geckodriver. You can download the latest geckodriver here: https://github.com/mozilla/geckodriver/releases'
      self.browser = webdriver.Firefox(executable_path=path2selenium) # start the browser with a path to the geckodriver.

    self.connector_type = connector_type # set the connector_type
    
    if session: # set the custom session
      self.session = session
    else:
      self.session = requests.session()
    self.logfilename = logfile # set the logfile path
    ## define header for the logfile
    header = ['id','project','connector_type','t', 'delta_t', 'url', 'redirect_url','response_size', 'response_code','success','error']
    if os.path.isfile(logfile):        
      if overwrite_log==True:
        self.log = open(logfile,'w')
        self.log.write(';'.join(header))
      else:
        self.log = open(logfile,'a')
    else:
      self.log = open(logfile,'w')
      self.log.write(';'.join(header))
    ## load log 
    with open(logfile,'r') as f: # open file
        
      l = f.read().split('\n') # read and split file by newlines.
      ## set id
      if len(l)<=1:
        self.id = 0
      else:
        self.id = int(l[-1][0])+1
            
  def get(self,url,project_name):
    """Method for connector reliably to the internet, with multiple tries and simple error handling, as well as default logging function.
    Input url and the project name for the log (i.e. is it part of mapping the domain, or is it the part of the final stage in the data collection).
    
    Keyword arguments:
    url -- str, url
    project_name -- str, Name used for analyzing the log. Use case could be the 'Mapping of domain','Meta_data_collection','main data collection'. 
    """
     
    project_name = project_name.replace(';','-') # make sure the default csv seperator is not in the project_name.
    if self.connector_type=='requests': # Determine connector method.
      for _ in range(self.n_tries): # for loop defining number of retries with the requests method.
        ratelimit()
        t = time.time()
        try: # error handling 
          response = self.session.get(url,timeout = self.timeout) # make get call

          err = '' # define python error variable as empty assumming success.
          success = True # define success variable
          redirect_url = response.url # log current url, after potential redirects 
          dt = t - time.time() # define delta-time waiting for the server and downloading content.
          size = len(response.text) # define variable for size of html content of the response.
          response_code = response.status_code # log status code.
          ## log...
          call_id = self.id # get current unique identifier for the call
          self.id+=1 # increment call id
          #['id','project_name','connector_type','t', 'delta_t', 'url', 'redirect_url','response_size', 'response_code','success','error']
          row = [call_id,project_name,self.connector_type,t,dt,url,redirect_url,size,response_code,success,err] # define row to be written in the log.
          self.log.write('\n'+';'.join(map(str,row))) # write log.
          return response,call_id # return response and unique identifier.

        except Exception as e: # define error condition
          err = str(e) # python error
          response_code = '' # blank response code 
          success = False # call success = False
          size = 0 # content is empty.
          redirect_url = '' # redirect url empty 
          dt = t - time.time() # define delta t

          ## log...
          call_id = self.id # define unique identifier
          self.id+=1 # increment call_id

          row = [call_id,project_name,self.connector_type,t,dt,url,redirect_url,size,response_code,success,err] # define row
          self.log.write('\n'+';'.join(map(str,row))) # write row to log.
    else:
      t = time.time()
      ratelimit()
      self.browser.get(url) # use selenium get method
      ## log
      call_id = self.id # define unique identifier for the call. 
      self.id+=1 # increment the call_id
      err = '' # blank error message
      success = '' # success blank
      redirect_url = self.browser.current_url # redirect url.
      dt = t - time.time() # get time for get method ... NOTE: not necessarily the complete load time.
      size = len(self.browser.page_source) # get size of content ... NOTE: not necessarily correct, since selenium works in the background, and could still be loading.
      response_code = '' # empty response code.
      row = [call_id,project_name,self.connector_type,t,dt,url,redirect_url,size,response_code,success,err] # define row 
      self.log.write('\n'+';'.join(map(str,row))) # write row to log file.
    # Using selenium it will not return a response object, instead you should call the browser object of the connector.
    ## connector.browser.page_source will give you the html.
      return call_id
    

In [None]:
import pandas as pd
from selenium import webdriver
driver = webdriver.Firefox(executable_path=r'/Users/marcusbjarupthogersen/Documents/geckodriver')
from time import sleep
connector = Connector('logfile_sds_trip_mother3.csv')

# 27 pages of Danish written reviews. 
links_mother=[]
for nummer in list(range(0, 10*27, 10)):
    sleep(0.5)
    url_nr = 'https://www.tripadvisor.com/Restaurant_Review-g189541-d1898372-Reviews-or{}-Mother-Copenhagen_Zealand.html'.format(nummer)
    response, call_id = connector.get(url_nr,'scraping restaurants')
    links_mother.append(url_nr)

pd.read_csv('logfile_sds_trip_mother3.csv',sep=';')

In [None]:
from selenium import webdriver
driver = webdriver.Firefox(executable_path=r'/Users/marcusbjarupthogersen/Documents/geckodriver')

data = []
for link in links_mother:
    sleep(0.7)
    N = []
    driver.get(link)
    driver.find_element_by_xpath("//span[contains(@class, 'ulBlueLinks')]").click()
    sleep(0.7)
    for item in driver.find_elements_by_class_name('reviewSelector'):
        review = item.find_element_by_class_name('partial_entry').text
        N.append(review)
    data.append(pd.DataFrame({'review': N}))

In [None]:
trip_motherDK = pd.concat(data)
trip_motherDK = trip_motherDK.to_csv(r'/Users/marcusbjarupthogersen/Documents/review_motherDK.csv')
trip_motherDK

In [None]:
import pandas as pd
from selenium import webdriver
driver = webdriver.Firefox(executable_path=r'/Users/marcusbjarupthogersen/Documents/geckodriver')
from time import sleep
connector = Connector('logfile_sds_trip_fiskebar3.csv')

# 16 pages of Danish written reviews. 
links_fiskebar=[]
for nummer in list(range(0, 10*16, 10)):
    sleep(0.5)
    url_nr = 'https://www.tripadvisor.com/Restaurant_Review-g189541-d2085491-Reviews-or{}-Kodbyens_Fiskebar-Copenhagen_Zealand.html'.format(nummer)
    response, call_id = connector.get(url_nr,'scraping restaurants')
    links_fiskebar.append(url_nr)
pd.read_csv('logfile_sds_trip_fiskebar3.csv',sep=';')

In [None]:
from selenium import webdriver
driver = webdriver.Firefox(executable_path=r'/Users/marcusbjarupthogersen/Documents/geckodriver')

data_fiskebar = []
for link in links_fiskebar:
    sleep(0.7)
    N = []
    driver.get(link)
    driver.find_element_by_xpath("//span[contains(@class, 'ulBlueLinks')]").click()
    sleep(0.7)
    for item in driver.find_elements_by_class_name('reviewSelector'):
        review = item.find_element_by_class_name('partial_entry').text
        N.append(review)
    data_fiskebar.append(pd.DataFrame({'review': N}))

In [None]:
trip_fiskebarDK = pd.concat(data_fiskebar)
trip_fiskebarDK = trip_fiskebarDK.to_csv(r'/Users/marcusbjarupthogersen/Documents/review_fiskebarDK.csv')
trip_fiskebarDK

In [None]:
import pandas as pd
from selenium import webdriver
driver = webdriver.Firefox(executable_path=r'/Users/marcusbjarupthogersen/Documents/geckodriver')
from time import sleep
connector = Connector('logfile_sds_trip_tight3.csv')

# 20 pages of Danish written reviews. 
links_tight=[]
for nummer in list(range(0, 10*20, 10)):
    sleep(0.5)
    url_nr = 'https://www.tripadvisor.dk/Restaurant_Review-g189541-d1528309-Reviews-or{}-Restaurant_Tight-Copenhagen_Zealand.html'.format(nummer)
    response, call_id = connector.get(url_nr,'scraping restaurants')
    links_tight.append(url_nr)
pd.read_csv('logfile_sds_trip_tight3.csv',sep=';')

In [None]:
from selenium import webdriver
driver = webdriver.Firefox(executable_path=r'/Users/marcusbjarupthogersen/Documents/geckodriver')

data_tight = []
for link in links_tight:
    sleep(1)
    N = []s
    driver.get(link)
    driver.find_element_by_xpath("//span[contains(@class, 'ulBlueLinks')]").click()
    sleep(1)
    for item in driver.find_elements_by_class_name('reviewSelector'):
        review = item.find_element_by_class_name('partial_entry').text
        N.append(review)
    data_tight.append(pd.DataFrame({'review': N}))

In [None]:
trip_tightDK = pd.concat(data_tight)
trip_tightDK = trip_tightDK.to_csv(r'/Users/marcusbjarupthogersen/Documents/review_tightDK.csv')
trip_tightDK