# Imports

In [1]:
import csv
import pandas as pd
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By

from datetime import datetime

# TripAdvisor Script 0 Version 4
- Fixed filtering native reviews
- Unable to retrieve review with reviews with no date just below the username

In [38]:
# Version 4

def scrape_review_v4_s0(attraction, link):
  
  print(f"[INFO] Currently Scrapping: {attraction}")
  
  start_time = datetime.now()
  
  # Open csv file to write in
  file_path = str(datetime.now().strftime('%Y_%m_%d')) + '_' + attraction.replace(" ", "_").lower() + '.csv'
  file = open(f'data/raw/{file_path}', 'a', encoding="utf-8", newline='')
  csv_writer = csv.writer(file)

  # Open up browser and navigate to page
  driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
  driver.get(link)
  
  # Get number of pages
  total_reviews = driver.find_element(By.XPATH, "//*[@href='#REVIEWS']/div[1]/span[1]/span[1]").text.replace('.', '')
  total_reviews = int(total_reviews)
  num_pages = (total_reviews // 10) + 1
  
  # Date replace
  date_dict = {'Okt':'Oct', 'Des':'Dec', 'Agt':'Aug', 'Mei':'May'}
  
  # Set review and page number counter
  review_count = 0
  page_num = 1
  
  for i in range(num_pages):
    
    # Let page load (change to until element exist later)
    time.sleep(3)   
    
    # Extract elements               
    users = driver.find_elements(By.XPATH, "//*[@data-automation='reviewCard']//span[contains(text(),'Google Terjemahan')]//parent::button//parent::div[1]/div[1]/div[1]/div[2]/span")
    reviews = driver.find_elements(By.XPATH, "//*[@data-automation='reviewCard']//span[contains(text(),'Google Terjemahan')]//parent::button//parent::div/div[5]/div[1]")
    dates = driver.find_elements(By.XPATH, "//*[@class='eRduX']")
    
    # Open csv file and write to it
    for j in range(len(reviews)):
      user = users[j].text
      review = reviews[j].text.replace('\n', ' ')
      date = dates[j].text[:9].strip()
      
      # Replace date with proper abbreviations
      for key, val in date_dict.items():
        if date[:3] == key:
          date = date.replace(key, val)
      
      date = datetime.strptime(date, '%b %Y')
      date = date.strftime('%Y-%m')
      
      # Write reviews and date into csv file
      csv_writer.writerow([date, user, review])      
      
    review_count += len(reviews)
    
    # Click next page
    next = driver.find_element(By.XPATH, "//a[@aria-label='Next page']//*[name()='svg']")
    next.click()
    page_num += 1
    
    # If no. of native reviews on page is 0, exit loop
    if len(reviews) == 0:
      break
    
    print(f"Review Count: {review_count}")
    print(f"Current page: {page_num}")
  
  print('-------------- Scrape Completed --------------')
  print(f"Total Reviews: {total_reviews}")
  print(f"Total Native Reviews: {review_count}")
  end_time = datetime.now()
  print('Runtime: {}'.format(end_time - start_time))
  
  file.close()
  driver.close()

In [39]:
attractions_dict = {}

with open('attractions_list_script0.csv', mode='r', encoding='utf-8-sig') as inp:
    reader = csv.reader(inp)
    attractions_dict = {rows[0]:rows[1] for rows in reader}

print(attractions_dict)

{'iFly Singapore': 'https://www.tripadvisor.co.id/Attraction_Review-g294264-d2180413-Reviews-IFly_Singapore-Sentosa_Island.html', 'Wave House Sentosa': 'https://www.tripadvisor.co.id/Attraction_Review-g294264-d2005250-Reviews-Wave_House_Sentosa-Sentosa_Island.html', 'KidZania Singapore': 'https://www.tripadvisor.co.id/Attraction_Review-g294264-d7789437-Reviews-KidZania_Singapore-Sentosa_Island.html', 'Wings of Time': 'https://www.tripadvisor.co.id/Attraction_Review-g294264-d1371247-Reviews-Wings_of_Time-Sentosa_Island.html', 'Tiger Sky Tower': 'https://www.tripadvisor.co.id/Attraction_Review-g294264-d1892452-Reviews-Tiger_Sky_Tower-Sentosa_Island.html', 'Sentosa 4D AdventureLand': 'https://www.tripadvisor.co.id/Attraction_Review-g294264-d1936430-Reviews-Sentosa_4D_Adventureland-Sentosa_Island.html', 'Madame Tussauds': 'https://www.tripadvisor.co.id/Attraction_Review-g294264-d7178019-Reviews-Madame_Tussauds_Singapore-Sentosa_Island.html', 'Butterfly Park & Insect Kingdom': 'https://www.

In [40]:
for attr, link in attractions_dict.items():
  scrape_review_v4_s0(attr, link)





[INFO] Currently Scrapping: iFly Singapore


Current google-chrome version is 100.0.4896
Get LATEST chromedriver version for 100.0.4896 google-chrome
Driver [C:\Users\Luqman\.wdm\drivers\chromedriver\win32\100.0.4896.60\chromedriver.exe] found in cache


Review Count: 2
Current page: 2
-------------- Scrape Completed --------------
Total Reviews: 678
Total Native Reviews: 2
Runtime: 0:00:10.581246






[INFO] Currently Scrapping: Wave House Sentosa


Current google-chrome version is 100.0.4896
Get LATEST chromedriver version for 100.0.4896 google-chrome
Driver [C:\Users\Luqman\.wdm\drivers\chromedriver\win32\100.0.4896.60\chromedriver.exe] found in cache


Review Count: 1
Current page: 2
-------------- Scrape Completed --------------
Total Reviews: 222
Total Native Reviews: 1
Runtime: 0:00:10.444347






[INFO] Currently Scrapping: KidZania Singapore


Current google-chrome version is 100.0.4896
Get LATEST chromedriver version for 100.0.4896 google-chrome
Driver [C:\Users\Luqman\.wdm\drivers\chromedriver\win32\100.0.4896.60\chromedriver.exe] found in cache


Review Count: 2
Current page: 2
-------------- Scrape Completed --------------
Total Reviews: 561
Total Native Reviews: 2
Runtime: 0:00:10.480106






[INFO] Currently Scrapping: Wings of Time


Current google-chrome version is 100.0.4896
Get LATEST chromedriver version for 100.0.4896 google-chrome
Driver [C:\Users\Luqman\.wdm\drivers\chromedriver\win32\100.0.4896.60\chromedriver.exe] found in cache


Review Count: 10
Current page: 2
Review Count: 20
Current page: 3
Review Count: 30
Current page: 4
Review Count: 38
Current page: 5
Review Count: 48
Current page: 6
Review Count: 57
Current page: 7
Review Count: 59
Current page: 8
-------------- Scrape Completed --------------
Total Reviews: 2124
Total Native Reviews: 59
Runtime: 0:00:31.290450






[INFO] Currently Scrapping: Tiger Sky Tower


Current google-chrome version is 100.0.4896
Get LATEST chromedriver version for 100.0.4896 google-chrome
Driver [C:\Users\Luqman\.wdm\drivers\chromedriver\win32\100.0.4896.60\chromedriver.exe] found in cache


Review Count: 10
Current page: 2
Review Count: 14
Current page: 3
-------------- Scrape Completed --------------
Total Reviews: 546
Total Native Reviews: 14
Runtime: 0:00:13.760156






[INFO] Currently Scrapping: Sentosa 4D AdventureLand


Current google-chrome version is 100.0.4896
Get LATEST chromedriver version for 100.0.4896 google-chrome
Driver [C:\Users\Luqman\.wdm\drivers\chromedriver\win32\100.0.4896.60\chromedriver.exe] found in cache


Review Count: 1
Current page: 2
-------------- Scrape Completed --------------
Total Reviews: 478
Total Native Reviews: 1
Runtime: 0:00:10.445200






[INFO] Currently Scrapping: Madame Tussauds


Current google-chrome version is 100.0.4896
Get LATEST chromedriver version for 100.0.4896 google-chrome
Driver [C:\Users\Luqman\.wdm\drivers\chromedriver\win32\100.0.4896.60\chromedriver.exe] found in cache


Review Count: 10
Current page: 2
Review Count: 20
Current page: 3
Review Count: 30
Current page: 4
Review Count: 40
Current page: 5
Review Count: 50
Current page: 6
Review Count: 60
Current page: 7
Review Count: 70
Current page: 8
Review Count: 80
Current page: 9
Review Count: 90
Current page: 10
Review Count: 100
Current page: 11
Review Count: 108
Current page: 12
-------------- Scrape Completed --------------
Total Reviews: 2103
Total Native Reviews: 108
Runtime: 0:00:46.064908






[INFO] Currently Scrapping: Butterfly Park & Insect Kingdom


Current google-chrome version is 100.0.4896
Get LATEST chromedriver version for 100.0.4896 google-chrome
Driver [C:\Users\Luqman\.wdm\drivers\chromedriver\win32\100.0.4896.60\chromedriver.exe] found in cache


Review Count: 3
Current page: 2
-------------- Scrape Completed --------------
Total Reviews: 604
Total Native Reviews: 3
Runtime: 0:00:10.377565






[INFO] Currently Scrapping: Adventure Cove Waterpark


Current google-chrome version is 100.0.4896
Get LATEST chromedriver version for 100.0.4896 google-chrome
Driver [C:\Users\Luqman\.wdm\drivers\chromedriver\win32\100.0.4896.60\chromedriver.exe] found in cache


Review Count: 10
Current page: 2
Review Count: 15
Current page: 3
-------------- Scrape Completed --------------
Total Reviews: 2480
Total Native Reviews: 15
Runtime: 0:00:14.940556






[INFO] Currently Scrapping: S.E.A Aquarium


Current google-chrome version is 100.0.4896
Get LATEST chromedriver version for 100.0.4896 google-chrome
Driver [C:\Users\Luqman\.wdm\drivers\chromedriver\win32\100.0.4896.60\chromedriver.exe] found in cache


Review Count: 10
Current page: 2
Review Count: 20
Current page: 3
Review Count: 30
Current page: 4
Review Count: 40
Current page: 5
Review Count: 50
Current page: 6
Review Count: 60
Current page: 7
Review Count: 70
Current page: 8
Review Count: 80
Current page: 9
Review Count: 90
Current page: 10
Review Count: 100
Current page: 11
Review Count: 110
Current page: 12
Review Count: 119
Current page: 13
Review Count: 129
Current page: 14
Review Count: 138
Current page: 15
Review Count: 141
Current page: 16
-------------- Scrape Completed --------------
Total Reviews: 6745
Total Native Reviews: 141
Runtime: 0:00:58.957818






[INFO] Currently Scrapping: The Maritime Experiential Museum


Current google-chrome version is 100.0.4896
Get LATEST chromedriver version for 100.0.4896 google-chrome
Driver [C:\Users\Luqman\.wdm\drivers\chromedriver\win32\100.0.4896.60\chromedriver.exe] found in cache


Review Count: 10
Current page: 2
-------------- Scrape Completed --------------
Total Reviews: 305
Total Native Reviews: 10
Runtime: 0:00:11.570840
