In [7]:
import csv
import pandas as pd
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By

from datetime import datetime
import re

# TripAdvisor Script 1
Features:
- Grab all reviews based on TripAdvisor language filtered amount
- Remove special characters and symbols for NLP use

Issue:
- Need to fix code if total native reviews is only on first page

Bugs:
- Some attractions the amount of filtered reviews can be inaccurate.

In [8]:
def fix_date(date):
  '''
  Clean date for consistency
  - Replaces Indonesian month names to English
  - Remove extra words
  - Reformat to YYYY-MM
  '''
  date_dict = {'Januari':'January', 'Februari':'February', 'Maret':'March', 'Mei':'May', 'Juni':'June',
                'Juli':'July', 'Agustus':'August', 'Oktober':'October', 'Desember':'December'}
  
  date = date.replace("Tanggal pengalaman: ", "")
      
  for key, val in date_dict.items():
    if key in date:
      date = date.replace(key, val)
    else:
      break
  
  date = datetime.strptime(date, '%B %Y')
  date = date.strftime('%Y-%m')
  
  return date

In [9]:
def clean_text(text):
  '''
  Remove special characters and symbols from text data
  '''
  text = re.sub('[^A-Za-z0-9]+', ' ', text).strip()
  return text

In [10]:
def scrape_review_s1(attraction, link): 
  
  print(f"[INFO] Currently Scrapping: {attraction}")
  
  start_time = datetime.now()
  
  # Create filename with date scrapped and attraction name
  file_path = str(datetime.now().strftime('%Y_%m_%d')) + '_' + attraction.replace(" ", "_").lower() + '.csv'
  file = open(f'data/raw/{file_path}', 'w', encoding="utf-8", newline='')
  csv_writer = csv.writer(file)
  
  # CSV Header
  csv_writer.writerow(['date', 'user', 'review', 'attraction', 'source'])
  
  driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
  driver.get(link)

  # Get number of Bahasa Indonesia reviews
  # Using this number to get number of pages to loop. 5 reviews/page
  native_num = driver.find_element(By.XPATH, "//label[@for='LanguageFilter_3']/span[2]").text
  native_num = int(re.search(r'\d+', native_num).group())
  num_pages = native_num // 5 + 1

  # Total review count
  total_reviews = driver.find_element(By.XPATH, "//span[@class='reviewCount siNVx S4 H3 Ci']").text
  total_reviews = int(''.join(i for i in total_reviews if i.isdigit()))
  
  review_count = 0
  page_num = 1
  
  # Only take remaining amount of reviews on last page
  rem = native_num % 5
  
  for i in range(num_pages-1):
    
    # Let page load (change to until element exist later)
    time.sleep(3) 
    
    # Expand the review
    driver.find_element(By.XPATH, ".//div[contains(@data-test-target, 'expand-review')]").click()

    # Extract elements
    users = driver.find_elements(By.XPATH, "//div[@class='bcaHz']/span[1]/a[1]")
    reviews = driver.find_elements(By.XPATH, "//q[@class='XllAv H4 _a']")
    dates = driver.find_elements(By.XPATH, "//span[@class='euPKI _R Me S4 H3']")
    
    for j in range(len(reviews)):
      user = clean_text(users[j].text)
      review = clean_text(reviews[j].text)
      date = fix_date(dates[j].text)
      csv_writer.writerow([date, user, review, attraction, 'tripadvisor']) 
 
    review_count += len(reviews)
    
    # Click next page
    next = driver.find_element(By.XPATH, "//a[@class='ui_button nav next primary ']")
    next.click()
    page_num += 1
    
    print(f"Review Count: {review_count}")
    print(f"Current page: {page_num}")
    
  else:
    # Let page load (change to until element exist later)
    time.sleep(3) 
    
    # Expand the review
    driver.find_element(By.XPATH, ".//div[contains(@data-test-target, 'expand-review')]").click()
    
    users = driver.find_elements(By.XPATH, "//div[@class='bcaHz']/span[1]/a[1]")
    reviews = driver.find_elements(By.XPATH, "//q[@class='XllAv H4 _a']")
    dates = driver.find_elements(By.XPATH, "//span[@class='euPKI _R Me S4 H3']")
    
    for k in range(rem):
      user = clean_text(users[k].text)
      review = clean_text(reviews[k].text)
      date = fix_date(dates[k].text)
      csv_writer.writerow([date, user, review, attraction, 'tripadvisor']) 
 
    review_count += rem
    
    # print(f"Review Count: {review_count}")
    # print(f"Current page: {page_num}")
  
  # ----- Scrapping Info -----
  print('-------------- Scrape Completed --------------')
  print(f"Total Reviews: {total_reviews}")
  print(f"Total Native Reviews: {review_count}")
  end_time = datetime.now()
  print('Runtime: {}'.format(end_time - start_time))

  file.close()
  driver.close()

In [11]:
df = pd.read_csv('attractions_list_script1.csv', header=None, names=["Attraction", "Link"])
df

Unnamed: 0,Attraction,Link
0,Sentosa Cable Car,https://www.tripadvisor.co.id/Attraction_Revie...
1,Mega Adventure Park,https://www.tripadvisor.co.id/Attraction_Revie...
2,SkyPark Sentosa by AJ Hackett,https://www.tripadvisor.co.id/Attraction_Revie...
3,Dolphin Island,https://www.tripadvisor.co.id/Attraction_Revie...
4,Segway Fun Ride (Gogreen Segway Eco Adventure),https://www.tripadvisor.co.id/Attraction_Revie...
5,Royal Albatross,https://www.tripadvisor.co.id/Attraction_Revie...


In [12]:
for index, row in df.iterrows():
    try:
        scrape_review_s1(row['Attraction'], row['Link'])
    except:
        print(f"No Native Reviews Found on {row['Attraction']}")
    finally:
        continue





[INFO] Currently Scrapping: Sentosa Cable Car


Current google-chrome version is 100.0.4896
Get LATEST chromedriver version for 100.0.4896 google-chrome
Driver [C:\Users\Luqman\.wdm\drivers\chromedriver\win32\100.0.4896.60\chromedriver.exe] found in cache


No Native Reviews Found on Sentosa Cable Car






[INFO] Currently Scrapping: Mega Adventure Park


Current google-chrome version is 100.0.4896
Get LATEST chromedriver version for 100.0.4896 google-chrome
Driver [C:\Users\Luqman\.wdm\drivers\chromedriver\win32\100.0.4896.60\chromedriver.exe] found in cache


-------------- Scrape Completed --------------
Total Reviews: 1382
Total Native Reviews: 4
Runtime: 0:00:07.342927






[INFO] Currently Scrapping: SkyPark Sentosa by AJ Hackett


Current google-chrome version is 100.0.4896
Get LATEST chromedriver version for 100.0.4896 google-chrome
Driver [C:\Users\Luqman\.wdm\drivers\chromedriver\win32\100.0.4896.60\chromedriver.exe] found in cache


-------------- Scrape Completed --------------
Total Reviews: 214
Total Native Reviews: 1
Runtime: 0:00:07.646540






[INFO] Currently Scrapping: Dolphin Island


Current google-chrome version is 100.0.4896
Get LATEST chromedriver version for 100.0.4896 google-chrome
Driver [C:\Users\Luqman\.wdm\drivers\chromedriver\win32\100.0.4896.60\chromedriver.exe] found in cache


No Native Reviews Found on Dolphin Island






[INFO] Currently Scrapping: Segway Fun Ride (Gogreen Segway Eco Adventure)


Current google-chrome version is 100.0.4896
Get LATEST chromedriver version for 100.0.4896 google-chrome
Driver [C:\Users\Luqman\.wdm\drivers\chromedriver\win32\100.0.4896.60\chromedriver.exe] found in cache


-------------- Scrape Completed --------------
Total Reviews: 294
Total Native Reviews: 0
Runtime: 0:00:07.221382






[INFO] Currently Scrapping: Royal Albatross


Current google-chrome version is 100.0.4896
Get LATEST chromedriver version for 100.0.4896 google-chrome
Driver [C:\Users\Luqman\.wdm\drivers\chromedriver\win32\100.0.4896.60\chromedriver.exe] found in cache


-------------- Scrape Completed --------------
Total Reviews: 565
Total Native Reviews: 1
Runtime: 0:00:07.624516
