In [10]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
import re
import logging
import logging.config


class TripAdvisorCollector(object):
    

    def __init__(self):
        self.option = webdriver.ChromeOptions()
        self.option.add_argument('--no-sandbox')
        self.option.add_argument('--disable-dev-shm-usage')
        self.option.add_argument('--start-maximized')
        self.option.add_argument('--disable-infobars')
        self.option.add_argument('--disable-extensions')
        self.option.add_argument('--disable-gpu')
        self.option.add_argument('--ignore-certificate-errors')
        self.option.add_argument('--lang=en')
        self.option.add_experimental_option("prefs", { 
            "profile.default_content_setting_values.notifications": 2 
            })
        #self.option.add_argument('--headless')
        self.driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()),options=self.option)

        logging.config.dictConfig({
            'version':1,
            'disable_existing_loggers':True,
        })
        logging.basicConfig(
            level=logging.INFO,
            format= '{asctime} {levelname:<8} {message}',
            style='{',
            filename='TripAdvisorCrawler.txt',
            filemode='a+'
        )
    def go_to_link(self, destination_link):
        driver = self.driver
        driver.get(destination_link)

    def get_hotel_link(self):
        driver = self.driver
        
        # try:
        #     driver.find_element(By.XPATH, '//button[@aria-label="Wilayah: Indonesia"]').click()
        #     driver.find_element(By.PARTIAL_LINK_TEXT, "United States").click()
        # except NoSuchElementException:
        #     pass
        body = driver.find_element(By.XPATH, "/html")
        body.send_keys(Keys.END)
        try:
            WebDriverWait(driver, 3).until(
                    EC.presence_of_element_located(
                        (By.XPATH, '//button/span[text()="Lihat semua" or text()="See all"]'))).click()
        except (NoSuchElementException, TimeoutException):
            pass
        
        # WebDriverWait(driver, 10).until(
        #             EC.presence_of_element_located(
        #                 (By.XPATH, '//div[@class="prw_rup prw_meta_hsx_responsive_listing ui_section listItem reducedWidth rounded"]')))
        
        hotels_elements = driver.find_elements(By.XPATH, '//div[@class="listing_title "]/a')
        
        all_hotel_data = list()
        for hotel_element in hotels_elements:
            hotel_data = dict()
            hotel_link = hotel_element.get_attribute('href')
            hotel_data['hotel_id'] =re.search('.-d(.*?)-', hotel_link).group(1)
            hotel_data['link'] = hotel_link
            all_hotel_data.append(hotel_data)

        return all_hotel_data
    
    def is_next_present(self):
        driver = self.driver
        try:
            WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located(
                        (By.XPATH, '//div/span[text()="Next" and contains(@onclick,"widget")]')))
        except Exception:
            return False
        return True

    def next_page(self):
        driver = self.driver
        try:
            driver.execute_script("arguments[0].click();",WebDriverWait(driver, 30).until(
                    EC.presence_of_element_located(
                        (By.XPATH, '//div/span[text()="Next" and contains(@onclick,"widget")]'))))
        except NoSuchElementException:
            return False
        return True


    def quit_driver(self):
        driver = self.driver
        driver.quit()
        

In [11]:
import json


hotelCollector = TripAdvisorCollector()
hotelCollector.go_to_link('https://www.tripadvisor.com/Hotels-g294225-zfc5-Indonesia-Hotels.html')

num_of_prop = 264
offset = 0

hotel_list = list()
while True:
    json_list = hotelCollector.get_hotel_link()
    hotel_list += json_list
    print(f'\rTotal data: {len(hotel_list)}', end='')
    if hotelCollector.is_next_present():
        hotelCollector.next_page()
    else:
        break


Total data: 264

In [None]:
import json

with open('hotel_list.json', 'w') as json_file:
            json.dump(hotel_list, json_file, indent=1 )

In [3]:
import json
json_file_name = 'json.json'
hotelCollector = TripAdvisorCollector()
hotelCollector.go_to_link('https://www.tripadvisor.com/Hotels-g294225-zfc5-Indonesia-Hotels.html')

json_num = 0 #json file suffix
go_to_next = True
temp_list = list()


successful_attemps = 0
failed_attemps = 0
total_data_captured = 0

while go_to_next:
    
    json_list = []

    while len(json_list) !=30:
        try:
            json_list = hotelCollector.get_hotel_link()
            is_next_page_present = hotelCollector.is_next_present()
            
            if len(json_list) == 30:
                successful_attemps += 1
            else:
                failed_attemps += 1
                
            retry = 0
            while not is_next_page_present and retry <= 5:
                print(f'next page not detected, retrying... {retry}', end='\r')
                is_next_page_present = hotelCollector.is_next_present()
                retry += 1
            
            if not is_next_page_present:
                go_to_next = False
                print('LAST PAGE DETECTED')
                break

        except:
            pass

    total_data_captured += len(json_list)

    #adding result to temporal list
    temp_list = temp_list + json_list
    print(f"Sucessful attemps = {successful_attemps:<4} | Failed attemps = {failed_attemps:<3} | Temporal list count = {len(temp_list):<4} | Total data captured = {total_data_captured:<10}", end='\r')

    #storing to json if there 1000 data
    if len(temp_list) >= 1000:
        to_be_dumped = temp_list[:1000]
        with open(json_file_name, 'w') as json_file:
            json.dump(to_be_dumped, json_file, indent=1 )
        temp_list = temp_list[1000:]
        json_num += 1
        json_file_name = f'json{json_num}.json'

    if go_to_next:
        hotelCollector.next_page()


next page not detected, retrying... 2emps = 0   | Temporal list count = 240  | Total data captured = 240       