# Tripadvisor Crawler

## Libraries

In [None]:
import time
import pandas as pd

from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException

## Functions

In [None]:
def get_review_page(href_list):
    links = []
    target_links = []
    
    for i in range(len(href_list)):
        links.append(href_list[i].get_attribute('href'))
    
    cleaned_links = [link for link in links if link] # remove None elements
    review_links = [cleaned_link for cleaned_link in cleaned_links if base_url_review in cleaned_link]
    
    target_hotel_id = hotel_ids.loc[hotel_ids['hotel'] == target_hotel]['id'].values[0]
    
    for i in range(len(review_links)):
        if base_url_review + target_hotel_id in review_links[i]:
            target_links.append(review_links[i])
            
    # multiple review user?
    if len(target_links) > 2:
        isMultiple = 1
    
    else:
        isMultiple = 0
        
    return target_links[0], isMultiple # return the latest review


def get_review_tab(ui_tab_list):
    ui_tabs = []
    
    for i in range(len(ui_tab_list)):
        ui_tabs.append(ui_tab_list[i].get_attribute('href'))
        
    cleaned_ui_tabs = [ui_tab for ui_tab in ui_tabs if ui_tab] # remove None elements
    review_tab = [cleaned_ui_tab for cleaned_ui_tab in cleaned_ui_tabs if 'tab=reviews' in cleaned_ui_tab]
    
    return review_tab[0]


def is_partnership():
    user_number = driver.current_url.split('-')[3][1:]
    xpath = '//*[@id="review_' + str(user_number) + '"]/div/div[2]/div[6]/div'
    
    partner_text = driver.find_element_by_xpath(xpath).text
    
    if 'Review collected in partnership with' in partner_text:
        isPartnership = 1
    
    else:
        isPartnership = 0
    
    return isPartnership

## Data

In [None]:
# target hotel list
target_hotels = pd.read_csv('data/new_targets.txt', sep='\n', names=['hotel'])
hotel_ids = pd.read_excel('data/hotel_id.xlsx', index_col=0)

# drop Hanbee hotel
target_hotels = target_hotels.drop(labels=3, axis=0).reset_index(drop=True)
hotel_ids = hotel_ids.dropna(axis=0, how='any', subset=['id']).reset_index(drop=True)

# base urls
base_user_profile = 'https://www.tripadvisor.com/MemberProfile-a_uid.'
base_url_review = 'https://www.tripadvisor.com/ShowUserReviews-'

# hotel_list
hotel_list = target_hotels['hotel'].apply(lambda x: x[:-len('_trgt_inds.txt')]) # remove 'trgt_inds.txt'

## Crawling

In [None]:
# webdriver
driver = webdriver.Chrome(executable_path='E:/chromedriver.exe')
driver.implicitly_wait(15)

# crawling
for i, target_hotel in enumerate(target_hotels['hotel']): # target_hotels['hotel'][:index] -> number of hotels to be crawled
    reviewer_id = []
    date = []
    title = []
    review = []
    is_multiple = []
    is_partner = []
    
    user_ids = pd.read_csv('data/target_indivs/' + target_hotel, sep='\n', names=['user_id'])
    
    print(target_hotel)
    print(str(i + 1) + 'th hotel out of ' + str(len(target_hotels['hotel'])) + ' hotels.') # target_hotels['hotel'][:index] -> number of hotels to be crawled
    
    for j, user_id in enumerate(user_ids['user_id']): # user_ids['user_id'][:index] -> number of users to be crawled
        user_profile = base_user_profile + user_id[4:] # remove 'UID_'
        
        print(str(j + 1) + 'th user out of ' + str(len(user_ids['user_id'])) + ' users.') # user_ids['user_id'][:index] -> number of users to be crawled
        
        # get user_profile
        driver.get(url=user_profile)
        time.sleep(2)
        
        # review tab
        try:
            ui_tab_list = driver.find_elements_by_class_name('ui_tab')
            review_tab_url = get_review_tab(ui_tab_list)
            driver.get(url=review_tab_url)
            time.sleep(2)
            
        except NoSuchElementException:
            # deleted review
            reviewer_id.append(user_id[4:]) # remove 'UID_'
            date.append('The review has been removed')
            title.append('The reivew has been removed')
            review.append('The review has been removed')
            is_multiple.append(isMultiple)
            is_partner.append(0)
        
        else:
            # check whether the button exists
            try:
                button = driver.find_element_by_xpath('//*[@id="content"]/div[2]/button')
            
            except NoSuchElementException:
                try:
                    # user's target review page
                    href_list = driver.find_elements_by_tag_name('a')
                    url, isMultiple = get_review_page(href_list=href_list)
                
                except IndexError:
                    # deleted review
                    reviewer_id.append(user_id[4:]) # remove 'UID_'
                    date.append('The review has been removed')
                    title.append('The reivew has been removed')
                    review.append('The review has been removed')
                    is_multiple.append(isMultiple)
                    is_partner.append(0)
                                
                else:
                    driver.get(url=url)
                    time.sleep(2)
                
                    # check whether the button exists
                    try:
                        more_button = driver.find_elements_by_css_selector('.ulBlueLinks')[0]
                    
                    except IndexError:
                        # append information
                        reviewer_id.append(user_id[4:]) # remove 'UID_'
                        date.append(driver.find_element_by_class_name('ratingDate').text)
                        title.append(driver.find_element_by_id('HEADING').text)
                        review.append(driver.find_element_by_class_name('fullText').text)
                        is_multiple.append(isMultiple)
                    
                        # isPartnership
                        isPartnership = is_partnership()
                        is_partner.append(isPartnership)
                    
                    else:
                        # click more button
                        driver.execute_script('arguments[0].click();', more_button)
                
                        # append information
                        reviewer_id.append(user_id[4:]) # remove 'UID_'
                        date.append(driver.find_element_by_class_name('ratingDate').text)
                        title.append(driver.find_element_by_id('HEADING').text)
                        review.append(driver.find_element_by_class_name('fullText').text)
                        is_multiple.append(isMultiple)
                
                        # isPartnership
                        isPartnership = is_partnership()
                        is_partner.append(isPartnership)
                
            else:
                button.click()
                
                # infinitely scrolling
                prev_height = driver.execute_script('return document.body.scrollHeight')
                
                while True:
                    driver.execute_script('window.scrollTo(0, document.body.scrollHeight)')
                    time.sleep(7)
                    
                    curr_height = driver.execute_script('return document.body.scrollHeight')
                
                    if curr_height == prev_height:
                        break
                    
                    else:
                        prev_height = driver.execute_script('return document.body.scrollHeight')

                try:
                    # user's target review page
                    href_list = driver.find_elements_by_tag_name('a')
                    url, isMultiple = get_review_page(href_list=href_list)

                except IndexError:
                    # deleted review                
                    reviewer_id.append(user_id[4:]) # remove 'UID_'
                    date.append('The review has been removed')
                    title.append('The review has been removed')
                    review.append('The review has been removed')
                    is_multiple.append(isMultiple)
                    is_partner.append(0)

                else:
                    driver.get(url=url)
                    time.sleep(2)
                
                    try:
                        # check whether the button exists
                        more_button = driver.find_elements_by_css_selector('.ulBlueLinks')[0]
                
                    except IndexError:
                        # append information
                        reviewer_id.append(user_id[4:]) # remove 'UID_'
                        date.append(driver.find_element_by_class_name('ratingDate').text)
                        title.append(driver.find_element_by_id('HEADING').text)
                        review.append(driver.find_element_by_class_name('fullText').text)
                        is_multiple.append(isMultiple)
                
                        # isPartnership
                        isPartnership = is_partnership()
                        is_partner.append(isPartnership)
                
                    else:
                        # click more button
                        driver.execute_script('arguments[0].click();', more_button)
                
                        # append information
                        reviewer_id.append(user_id[4:]) # remove 'UID_'
                        date.append(driver.find_element_by_class_name('ratingDate').text)
                        title.append(driver.find_element_by_id('HEADING').text)
                        review.append(driver.find_element_by_class_name('fullText').text)
                        is_multiple.append(isMultiple)
                
                        # isPartnership
                        isPartnership = is_partnership()
                        is_partner.append(isPartnership)

    print('='*150)
    crawled_data = pd.DataFrame(data={'id': reviewer_id, 
                                     'date': date, 
                                     'title': title, 
                                     'review': review, 
                                     'isMultiple': is_multiple, 
                                     'isPartner': is_partner})
    
    crawled_data.to_csv('E:/jupyter/project/gtm/crawled_data/' + target_hotel[:-len('_trgt_inds.txt')] + '.csv', sep='|', index=False)