# Tripadvisor Crawler

## Libraries

In [None]:
import time
import pandas as pd

from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException

## Functions

In [None]:
def id_parser(user_id):
    reviewer_id, review_id = user_id[4:].split('-')[0], user_id[4:].split('_')[1]
    
    return reviewer_id, review_id


def get_url(hotel_id, review_id):
    url = base_url + hotel_id + '-r' + review_id
    
    return url
    
    
def is_partnership():
    user_number = driver.current_url.split('-')[3][1:]
    xpath = '//*[@id="review_' + str(user_number) + '"]/div/div[2]/div[6]/div'
    
    partner_text = driver.find_element_by_xpath(xpath).text
    
    if 'Review collected in partnership with' in partner_text:
        isPartnership = 1
    
    else:
        isPartnership = 0
    
    return isPartnership

## Data

In [None]:
# hotel data
target_hotels = pd.read_csv('data/new_targets.txt', sep='\n', names=['hotel']) # hotel list
hotel_ids = pd.read_excel('data/hotel_id.xlsx', index_col=0) # hotel id

# drop Hanbee hotel
target_hotels = target_hotels.drop(labels=3, axis=0).reset_index(drop=True)
hotel_ids = hotel_ids.dropna(axis=0, how='any', subset=['id']).reset_index(drop=True)

# base urls
base_url = 'https://www.tripadvisor.com/ShowUserReviews-'

## Crawling

In [None]:
# webdriver
driver = webdriver.Chrome(executable_path='E:/chromedriver.exe')
driver.implicitly_wait(10)

# crawling
for i, target_hotel in enumerate(target_hotels['hotel']): # target_hotels['hotel'][:index] -> number of hotels to be crwaled
    uid = []
    reviewer_id = []
    review_id = []
    date = []
    title = []
    review = []
    is_partner = []
    
    user_ids = pd.read_csv('data/target_indivs/' + target_hotel, sep='\n', names=['user_id'])
    hotel_id = hotel_ids.loc[hotel_ids['hotel'] == target_hotel]['id'].values[0]
    
    print(target_hotel)
    print(str(i + 1) + 'th hotel out of ' + str(len(target_hotels['hotel'])) + ' hotels.') # target_hotels['hotel'][:index] -> number of hotels to be crawled
    
    for j, user_id in enumerate(user_ids['user_id']): # user_ids['user_id'][:index] -> number of users to be crawled
        print(str(j + 1) + 'th user out of ' + str(len(user_ids['user_id'])) + ' users.') # user_ids['user_id'][:index] -> number of users to be crawled
        
        rvwer_id, rvw_id = id_parser(user_id=user_id)
        
        driver.get(url=get_url(hotel_id=hotel_id, review_id=rvw_id))
        time.sleep(2)
        
        try:
            more_button = driver.find_elements_by_css_selector('.ulBlueLinks')[0]
            
        except IndexError: # no more_button
            # append information
            uid.append(user_id)
            reviewer_id.append(rvwer_id)
            review_id.append(rvw_id)
            
            try:
                date.append(driver.find_element_by_class_name('ratingDate').text)
                title.append(driver.find_element_by_id('HEADING').text)
                review.append(driver.find_element_by_class_name('fullText').text)
                
                # isPartnership
                isPartnership = is_partnership()
                is_partner.append(isPartnership)
                
            except NoSuchElementException:
                date.append('The review has been removed')
                title.append('The review has been removed')
                review.append('The review has been removed')
                is_partner.append(0)
            
        else:
            # click more_button
            driver.execute_script('arguments[0].click();', more_button)
            
            # append information
            uid.append(user_id)
            reviewer_id.append(rvwer_id)
            review_id.append(rvw_id)
            
            try:
                date.append(driver.find_element_by_class_name('ratingDate').text)
                title.append(driver.find_element_by_id('HEADING').text)
                review.append(driver.find_element_by_class_name('fullText').text)
            
                # isPartnership
                isPartnership = is_partnership()
                is_partner.append(isPartnership)
            
            except NoSuchElementException:
                date.append('The review has been removed')
                title.append('The review has been removed')
                review.append('The review has been removed')
                is_partner.append(0)
            
    print('='*150)
    
    crawled_data = pd.DataFrame(data={'uid': uid, 
                                     'reviewer_id': reviewer_id, 
                                     'review_id': review_id, 
                                     'date': date, 
                                     'title': title, 
                                     'review': review, 
                                     'isPartner': is_partner})
    
    crawled_data.to_csv('E:/jupyter/project/gtm/crawled_data/' + target_hotel[:-len('_trgt_inds.txt')] + '.csv', sep='|', index=False)