# Import Dependencies

In [None]:
from splinter import Browser
from bs4 import BeautifulSoup as soup
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
from random import randint
from time import sleep

# Load and clean Trip Advisor URLs

In [None]:
# Load TA URLs data file into a Data Frame
data_to_load = "Resources/TripAdvisor_NatPark_Scraped_URLs.csv"
TA_URLs = pd.read_csv(data_to_load)
TA_URLs.head()

In [None]:
# Create Data Frame to hold clean list of URLs
Clean_URLs_df = TA_URLs.copy()
Clean_URLs_df.describe()

In [None]:
#Remove Duplicates
Clean_URLs_df = Clean_URLs_df.drop_duplicates()
Clean_URLs_df.describe()

In [None]:
#Remove URLs that belong to Hotels, Restaurants, Vacations, etc
Clean_URLs_df = Clean_URLs_df[Clean_URLs_df['Direct_URL'].str.contains("tripadvisor.com/Attraction_Review")==True]
Clean_URLs_df.head()

In [None]:
Clean_URLs_df.describe()

In [None]:
# Make a list out of the cleaned URLs df to use as a for loop ticker during scraping
Clean_URLs_list = Clean_URLs_df['Direct_URL'].to_list()
print(Clean_URLs_list)

In [None]:
# Create a list to hold the scraped reviews
TA_Reviews_List = []

# Scrape TripAdvisor Reviews

In [None]:
# Set up Splinter
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

In [None]:
# Loop the list of trails through search and grab the URL
for x in Clean_URLs_list:
    
    # Visit the trip advisor site
    browser.visit(x)
    # Optional delay for loading the page
    browser.is_element_present_by_css('div.list_text', wait_time=1)
    
    # Parse the HTML
    html = browser.html
    html_soup = soup(html, 'html.parser')
    
    # Grab page title
    page_title = browser.title
    
    try:
        # Create Trail Name Variable
        Trail = html_soup.find('h1', class_='WlYyy cPsXC GeSzT').text
    except:
        Trail = "could not scrape"
    
    # Retrieve the parent divs for all reviews
    reviews = html_soup.find_all('div', class_='ffbzW _c')
    
    # Loop through review card to get review data
    for reviewcard in reviews:
        # scrape the review title
        title = reviewcard.find('div', class_='WlYyy cPsXC bLFSo cspKb dTqpp').text
    
        # scrape the review text
        review = reviewcard.find('div', class_='WlYyy diXIH dDKKM').text
        
        # print review data
        print('-----------------')
        print(Trail)
        print(title)
    
        # Create a dictionary
        review_dict = {
            'Page Title':page_title,
            'Trail Name':Trail,
            'Review Title': title,
            'Review': review
            }
    
        # Append data to trail_reviews list
        TA_Reviews_List.append(review_dict)  
    
browser.quit()

In [None]:
Reviews_df = pd.DataFrame(TA_Reviews_List, columns=['Page Title','Trail Name','Review Title', 'Review'])
Reviews_df.head(10)

In [None]:
Reviews_df.to_csv(r"Resources/TripAdvisor_Reviews.csv", index=False)