In [1]:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from collections import defaultdict
from bs4 import BeautifulSoup
import time
import pandas as pd
import pickle

In [18]:
def scrape_state_trails(state, browser):
    #URL to  base site
    all_trails_site = "https://www.alltrails.com"    
    html_doc = get_all_hikes(all_trails_site, state, browser)
    
    #Get all trail cards
    trail_cards = html_doc.find_all("div", {"class" : "styles-module__trailCard___2oHiP"})
    print("total num of hikes: ")
    print(len(trail_cards))
    
    trails_contents = []
#     count = 1
    for card in trail_cards:
        hike = card.findChild('a')
        if hike == None:
            continue
        
        hike_url = all_trails_site + hike['href']
        hike_html = get_all_hike_ratings(hike_url, browser)
        parsed_data = parse_html_for_data(hike_html, hike_url)
        trails_contents.append(parsed_data)
        
        #REMOVE THIS BREAK STATEMENT
#         if count >= 2:
#             break
#         count += 1
    df = pd.DataFrame(trails_contents)
    
    #Clean the data
    hike_df = clean_data(df)
    hike_df, hike_ids = create_hike_ids(hike_df, state)
    #WHEN ADD MORE STATES, WILL NEED TO CHANGE HOW I HANDLE USER IDS - NEED TO CREATE USER IDS BASED ON DATA FROM ALL STATES
    user_ids = create_user_ids(hike_df)
    user_hike_rating_dict = create_user_hike_rating_dict(hike_df, user_ids)
    
    #Drop the columns no longer needed
    hike_df.drop('address', axis = 1, inplace = True)
    hike_df.drop('user_ratings', axis = 1, inplace = True)
    
    
    #Create CSVs
#     hike_df.to_csv("test.csv", index = False)
    rating_df = pd.DataFrame.from_dict(user_hike_rating_dict)
    rating_df['hike_id'] = rating_df.index
    hike_user_rating_df = pd.melt(rating_df, id_vars = 'hike_id').dropna()
#     hike_user_rating_df.to_csv("testUserHikeRating.csv", index = False)
    hike_df.to_csv("../data/" + state + ".csv", index = False)
    hike_user_rating_df.to_csv("../data/allRatings.csv", index = False)

    #Save user ids and hike ids
    with open('../data/all_hike_ids.pkl', 'wb') as f:
        pickle.dump(hike_ids, f)
    with open('../data/all_user_ids.pkl', 'wb') as f:
        pickle.dump(user_ids, f)
    

In [19]:
def get_all_hikes(all_trails_site, state, browser):
    state_url = all_trails_site + "/" + "us" + "/" + state
    browser.get(state_url)
    
    while True:
        try:
            load_more_hikes = WebDriverWait(browser, 20).until(EC.visibility_of_element_located((By.CLASS_NAME,'styles-module__button___1nuva'))) 
            load_more_hikes.click()
            time.sleep(2)
            #REMOVE THIS
#             break
        except:
            break
    html_doc = BeautifulSoup(browser.page_source)

    return html_doc

In [4]:
def get_all_hike_ratings(hike_url, browser):
    browser.get(hike_url)
    
    while True:
        try:
            load_more_ratings = WebDriverWait(browser, 20).until(EC.visibility_of_element_located((By.CLASS_NAME, 'styles-module__button___1nuva')))
            load_more_ratings.click()
            time.sleep(2)
        except:
            break
    hike_html_doc = BeautifulSoup(browser.page_source)
    
    return hike_html_doc

In [5]:
def parse_html_for_data(hike_html, hike_url):
    header = hike_html.find('div', id='title-and-menu-box')
    hike_name = header.findChild('h1').text
    difficulty = header.findChild('span').text
    stars_ranking = header.findChild('meta')['content']
    
    try:
        area = header.findChild('a').text
    except:
        area = None
    
    try:
        traffic_text = hike_html.find('section', id = 'trail-top-overview-text').findChild('p').text
        traffic_arr = traffic_text.split(" ")
        traffic_index = traffic_arr.index("trafficked") - 1
        traffic = traffic_arr[traffic_index]
    except:
        traffic = None
    
    try:
        distance = hike_html.select('span.distance-icon')[0].findChildren('span')[1].text.strip()
    except:
        distance = None
    
    try:
        elevation = hike_html.select('span.elevation-icon')[0].findChildren('span')[1].text.strip()
    except:
        elevation = None

    try:
        route_type = hike_html.select('span.route-icon')[0].findChildren('span')[1].text.strip()
    except:
        route_type = None
    
    address_link = hike_html.find("li", {"class" : "trail-directions"}).findChild('a')['href']
    address = address_link.split("/")[-1].split("?")[0]
    
    #Get tags
    tags= hike_html.select("section.tag-cloud")[0].findChildren('h3')
    hike_tags = []
    for tag in tags:
        hike_tags.append(tag.text)
    
    #Get user reviews
    user_ratings = []
    users = hike_html.select('div.feed-items')[0].findChildren('div', itemprop="review")
    for user in users:
        if user.find('span', itemprop = 'author') != None:
            username = user.find('span', itemprop = 'author').text
            username = username.replace('.', ' ')
            try:
                user_rating = user.find('span', itemprop='reviewRating').findChildren('meta')[0]['content']
                user_ratings.append({username : user_rating})
            except:
                pass
            
    print("num of ratings: ")
    print(len(user_ratings))
            
    #Make dictionary containing information about hike
    hike_info = {}
    hike_info["hike_name"] = hike_name
    hike_info["hike_url"] = hike_url
    hike_info["difficulty"] = difficulty
    hike_info["overall_rating"] = stars_ranking
    hike_info["area"] = area
    hike_info["traffic_level"] = traffic
    hike_info["distance"] = distance
    hike_info["elevation"] = elevation
    hike_info["route_type"] = route_type
    hike_info["address"] = address
    hike_info["tags"] = hike_tags
    hike_info["user_ratings"] = user_ratings
    
    return hike_info
    

In [6]:
def clean_data(hike_df):
    #Drop NAs
    hike_df.dropna()
    #Map easy, moderate, and hard to 1, 2, and 3 respectively
    hike_df['difficulty'] = hike_df['difficulty'].map({'easy': 1, 'moderate':2, 'hard' : 3})
    hike_df['overall_rating'] = hike_df['overall_rating'].astype(float)
    #Map lightly, moderately, and heavily to 1,2, and 3
    hike_df['traffic_level'] = hike_df['traffic_level'].map({'lightly' : 1, 'moderately': 2, 'heavily':3})
    hike_df['distance'] = hike_df['distance'].str.replace(' miles', '')
    hike_df['distance'] = hike_df['distance'].astype(float)
    hike_df['elevation'] = hike_df['elevation'].str.replace(' feet', '')
    hike_df['elevation'] = hike_df['elevation'].str.replace(',', '')
    hike_df['elevation'] = hike_df['elevation'].astype(float)
    
    hike_df = add_route_type_and_address(hike_df)
    hike_df = add_tags(hike_df)
    
    return hike_df
    

In [14]:
def add_route_type_and_address(hike_df):
    hike_df['loop'] = 0
    hike_df['out_and_back'] = 0
    hike_df['point_to_point'] = 0
    hike_df['latitude'] = 0
    hike_df['longitude'] = 0
    for idx, hike_type in enumerate(hike_df['route_type']):
        if hike_type == 'Loop':
            hike_df['loop'].iloc[idx] = 1
        elif hike_type == 'Out & Back':
            hike_df['out_and_back'].iloc[idx] = 1
        elif hike_type == 'Point to Point':
            hike_df['point_to_point'].iloc[idx] = 1
        
        hike_df['latitude'].iloc[idx] = hike_df['address'].values[idx].split(',')[0]
        hike_df['longitude'].iloc[idx] = hike_df['address'].values[idx].split(',')[1]
        
    hike_df['latitude'] = hike_df['latitude'].astype(float)
    hike_df['longitude'] = hike_df['longitude'].astype(float)
    hike_df.drop('route_type', axis = 1, inplace = True)
    return hike_df

In [8]:
def add_tags(hike_df):
    hike_df['dog_friendly'] = 0
    hike_df['kid_friendly'] = 0
    hike_df['camping'] = 0
    hike_df['waterfall'] = 0
    hike_df['river'] = 0
    hike_df['lake'] = 0
    hike_df['wildflowers'] = 0
    hike_df['wildlife'] = 0
    hike_df['backpacking'] = 0
    hike_df['bird_watching'] = 0
    for idx, attributes in enumerate(hike_df['tags']):
        for feature in attributes:
            if feature == 'dog friendly' or feature == 'dogs on leash':
                hike_df['dog_friendly'].iloc[idx] = 1
            elif feature == 'kid friendly':
                hike_df['kid_friendly'].iloc[idx] = 1
            elif feature == 'camping':
                hike_df['camping'].iloc[idx] = 1
            elif feature == 'waterfall':
                hike_df['waterfall'].iloc[idx] = 1
            elif feature == 'river':
                hike_df['river'].iloc[idx] = 1
            elif feature == 'lake':
                hike_df['lake'].iloc[idx] = 1
            elif feature == 'wild flowers':
                hike_df['wildflowers'].iloc[idx] = 1
            elif feature == 'wildlife':
                hike_df['wildlife'].iloc[idx] = 1
            elif feature == 'backpacking':
                hike_df['backpacking'].iloc[idx] = 1
            elif feature =='bird watching':
                hike_df['bird_watching'].iloc[idx] = 1
    hike_df.drop('tags', axis=1, inplace=True) 
    return hike_df

In [9]:
def create_hike_ids(hike_df, state):
    hike_df['hike_id'] = 0
    hike_ids = {}
    for idx, name in enumerate(hike_df['hike_name']):
        hike_id = state + 'hike{}'.format(idx)
        hike_df['hike_id'].iloc[idx] = hike_id
        hike_ids[hike_id] = name
    return hike_df, hike_ids
        

In [10]:
def create_user_ids(hike_df):
    users = set([])
    for idx, lst in enumerate(hike_df['user_ratings']):
        for user_dict in lst:
            for user, rating in user_dict.items():
                users.add(user)
    user_ids = {}
    num = 1
    for unique_user in users:
        user_ids[unique_user] = 'user{}'.format(num)
        num += 1
    return user_ids    

In [11]:
def create_user_hike_rating_dict(hike_df, user_ids):
    user_hike_rating_dict = defaultdict(dict)
    for idx, lst in enumerate(hike_df['user_ratings']):
        for user_dict in lst:
            for user, rating in user_dict.items():
                user_id = user_ids[user]
                hike_id = hike_df['hike_id'].iloc[idx]
                user_hike_rating_dict[user_id][hike_id] = rating
    return user_hike_rating_dict        

In [12]:
def scrape_all_trails(states):
    #Create browser
    browser = webdriver.Chrome()
    
    for state in states:
        scrape_state_trails(state, browser)

In [17]:
states = ["Texas"]
scrape_all_trails(states)

total num of hikes: 
20
num of ratings: 
1061
num of ratings: 
704
