In [80]:
# load the packages needed to run the code
import os
import pandas as pd
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as ec
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException
import re
from webdriver_manager.chrome import ChromeDriverManager
from webdriver_manager.utils import ChromeType

In [81]:
# specify the path where you want to save the .csv file
#outpath = '/Users/spk/Documents/Code/Python/webscrape_proj/outputs/banff_national_park_trailforks.csv'
out_name = 'canmore.csv'
# specify the url that you want to scrape from
url = 'https://www.trailforks.com/region/canmore/trails/'

In [82]:
options = webdriver.ChromeOptions()
options = webdriver.ChromeOptions()
options.add_argument("--headless")
options.add_argument('--no-sandbox') 
options.add_argument('--disable-dev-shm-usage')
options.add_argument('window-size=1920,1080')

driver = webdriver.Chrome(ChromeDriverManager(chrome_type=ChromeType.CHROMIUM).install(),
                         options=options)

#driver.maximize_window()
#driver.execute_script("document.body.style.zoom='50%'")
timeout = 5

driver.get(url)

print('Prepping data:')
df_out = pd.DataFrame(columns=['title', 'trail_url', 'region', 'region_url', 'star_rating', 'star_votes',
                                   'descent', 'climb'])

get_meta = True
while get_meta:
    trail_table_elem = WebDriverWait(driver, timeout)\
        .until(ec.presence_of_element_located((By.ID, 'trails_table')))

    col_names = []
    tbl_heads = driver.find_elements(By.TAG_NAME, 'thead')
    for th in tbl_heads:
        for th_i in th.find_elements(By.TAG_NAME, 'th'):
            col_names.append(th_i.text)
    tbl_rows = trail_table_elem.find_elements(By.TAG_NAME, "tr")
    if tbl_heads is not None:
        print('Found table')
        total = driver.find_element(By.CLASS_NAME, 'resultTotal')
    print('Looping through table on page: ' + total.text)
    idx=0
    for idx_row, row in enumerate(tbl_rows):
        display(str(idx_row + 1) + " of " + str(len(tbl_rows) + 1))
        try:
            row_cols = row.find_elements(By.TAG_NAME, "td")
            title = row_cols[col_names.index('title')].text
            trail_url = row_cols[col_names.index('title')].find_element(By.TAG_NAME, 'a').get_attribute('href')
            region = row_cols[col_names.index('riding area')].text
            region_url = row_cols[col_names.index('riding area')].find_element(By.TAG_NAME, 'a').get_attribute('href')
            star_votes = row_cols[col_names.index('rating')].text
            star_rating = row_cols[col_names.index('rating')]\
                .find_element(By.CLASS_NAME, 'hovertip').get_attribute('data-score')
            distance = row_cols[col_names.index('distance')].text
            descent = row_cols[col_names.index('descent')].text
            climb = row_cols[col_names.index('climb')].text

            df_tmp = pd.DataFrame(dict(title=title,
                                       trail_url=trail_url,
                                       region=region,
                                       region_url=region_url,
                                  star_rating=star_rating,
                                       star_votes=star_votes,
                                  distance=distance,
                                  descent=descent,
                                  climb=climb), index=[idx])
            df_out = df_out.append(df_tmp)
            del [title, trail_url, region, region_url, star_rating, distance, descent,
                 df_tmp]
            idx += 1
        except IndexError:
            display('skipping row ' + str(idx))
    next_page = driver.find_element(By.CLASS_NAME, 'next-page')
    if next_page is not None:
        try:
            next_url = next_page.find_element(By.TAG_NAME, 'a').get_attribute('href')
            print('Page complete. Opening next page.\n')
            driver.get(next_url)
        except NoSuchElementException:
            get_meta=False
    else:
        get_meta=False

print('finshed!')
print('\n\nLooping through found trails to get details')
for idx_row, row in df_out.iterrows():
    display(str(idx_row + 1) + " of " + str(len(df_out) + 1))
    # Open a new window
    driver.execute_script("window.open('');")
    # Switch to the new window
    driver.switch_to.window(driver.window_handles[1])
    driver.get(row['trail_url'])

    trail_deets = driver.find_element(By.ID, 'traildetails_display')
    trail_spec_terms = trail_deets.find_elements(By.XPATH, "//li//div[@class='term']")
    trail_spec_defs = trail_deets.find_elements(By.XPATH, "//li//div[contains(@class, 'def')]")
    for idx_spec, spec in enumerate(trail_spec_terms):
        #print(spec.text)
        if spec.text == "Activities":
            trail_activities = trail_spec_defs[idx_spec].find_elements(By.CLASS_NAME, 'badgesquare')
            df_out.loc[idx_row, 'activities'] = ','.join([x.text for x in trail_activities])
        if spec.text == 'Difficulty Rating':
            df_out.loc[idx_row, 'difficulty_rating'] = trail_spec_defs[idx_spec]\
                .find_element(By.XPATH, "//span[contains(@class, 'dicon')]")\
                .get_attribute('title')
        if spec.text == 'Voted Difficulty':
            df_out.loc[idx_row, 'voted_difficulty'] = trail_spec_defs[idx_spec]\
                .find_element(By.XPATH, "//span[contains(@class, 'dicon')]")\
                .get_attribute('title')
        if spec.text == 'Trail Type':
            df_out.loc[idx_row, 'trail_type'] = trail_spec_defs[idx_spec].text
        if spec.text == 'Bike Type':
            df_out.loc[idx_row, 'bike_type'] = trail_spec_defs[idx_spec].text.replace(" ", "")
        if spec.text == 'Trail Usage':
            df_out.loc[idx_row, 'trail_usage'] = trail_spec_defs[idx_spec].text
        if spec.text == 'Direction':
            df_out.loc[idx_row, 'direction'] = trail_spec_defs[idx_spec].text
        if spec.text == 'Climb Difficulty':
            df_out.loc[idx_row, 'climb_difficulty'] = trail_spec_defs[idx_spec] \
                .find_element(By.XPATH, "//span[contains(@class, 'dicon')]") \
                .get_attribute('title')
        if spec.text == 'Physical Rating':
            df_out.loc[idx_row, 'physical_rating'] = trail_spec_defs[idx_spec].text
        if spec.text == 'Dogs Allowed':
            df_out.loc[idx_row, 'dogs_allowed'] = trail_spec_defs[idx_spec].text
        if spec.text == 'eBike Allowed':
            df_out.loc[idx_row, 'ebike_allowed'] = trail_spec_defs[idx_spec].text
        if spec.text == 'Global Ranking':
            df_out.loc[idx_row, 'global_ranking'] = re.findall('(?<=#)[0-9]+', trail_spec_defs[idx_spec].text)[0]
            df_out.loc[idx_row, 'global_ranking_type'] = re.findall('(?<=in )(.*)', trail_spec_defs[idx_spec].text)[0]
        if spec.text == 'Local Popularity':
            df_out.loc[idx_row, 'local_popularity'] = re.findall('^([^in]*) in*', trail_spec_defs[idx_spec].text)[0]
            lp_xtra = re.findall('[+]', trail_spec_defs[idx_spec].text)
            if len(lp_xtra) != 0:
                df_out.loc[idx_row, 'local_popularity_type'] = re.sub(' \[\+\]', '', re.findall('(?<=in )(.*)',
                                                                          trail_spec_defs[idx_spec].text)[0])
                lp_xtras1 = trail_spec_defs[idx_spec].find_element(By.ID, 'popularity_activity')
                lp_xtras2 = lp_xtras1.find_elements(By.TAG_NAME, 'li')
                for idx_lp_xtra, lp_xtra in enumerate(lp_xtras2):
                    lp_pop = lp_xtra.find_elements(By.TAG_NAME, "span")
                    df_out.loc[idx_row, 'local_popularity_' + str(idx_lp_xtra + 1)] = \
                        lp_pop[0].get_attribute('innerHTML')
                    df_out.loc[idx_row, 'local_popularity_type' + str(idx_lp_xtra + 1)] = \
                        re.findall('(?<=in )(.*)', lp_pop[1].get_attribute('innerHTML'))[0]
            else:
                df_out.loc[idx_row, 'local_popularity_type'] = re.findall('(?<=in )(.*)',
                                                                          trail_spec_defs[idx_spec].text)[0]
    driver.close()
    driver.switch_to.window(driver.window_handles[0])


with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    print(df_out)

df_out.to_csv(out_name, index_label='index')

[WDM] - Current google-chrome version is 87.0.4280
[WDM] - Get LATEST driver version for 87.0.4280
[WDM] - Driver [/Users/spk/.wdm/drivers/chromedriver/mac64/87.0.4280.88/chromedriver] found in cache


 
Prepping data:
Found table
Looping through table on page: Displaying 1 - 100 of 142
1 of 102
skipping row
2 of 102
3 of 102
4 of 102
5 of 102
6 of 102
7 of 102
8 of 102
9 of 102
10 of 102
11 of 102
12 of 102
13 of 102
14 of 102
15 of 102
16 of 102
17 of 102
18 of 102
19 of 102
20 of 102
21 of 102
22 of 102
23 of 102
24 of 102
25 of 102
26 of 102
27 of 102
28 of 102
29 of 102
30 of 102
31 of 102
32 of 102
33 of 102
34 of 102
35 of 102
36 of 102
37 of 102
38 of 102
39 of 102
40 of 102
41 of 102
42 of 102
43 of 102
44 of 102
45 of 102
46 of 102
47 of 102
48 of 102
49 of 102
50 of 102
51 of 102
52 of 102
53 of 102
54 of 102
55 of 102
56 of 102
57 of 102
58 of 102
59 of 102
60 of 102
61 of 102
62 of 102
63 of 102
64 of 102
65 of 102
66 of 102
67 of 102
68 of 102
69 of 102
70 of 102
71 of 102
72 of 102
73 of 102
74 of 102
75 of 102
76 of 102
77 of 102
78 of 102
79 of 102
80 of 102
81 of 102
82 of 102
83 of 102
84 of 102
85 of 102
86 of 102
87 of 102
88 of 102
89 of 102
90 of 102
91 of 102


KeyboardInterrupt: 

In [77]:
next_page = driver.find_element(By.CLASS_NAME, 'next-page')

In [78]:
next_page

<selenium.webdriver.remote.webelement.WebElement (session="319d73f98933782d7a0bdac3184e1113", element="d0f03e15-c8d2-4601-bbeb-3836aa436ea9")>