## Web Scrapping
Scraping the TFL website: (https://cycling.data.tfl.gov.uk) in order to get the links of the files we are interested in.

In [1]:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options as ChromeOptions
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager

# Set up Selenium options
options = ChromeOptions()
options.headless = True

# Set up the Selenium service
service = Service(ChromeDriverManager().install())

# Choose the URL to scrape
url = "https://cycling.data.tfl.gov.uk"

# Create a new instance of the Chrome driver
driver = webdriver.Chrome(service=service, options=options)

# Navigate to the URL
driver.get(url)

# Wait until at least a single element of the table exists
wait = WebDriverWait(driver, 20)
html = wait.until(EC.presence_of_element_located((By.XPATH, '/html/body/div[2]/table/tbody/tr[1]/td[1]')))

# Get the page source
html_element = driver.page_source

# Close the browser
driver.quit()

# Parse the HTML using BeautifulSoup
soup = BeautifulSoup(html_element, "html.parser")

# Find the relevant files with their links
table = soup.find('table')
tbody = table.find('tbody')
folder_name = "usage-stats/"
capture_files = False
year = 2022
filetype = 'csv'
extracted_files = {}

for row in tbody.find_all('tr'):
    columns = row.find_all('td')

    if capture_files == False:
        col_values = [col.text.strip() for col in columns]

        if col_values[0] == folder_name:
            capture_files = True
            continue

    else:
        col = columns[0]
        filename = col.text.strip()

        if not filename.endswith(f'{year}.{filetype}'):
            continue

        # Extract the date
        filename_without_extension = filename.replace(f'.{filetype}', '')
        filename_last_date = filename_without_extension.split('-')[-1]
        extracted_files[filename_last_date] = col.a['href']

print('Display 2 items in the dictionary')
print(dict(list(extracted_files.items())[0:2]))

  options.headless = True


Display 2 items in the dictionary
{}
