In [None]:
%cd ..

In [None]:
from selenium.webdriver import Firefox
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException, ElementNotInteractableException, TimeoutException

from os.path import dirname, abspath
from tqdm.auto import tqdm

import urllib.request
import zipfile
import uuid
import os


**Assuming that the browser object is already setup with:**

    opts = Options()
    opts.set_headless()
    assert opts.headless  
    browser = Firefox(options=opts)
    browser.get('HTTPS_ADDRESS_OF_YOUR_CHOICE')

**EXAMPLE 1:**

Find search form field:

    search_form = browser.find_element_by_id('element_id')

Fill in search form field:

    search_form.send_keys('real python')
    
Submit search form field:

    search_form.submit()
    
Checkout the top result with:

    results = browser.find_elements_by_class_name('result')
    print(results[0].text)

Close browser object before exiting your Python session:

    browser.close()
    quit()

**EXAMPLE 2:**

Click on a specific playbutton:

    browser.find_element_by_class('playbutton').click()
    
**EXAMPLE 3:**

Find elements witha specific class attribute, e.g. "discover-item"

    list_of_items = browser.find_elements_by_class_name('discover-item')

In [None]:
opts = Options()
opts.headless = False
# Operating in headless mode
#assert opts.headless  
browser = Firefox(options=opts)

In [None]:
# Configuration
DESIRED_PRODUCT_LIST = ['National LIDAR Programme Point Cloud']
DESIRED_YEAR = '2021'
LATEST = True
OUTPUT_DIR = "./assets/output_tiles"
# The .shp must be accompanied by its respective .dbf, .shx and .prj files
AOI_FILE_PATH = "./assets/aoi/large-wrington.shp" 

In [None]:
# utility functions

def compress_as_zip(aoi_file_path: str=None):
    
    aoi_dir = dirname(os.path.abspath(aoi_file_path))
    
    aoi_name = aoi_file_path.split("/")[-1]
    aoi_name = aoi_name[:-4]
    
    zip_file_path = os.path.join(aoi_dir, aoi_name + ".zip")
    
    list_of_aoi_files = []
    
    for file in os.listdir(aoi_dir):
        if file.split(".")[0] == aoi_name:
            list_of_aoi_files.append(os.path.join(aoi_dir, file))
            
    with zipfile.ZipFile(zip_file_path, 'w') as zipMe:
        [zipMe.write(file, compress_type=zipfile.ZIP_DEFLATED) for file in list_of_aoi_files]
        
    return zip_file_path

class DownloadProgressBar(tqdm):
    def update_to(self, b=1, bsize=1, tsize=None):
        if tsize is not None:
            self.total = tsize
        self.update(b * bsize - self.n)

def download_url(file_url, file_path):

    with DownloadProgressBar(
        unit="B", unit_scale=True, miniters=1, desc=file_url.split("/")[-1]
    ) as t:
        urllib.request.urlretrieve(file_url, filename=file_path, reporthook=t.update_to)


# class: LidarTileScraper

In [None]:
aoi_zip_path = compress_as_zip(AOI_FILE_PATH)
aoi_zip_path

In [None]:
print("...waiting for page to load.")

browser.get("https://environment.data.gov.uk/DefraDataDownload/?Mode=survey")

wait = WebDriverWait(browser, 300)

print("...page has loaded.")

In [None]:
print("...waiting for shapefile to load")

wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#fileid")))

browser.find_element(by=By.CSS_SELECTOR, value="#fileid").send_keys([aoi_zip_path])

wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, ".grid-item-container")))

wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, ".grid-item-container")))

E1 = browser.find_element(by=By.CSS_SELECTOR, value=".grid-item-container")

print("...shapefile has loaded")

In [None]:
print("...waiting for available products to load")

while True:  # hack :(
    
    try:
        E1.click()
        
    except ElementNotInteractableException as e:
        break

    wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "#productSelect")))
    
    list_of_products = [
        x.get_attribute("value")
        for x in Select(
            browser.find_element(by=By.CSS_SELECTOR, value="#productSelect")
        ).options
    ]

print(f"List of products: {list_of_products}")

In [None]:
for desired_product in DESIRED_PRODUCT_LIST:
    
    if desired_product not in list_of_products:
        print(f"Desired product: {desired_product} is not available.")
        continue
        
    else:
        xPath = f'//*[@id="productSelect"]/option[{list_of_products.index(desired_product) + 1}]'
            
        wait.until(EC.presence_of_element_located((By.XPATH, xPath)))
        browser.find_element(by=By.XPATH, value=xPath).click()
            
        available_years_list = [
            x.get_attribute("value")
            for x in Select(
                browser.find_element(by=By.CSS_SELECTOR, value="#yearSelect")
            ).options
        ]
        
        if DESIRED_YEAR in available_years_list:
            year_element = [f'//*[@id="yearSelect"]/option[{available_years_list.index(DESIRED_YEAR)+1}]']
            
        elif (DESIRED_YEAR not in available_years_list) and LATEST:
            
            year_element = ['//*[@id="yearSelect"]/option[1]']
            
        else:
            print(f"Desired product is not available.")
            continue
            
        year_to_be_downloaded = available_years_list[int(year_element[0].split("[")[-1][:-1]) - 1]
        
        print(f"Year to be downloaded: {year_to_be_downloaded}")
        print("***")
        print(f"Desired year: {DESIRED_YEAR}")
        print("***")
        print(f"Available years: {available_years_list}")
        
        wait.until(EC.presence_of_element_located((By.XPATH, year_element[0])))
        browser.find_element(by=By.XPATH, value=year_element[0]).click()
        linki = 1
        
        while True:
            try:
                file_url = browser.find_element(
                    by=By.CSS_SELECTOR,
                    value=f".data-ready-container > a:nth-child({linki})",
                ).get_attribute("href")
                
                file_path = os.path.join(
                    OUTPUT_DIR,
                    AOI_FILE_PATH.split("/")[-1][:-4] + "-" +
                    file_url.split("/")[-1],
                )
                
                if not os.path.isfile(file_path):
                    download_url(file_url, file_path)
                    print(f"Saved to: {file_path}")
                        
                linki += 1
                
            except NoSuchElementException:
                print(f"{linki - 1} files downloaded for {year_to_be_downloaded}")
                break
            except Exception as err:
                print(err)
                
browser.close()