In [1]:
# imports
import scrapy
import os
import requests
from urllib.parse import urljoin
from scrapy.crawler import CrawlerProcess
import re

In [None]:
# identify a folder to save the files
DOWNLOAD_DIR = os.path.abspath(os.path.join(os.getcwd(),"..","ExampleDownloadSpot"))

In [None]:
# create the folder if it doesn't exist
os.makedirs(DOWNLOAD_DIR, exist_ok=True)

In [3]:
class FileDownloaderSpider(scrapy.Spider):
    name = "file_downloader"
    start_urls = ["https://gq.mines.gouv.qc.ca/documents/EXAMINE/"] # main page
    dry_run = True # toggle a no-download run of the code to test that the code is working
    # getting information from links meeting the pattern of any 2 letters, any 2 digits, any 1 letter, at least 6 digits
    

    # go through main page links
    def parse(self, response):
        '''
        Parses the main page and returns the full urls and related subfolder metadata that meet the criteria.

        Args:
            self (class): the instance of the FileDownloaderSpider.
            response (scrapy.http.Response): the response from start_urls.

        Returns:
            The full urls and related subfolder metadata that meet the criteria.
        '''
        # 2 letters, any amount of characters, at least 6 digits in a row
        pattern = r'^[a-zA-Z]{2}.*\d{6,}'
        links = response.css("pre a")
        for link in links:
            link_text = link.css("::text").get()
            href = link.css('::attr(href)').get()

            # only go through links with text meeting the specified pattern
            # start by getting starting text that is desired
            if link_text and link_text.startswith("DP"):
                
                # only download files of links that match specified pattern
                if re.match(pattern, link_text):

                    full_url = urljoin(response.url, href)
                    yield scrapy.Request(full_url, callback=self.parse_project_page,  meta={"subfolder": link_text})

    # go through the results of main page 
    # this code only goes through results meeting the specified pattern because of the yield above
    def parse_project_page(self, response):
        '''
        Using the metadata passed in parse(), Iterates over all non-header links in a given subfolder page and calls download_file with the given file URL and subfolder name passed in parse().

        Args:
            self (class): the instance of the FileDownloaderSpider.
            response (scrapy.http.Response): the response of a given subfolder page that fits the criteria.

        Returns:
            Calls download_file on all non-header links with the given file URL and subfolder name passed in parse().
        '''
        header_strings = ["Name", "Last modified", "Size", "Description"]
        subfolder = response.meta.get("subfolder", "misc")

        for link in response.css("pre a"):
            file_link_text = link.css("::text").get()
            href = link.css("::attr(href)").get()

            # exclude header strings
            if file_link_text in header_strings:
                continue

            if href:
                full_url = urljoin(response.url, href)
                self.download_file(full_url, subfolder)

    def download_file(self, file_url, subfolder):
        '''
        Creates a download folder named according to the relavant subfolder for a given link, which it them downloads.

        Args:
            self (class): the instance of the FileDownloaderSpider.
            file_url (string): the full URL to a file that should be downloaded.
            subfolder (string): comes from the link name on the main page. used to organize downloaded files.

        Returns:
            Downloaded files from the pages that meet the criteria, stored in the relevant subfolders.
        '''

        # dry_run = True
        if self.dry_run:
            self.logger.info(f"This is a dry run. Would download: {file_url} into folder: {subfolder}")

        # dry_run = False
        else:
            # subfolder is based on the text that was clicked through on the main page
            subfolder_path = os.path.join(DOWNLOAD_DIR, subfolder)
            os.makedirs(subfolder_path, exist_ok=True)
            local_filename = os.path.join(subfolder_path, os.path.basename(file_url))

            try:
                r = requests.get(file_url)
                with open(local_filename, "wb") as f:
                    f.write(r.content)
                self.logger.info(f"Downloaded: {file_url} -> {local_filename}")
            except Exception as e:
                self.logger.error(f"Failed to download {file_url}: {e}")

In [None]:
process = CrawlerProcess(settings={
    "DOWNLOAD_DELAY": 2
    ,"RANDOMIZE_DOWNLOAD_DELAY": True
    ,"CONCURRENT_REQUESTS": 1
    ,"AUTOTHROTTLE_ENABLED": True # use autothrottle to dynamically change crawling speed based on server speeds
    ,"AUTOTHROTTLE_START_DELAY": 1
    ,"AUTOTHROTTLE_MAX_DELAY": 10
    ,"AUTOTHROTTLE_TARGET_CONCURRENCY": 0.5
    ,"USER_AGENT": "Mozilla/5.0 (compatible; mybot/0.1)"
})
process.crawl(FileDownloaderSpider)
process.start()