In [None]:
# imports
import scrapy
import os
import requests
from urllib.parse import urljoin
from scrapy.crawler import CrawlerProcess

In [None]:
# identify a folder to save the files
DOWNLOAD_DIR = r"downloaded_files"

In [None]:
# create the folder if it doesn't exist
os.makedirs(DOWNLOAD_DIR, exist_ok=True)

In [None]:
class FileDownloaderSpider(scrapy.Spider):
    name = "file_downloader"
    start_urls = ["https://gq.mines.gouv.qc.ca/documents/EXAMINE/"] # main page
    dry_run = False # toggle a no-download run of the code to test that the code is working

    # go through main page links
    def parse(self, response):
        links = response.css("pre a")
        for link in links:
            link_text = link.css("::text").get()
            href = link.css('::attr(href)').get()

            # only go through links with text starting with DP
            if link_text and link_text.startswith("DP"):
                full_url = urljoin(response.url, href)
                yield scrapy.Request(full_url, callback=self.parse_project_page,  meta={"subfolder": link_text})

    # go through the results of main page 
    # this code only goes through results starting with DP because of the yield above
    def parse_project_page(self, response):
        header_strings = ["Name", "Last modified", "Size", "Description"]
        subfolder = response.meta.get("subfolder", "misc")

        for link in response.css("pre a"):
            file_link_text = link.css("::text").get()
            href = link.css("::attr(href)").get()

            # exclude header strings
            if file_link_text in header_strings:
                continue

            if href:
                full_url = urljoin(response.url, href)
                self.download_file(full_url, subfolder)

    def download_file(self, file_url, subfolder):

        # dry_run = True
        if self.dry_run:
            self.logger.info(f"This is a dry run. Would download: {file_url} into folder: {subfolder}")

        # dry_run = False
        else:
            # subfolder is based on the text that was clicked through on the main page
            subfolder_path = os.path.join(DOWNLOAD_DIR, subfolder)
            os.makedirs(subfolder_path, exist_ok=True)
            local_filename = os.path.join(subfolder_path, os.path.basename(file_url))

            try:
                r = requests.get(file_url)
                with open(local_filename, "wb") as f:
                    f.write(r.content)
                self.logger.info(f"Downloaded: {file_url} -> {local_filename}")
            except Exception as e:
                self.logger.error(f"Failed to download {file_url}: {e}")

In [3]:
process = CrawlerProcess(settings={
    "DOWNLOAD_DELAY": 2
    ,"RANDOMIZE_DOWNLOAD_DELAY": True
    ,"CONCURRENT_REQUESTS": 1
    ,"AUTOTHROTTLE_ENABLED": True # use autothrottle to dynamically change crawling speed based on server speeds
    ,"AUTOTHROTTLE_START_DELAY": 1
    ,"AUTOTHROTTLE_MAX_DELAY": 10
    ,"AUTOTHROTTLE_TARGET_CONCURRENCY": 0.5
    ,"USER_AGENT": "Mozilla/5.0 (compatible; mybot/0.1)"
})
process.crawl(FileDownloaderSpider)
process.start()

2025-04-11 14:29:34 [scrapy.utils.log] INFO: Scrapy 2.12.0 started (bot: scrapybot)
2025-04-11 14:29:34 [scrapy.utils.log] INFO: Versions: lxml 5.3.2.0, libxml2 2.11.9, cssselect 1.3.0, parsel 1.10.0, w3lib 2.3.1, Twisted 24.11.0, Python 3.9.20 (main, Oct  3 2024, 07:38:01) [MSC v.1929 64 bit (AMD64)], pyOpenSSL 25.0.0 (OpenSSL 3.4.1 11 Feb 2025), cryptography 44.0.2, Platform Windows-10-10.0.26100-SP0
2025-04-11 14:29:34 [scrapy.addons] INFO: Enabled addons:
[]
2025-04-11 14:29:34 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2025-04-11 14:29:34 [scrapy.extensions.telnet] INFO: Telnet Password: 7861ee91ec3e63fe
2025-04-11 14:29:34 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.logstats.LogStats',
 'scrapy.extensions.throttle.AutoThrottle']
2025-04-11 14:29:34 [scrapy.crawler] INFO: Overridden settings:
{'AUTOTHROTTLE_ENABLED': True,
 'AUTOTHRO