In [0]:
%pip install tqdm

In [0]:
catalog = "lucasbruand_catalog"
schema = "ign_bdtopo"
volume = "bronze_volume"

In [0]:
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {catalog}.{schema}")
spark.sql(f"CREATE VOLUME IF NOT EXISTS {catalog}.{schema}.{volume}")

In [0]:
version_date = "2025-09-15"
version = "3-5"
package = "GPKG"
projection = "LAMB93"
dept = "D001"

In [0]:
def build_download_url(version_date,
                       version,
                       package,
                       projection,
                       dept):
  return f"https://data.geopf.fr/telechargement/download/BDTOPO/BDTOPO_{version}_TOUSTHEMES_{package}_{projection}_{dept}_{version_date}/BDTOPO_{version}_TOUSTHEMES_{package}_{projection}_{dept}_{version_date}.7z"

In [0]:
url = build_download_url(version_date,
                       version,
                       package,
                       projection,
                       dept)
print(url)

In [0]:
import requests
import time
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from tqdm import tqdm


def download_file_with_retries(url, dest_path, max_retries=5, backoff_factor=1.0, chunk_size=1024*1024):
    """
    Download a file from a URL to a Databricks volume with retry logic and progress bar.
    Args:
        url (str): The URL to download from.
        dest_path (str): The full destination path in the Databricks volume (e.g., '/Volumes/catalog/schema/volume/filename').
        max_retries (int): Maximum number of retries for failed requests.
        backoff_factor (float): Backoff factor for retries.
        chunk_size (int): Size of chunks to write at a time (default 1MB).
    """
    session = requests.Session()
    retries = Retry(
        total=max_retries,
        backoff_factor=backoff_factor,
        status_forcelist=[500, 502, 503, 504],
        allowed_methods=["GET"]
    )
    session.mount('https://', HTTPAdapter(max_retries=retries))
    session.mount('http://', HTTPAdapter(max_retries=retries))

    try:
        with session.get(url, stream=True, timeout=60) as r:
            r.raise_for_status()
            total_size = int(r.headers.get('content-length', 0))
            with open(dest_path, 'wb') as f, tqdm(total=total_size, unit='B', unit_scale=True, desc=dest_path.split('/')[-1]) as pbar:
                for chunk in r.iter_content(chunk_size=chunk_size):
                    if chunk:
                        f.write(chunk)
                        pbar.update(len(chunk))
        print(f"Download completed: {dest_path}")
    except Exception as e:
        print(f"Download failed: {e}")
        raise


In [0]:
import os

def create_volume_file_from_url(url, volume_prefix):
    """
    Download a file from a URL to a Databricks volume, auto-generating the file path.
    Args:
        url (str): The URL to download from.
        volume_prefix (str): The volume path prefix (e.g., '/Volumes/catalog/schema/volume').
    Returns:
        str: The full destination path of the downloaded file.
    """
    filename = os.path.basename(url)
    dest_path = os.path.join(volume_prefix, filename)
    return dest_path


In [0]:
dest_path = create_volume_file_from_url(url, f"/Volumes/{catalog}/{schema}/{volume}/fileprefix/")

In [0]:
from pathlib import Path

def create_base_directories(filepath):
    """
    Create all base directories for the given file path if they do not exist.
    Args:
        filepath (str): The full file path for which to create base directories.
    """
    base_dir = Path(filepath).parent
    base_dir.mkdir(parents=True, exist_ok=True)
    return str(base_dir)


In [0]:
create_base_directories(dest_path)
download_file_with_retries(url, dest_path)

In [0]:
# download md5 file and check the checksum
md5_url = url.replace(".7z", ".md5")
md5_dest_path = create_volume_file_from_url(md5_url, f"/Volumes/{catalog}/{schema}/{volume}/fileprefix/")
#download_file_with_retries(md5_url, md5_dest_path)