# Download from Swift Bucket functions

This notebook lists all `functions` that are used to retrieve files from a Swift Bucket

* [list_data_on_bucket](#list_data_on_bucket)
* [open_data_from_bucket](#open_data_from_bucket)
* [download_data_from_bucket](#download_data_from_bucket)

In [25]:
import requests
import io
import os

import numpy as np
import xarray as xr

from pprint import pprint

import tempfile

### <a id='list_data_on_bucket'></a>`list_data_on_bucket`

In [1]:
def list_data_on_bucket(base_url):
    """
    Sends a GET request to the specified base URL to retrieve a list of file and directory paths.
    It processes the paths to extract and print unique directory names if they contain 4 or fewer slashes ('/').
    The function returns the complete list of paths retrieved from the response.

    Parameters:
    base_url (str): The base URL of the bucket or endpoint from which the data is retrieved.

    Returns:
    list of str: A list containing all paths (as strings) obtained from the response.
    """
    
    # Send a GET request to the base URL
    response = requests.get(base_url)
    if response.status_code == requests.codes.ok:
        print('Status ok\n')
    else:
        response.raise_for_status()
    
    all_paths_list = response.text.split('\n')
    
    # If a single path contains less or equal than 4 '/', return a directory name
    all_directories_list = [os.path.dirname(path) for path in all_paths_list if path.count('/') <= 4]

    return all_paths_list

### <a id='open_data_from_bucket'></a>`open_data_from_bucket`

In [3]:
def open_data_from_bucket(data_path, group=None):
    """
    Fetches and opens a NetCDF file from a specified data path (typically from a cloud bucket), and returns it as an xarray dataset.

    The `data_path` must be a full URL combining the base URL, the path to the folder, and the file name. The function retrieves the file over HTTP, saves it temporarily, and opens it using the netCDF4 engine.

    Parameters:
    data_path (str): The full URL path to the NetCDF file to be opened.
                     Example format: base_url + path_to_folder + filename
                     Example:
                     https://s3.waw3-2.cloudferro.com/swift/v1/act6/data/s3/frp/
                     S3A_SL_2_FRP____20230821T080123_20230821T080423_20240821T020918_0180_102_249______MAR_D_NR_003.SEN3/
                     FRP_Merged_MWIR1kmStandard_SWIR1km.nc

    Returns:
    ds (xarray.Dataset): The opened NetCDF dataset.
    
    Example Usage:
    - base_url = "https://s3.waw3-2.cloudferro.com/swift/v1/act6/"
    - path_to_folder = "data/s3/frp/S3A_SL_2_FRP____20230821T080123_20230821T080423_20240821T020918_0180_102_249______MAR_D_NR_003.SEN3/"
    - filename = "FRP_Merged_MWIR1kmStandard_SWIR1km.nc"
    - data_path = base_url + path_to_folder + filename
    
    Raises:
    - HTTPError: If the file cannot be fetched successfully (non-200 HTTP response).
    
    """
    response = requests.get(data_path, stream=True)
    if response.status_code == 200:
        # Convert the streamed content into a file-like object
        file_obj = io.BytesIO(response.content)
        # Write to a temporary file and open it using the netcdf4 engine
        with tempfile.NamedTemporaryFile(delete=False, suffix='.nc') as tmp_file:
            tmp_file.write(file_obj.getbuffer())
            tmp_file.flush()
            
        ds = xr.open_dataset(tmp_file.name, engine='netcdf4', group=group)
    
        os.remove(tmp_file.name)
        print(f"File opened: {data_path}")
        return ds
    else:
        print(f"Failed to fetch file: {response.status_code}")

### <a id='open_data_from_bucket'></a>`download_data_from_bucket`

In [None]:
def download_data_from_bucket(base_url, data_paths, download_path):
    """
    Downloads files from a base URL using a list of data paths and saves them locally.
    Skips downloading if the file already exists.

    Args:
        base_url (str): The base URL where the files are stored.
        data_paths (list): A list of file paths (relative to the base URL) to download.
        download_path (str): The local directory where the files should be saved.

    Returns:
        list: A list of file names that were successfully downloaded or already exist.
    """
    if not os.path.exists(download_path):
        os.mkdir(download_path)
        
    file_names = list()
    for path in data_paths:
        file_name = os.path.join(download_path, path.split("/")[-1])
        
        # Check if file already exists
        if os.path.exists(file_name):
            print(f"File already exists: {file_name}")
            file_names.append(file_name)
            continue
        
        data_url = base_url + path
        response = requests.get(data_url, stream=True)
    
        if response.status_code == 200:
            # Open the file in write mode and save the streamed content
            with open(file_name, 'wb') as f:
                for chunk in response.iter_content(chunk_size=8192):
                    if chunk:  # Filter out keep-alive new chunks
                        f.write(chunk)
    
            print(f"Downloaded file: {file_name}")
            file_names.append(file_name)

        else:
            print(f"Failed to download file: {response.status_code}")
    
    return file_names