In [None]:
import requests
import os
from calendar import monthrange

In [None]:
# Base URL for the dataset
base_url = "https://www.ncei.noaa.gov/thredds/fileServer/OisstBase/NetCDF/V2.1/AVHRR"
url_cont = "oisst-avhrr-v02r01"

# Directory to save the downloaded files
download_dir = "../data/raw"

# Ensure the download directory exists
os.makedirs(download_dir, exist_ok=True)

In [None]:
# Download nc files and log errors
def download_nc_files(base_url, url_cont, download_dir, years, months, days):
    def is_valid_date(y, m, d):
        try:
            return d <= monthrange(y, m)[1]
        except:
            return False

    for year in years:
        for month in months:
            for day in days:
                if not is_valid_date(year, month, day):
                    continue

                nc_url = f'{base_url}/{year}{month:02d}/{url_cont}.{year}{month:02d}{day:02d}.nc'
                file_name = f'{url_cont}.{year}{month:02d}{day:02d}.nc'
                file_path = os.path.join(download_dir, file_name)

                try:
                    response = requests.get(nc_url, stream=True)
                    if response.status_code == 200:
                        with open(file_path, 'wb') as f:
                            for chunk in response.iter_content(chunk_size=8192):
                                f.write(chunk)
                        print(f"Downloaded {file_name}")
                    else:
                        print(f"Failed to download {file_name}: {response.status_code}")
                        with open("failed_downloads.log", "a") as log_file:
                            log_file.write(f"{nc_url}\n")
                except Exception as e:
                    print(f"Error downloading {file_name}: {e}")
                    with open("failed_downloads.log", "a") as log_file:
                        log_file.write(f"{nc_url}\n")

download_nc_files(
    base_url=base_url,
    url_cont=url_cont,
    download_dir=download_dir,
    years=range(2024, 2026),   # 2024 and 2025
    months=range(1, 4),        # January to March
    days=range(1, 6)          # Days 1 to 5
)

In [None]:
def retry_failed_downloads(log_file_path, download_dir):
    if not os.path.exists(log_file_path):
        print("No failed downloads log found.")
        return

    with open(log_file_path, "r") as log_file:
        failed_urls = log_file.readlines()

    new_log_file_path = log_file_path.replace(".log", "_retry.log")
    open(new_log_file_path, "w").close()

    for url in failed_urls:
        url = url.strip()
        if not url:
            continue

        file_name = url.split("/")[-1]
        file_path = os.path.join(download_dir, file_name)

        if os.path.exists(file_path):
            print(f"{file_name} already exists, skipping.")
            continue

        try:
            response = requests.get(url, stream=True)
            if response.status_code == 200:
                with open(file_path, 'wb') as f:
                    for chunk in response.iter_content(chunk_size=8192):
                        f.write(chunk)
                print(f"Successfully downloaded {file_name} on retry.")
            else:
                print(f"Failed to download {file_name} again: {response.status_code}")
                with open(new_log_file_path, "a") as new_log_file:
                    new_log_file.write(f"{url}\n")
        except Exception as e:
            print(f"Error downloading {file_name} on retry: {e}")
            with open(new_log_file_path, "a") as new_log_file:
                new_log_file.write(f"{url}\n")

# Call the retry function
retry_failed_downloads("failed_downloads.log", download_dir)