In [1]:
import os
import re
import glob
import time
import numpy as np
import pandas as pd
import xarray as xr 
import requests
from bs4 import BeautifulSoup
import dotenv
from concurrent.futures import ThreadPoolExecutor, as_completed
dotenv.load_dotenv(dotenv.find_dotenv()) #.env file in the same directory as this script

True

### Download FLDAS and GLDAS netcdf files

In [2]:
# Credentials fo LP DAAC
# you will require an EarthData login
fldas_host = fr"https://hydro1.gesdisc.eosdis.nasa.gov/data/FLDAS/FLDAS_NOAH01_C_GL_M.001/"
gldas_host = fr"https://hydro1.gesdisc.eosdis.nasa.gov/data/GLDAS/GLDAS_CLSM025_DA1_D.2.2/"

login = os.getenv('user')
password = os.getenv('pwd')

hosts=[fldas_host,gldas_host]

out_dir =r"D:/VUB/_data/gldas_monthly_data"
if not os.path.exists(out_dir):
    os.makedirs(out_dir)

#### FLDAS Files

In [77]:
# list folders from which to download
r = requests.get(gldas_host, verify=True, stream=True,auth=(login,password))

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(r.text, "html.parser")
folders = list()
for link in soup.findAll('a', attrs={'href': re.compile("\d{4}/")}): #e.g. 2000/ #fldas
    # Append unique links to the list
    href = link.get('href')
    if href not in folders:
        folders.append(href)
print(f"{len(folders)} folders found")

0 folders found


In [73]:
#subset of data to download
subset_folders=folders[0:2]
subset_folders

['1982/', '1983/']

#### GLDAS files with concurrent features

In [3]:
# Year range
year_folders = np.arange(2004, 2024)  # ['2004/', '2005/', ... '2023/']

# Manually decode the months
months = [f"{str(i).zfill(2)}/" for i in range(1, 13)]  # ['01/', '02/', ... '12/']

exceptions = list()

def download_file(url, local_path):
    try:
        r = requests.get(url, verify=True, stream=True, auth=(login, password), timeout=60)
        r.raise_for_status()
        with open(local_path, "wb") as f:
            f.write(r.content)
        print(f"Downloaded {local_path}", end='\r')
    except Exception as error:
        print(f"Error downloading {url}: {error}")
        exceptions.append(url)

# Use ThreadPoolExecutor for concurrent downloads
with ThreadPoolExecutor(max_workers=6) as executor:  # Adjust max_workers based on your bandwidth
    futures = []
    
    for year_folder in year_folders:
        for month in months:
            month_url = f"{gldas_host}{year_folder}/{month}"
            r = requests.get(month_url, verify=True, stream=True, auth=(login, password))
            
            if r.status_code == 200:
                soup = BeautifulSoup(r.text, "html.parser")
                
                for link in soup.findAll('a', attrs={'href': re.compile(r'\.nc4$')}):
                    file_name = link.get('href')
                    full_file_path = f"{month_url}{file_name}"
                    local_filename = file_name.split("/")[-1]
                    local_path = f"{out_dir}/{local_filename}"
                    
                    # Submit the download task to the ThreadPoolExecutor
                    futures.append(executor.submit(download_file, full_file_path, local_path))
            else:
                print(f"Failed to access {month_url}, status code: {r.status_code}", end='\r')
        
        # Pause after processing each year
        print(f"Processed year {year_folder}, pausing for 5 seconds...", end='\r')
        time.sleep(5)

    # Wait for all futures to complete
    for future in as_completed(futures):
        future.result()  # To raise any exceptions that occurred during download

# Optional: print or log exceptions
if exceptions:
    print(f"Some files could not be downloaded. Check the following paths:")
    for path in exceptions:
        print(path)

Error downloading https://hydro1.gesdisc.eosdis.nasa.gov/data/GLDAS/GLDAS_CLSM025_DA1_D.2.2/2004/06/GLDAS_CLSM025_DA1_D.A20040612.022.nc4: HTTPSConnectionPool(host='hydro1.gesdisc.eosdis.nasa.gov', port=443): Max retries exceeded with url: /data/GLDAS/GLDAS_CLSM025_DA1_D.2.2/2004/06/GLDAS_CLSM025_DA1_D.A20040612.022.nc4 (Caused by SSLError(SSLEOFError(8, '[SSL: UNEXPECTED_EOF_WHILE_READING] EOF occurred in violation of protocol (_ssl.c:1006)')))
Error downloading https://hydro1.gesdisc.eosdis.nasa.gov/data/GLDAS/GLDAS_CLSM025_DA1_D.2.2/2004/06/GLDAS_CLSM025_DA1_D.A20040613.022.nc4: HTTPSConnectionPool(host='hydro1.gesdisc.eosdis.nasa.gov', port=443): Max retries exceeded with url: /data/GLDAS/GLDAS_CLSM025_DA1_D.2.2/2004/06/GLDAS_CLSM025_DA1_D.A20040613.022.nc4 (Caused by SSLError(SSLEOFError(8, '[SSL: UNEXPECTED_EOF_WHILE_READING] EOF occurred in violation of protocol (_ssl.c:1006)')))
Error downloading https://hydro1.gesdisc.eosdis.nasa.gov/data/GLDAS/GLDAS_CLSM025_DA1_D.2.2/2004/06/

#### FLDAS files

In [None]:
# Initialize a list to keep track of any exceptions
exceptions = list()

# Loop through folders and process files
for folder in subset_folders:
    folder_url = f"{fldas_host}/{folder}"
    r = requests.get(folder_url, verify=True, stream=True, auth=(login, password))
    soup = BeautifulSoup(r.text, "html.parser")
    
    for link in soup.findAll('a', attrs={'href': re.compile(".nc4$")}):
        file_name = link.get('href')
        full_file_path = f"{folder_url}/{file_name}"  # Concatenate folder and file

        local_filename = full_file_path.split('/')[-1]
        print(f"Downloading {local_filename}", end='\r')
        
        try:
            # Make the request to download the file
            r = requests.get(full_file_path, verify=True, stream=True, auth=(login, password))
            
            # Save the file with its original name in the specified directory
            local_path = f"{out_dir}/{local_filename}"
            
            with open(local_path, "wb") as f:
                f.write(r.content)
        
        except Exception as error:
            print(f"Error downloading {full_file_path}: {error}", end='\r')
            exceptions.append(full_file_path)
    
    # Pause every 10 folders to avoid overloading the server
    if (folders.index(folder) + 1) % 10 == 0:
        print(f"Processed {folders.index(folder) + 1} folders, pausing for 5 seconds...", end='\r')
        time.sleep(5)

print("\n" f"Download process completed with {len(exceptions)} exceptions")

#### Extract ROI


In [50]:
#Read files
files = glob.glob(r'D:\VUB\_data\fldas_monthly_data/*.nc')

min_lat = -1.75
max_lat = 5.55
min_lon = 33.75
max_lon = 40.75

ds_list=[]
for file in files[:]:
    files.sort()
    ds=xr.open_dataset(file)
    #extract the region of interest
    ds_roi=ds.sel(X=slice(min_lon, max_lon),Y=slice(min_lat,max_lat)) #this is the order of coordinates that works to clip the data

    #rename coordinates
    ds_roi=ds_roi.rename({'X':'lon','Y':'lat'})

    ds_list.append(ds_roi)

ds_all=xr.concat(ds_list,dim='time')

In [None]:
ds_all.to_netcdf(r'D:\VUB\_data\fldas_monthly_data\fldas_all_vars_2000_2023.nc')

In [70]:
ds_all