In [1]:
import os
import re
from collections import defaultdict
import glob
import time
# import numpy as np
import pandas as pd
#import rasterio as rio
from osgeo import gdal
import requests
from bs4 import BeautifulSoup
import dotenv
dotenv.load_dotenv(dotenv.find_dotenv()) #.env file in the same directory as this script

True

> This script is intended to download MODIS data from the LP DAAC archive.
> The files to be downloaded are .hdf files.  In this notebook I am trying to download MODIS ET and PET (MOD16A2GF.061). (https://e4ftl01.cr.usgs.gov/MOLT/MOD16A2GF.061/)
> Each hdf file contains both of these variables and the files are in timestamped folders.
> To download the data, one needs an earthdata login

> Reference: https://github.com/bpostance/learn_data_engineering/blob/main/earth.observation/modis/MCD64A1v061-burned-areas/00.ETL-MODIS.ipynb

Earth data login credentials

In [2]:
# Credentials fo LP DAAC
# you will require an EarthData login
host = fr'https://e4ftl01.cr.usgs.gov/MOLT/MOD17A2HGF.061/'
login = os.getenv('user')
password = os.getenv('pwd')

out_dir =r"E:\_data\vegetation_dynamics_EA\HDF_FILES\MODIS_GPP"

if not os.path.exists(out_dir):
    os.makedirs(out_dir)

In [None]:
# list folders from which to download
r = requests.get(host, verify=True, stream=True,auth=(login,password))
soup = BeautifulSoup(r.text, "html.parser")
folders = list()
for link in soup.findAll('a', attrs={'href': re.compile("\d{4}.\d{2}.\d{2}/")}): #e.g. 2000.03.05/
    folders.append(link.get('href'))
print(f"{len(folders)} folders found")

In [16]:
# # list files in folders
# for f in folders[:]:
#     file_list = list()
#     folder_url = f"{host}/{f}"
#     r = requests.get(folder_url, verify=True, stream=True,auth=(login,password))
#     soup = BeautifulSoup(r.text, "html.parser")
#     for link in soup.findAll('a', attrs={'href': re.compile(".hdf$")}):
#         file_list.append(link.get('href'))    
# print(f"{len(file_list)} files found in folder n")

In [17]:
# hreg = re.compile("h21") # Kenya is in h21 and h22
# vreg = re.compile("v0[8-9]") # Kenya is in v08 and v09 ("v0[8-9]")
# ken_files = list()
# for fl in file_list:
#     h = hreg.search(fl)
#     if h:
#         v = vreg.search(h.string)
#         if v:
#             ken_files.append(v.string)
# print(f"{len(ken_files)} tiles found covering the area of interest")

> Since MODIS data is stored in numbered grids, select tiles corresponding to areas of interest.  
> Kenya lies in 4 grids: h21v08, h22v08, h21v09, h22v09 so select only these tiles for download.  
> https://modis-land.gsfc.nasa.gov/MODLAND_grid.html

In [None]:
# Fetch the list of folders
r = requests.get(host, verify=True, stream=True, auth=(login, password))
soup = BeautifulSoup(r.text, "html.parser")
folders = list()
for link in soup.findAll('a', attrs={'href': re.compile("\d{4}.\d{2}.\d{2}/")}): # e.g. 2000.03.05/
    folders.append(link.get('href'))
print(f"{len(folders)} folders found")

# Compile regex for Kenyan tiles
hreg = re.compile("h2[1]")  # Kenya is in h21 and h22
vreg = re.compile("v0[8-9]")  # Kenya is in v08 and v09

# List files in folders and filter for Kenyan tiles
ken_files = list()
for f in folders[:]:
    folder_url = f"{host}/{f}"
    r = requests.get(folder_url, verify=True, stream=True, auth=(login, password))
    soup = BeautifulSoup(r.text, "html.parser")
    for link in soup.findAll('a', attrs={'href': re.compile(".hdf$")}):
        file_name = link.get('href')
        full_file_path = f"{folder_url}/{file_name}"  # Concatenate folder and file
        h = hreg.search(full_file_path)
        if h:
            v = vreg.search(h.string)
            if v:
                ken_files.append(full_file_path)
                print(f"Found Kenyan tile: {full_file_path}", end="\r")

print("\n" f"The loop has found {len(ken_files)} tiles altogether", end="\r")

#### Download the files

In [None]:
exceptions = list()
for e, file_url in enumerate(ken_files):
    local_filename = file_url.split('/')[-1]
    print(f"Downloading {local_filename}", end='\r')
    # Pause every 10 files to avoid overloading the server
    if (e + 1) % 10 == 0:
        print(f"Processed {e + 1} files, pausing for 5 seconds...", end='\r')
        time.sleep(5)
    try:
        # Make the request to download the file
        r = requests.get(file_url, verify=True, stream=True, auth=(login, password))
        
        # Save the file with its original name in the specified directory
        local_path = f"{out_dir}/{local_filename}"
        
        with open(local_path, "wb") as f:
            f.write(r.content)
    
    except Exception as error:
        print(f"Error downloading {file_url}: {error}")
        exceptions.append(file_url)
    
print("\n" f"Downloaded {len(ken_files) - len(exceptions)} files with {len(exceptions)} exceptions")

#### Combined listing and download

In [3]:
# Fetch the list of folders
r = requests.get(host, verify=True, stream=True, auth=(login, password))
soup = BeautifulSoup(r.text, "html.parser")
folders = list()
for link in soup.findAll('a', attrs={'href': re.compile("\d{4}.\d{2}.\d{2}/")}): # e.g. 2000.03.05/
    folders.append(link.get('href'))
print(f"{len(folders)} folders found")

# Compile regex for Kenyan tiles
hreg = re.compile("h2[2]")  # Kenya is in h21 and h22
vreg = re.compile("v0[9]")  # Kenya is in v08 and v09

# Initialize a list to keep track of any exceptions
exceptions = list()

# Loop through folders and process files
for folder in folders[:]:
    folder_url = f"{host}/{folder}"
    r = requests.get(folder_url, verify=True, stream=True, auth=(login, password))
    soup = BeautifulSoup(r.text, "html.parser")
    
    for link in soup.findAll('a', attrs={'href': re.compile(".hdf$")}):
        file_name = link.get('href')
        full_file_path = f"{folder_url}/{file_name}"  # Concatenate folder and file
        
        h = hreg.search(full_file_path)
        if h:
            v = vreg.search(h.string)
            if v:
                local_filename = full_file_path.split('/')[-1]
                print(f"Downloading {local_filename}", end='\r')
                
                try:
                    # Make the request to download the file
                    r = requests.get(full_file_path, verify=True, stream=True, auth=(login, password))
                    
                    # Save the file with its original name in the specified directory
                    local_path = f"{out_dir}/{local_filename}"
                    
                    with open(local_path, "wb") as f:
                        f.write(r.content)
                
                except Exception as error:
                    print(f"Error downloading {full_file_path}: {error}", end='\r')
                    exceptions.append(full_file_path)
    
    # Pause every 10 folders to avoid overloading the server
    if (folders.index(folder) + 1) % 10 == 0:
        print(f"Processed {folders.index(folder) + 1} folders, pausing for 5 seconds...", end='\r')
        time.sleep(5)

print("\n" f"Download process completed with {len(exceptions)} exceptions")

1104 folders found
Downloading MOD17A2HGF.A2023361.h22v09.061.2024021050114.hdf
Download process completed with 0 exceptions


In [None]:
exceptions

#### Check for missing files in the directory

In [None]:
# List of file paths
file_paths = glob.glob(f"{out_dir}/*.hdf")

# Dictionary to hold counts of files by year and month
file_count_by_date = defaultdict(int)

# Regular expression to extract the year and day of year (DOY)
pattern = r'\.A(\d{4})(\d{3})\.'

for file_path in file_paths:
    match = re.search(pattern, file_path)
    if match:
        year = match.group(1)
        doy = int(match.group(2))
        
        # Convert DOY to month
        date_str = f"{year}-{doy:03d}"
        month = pd.to_datetime(date_str, format='%Y-%j').strftime('%Y-%m')
        
        # Increment count for the month
        file_count_by_date[month] += 1

# Print the results
for month, count in sorted(file_count_by_date.items()):
    print(f"{month}: {count} file(s)")

# To identify missing months
all_months = pd.date_range(start='2000-01', end='2020-12', freq='M').strftime('%Y-%m')
missing_months = set(all_months) - set(file_count_by_date.keys())

print("\nMissing months:")
for month in sorted(missing_months):
    print(month)
