In [2]:
import os
import numpy as np
import pandas as pd
from osgeo import gdal
import glob
import xarray as xr
import matplotlib.pyplot as plt
from datetime import datetime
import subprocess

In [3]:
#source and destination directories
input_hdf_dir = r"D:\VUB\_data\modis_terra_LAI\hdf"
pp_output_dir = r"D:\VUB\_data\modis_terra_LAI\tif"

In [14]:
#sample file to extract subdatasets
fl=r"D:\VUB\_data\fldas_monthly_data\MOD11A1.A2000055.h21v08.061.2020043121008.hdf"
gdal.Open(fl,gdal.GA_ReadOnly)

#Get subdatasets. hdf files contain multiple subdatasets
subdatasets = gdal.Open(fl,gdal.GA_ReadOnly).GetSubDatasets()
subdatasets

#extracting each subdataset
subdataset_list=list()
for i in range(len(subdatasets)):
    #extracting the subdataset
    sds = gdal.Open(subdatasets[i][0], gdal.GA_ReadOnly)
    sub_dataset=subdatasets[i][0].split(':')[-1]
    subdataset_list.append(sub_dataset)

In [12]:
subdatasets[0][0]

'HDF4_EOS:EOS_GRID:"D:\\VUB\\_data\\fldas_monthly_data\\MOD11A1.A2000055.h21v02.061.2020043121022.hdf":MODIS_Grid_Daily_1km_LST:LST_Day_1km'

In [15]:
# open dataset
dataset = gdal.Open(fl,gdal.GA_ReadOnly)
subdataset =  gdal.Open(dataset.GetSubDatasets()[0][0], gdal.GA_ReadOnly)

# gdalwarp
kwargs = {'format': 'GTiff', 'dstSRS': 'EPSG:4326'}
ds = gdal.Warp(destNameOrDestDS='example.tif',srcDSOrSrcDSTab=subdataset, **kwargs)

### Convert hdf files to tiffs with EPSG: 4326 CRS

In [3]:
"""
Consult this reference: https://bpostance.github.io/posts/processing-large-spatial-datasets.md/
"""
def convert_hdf_tiff_epsg4326(input_hdf_dir,pp_vars, out_file):
    """
    Convert a MODIS HDF file to a GeoTIFF file with EPSG:4326 projection.
    """
    # Open the HDF file
    for pp_var in pp_vars:
        hdf_files = glob.glob(os.path.join(input_hdf_dir, f"{pp_var}","*.hdf"))
        for fl in hdf_files:       
            out_file_name = os.path.basename(fl.replace('.hdf','.tif'))
            out_file = os.path.join(pp_output_dir, f"{pp_var}_500",out_file_name)
            
            # open dataset
            dataset = gdal.Open(fl,gdal.GA_ReadOnly)
            #subdataset =  gdal.Open(dataset, gdal.GA_ReadOnly).GetSubDatasets()[0][0]
            
            # gdalwarp
            kwargs = {'format': 'GTiff', 'dstSRS': 'EPSG:4326'}
            ds = gdal.Warp(destNameOrDestDS= out_file,srcDSOrSrcDSTab=subdataset, **kwargs)
            del ds
            del dataset
            print(f"Converted {fl} to \n {out_file}",end='\r')
        print("Conversion Completed!")

In [None]:
fl = r"D:\VUB\_data\fldas_monthly_data\MODIS_2000_01_25_LE.tif"
out_file = r"D:\VUB\_data\fldas_monthly_data\MODIS_2000_01_25_LE_4326.tif"
dataset = gdal.Open(fl,gdal.GA_ReadOnly)
#subdataset =  gdal.Open(dataset, gdal.GA_ReadOnly).GetSubDatasets()[0][0]

# gdalwarp
kwargs = {'format': 'GTiff', 'dstSRS': 'EPSG:4326'}
ds = gdal.Warp(destNameOrDestDS= out_file,srcDSOrSrcDSTab=dataset, **kwargs)
del ds
del dataset
print(f"Converted {fl} to \n {out_file}",end='\r')

#### For multiple subdatasets

In [22]:
def convert_hdf_tiff_epsg4326(input_hdf_dir, pp_vars, output_dir):
    """
    Convert MODIS HDF files to GeoTIFF files with EPSG:4326 projection.
    
    Args:
        input_hdf_dir (str): Directory containing the HDF files.
        pp_vars (list): List of product variables (e.g., ['MOD11A1', 'MOD13Q1']).
        output_dir (str): Directory to save the output GeoTIFF files.
    """
    # Loop through each product variable
    for pp_var in pp_vars:
        # Find all HDF files in the corresponding directory
        hdf_files = glob.glob(os.path.join(input_hdf_dir, "*.hdf"))
        #sort files
        hdf_files.sort()
        
        # Loop through each HDF file
        for fl in hdf_files[0:12]:
            # Define output file path
            out_file_name = os.path.basename(fl).replace('.hdf', '.tif')
            out_file = os.path.join(output_dir, f"{pp_var}", out_file_name)
            
            # Create output directory if it doesn't exist
            os.makedirs(os.path.dirname(out_file), exist_ok=True)
            
            # Open the HDF file
            dataset = gdal.Open(fl, gdal.GA_ReadOnly)
            if dataset is None:
                print(f"Failed to open {fl}")
                continue
            
            # Get the subdataset according to the product variable
            subdataset = dataset.GetSubDatasets()[1][0]
            if not subdataset:
                print(f"No subdatasets found in {fl}")
                continue
            try:
            # Use subprocess to call gdal_translate command on the subdataset with EPSG:4326 and LZW compression
                subprocess.run(
                    [
                        "gdal_translate", 
                        "-of", "GTiff", 
                         # Set projection to EPSG:4326
                        "-a_srs", "EPSG:4326",
                        "-co", "COMPRESS=LZW",  # Set LZW compression
                        subdataset, 
                        out_file
                    ],
                    check=True
                )
                print(f"Converted {fl} to {out_file}", end='\r')
            except subprocess.CalledProcessError as e:
                print(f"Error converting {fl}: {e}")
            except Exception as e:
                print(f"Unexpected error: {e}")
    
            # # Reproject and save as GeoTIFF using gdalwarp
            # kwargs = {
            #     'format': 'GTiff',
            #     'dstSRS': 'EPSG:4326',
            #     'options': ['COMPRESS=LZW'],
            #     'outputType': gdal.GDT_Float32
            # }
            # ds = gdal.Warp(destNameOrDestDS=out_file, srcDSOrSrcDSTab=subdataset, dstSRS='EPSG:4326', format='GTiff', outputType=gdal.GDT_Float32, options=['COMPRESS=LZW'])          
            # if ds:
            #     print(f"Converted {fl} to {out_file}", end='\r')
            #     del ds  # Clean up the dataset
            # else:
            #     print(f"Failed to convert {fl}")
            
            del dataset  # Clean up the original dataset
    
    print("\n Conversion Completed!")

In [None]:
#run the function
pp_vars=['Lai_500m']
convert_hdf_tiff_epsg4326(input_hdf_dir,pp_vars, pp_output_dir)

### Extract subset of files by date threshold

In [None]:
def extract_date_from_filename(filename):
    """
    Extract the date from the filename in the format 'YYYYDDD'.

    Parameters:
    filename (str): The name of the file.

    Returns:
    datetime: A datetime object representing the extracted date.
    """
    # Split the filename and extract the date portion after 'A' in the 3rd part
    date_str = filename.split('_')[2][1:]  # Assuming the date is in the 4th position and starts with 'A'
    return datetime.strptime(date_str, '%Y%j')

def filter_files_by_date(files, date_threshold):
    """
    Filter files that are later than the given date threshold.

    Parameters:
    files (list): List of file paths.
    date_threshold (datetime): The date threshold for filtering files.

    Returns:
    list: List of file paths that are later than the date threshold.
    """
    filtered_files = []
    for file in files:
        file_date = extract_date_from_filename(os.path.basename(file))
        if file_date > date_threshold:
            filtered_files.append(file)
    return filtered_files

In [None]:
files = glob.glob(r"D:/VUB/_data/modis_gpp_npp/gpp_500/GLASS12E01_V60*.tif")
subset_tiles=filter_files_by_date(files, datetime(2000, 2, 1))

### Merge, Clip MODIS tiff tiles by date

In [18]:
def merge_and_compress_tiff_files(input_files,bbox, output_file, compression='LZW'):
    """
    Merges multiple GeoTIFF files into a single GeoTIFF file and compresses it.

    Args:
        input_files (list): List of input GeoTIFF file paths.
        output_file (str): Path to the output merged and compressed GeoTIFF file.
        compression (str): Compression type for the output file. Default is 'DEFLATE'.
        bbox (list): Bounding box coordinates [minx, miny, maxx, maxy]
    """
    # Open the input files
    input_datasets = [gdal.Open(file) for file in input_files]

    if os.path.exists(output_file):
            # Output file already exists            
            print(f"Output file {output_file} already exists. Skipping...", end='\r')
            return

    # Check if all files were opened successfully
    if None in input_datasets:
        raise ValueError("One or more input files could not be opened")

    # Use gdal.Warp to merge the files and apply compression
    #clip raster to a bounding box
    gdal.Warp(srcDSOrSrcDSTab=input_datasets, 
              destNameOrDestDS=output_file,
               format='GTiff', 
               outputBounds=[bbox[0], bbox[1], bbox[2], bbox[3]],
                 creationOptions=['COMPRESS={}'.format(compression)]
                 )
    
    print(f"Successfully merged and compressed files into {output_file}", end='\r')

In [None]:
# Get all tiff_tiles HDF files
tiff_tiles=glob.glob(r'D:\VUB\_data\modis_8day_gpp_2000_2023\tiffs\PsNet_500m/*.tif')
bbox=[33.8, 0.0007, 39.5, 5.0]
output_dir = r"D:\VUB\_data\modis_8day_gpp_2000_2023\tiffs\subset_psnet"
# tiff_tiles = glob.glob(r'D:\VUB\_data\modis_gpp_npp\gpp_500/*.tif')

# Generate arrays for years and days
years = np.arange(2000, 2024)
days = np.arange(1,367, 1)  # 8-day intervals

# Merge tiff_tiles files for the day
for year in years:
    for day in days:
        formatted_day = str(day).zfill(3)
        yr = str(year)
        files_to_merge = [file for file in tiff_tiles if f'A{yr}{formatted_day}' in file]
        
        if files_to_merge:
            output_file = os.path.join(output_dir,f'MOD17A2HGF.A{yr}{formatted_day}.tif')


            #add exception if file exists so that it is skipped and the rest are processed
            
            merge_and_compress_tiff_files(files_to_merge,bbox, output_file)

In [59]:
input=r"D:\VUB\_data\modis_8day_gpp_2000_2023\tiffs\PsNet_500m\MOD17A2HGF.A2023329.h21v08.061.2024020184752.tif"
dest=r"D:\VUB\_data\modis_8day_gpp_2000_2023\tiffs\PsNet_500m\subset\MOD17A2HGF.A2023329.tif"

In [None]:
gdal.Warp(srcDSOrSrcDSTab=input, 
              destNameOrDestDS=dest,
               format='GTiff', 
               outputBounds=[bbox[0], bbox[1], bbox[2], bbox[3]],
                 creationOptions=['COMPRESS={}'.format('LZW')]
                 )

In [None]:
file=r"D:\VUB\_data\modis_8day_gpp_2000_2023\tiffs\subset_Gpp\MOD17A2HGF.A2000177.tif"
basename=os.path.basename(file)
year=os.path.splitext(basename)[0].split('.')[1][1:5]
doy=os.path.splitext(basename)[0].split('.')[1][5:8]
doy

### Merge files to netCDF

In [16]:
def merge_Gtiffs_to_NetCDF(files, output_file):
    """
    """
    array_list=[]
    files.sort()
    for file in files:

        #extract the year and DOY from the filename
        basename=os.path.basename(file)
        year=os.path.splitext(basename)[0].split('.')[1][1:5]
        #doy='001'
        doy=os.path.splitext(basename)[0].split('.')[1][5:8]

        #convert year and DOY to datetime
        date = pd.to_datetime(year + doy, format='%Y%j')

        #read the first file to get the dimensions
        ds = gdal.Open(file)
        band = ds.GetRasterBand(1)

        #open as array
        arr = band.ReadAsArray()

        #get the size and coordinates
        nlat,nlon = np.shape(arr)
        b = ds.GetGeoTransform() #bbox, interval
        #get the number of rows and columns and multiply by the interval, then add to the origin to get the coordinates
        lon = np.arange(nlon)*b[1]+b[0]
        lat = np.arange(nlat)*b[5]+b[3]

        #assign the coordinates to the array
        arr = xr.DataArray(arr,coords=[lat,lon],dims=['lat','lon'])

        #assign the date to the array
        arr = arr.expand_dims('time')
        arr['time'] = [date]

        #assign nodata value
        arr = arr.where(arr!=band.GetNoDataValue())

        #to reduce file size, convert to float32
        arr=arr.astype('float32')

        #set projection
        arr.attrs['crs'] = 'EPSG:4326'

        #assign variable properties
        arr.attrs['units'] = 'kg*C/m^2'
        arr.attrs['long_name'] = 'MODIS Net Photosynthesis'
        arr.attrs['source'] = 'MODIS'
        arr.attrs['scale'] = 0.0001
        arr.attrs['Description'] = "The MOD17A2HGF Version 6 Gross Primary Productivity (GPP) product is a cumulative 8-day composite of values with 500 meter (m) pixel size. \n The data product includes information about GPP and Net Photosynthesis (PSN). The PSN band values are the GPP less the Maintenance Respiration (MR). The data product also contains a PSN Quality Control (QC) layer."
                                

        array_list.append(arr)
        print(f"Processed {file}", end='\r')

    #concatenate the list of arrays into a single xarray dataset
    print(f"\n Saving the dataset to {output_file}")
    npp_xr=xr.concat(array_list,dim='time')
    npp_xr.name = 'PsNet'

    #save the dataset to a netcdf file
    npp_xr.to_netcdf(output_file)

    print(f"NetCDF file saved to {output_file}")

In [None]:
tiffs_to_convert = glob.glob(r"D:\VUB\_data\modis_8day_gpp_2000_2023\tiffs\subset_psnet\*.tif")
output_dir=r"D:\VUB\_data\nc_files"
output_file=os.path.join(output_dir,'MODIS_8day_PsNet_2000_2023.nc')

merge_Gtiffs_to_NetCDF(tiffs_to_convert,output_file)

### Check missing files


In [None]:
import os
import glob

def get_filenames_without_extension(directory):
    files=glob.glob(os.path.join(directory,'*.tif'))
    return {file.split(".")[1] for file in files}

def compare_directories(dir1, dir2):
    files_in_dir1 = get_filenames_without_extension(dir1)
    files_in_dir2 = get_filenames_without_extension(dir2)

    missing_in_dir1 = files_in_dir2 - files_in_dir1
    missing_in_dir2 = files_in_dir1 - files_in_dir2

    return missing_in_dir1, missing_in_dir2

# Replace with your actual directories
directory1 = r'D:\VUB\_data\modis_8day_gpp_2000_2023\tiffs\PsNet_500m'
directory2 = r'D:\VUB\_data\modis_8day_gpp_2000_2023\tiffs\subset_psnet'

missing_in_dir1, missing_in_dir2 = compare_directories(directory1, directory2)

print("Files missing in directory 1:", missing_in_dir1)
print("Files missing in directory 2:", missing_in_dir2)


In [41]:
#count file numbers for each year
directory1 = r'D:\VUB\_data\modis_8day_gpp_2000_2023\tiffs\PsNet_500m'
direftory2=r'D:\VUB\_data\modis_8day_gpp_2000_2023\tiffs\subset_psnet'
files = glob.glob(os.path.join(directory1, '*.tif'))
files.sort()

#extract doy
doy_list = [os.path.basename(file).split('.')[1][-3:] for file in files]
year_list = [os.path.basename(file).split('.')[1][1:5] for file in files]
ydoy_list=[os.path.basename(file).split('.')[1][1:] for file in files]
#count the number of files for each year
year_count = {year: year_list.count(year) for year in year_list}
day_count={doy: doy_list.count(doy) for doy in doy_list}


In [None]:
ydoy_list.sort()
ydoy_list_ = [ydoy for ydoy in ydoy_list if '001' in ydoy]

# Finding duplicates in ydoy_list_
duplicates = [ydoy for ydoy in set(ydoy_list_) if ydoy_list_.count(ydoy) > 1]
duplicates


#### Statistical Analysis

In [150]:
ds=xr.open_dataset(output_file)
#mask nan values
#ds=ds.where(ds<2000,np.nan)

In [152]:
ds_gpp=xr.open_dataset(r"D:\VUB\_data\modis_gpp_npp_annual\tiff\Gpp_500m\clipped\MODIS_annual_GPP_2000_2023.nc")

In [None]:
fig, ax = plt.subplots(figsize=(12, 3.5))
ds_map=ds['NPP'].sel(lat=1, lon=36.5, method='nearest')
ds_gpp_map=ds_gpp['GPP'].sel(lat=1, lon=36.5, method='nearest')
ds_map.plot(ax=ax)


In [None]:
cue=ds['NPP']/ds_gpp['GPP']
cue.attrs['variable'] = 'CUE'

fig, ax = plt.subplots(figsize=(12, 3.5))
cue_ts=cue.sel(lat=1, lon=36.5, method='nearest')
cue_ts.plot(ax=ax)
plt.ylim(0.45,0.55)