# Data Loader - H8 and MODIS

In [1]:
# TODO: consider splitting script into a module

import os
import pandas as pd
import numpy as np
import xarray as xr
import datetime

def read_H8_MODIS_byDate(date, modis_res, modis_sat, data_path):
    '''
    TODO: comments
    '''
    
    H8_PARENT_DIR = 'H8_MODIS_2019-2020'
    H8_MODIS_DIR = f'H8_{modis_sat}_{modis_res}'
    H8_MODIS_FILENAME = f'H8_{modis_sat}_{modis_res}_Aus_0.05_{date}.nc'
    H8_MODIS_PATH = os.path.join(data_path, H8_PARENT_DIR, H8_MODIS_DIR, H8_MODIS_FILENAME)
    
    df = xr.open_dataset(H8_MODIS_PATH).to_dataframe()
    df = df.drop(columns=['solar_azimuth_angle','solar_zenith_angle'])
    
    return df

def read_MODIS_byDate(date, modis_res, modis_sat, data_path):
    '''
    TODO: comments
    '''
    
    MODIS_PARENT_DIR = 'MODIS_L2_Aus_2019-2020'
    MODIS_FILENAME = f'{modis_sat}_{modis_res}_Aus_0.05_{date}.nc'
    MODIS_PATH = os.path.join(data_path, MODIS_PARENT_DIR, MODIS_FILENAME)
    
    df = xr.open_dataset(MODIS_PATH).drop_dims(['corner','file']).to_dataframe()
    
    return df

def concat_rows(df_list):
    '''
    TODO: comments
    '''
    
    for i in range(1,len(df_list)):
        if list(df_list[i].columns) != list(df_list[0].columns) and (not df_list[0].empty and not df_list[i].empty):
            raise Exception('ERROR: Columns do not line up in the dataframes that are to be concatenated. Data may need to be fixed before proceeding.')
            
    return pd.concat(df_list, axis=0)

def check_lat_lon(df_H8, df_MODIS):
    '''
    TODO: comments
    '''
    
    if list(df_H8.lat) != list(df_MODIS.Latitude) or list(df_H8.lon) != list(df_MODIS.Longitude):
        raise Exception('ERROR: Lat/Lon rows do not line up between the two dataframes.')

def load_data_byDate(date_str, modis_res, data_dir):
    '''
    TODO: comments
    '''
    
    df_H8_MOD04 = read_H8_MODIS_byDate(date_str, modis_res, 'MOD04', data_dir)
    df_H8_MYD04 = read_H8_MODIS_byDate(date_str, modis_res, 'MYD04', data_dir)
    df_H8 = concat_rows([df_H8_MOD04, df_H8_MYD04])
    
    df_MOD04 = read_MODIS_byDate(date_str, modis_res, 'MOD04', data_dir)
    df_MYD04 = read_MODIS_byDate(date_str, modis_res, 'MYD04', data_dir)
    df_MODIS = concat_rows([df_MOD04, df_MYD04])
    
    check_lat_lon(df_H8, df_MODIS)
    
    return df_H8, df_MODIS

def generate_dates(years, days):
    '''
    TODO: comments
    '''
    
    for year in years:
        for month in range(1, 13):
            for day in days:
                yield datetime.date(year, month, day).strftime("%Y-%m-%d")

def load_data(dates_str, modis_res, data_dir, sample_frac=1):
    '''
    TODO: comments
    '''
    
    df_H8 = pd.DataFrame()
    df_MODIS = pd.DataFrame()
    
    for date_str in dates_str:
                
        df_H8_daily, df_MODIS_daily = load_data_byDate(date_str, modis_res, data_dir)
        
        if sample_frac != 1:
            n = len(df_H8_daily)
            sample_size = int(n * sample_frac)
            sample_idx = np.random.choice(n, size=sample_size, replace=False)
            
            df_H8_daily = df_H8_daily.iloc[sample_idx]
            df_MODIS_daily = df_MODIS_daily.iloc[sample_idx]

        df_H8 = concat_rows([df_H8, df_H8_daily])
        df_MODIS = concat_rows([df_MODIS, df_MODIS_daily])
                
    return df_H8, df_MODIS

In [2]:
PARAM_DATA_YEARS = [2019]
PARAM_TRAIN_DAYSOFMONTH = [1,10,20]
PARAM_VAL_DAYSOFMONTH = [8,16]
PARAM_HOLDOUT_DAYSOFMONTH = [28]
PARAM_MODIS_RES = 'L2'
PARAM_DATA_DIR = '../../'  # change this to the directory path where your H8 and MODIS files are located

PARAM_SAMPLE_FRAC = 0.03  # fraction of data sampled into our sets, change to 1 for no further sampling
# PARAM_SAMPLE_FRAC = 1

np.random.seed(111)

df_H8_train, df_MODIS_train = load_data(generate_dates(PARAM_DATA_YEARS, PARAM_TRAIN_DAYSOFMONTH), 
                                        PARAM_MODIS_RES, 
                                        PARAM_DATA_DIR,
                                        PARAM_SAMPLE_FRAC)

df_H8_val, df_MODIS_val = load_data(generate_dates(PARAM_DATA_YEARS, PARAM_VAL_DAYSOFMONTH), 
                                    PARAM_MODIS_RES, 
                                    PARAM_DATA_DIR,
                                    PARAM_SAMPLE_FRAC)

df_H8_holdout, df_MODIS_holdout = load_data(generate_dates(PARAM_DATA_YEARS, PARAM_HOLDOUT_DAYSOFMONTH), 
                                            PARAM_MODIS_RES, 
                                            PARAM_DATA_DIR,
                                            PARAM_SAMPLE_FRAC)

In [3]:
print('train size:', len(df_H8_train))
print('val size:', len(df_H8_val))

train size: 49193
val size: 32656


# Sample usage - e.g. for modelling

In [4]:
feature_cols = [
    # 'channel_0001_brf',
    'channel_0001_scaled_radiance',
    # 'channel_0002_brf',
    'channel_0002_scaled_radiance',
    # 'channel_0003_brf',
    'channel_0003_scaled_radiance',
    # 'channel_0004_brf',
    'channel_0004_scaled_radiance',
    # 'channel_0005_brf',
    'channel_0005_scaled_radiance',
    # 'channel_0006_brf',
    'channel_0006_scaled_radiance',
    'channel_0007_brightness_temperature',
    'channel_0008_brightness_temperature',
    'channel_0009_brightness_temperature',
    'channel_0010_brightness_temperature',
    'channel_0011_brightness_temperature',
    'channel_0012_brightness_temperature',
    'channel_0013_brightness_temperature',
    'channel_0014_brightness_temperature',
    'channel_0015_brightness_temperature',
    'channel_0016_brightness_temperature',
]

response_col = 'AOD_550_Dark_Target_Deep_Blue_Combined'

X_train = df_H8_train[feature_cols]
y_train = df_MODIS_train[response_col]
X_val = df_H8_val[feature_cols]
y_val = df_MODIS_val[response_col]