# Imports

In [1]:
import os
import h5py
import shutil
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime, timedelta

# from moviepy.editor import ImageSequenceClip
# from moviepy.video.io.bindings import mplfig_to_npimage


# Processing Satellite Data

Windowed data across all three years  
Using 2020 for training and 2021 for testing  
Using 2022 for validation  

## Utility Functions

In [2]:
def find_by_date(
    start_datetime, end_datetime,
    root_path, fn_pattern, fn_ext,
    timestep, error=1,
    upper=True, lower=False, verbose=False
):
    """
    start_datetime -> must be in the datetime format (inclusive of time)  
    end_datetime   -> must be in the datetime format (inclusive of time)  
    root_path      -> the path to the folder where all the files are stored  
    fn_pattern     -> the general name of the file with the datetime pattern (strformat) embedded  
    fn_ext         -> extension of the file, do NOT include the '.'  
    timestep       -> in minutes,,, the time difference between two consecutive files  
    error          -> in minutes,,, if the files are not exactly in a regular timestep, add the usual error (2 means +1, +2 min AND -1, -2 min)  
    upper          -> convert the filename into upper case (some stupid thing with format)  
    lower          -> convert the filename into lower case (added because just upper was weird)  
    verbose        -> prints all the files not found  
    
    **note** -> defaults are set up to work with satellite data from MOSDAC
    """

# error handling
    if upper and lower:
        raise ValueError("Cannot have both arguments, 'upper' and 'lower' be True!")

    if timestep <= 0 or isinstance(timestep, float):
        raise ValueError("Argument 'timestep' can only be a positive integer")

    if not (isinstance(start_datetime, datetime) and isinstance(end_datetime, datetime)):
        raise TypeError("Both 'start_datetime', 'end_datetime' arguments must be datetime objects")

    if not os.path.exists(root_path):
        raise IOError("Path provided in the 'root_path' argument does not exist")

    filenames = []
    timestamps = []

    cur_datetime = start_datetime

    while cur_datetime <= end_datetime + timedelta(minutes=error):
        fn = _find_matching_fn(
            cur_datetime, root_path, fn_pattern, fn_ext, error,
            upper, lower, verbose
        )

        filenames.append(fn)
        timestamps.append(cur_datetime)

    # iterating through the datetimes
        cur_datetime = cur_datetime + timedelta(minutes=timestep)

# this error exists because verbose might be false
    if all(filename is None for filename in filenames):
        raise IOError(f"No input data found in {root_path} with the given datetime range")

    return filenames, timestamps


def _find_matching_fn(
    cur_datetime, root_path, fn_pattern, fn_ext, error,
    upper, lower, verbose
):
    
# without error (timestep)    
    fn = _gen_fn_path(
        cur_datetime,
        root_path, fn_pattern, fn_ext,
        upper, lower
    )

    if os.path.exists(fn):
        return fn

# with negative error
    for i in range(1, error+1):
        fn = _gen_fn_path(
            cur_datetime - timedelta(minutes=i),
            root_path, fn_pattern, fn_ext,
            upper, lower
        )
        if os.path.exists(fn):
            return fn
    
# with positive error
    for i in range(1, error+1):
        fn = _gen_fn_path(
            cur_datetime + timedelta(minutes=i),
            root_path, fn_pattern, fn_ext,
            upper, lower
        )
        if os.path.exists(fn):
            return fn

# nothing matched
    if verbose:
        error_date = datetime.strftime(cur_datetime, fn_pattern)
        if upper:
            error_date = error_date.upper()
        if lower:
            error_date = error_date.lower()
            
        print(f"file not found with datetime: {error_date}")
    
    return None            

 
def _gen_fn_path(
    cur_datetime, root_path, fn_pattern, fn_ext, 
    upper, lower
):
    fn = datetime.strftime(cur_datetime, fn_pattern)
        
    if upper:
        fn = fn.upper()
    if lower:
        fn = fn.lower()
            
    fn = fn + '.' + fn_ext
    fn = os.path.join(root_path, fn)
        
    return fn


In [3]:
def h5ToNumpyMatrix(file_path):
# 512, 512 is image size I am planning working with (that didnt pan out huh),, padding to maintain size ##
    if file_path is None:
        return np.zeros((400, 400))

# in case I have introduced a bug in the previous functions
    if not os.path.exists(file_path):
        raise IOError(f"File {file_path} not found")

# extracting information
    cur_file = h5py.File(file_path)
    
    lat = cur_file['Latitude'][()] * 0.01
    lon = cur_file['Longitude'][()] * 0.01
    
    olr = np.squeeze(cur_file['OLR'][()])

# Filter data within location of interest  
    lat_filtered = lat[(lat >= 0) & (lat < 40) & (lon >= 60) & (lon < 100)]
    lon_filtered = lon[(lat >= 0) & (lat < 40) & (lon >= 60) & (lon < 100)]
    olr_filtered = olr[(lat >= 0) & (lat < 40) & (lon >= 60) & (lon < 100)]

# Grid data from 0-40N Latitude, 60-100E Longitude with 0.1 deg resolution
    olr_grid = np.zeros((400, 400))
    cnt = np.zeros((400, 400))

# Generating index values
    lat_ind = np.int32((lat_filtered - 0) * 10)
    lon_ind = np.int32((lon_filtered - 60) * 10)
    
    np.add.at(olr_grid, (lat_ind, lon_ind), olr_filtered)
    np.add.at(cnt, (lat_ind, lon_ind), 1)

    olr_grid[cnt > 0] = olr_grid[cnt > 0] / cnt[cnt > 0]  
    olr_grid[cnt == 0] = np.nan
    
## padding (might change,, if you do change at the top) ##
#     olr_grid = np.pad(olr_grid, ((56, 56), (56, 56)), mode='constant', constant_values=np.nan)
    
    return olr_grid


In [4]:
### make changes to the paths HERE if you want to use it to suit your needs ###
def process_by_year(year: int):
# looping through the folder as per datetime
    olr_load = {
        'start_datetime': datetime(year, 5, 1, 0, 0),
        'end_datetime': datetime(year, 9, 30, 23, 30),
        'root_path': f'./data/May-Sep{year}',
        'fn_pattern': '3DIMG_%d%b%Y_%H%M_L2B_OLR_V01R00',
        'fn_ext': 'h5',
        'timestep': 30,
    }
    filenames, timestamps = find_by_date(**olr_load)

# extracting olr data from the h5 files,,, error handling for no file existing for that time
    all_olr_grids = [h5ToNumpyMatrix(fn) for fn in filenames]

# ultility to handle the empty files
    window_start_indices = [idx for idx in range(len(filenames) - 12) if not None in filenames[idx:idx+12]]

# data windowing
    X_y = [all_olr_grids[start_idx:start_idx+12] for start_idx in window_start_indices]

# saving all the data windows into disk
    save_path = f'./data/processed_May-Sep{year}/'
    save_pattern = '%d%b%Y_%H%M_L2B_OLR'
    ### the filename corresponding to each data window states the datetime at which the data window starts (X starts) ###
    for i in range(len(X_y)):
        np.save((save_path + datetime.strftime(timestamps[window_start_indices[i]], save_pattern)), X_y[i], allow_pickle=False, fix_imports=False)

##### while loading the npy files using the find_by_date thing, make **upper=False** #####


## Processing Raw Data

In [None]:
# done
process_by_year(2020)

In [28]:
# done
process_by_year(2021)

In [None]:
# done
process_by_year(2022)

# Further Processing

This is me trying to decrease the size of the input data to be able to use it through google colab  
Will probably merge 2020 and 2021 for training and use 2022 testing

## Utility Functions

In [9]:
# uses utility functions from the processing raw data
def find_by_date_further(
    start_datetime, end_datetime,
    root_path, fn_pattern, fn_ext,
    timestep, error=1,
    upper=False, lower=False, verbose=False
):
    """further_process_by_year(2020)
    same docstring as the function find_by_date
    """

    # error handling
    if upper and lower:
        raise ValueError("Cannot have both arguments, 'upper' and 'lower' be True!")

    if timestep <= 0 or isinstance(timestep, float):
        raise ValueError("Argument 'timestep' can only be a positive integer")

    if not (isinstance(start_datetime, datetime) and isinstance(end_datetime, datetime)):
        raise TypeError("Both 'start_datetime', 'end_datetime' arguments must be datetime objects")

    if not os.path.exists(root_path):
        raise IOError("Path provided in the 'root_path' argument does not exist")

    filenames = []
    timestamps = []

    cur_datetime = start_datetime
# the main changes are in the loop
    while cur_datetime <= end_datetime + timedelta(minutes=error):
        fn = _find_matching_fn(
            cur_datetime, root_path, fn_pattern, fn_ext, error,
            upper, lower, verbose
        )

        if fn == None:
            cur_datetime += timedelta(minutes=timestep)
            continue

        filenames.append(fn)
        timestamps.append(cur_datetime)
    
    # 12 is used because there are 12 datapoints in the window, this number is selected for this particular use case
    # should probably make it a parameter
        cur_datetime += timedelta(minutes=timestep*12)

# this error exists for verbose equals false
    if all(filename is None for filename in filenames):
        raise IOError(f"No input data found in {root_path} with the given datetime range")

    return filenames, timestamps

In [63]:
### make changes to the paths HERE if you want to use it to suit your needs ###
def further_process_by_year(year: int):
# looping through processed folder such that all data windows have no overlapping frames
    further_process_params = {
        'start_datetime': datetime(year, 5, 1, 0, 0),
        'end_datetime': datetime(year, 9, 30, 23, 30),
        'root_path': f'./data/processed_May-Sep{year}',
        'fn_pattern': '%d%b%Y_%H%M_L2B_OLR',
        'fn_ext': 'npy',
        'timestep': 30,
    }
    _, timestamps = find_by_date_further(**further_process_params)

# copy pasting the found files into new folder 
    source_path = further_process_params['root_path'] + '/'
    target_path = f'./data/further_processed_May-Sep{year}/'
    
    for ts in timestamps:
        fn_rebuilt = datetime.strftime(ts, further_process_params['fn_pattern']) + '.' + further_process_params['fn_ext']
        shutil.copyfile(source_path + fn_rebuilt, target_path + fn_rebuilt)


## Processing for Colab

In [None]:
# done
further_process_by_year(2020)

In [None]:
# done
further_process_by_year(2021)

In [None]:
# done
further_process_by_year(2022)