Function description:

1. Select appropriate list of blobs from gcp fs, most recent hour's worth of data from GOES MCMIPC bucket. This should be 12 blobs because there are scans every 5 mins.
2. Download the set of blobs, pruning unneeded data such as data quality flag arrays and unused bands, return list of data sets
3. Concatenate the 12 datasets into one, effectively creating a dataset with a time dimension
4. Take median over the time dimension, so each pixel has median value of the last hour for each band
5. Feature engineer the median dataset, adding more informative bands that are ratios of the spectral channels
6. Reproject this dataset to epsg 4326
7. Download the preprocessed landfire layers. These have been reproject_matched to a GOES CONUS 'template' image, which has itself been reprojected to epsg 4326. This is intended to match the slightly convoluted preprocessing routine of training data the pytorch model was trained on.
8. Stack the GOES ds with the preprocessed landfire layers into a dataset.
9. Chunk the stacks to pytorch manageable size and upload to a bucket, this will be a large list of dataset files that have the stacked raster imagery with metadata that can be used to project pytorch container's inference.




In [3]:
from google.cloud import storage
from datetime import datetime, timedelta
import pandas as pd
import rioxarray
import xarray as xr
import fsspec
import os
import tempfile
from rasterio.enums import Resampling



In [20]:
def select_blobs(bucket_name='gcp-public-data-goes-16'):
    """
    Selects the appropriate list of blobs from GCP fs, most recent hour's worth of data from GOES MCMIPC bucket.
    Returns: List of selected blobs.
    """
    # Get the current time
    attime = datetime.utcnow()

    # Set up Google Cloud Storage client
    client = storage.Client()
    bucket = client.get_bucket(bucket_name)

    # Create a range of directories to check. The GOES bucket is
    # organized by hour of day.
    selected_blobs = []
    for i in range(2):  # Get blobs from current hour and previous hour
        current_time = attime - timedelta(hours=i)
        prefix = f'ABI-L2-MCMIPC/{current_time.year}/{current_time.timetuple().tm_yday:03d}/{current_time.hour:02d}/'
        blobs = bucket.list_blobs(prefix=prefix)
        selected_blobs.extend([blob.name for blob in blobs])

    # Sort the blobs by their timestamp in descending order
    selected_blobs.sort(key=lambda name: name.split('_')[3][1:], reverse=True)

    # Check if there are at least 12 blobs
    if len(selected_blobs) < 12:
        raise Exception(f"Only {len(selected_blobs)} blobs found")

    return selected_blobs[:12]


def create_fs():
    """
    Creates a file system object for GCP. 
    Returns: File system object. fs can be interacted with as though it were a local file system.
    """
    fs = fsspec.filesystem('gcs', token=os.environ['GOOGLE_APPLICATION_CREDENTIALS'])
    return fs


def create_median_image(blob_list, fs, bucket_name='gcp-public-data-goes-16'):
    """
    This function creates a single-band image from a list of blob names.
    The pixel values in the image are the median values from the corresponding pixels in the input images.
    """
    # Initialize a list to store the Datasets
    datasets = []

    # Open each blob as a full dataset and load it into memory
    for blob in blob_list[::6]:  # Use every 6th blob for testing purposes (faster operation creating each median without pipeline-structure difference), change to 1 later
        f = fs.open(f'{bucket_name}/{blob}')
        print(f'Opening: {bucket_name}/{blob}')
        ds = xr.open_dataset(f).load()
        datasets.append(ds)

    # Concatenate the datasets along a new 'band' dimension
    concated = xr.concat(datasets, dim='time')

    # Compute the median along the 'band' dimension
    median_ds = concated.median(dim='time', keep_attrs=True)

    # Close the files
    for ds in datasets:
        ds.close()

    # Return the median dataset
    return median_ds

def reproject_dataset(dataset):
    """
    Reprojects the dataset to epsg 5070 with 375 res.
    Note that the technique used here is creating a tempfile and then opening it with rioxarray.
    This is a hack but I could not find any other way to get rioxarray to recognize the spatial
    metadata that is necessary for reprojection.
    Returns: Reprojected dataset.
    """
    # Create a temporary file
    with tempfile.NamedTemporaryFile(suffix='.nc') as tmpfile:
        # Save the dataset to the temporary file
        dataset.to_netcdf(tmpfile.name)

        # Open the temporary file with rioxarray
        ds_rio = rioxarray.open_rasterio(tmpfile.name)

        # Reproject the dataset to CRS EPSG:5070
        reprojected_dataset = ds_rio.rio.reproject("EPSG:5070")

    return reprojected_dataset

def engineer_features(dataset):
    """
    Feature engineers the median dataset, adding more informative bands that are ratios of the spectral channels.
    Returns: Feature engineered dataset.
    """
    # Compute the new features
    feat1 = dataset['CMI_C06'] / dataset['CMI_C05']
    feat2 = dataset['CMI_C07'] / dataset['CMI_C05']
    feat3 = dataset['CMI_C07'] / dataset['CMI_C06']
    feat4 = dataset['CMI_C14'] / dataset['CMI_C07']

    # Create a dictionary of the new features
    data_dict = {'6_5': feat1, '7_5': feat2, '7_6': feat3, '14_7': feat4}

    # Add the new features to the dataset
    engineered_dataset = dataset.assign(data_dict)

    return engineered_dataset


def download_landfire_layers():
    """
    Downloads the preprocessed landfire layers. These have been reproject_matched to a GOES CONUS 'template' image, 
    which has itself been reprojected to epsg 4326.
    Returns: Preprocessed landfire layers.
    """
    # landfire_layers = ...
    return landfire_layers

def stack_datasets(goes_ds, landfire_layers):
    """
    Stacks the GOES ds with the preprocessed landfire layers into a dataset.
    Returns: Stacked dataset.
    """
    # Open the landfire layers dataset
    with xr.open_dataset(landfire_layers) as landfire_ds:
        # Merge the two datasets
        stacked_dataset = xr.merge([goes_ds, landfire_ds])

    return stacked_dataset

def chunk_dataset(dataset):
    # Get the width and height of the image
    width = dataset.dims['x']
    height = dataset.dims['y']
    
    # Calculate the number of chunks in x and y direction
    nx, ny = width // 64, height // 64
    
    # Initialize a list to store the chunks
    chunks = []
    spatial_info = []
    
    # Loop over the image
    for i in range(ny):
        for j in range(nx):
            # Define the slice
            y_slice = slice(i * 64, (i + 1) * 64)
            x_slice = slice(j * 64, (j + 1) * 64)
    
            # Extract the chunk across all bands
            chunk = dataset.isel(y=y_slice, x=x_slice)
    
            # Store the chunk and its spatial information
            chunks.append(chunk)
            spatial_info.append((y_slice, x_slice))
    
    # Convert the chunks to a list of xarray Datasets
    chunks = [chunk for chunk in chunks]

    return chunks

In [5]:
selected_blobs = select_blobs()


In [6]:

fs = create_fs()
median_ds = create_median_image(selected_blobs, fs)

Opening: gcp-public-data-goes-16/ABI-L2-MCMIPC/2024/023/05/OR_ABI-L2-MCMIPC-M6_G16_s20240230551170_e20240230553543_c20240230554064.nc
Opening: gcp-public-data-goes-16/ABI-L2-MCMIPC/2024/023/05/OR_ABI-L2-MCMIPC-M6_G16_s20240230521170_e20240230523555_c20240230524073.nc


In [26]:
engineered_ds = engineer_features(median_ds)


  dataset.to_netcdf(tmpfile.name)
  dataset.to_netcdf(tmpfile.name)


(3270.9038690767325, -3270.9038690767325)

In [29]:

reprojected_median_ds = reproject_dataset(median_ds)
reprojected_median_ds = engineer_features(reprojected_median_ds)



  dataset.to_netcdf(tmpfile.name)
  dataset.to_netcdf(tmpfile.name)


(3270.9038690767325, -3270.9038690767325)