In [14]:
import os
import fsspec
import xarray as xr
import numpy as np
import geojson
from google.cloud import storage
from datetime import datetime, timedelta
import pandas as pd
from google.cloud import bigquery
import pandas as pd
import concurrent.futures
from tqdm.notebook import tqdm



In [15]:
##functions for selecting which data to download

def get_blob_names(attime=datetime.utcnow(), bucket_name='gcp-public-data-goes-16'):
    """
    This function returns a list of blob names from the current hour and the previous hour.
    The blob names can be passed to a function that selects the blobs that form complete images.
    """
    if isinstance(attime, str):
        attime = pd.to_datetime(attime)

    # Set up Google Cloud Storage client
    client = storage.Client()
    bucket = client.get_bucket(bucket_name)

    # Create a range of directories to check. The GOES bucket is
    # organized by hour of day.
    blob_names = []
    for i in range(2):  # Get blobs from current hour and previous hour
        current_time = attime - timedelta(hours=i)
        prefix = f'ABI-L2-MCMIPC/{current_time.year}/{current_time.timetuple().tm_yday:03d}/{current_time.hour:02d}/'
        blobs = bucket.list_blobs(prefix=prefix)
        blob_names.extend([blob.name for blob in blobs])

    return blob_names

def extract_band_number(blob_name):
    """
    Extracts the band number from a blob name.
    """
    try:
        return int(blob_name.split('_')[1][-2:])
    except ValueError:
        return None

def select_blobs_single_band(blob_names):
    """
    Selects the blobs that form complete images for a single band.
    """
    # Sort blob names by timestamp
    blob_names.sort(key=lambda name: name.split('_')[3][1:], reverse=True)

    # Initialize a list to store the selected blobs
    selected_blobs = []

    # Iterate over the blob names
    for name in blob_names:
        # If there are less than 12 blobs, add the blob name to the list
        if len(selected_blobs) < 12:
            selected_blobs.append(name)

    # Check if there are 12 blobs
    if len(selected_blobs) < 12:
        raise Exception(f"Only {len(selected_blobs)} blobs found")

    return selected_blobs

def create_median_image(blob_list, fs, bucket_name='gcp-public-data-goes-16'):
    """
    This function creates a single-band image from a list of blob names.
    The pixel values in the image are the median values from the corresponding pixels in the input images.
    """
    # Initialize a list to store the Datasets
    datasets = []

    # Open each blob as a full dataset and load it into memory
    for blob in blob_list[::6]:  # Use every 6th blob for testing purposes (faster operation creating each median without pipeline-structure difference), change to 1 later
        f = fs.open(f'{bucket_name}/{blob}')
        print(f'Opening: {bucket_name}/{blob}')
        ds = xr.open_dataset(f).load()
        datasets.append(ds)

    # Concatenate the datasets along a new 'band' dimension
    concated = xr.concat(datasets, dim='band')

    # Compute the median along the 'band' dimension
    median_ds = concated.median(dim='band', keep_attrs=True)

    # Close the files
    for ds in datasets:
        ds.close()

    # Return the median dataset
    return median_ds

def feature_engineer(img):
    """
    This function performs feature engineering on the input image. It creates new features by performing 
    mathematical operations on the different bands of the image. The new features are ratios of the values 
    in different bands. These features are then added to the original image. Chosen ratios build off experimentation
    with finding features that give most predictive power.
    """
    
    feat1 = img['CMI_C06'] / img['CMI_C05']
    feat2 = img.CMI_C07 / img.CMI_C05
    feat3 = img.CMI_C07 / img.CMI_C06
    feat4 = img.CMI_C14 / img.CMI_C07
    
    data_dict = {'6_5': feat1, '7_5': feat2,
                 '7_6': feat3, '14_7': feat4}
    
    img_crs = img.rio.crs
    
    return img.assign(data_dict).rio.write_crs(img_crs)


In [16]:
#Set env variable for google cloud credentials, used behind the scenes by a couple functions
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/Users/adamhunter/Documents/school projs/firenet/data/credentials/firenet-99-writer.json'
# Use fsspec to create a file system
fs = fsspec.filesystem('gcs', token=os.environ['GOOGLE_APPLICATION_CREDENTIALS'])

In [17]:
# Test the get_blob_names function
blob_names = get_blob_names()
print(blob_names)

# Test the select_blobs function
selected_blobs = select_blobs_single_band(blob_names)
print(len(selected_blobs))
selected_blobs

['ABI-L2-MCMIPC/2024/023/05/OR_ABI-L2-MCMIPC-M6_G16_s20240230501170_e20240230503543_c20240230504056.nc', 'ABI-L2-MCMIPC/2024/023/05/OR_ABI-L2-MCMIPC-M6_G16_s20240230506170_e20240230508543_c20240230509062.nc', 'ABI-L2-MCMIPC/2024/023/05/OR_ABI-L2-MCMIPC-M6_G16_s20240230511170_e20240230513555_c20240230514072.nc', 'ABI-L2-MCMIPC/2024/023/05/OR_ABI-L2-MCMIPC-M6_G16_s20240230516170_e20240230518543_c20240230519062.nc', 'ABI-L2-MCMIPC/2024/023/05/OR_ABI-L2-MCMIPC-M6_G16_s20240230521170_e20240230523555_c20240230524073.nc', 'ABI-L2-MCMIPC/2024/023/05/OR_ABI-L2-MCMIPC-M6_G16_s20240230526170_e20240230528549_c20240230529060.nc', 'ABI-L2-MCMIPC/2024/023/05/OR_ABI-L2-MCMIPC-M6_G16_s20240230531170_e20240230533555_c20240230534062.nc', 'ABI-L2-MCMIPC/2024/023/05/OR_ABI-L2-MCMIPC-M6_G16_s20240230536170_e20240230538543_c20240230539069.nc', 'ABI-L2-MCMIPC/2024/023/04/OR_ABI-L2-MCMIPC-M6_G16_s20240230401170_e20240230403543_c20240230404074.nc', 'ABI-L2-MCMIPC/2024/023/04/OR_ABI-L2-MCMIPC-M6_G16_s20240230406

['ABI-L2-MCMIPC/2024/023/05/OR_ABI-L2-MCMIPC-M6_G16_s20240230536170_e20240230538543_c20240230539069.nc',
 'ABI-L2-MCMIPC/2024/023/05/OR_ABI-L2-MCMIPC-M6_G16_s20240230531170_e20240230533555_c20240230534062.nc',
 'ABI-L2-MCMIPC/2024/023/05/OR_ABI-L2-MCMIPC-M6_G16_s20240230526170_e20240230528549_c20240230529060.nc',
 'ABI-L2-MCMIPC/2024/023/05/OR_ABI-L2-MCMIPC-M6_G16_s20240230521170_e20240230523555_c20240230524073.nc',
 'ABI-L2-MCMIPC/2024/023/05/OR_ABI-L2-MCMIPC-M6_G16_s20240230516170_e20240230518543_c20240230519062.nc',
 'ABI-L2-MCMIPC/2024/023/05/OR_ABI-L2-MCMIPC-M6_G16_s20240230511170_e20240230513555_c20240230514072.nc',
 'ABI-L2-MCMIPC/2024/023/05/OR_ABI-L2-MCMIPC-M6_G16_s20240230506170_e20240230508543_c20240230509062.nc',
 'ABI-L2-MCMIPC/2024/023/05/OR_ABI-L2-MCMIPC-M6_G16_s20240230501170_e20240230503543_c20240230504056.nc',
 'ABI-L2-MCMIPC/2024/023/04/OR_ABI-L2-MCMIPC-M6_G16_s20240230456170_e20240230458554_c20240230459055.nc',
 'ABI-L2-MCMIPC/2024/023/04/OR_ABI-L2-MCMIPC-M6_G16_s20

In [19]:
median_ds = create_median_image(selected_blobs, fs)

Opening: gcp-public-data-goes-16/ABI-L2-MCMIPC/2024/023/05/OR_ABI-L2-MCMIPC-M6_G16_s20240230536170_e20240230538543_c20240230539069.nc
Opening: gcp-public-data-goes-16/ABI-L2-MCMIPC/2024/023/05/OR_ABI-L2-MCMIPC-M6_G16_s20240230506170_e20240230508543_c20240230509062.nc


In [29]:
median_ds_copy = median_ds.copy()

In [35]:
from rasterio.crs import CRS

wkt = 'PROJCS["unnamed",GEOGCS["unknown",DATUM["unnamed",SPHEROID["Spheroid",6378137,298.2572221]],PRIMEM["Greenwich",0],UNIT["degree",0.0174532925199433,AUTHORITY["EPSG","9122"]]],PROJECTION["Geostationary_Satellite"],PARAMETER["central_meridian",-75],PARAMETER["satellite_height",35786023],PARAMETER["false_easting",0],PARAMETER["false_northing",0],UNIT["metre",1,AUTHORITY["EPSG","9001"]],AXIS["Easting",EAST],AXIS["Northing",NORTH],EXTENSION["PROJ4","+proj=geos +lon_0=-75 +h=35786023 +x_0=0 +y_0=0 +ellps=GRS80 +units=m +no_defs +sweep=x"]]'
crs = CRS.from_wkt(wkt)

median_ds_copy.rio.write_crs(crs, inplace=True)
print(median_ds_copy.rio.crs)

PROJCS["unnamed",GEOGCS["unknown",DATUM["unnamed",SPHEROID["Spheroid",6378137,298.2572221]],PRIMEM["Greenwich",0],UNIT["degree",0.0174532925199433,AUTHORITY["EPSG","9122"]]],PROJECTION["Geostationary_Satellite"],PARAMETER["central_meridian",-75],PARAMETER["satellite_height",35786023],PARAMETER["false_easting",0],PARAMETER["false_northing",0],UNIT["metre",1,AUTHORITY["EPSG","9001"]],AXIS["Easting",EAST],AXIS["Northing",NORTH],EXTENSION["PROJ4","+proj=geos +lon_0=-75 +h=35786023 +x_0=0 +y_0=0 +ellps=GRS80 +units=m +no_defs +sweep=x"]]


In [36]:
# Reproject median_ds_copy to EPSG:5070
median_ds_copy = median_ds_copy.rio.reproject("EPSG:5070")
print(median_ds_copy.rio.crs)


EPSG:5070


In [44]:
median_ds_copy

In [39]:
for data_var in median_ds_copy.data_vars:
    median_ds_copy[data_var].attrs.pop('grid_mapping', None)
median_ds_copy.to_netcdf('median_image.nc')


Realized that to get the .crs info to appear in expected form, it is necessary to reopen using rioxarray.
This is the method used in Sean's scratch.

I further realized that the landfire data is reprojected to the bool_img before the goes data gets reprojected to it. It's also worth noting that the bool_img has a crs written to it, not sure why. So if we are to copy the pipeline I think we need to get that .nc, write the crs to it, reproject_match landfire data to that, then reproject_match goes data to that.

In [45]:
import xarray as xr
import rioxarray

# Open the NetCDF file
median_riox = rioxarray.open_rasterio('median_image.nc')
print(median_riox.rio.crs)

PROJCS["unnamed",GEOGCS["unknown",DATUM["unnamed",SPHEROID["Spheroid",6378137,298.2572221]],PRIMEM["Greenwich",0],UNIT["degree",0.0174532925199433,AUTHORITY["EPSG","9122"]]],PROJECTION["Geostationary_Satellite"],PARAMETER["central_meridian",-75],PARAMETER["satellite_height",35786023],PARAMETER["false_easting",0],PARAMETER["false_northing",0],UNIT["metre",1,AUTHORITY["EPSG","9001"]],AXIS["Easting",EAST],AXIS["Northing",NORTH],EXTENSION["PROJ4","+proj=geos +lon_0=-75 +h=35786023 +x_0=0 +y_0=0 +ellps=GRS80 +units=m +no_defs +sweep=x"]]


In [46]:

# Reproject median_riox to EPSG:4326
median_riox = median_riox.rio.reproject("EPSG:5070")


In [47]:
median_riox

In [48]:

median_riox.to_netcdf('reprojected.nc')


In [2]:

# Load the tif file
tif_path = '/Users/adamhunter/Documents/school projs/firenet/data/static_layers/LF2020_Elev_220_CONUS/Tif/LC20_Elev_220.tif'
elev = rioxarray.open_rasterio(tif_path)
# Reproject elev to match median_riox
elev = elev.rio.reproject_match(median_riox)


In [18]:

# Add a new coordinate 'band' to elev
elev = elev.assign_coords(band=[1])

# Add elev as a new variable to median_riox
median_riox['elev'] = elev



In [21]:
median_riox.to_netcdf('median_with_elev.nc')


Full description of preprocessing function:
1. download last hour of goes data
2. create median
3. feature engineer
4. remove uneeded data from nc
5. Reproject to epsg 4326
5. stack with preprocessed landfire layers
    -Preprocessing of these involves reprojecting to 4326 and matching resolution to goes data
    -Preprocessed landfire stack stored in bucket
6. chunk the stacks to pytorch manageable size and upload
7. save chunks as separate files in bucket - presently not clear if these chunks should be simple multidimensional arrays or nc datasets



Higher detail description:

1. Select appropriate list of blobs from gcp fs, most recent hour's worth of data from GOES MCMIPC bucket. This should be 12 blobs because there are scans every 5 mins.
2. Download the set of blobs, pruning unneeded data such as data quality flag arrays and unused bands, return list of data sets
3. Concatenate the 12 datasets into one, effectively creating a dataset with a time dimension
4. Take median over the time dimension, so each pixel has median value of the last hour for each band
5. Feature engineer the median dataset, adding more informative bands that are ratios of the spectral channels
6. Reproject this dataset to epsg 4326
7. Download the preprocessed landfire layers. These have been reproject_matched to a GOES CONUS 'template' image, which has itself been reprojected to epsg 4326. This is intended to match the slightly convoluted preprocessing routine of training data the pytorch model was trained on.
8. Stack the GOES ds with the preprocessed landfire layers into a dataset.
9. Chunk the stacks to pytorch manageable size and upload to a bucket, this will be a large list of dataset files that have the stacked raster imagery with metadata that can be used to project pytorch container's inference.


