In [None]:
import os
import fsspec
import xarray as xr
import numpy as np
import geojson
from google.cloud import storage
from datetime import datetime, timedelta
import pandas as pd
from google.cloud import bigquery
import pandas as pd

In [None]:
##functions for selecting which data to download

def get_blob_names(attime=datetime.utcnow(), within="1H", bucket_name='gcp-public-data-goes-16'):
    """
    This function returns a list of blob names given parameters. It selects the blobs that were created within a time
    range. The blob names can be passed to a function that selects the blobs that form complete images.
    """
    if isinstance(attime, str):
        attime = pd.to_datetime(attime)
    if isinstance(within, str):
        within = pd.to_timedelta(within)

    # Parameter Setup
    start = attime - within
    end = attime + within

    # Set up Google Cloud Storage client
    client = storage.Client()
    bucket = client.get_bucket(bucket_name)

    # Create a range of directories to check. The GOES bucket is
    # organized by hour of day.
    blob_names = []
    current_time = start
    while current_time <= end:
        prefix = f'ABI-L2-CMIPC/{current_time.year}/{current_time.timetuple().tm_yday:03d}/{current_time.hour:02d}/'
        blobs = bucket.list_blobs(prefix=prefix)
        blob_names.extend([blob.name for blob in blobs])
        current_time += timedelta(hours=1)  # Increment current_time by 1 hour

    return blob_names

def extract_band_number(blob_name):
    """
    Extracts the band number from a blob name.
    """
    try:
        return int(blob_name.split('_')[1][-2:])
    except ValueError:
        return None

def find_sequences(blob_names, unique_band_numbers):
    """
    Finds the last 12 continuous sequences that match the expected band order.
    """
    selected_sequences = []
    for i in range(len(blob_names) - len(unique_band_numbers) + 1):
        selected = blob_names[i:i+len(unique_band_numbers)]
        band_order = [extract_band_number(name) for name in selected]
        if band_order == unique_band_numbers:
            selected_sequences.append(selected)
            if len(selected_sequences) == 12:
                break
    return selected_sequences

def select_blobs(blob_names):
    """
    Selects the blobs that form complete images.
    """
    # Sort blob names by timestamp
    blob_names.sort(key=lambda name: name.split('_')[3][1:], reverse=True)

    # Extract band numbers from blob names
    band_numbers = [extract_band_number(name) for name in blob_names]

    # Get unique band numbers and sort them
    unique_band_numbers = sorted(set(band_numbers))

    # Find the last 12 continuous sequences that match the expected band order
    selected_sequences = find_sequences(blob_names, unique_band_numbers)

    if len(selected_sequences) < 12:
        raise Exception(f"Only {len(selected_sequences)} continuous sequences found that match the expected band order")

    return selected_sequences

##Downloading function

def get_datasets(selected_sequences, fs, bucket_name='gcp-public-data-goes-16'):

    """
    This function downloads datasets based on the selected sequences blob names.
    """

    # Initialize a list to store dictionaries of datasets for each sequence
    datasets = []

    # Iterate over each sequence in selected_sequences
    for sequence in selected_sequences:
        # Initialize a dictionary to store datasets for the current sequence
        sequence_datasets = {}

        # Open each blob in the sequence as an xarray Dataset and store it in the dictionary under the corresponding channel identifier
        for name in sequence:
            channel_id = name.split('_')[1]
            f = fs.open(f'{bucket_name}/{name}')
            ds = xr.open_dataset(f, engine='h5netcdf')
            sequence_datasets[channel_id] = ds

        # Append the dictionary of datasets for the current sequence to the list of all sequences' datasets
        datasets.append(sequence_datasets)

    # Return the list of dictionaries of datasets
    return datasets

##Processing functions

def form_multiband_images(datasets):
    """
    This function forms complete multi-band images from a list of dictionaries of xarray Datasets.
    Each dictionary should correspond to a sequence of blobs, with keys being the channel identifiers
    and values being the corresponding Datasets.
    """

    # Initialize a list to store the multi-band images
    multiband_images = []

    # Iterate over each dictionary of Datasets
    for sequence_datasets in datasets:
        # Get the Datasets in the order of the channel identifiers
        ordered_datasets = [sequence_datasets[channel_id] for channel_id in sorted(sequence_datasets.keys())]

        # Concatenate the Datasets along a new 'band' dimension
        multiband_image = xr.concat(ordered_datasets, dim='band')

        # Append the multi-band image to the list
        multiband_images.append(multiband_image)

    # Return the list of multi-band images
    return multiband_images

def create_hourly_composite(multiband_images):
    """
    This function creates an hourly composite image from a list of multi-band images.
    Each multi-band image should correspond to a 5-minute interval.
    """

    # Concatenate the multi-band images along a new 'time' dimension
    hourly_images = xr.concat(multiband_images, dim='time')

    # Compute the median for each band over the 'time' dimension
    hourly_composite = hourly_images.median(dim='time')

    # Return the hourly composite image
    return hourly_composite

def combine_arrays_from_composite(composite):
    """
    This function combines the underlying numpy arrays from an xarray Dataset into a single numpy array.
    It assumes that all the variables in the Dataset represent bands of the same image and can be stacked together.
    """

    # Initialize a list to store the numpy arrays
    arrays = []

    # Iterate over each variable in the Dataset
    for var_name in composite:
        # Extract the underlying numpy array and append it to the list
        arrays.append(composite[var_name].values)

    # Stack the numpy arrays along a new axis
    combined_array = np.stack(arrays, axis=-1)

    # Return the combined numpy array
    return combined_array

In [None]:
#Set env variable for google cloud credentials, used behind the scenes by a couple functions
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/Users/adamhunter/Documents/school projs/firenet/data/credentials/firenet-99-writer.json'
# Use fsspec to create a file system
fs = fsspec.filesystem('gcs', token=os.environ['GOOGLE_APPLICATION_CREDENTIALS'])

In [None]:
blob_names = get_blob_names()
len(blob_names)

In [None]:
blob_names

In [None]:

selected_blobs = select_blobs(blob_names)
selected_blobs

In [None]:
fs = fsspec.filesystem('gcs')

datasets = get_datasets(selected_blobs, fs)

In [None]:
multiband_images = form_multiband_images(datasets)

multiband_images


In [None]:

hourly_composite = create_hourly_composite(multiband_images)

hourly_composite


In [None]:

combined_array = combine_arrays_from_composite(hourly_composite)

# Print the combined array for debugging purposes
print(combined_array)

# Print the shape of the combined array for debugging purposes
print(combined_array.shape)

# Print the data type of the combined array for debugging purposes
print(combined_array.dtype)

# Print the minimum value in the combined array for debugging purposes
print(combined_array.min())

# Print the maximum value in the combined array for debugging purposes
print(combined_array.max())
