In [None]:
import xarray as xr
import glob
import pandas as pd
import datetime
import requests
import netCDF4
import boto3
import os
import numpy as np
import rioxarray

s3client = boto3.client('s3')

In [None]:
files = glob.glob('temp/goes/*.nc')

In [None]:
xr_files = [xr.open_dataset(f) for f in files]

In [None]:
concated = xr.concat(xr_files, dim = 'band')

In [None]:
median_composite = concated.median(dim = 'band', keep_attrs = True)

In [None]:
median_composite.to_netcdf("temp/goes/median_composite.nc")

  median_composite.to_netcdf("temp/goes/median_composite.nc")
  median_composite.to_netcdf("temp/goes/median_composite.nc")


# Access VIIRS dates via AWS (not current)

In [None]:
import awswrangler as wr

labeled_bucket = 'viirs-labeled-data'

labeled_contents = wr.s3.list_objects('s3://' + labeled_bucket)


# Only pull BOOL images and not .DS_store objects
viirs_keys = [obj for obj in labeled_contents if not obj.endswith('.DS_Store') and not obj.endswith('_frp.tif')]




In [None]:
# Define function that takes the list of viirs keys and returns a list of datetime objects
import datetime

def strip_keys(viirs_list):
    date_list = []

    for obj in viirs_list:
        split_1 = obj.split('-')

        year, month = int(split_1[2].split('/')[1]), int(split_1[3])

        split_2 = split_1[-1].split('_')


        # For some reason, there are two rasterized (out of ~2000) that don't have hours associated
        # Catch them, and make them into midnight hours

        try:
            day, hour = int(split_2[0]), int(split_2[1])

        except ValueError:
            day, hour = int(split_2[0]), 0

        date = datetime.datetime(year, month, day, hour)
        date_list.append(date)

    return date_list



In [None]:
date_list = strip_keys(viirs_keys)

IndexError: list index out of range

# New method for local VIIRS files

In [None]:
viirs_parent = '../Data/Training/VIIRS/Rasterized/Individual_Fires/variable_size/'
viirs_suffix = '_bool.nc'


def get_files(parent_dir, suffix):
    files = []
    for r,d,f in os.walk(parent_dir):
        for file in f:
            if file.endswith(suffix):
                files.append(os.path.join(r,file))


    return files

viirs_files = get_files(viirs_parent, viirs_suffix)

import datetime

def strip_dirs(viirs_list):
    date_list = []

    for obj in viirs_list:
        split_1 = obj.split('/')[-1]

        split_2 = split_1.split('_')[2].split('-')

        year, month = int(split_2[0]), int(split_2[1])



        # For some reason, there are two rasterized (out of ~2000) that don't have hours associated
        # Catch them, and make them into midnight hours

        try:
            day, hour = int(split_2[2]), int(split_1.split('_')[3])

        except ValueError:
            day, hour = int(split_2[0]), 0

        date = datetime.datetime(year, month, day, hour)
        date_list.append(date)

    return date_list

In [None]:
viirs_files

In [None]:
viirs_date_list = np.unique(strip_dirs(viirs_files))

# Need to find all VIIRS files from a specific date

In [None]:
def get_viirs_path(viirs_date, viirs_path_list):
    year, month, date, hour = viirs_date.strftime("%Y,%m,%d,%H").split(',')
    hour = hour.lstrip('0')

    og_path = viirs_path_list[0].split('/')
    og_path[7] = year
    og_path[-1] = f'VIIRS_Rasterized_{year}-{month}-{date}_{hour}_Num'

    return [img for img in viirs_path_list if img.startswith('/'.join(og_path))]

# Cull date_list to only include dates / hours that have yet to be pulled

In [None]:
goes_files = []
goes_parent = '../Data/Training/GOES'
goes_suffix = '.nc'

goes_files = get_files(goes_parent,goes_suffix)

In [None]:
def get_goes_time(goes_list):
    dates = []

    for obj in goes_list:
        split_1 = obj.split('/')[-1].split('-')

        year = int(split_1[0].split('_')[-1])

        month, day = int(split_1[1]), int(split_1[2])

        hour = int(split_1[-1].split('.')[0])

        date = datetime.datetime(year, month, day, hour)

        dates.append(date)

    return dates

In [None]:
goes_date_list = get_goes_time(goes_files)

In [None]:
leftover_dates = [d for d in viirs_date_list if d not in goes_date_list]

# Pull keys

In [None]:
def get_date_hour(single_key):

    '''
    single_key (str): key string returned from get_s3_keys
    '''

    split = single_key.split('/')
    year, doy, hour = int(split[1]), int(split[2]), int(single_key.split('/')[3])


    date = datetime.datetime(year,1,1, hour) + datetime.timedelta(doy-1)


    return date



In [None]:
def get_s3_keys(bucket, s3_client, date_list, prefix = ''):
    """
    Generate the keys in an S3 bucket.

    :param bucket: Name of the S3 bucket.
    :param prefix: Only fetch keys that start with this prefix (optional).
    """

    kwargs = {'Bucket': bucket}

    if isinstance(prefix, str):
        kwargs['Prefix'] = prefix

    while True:
        resp = s3_client.list_objects_v2(**kwargs)
        for obj in resp['Contents']:
            key = obj['Key']

            # Grab doy and hour from the key
            date = get_date_hour(key)

            if key.startswith(prefix):

                # Also, if key is associated with a VIIRS observation
                if date in date_list:
                    yield key


        try:
            kwargs['ContinuationToken'] = resp['NextContinuationToken']
        except KeyError:
            break

In [None]:
bucket_name = 'noaa-goes16'
product_name = 'ABI-L2-MCMIPC'

key_gen = get_s3_keys(bucket_name,
                   s3client,
                   viirs_date_list,
                   prefix = f'{product_name}'
                  )

keys = [key for key in key_gen]

In [None]:
resp = requests.get(f'https://{bucket_name}.s3.amazonaws.com/{keys[0]}')

fname = 'GOES_' #+ goes_hour.strftime('%Y-%m-%d-%H')
nc4_ds = netCDF4.Dataset(fname, memory = resp.content)
store = xr.backends.NetCDF4DataStore(nc4_ds)
DS = xr.open_dataset(store)

Split keys by year

In [None]:
goes_dates = [get_date_hour(key) for key in keys]

In [None]:
def get_keys_in_hour(key_list, date):
    year, julian, hour = date.strftime('%Y') + '/', \
                             date.strftime('%j') + '/', \
                             date.strftime('%H')

    prefix = 'ABI-L2-MCMIPC/'
    keys_to_return = []

    for k in key_list:
        if k.startswith(prefix + year + julian + hour):
            keys_to_return.append(k)

    return keys_to_return


In [None]:
2021-10-30

In [None]:
list(np.unique(goes_dates)).index(datetime.datetime(2021,10, 30, 20,0,0))

2433

In [None]:
list(np.unique(goes_dates))[1560]

datetime.datetime(2020, 8, 30, 19, 0)

In [None]:
np.unique(goes_dates)[1560]

datetime.datetime(2020, 8, 30, 19, 0)

In [None]:
for goes_hour in np.unique(goes_dates)[2433:]:
    print(str(goes_hour) + " started.")
    rasters = []

    key_in_that_hour = get_keys_in_hour(keys, goes_hour)
    year = goes_hour.strftime('%Y')

    i = 0
    for k in key_in_that_hour:
        print("Key " + str(i) + " started.")
        resp = requests.get(f'https://{bucket_name}.s3.amazonaws.com/{k}')

        fname = 'GOES_' + goes_hour.strftime('%Y-%m-%d-%H')
        nc4_ds = netCDF4.Dataset(fname, memory = resp.content)
        store = xr.backends.NetCDF4DataStore(nc4_ds)
        DS = xr.open_dataset(store)
#         DS = DS[bands]

        rasters.append(DS)
        i += 1

    concated = xr.concat(rasters, dim = 'band')

    median_composite = concated.median(dim = 'band', keep_attrs = True)

    parent_path = '../Data/training/Goes/clipped/'
    temp_path = '../Data/training/Goes/clipped/temp/'

    if not os.path.exists(parent_path + year):
        os.mkdir(parent_path + year)

    if not os.path.exists(temp_path):
        os.mkdir(temp_path)

    export_name = parent_path + year + '/' + fname

    median_composite.to_netcdf(f'{temp_path}{fname}.nc')
    median_riox = rioxarray.open_rasterio(f'{temp_path}{fname}.nc')

    if not median_riox.rio.crs:
        print("CRS NOT FOUND!")
        continue

    # Clip this composite to VIIRS observations
    viirs_images = get_viirs_path(goes_hour, viirs_files)
    export_name = f'{parent_path}{year}/{fname}.nc'


    for viirs_path in viirs_images:
        num = viirs_path.split('_')[-2][3:]

        img = xr.open_dataset(viirs_path)
        img = img.rio.write_crs('EPSG:4326')
        # Check if works**: bounds = img.rio.reproject(**DS.rio.crs).rio.bounds()

        clipped = median_riox.rio.reproject_match(img)
        export_name = f'{parent_path}{year}/{fname}_Num{num}'

        clipped.to_netcdf(export_name + '.nc')

    os.remove(f'{temp_path}/{fname}.nc')

    print(str(goes_hour) + ' done.')

2021-10-30 20:00:00 started.
Key 0 started.
Key 1 started.
Key 2 started.
Key 3 started.
Key 4 started.
Key 5 started.
Key 6 started.
Key 7 started.
Key 8 started.
Key 9 started.
Key 10 started.
Key 11 started.


  median_composite.to_netcdf(f'{temp_path}{fname}.nc')
  median_composite.to_netcdf(f'{temp_path}{fname}.nc')


2021-10-30 20:00:00 done.
2021-11-06 19:00:00 started.
Key 0 started.
Key 1 started.
Key 2 started.
Key 3 started.
Key 4 started.
Key 5 started.
Key 6 started.
Key 7 started.
Key 8 started.
Key 9 started.
Key 10 started.
Key 11 started.


  median_composite.to_netcdf(f'{temp_path}{fname}.nc')
  median_composite.to_netcdf(f'{temp_path}{fname}.nc')


2021-11-06 19:00:00 done.
2021-11-08 09:00:00 started.
Key 0 started.
Key 1 started.
Key 2 started.
Key 3 started.
Key 4 started.
Key 5 started.
Key 6 started.
Key 7 started.
Key 8 started.
Key 9 started.
Key 10 started.
Key 11 started.


  median_composite.to_netcdf(f'{temp_path}{fname}.nc')
  median_composite.to_netcdf(f'{temp_path}{fname}.nc')


2021-11-08 09:00:00 done.
2021-11-08 20:00:00 started.
Key 0 started.
Key 1 started.
Key 2 started.
Key 3 started.
Key 4 started.
Key 5 started.
Key 6 started.
Key 7 started.
Key 8 started.
Key 9 started.
Key 10 started.
Key 11 started.


  median_composite.to_netcdf(f'{temp_path}{fname}.nc')
  median_composite.to_netcdf(f'{temp_path}{fname}.nc')


2021-11-08 20:00:00 done.
2021-11-09 18:00:00 started.
Key 0 started.
Key 1 started.
Key 2 started.
Key 3 started.
Key 4 started.
Key 5 started.
Key 6 started.
Key 7 started.
Key 8 started.
Key 9 started.
Key 10 started.
Key 11 started.


  median_composite.to_netcdf(f'{temp_path}{fname}.nc')
  median_composite.to_netcdf(f'{temp_path}{fname}.nc')


2021-11-09 18:00:00 done.
2021-11-10 18:00:00 started.
Key 0 started.
Key 1 started.
Key 2 started.
Key 3 started.
Key 4 started.
Key 5 started.
Key 6 started.
Key 7 started.
Key 8 started.
Key 9 started.
Key 10 started.
Key 11 started.


  median_composite.to_netcdf(f'{temp_path}{fname}.nc')
  median_composite.to_netcdf(f'{temp_path}{fname}.nc')


2021-11-10 18:00:00 done.
2021-11-11 10:00:00 started.
Key 0 started.
Key 1 started.
Key 2 started.
Key 3 started.
Key 4 started.
Key 5 started.
Key 6 started.
Key 7 started.
Key 8 started.
Key 9 started.
Key 10 started.
Key 11 started.


  median_composite.to_netcdf(f'{temp_path}{fname}.nc')
  median_composite.to_netcdf(f'{temp_path}{fname}.nc')


2021-11-11 10:00:00 done.
2021-11-17 19:00:00 started.
Key 0 started.
Key 1 started.
Key 2 started.
Key 3 started.
Key 4 started.
Key 5 started.
Key 6 started.
Key 7 started.
Key 8 started.
Key 9 started.
Key 10 started.
Key 11 started.


  median_composite.to_netcdf(f'{temp_path}{fname}.nc')
  median_composite.to_netcdf(f'{temp_path}{fname}.nc')


2021-11-17 19:00:00 done.
2021-11-19 09:00:00 started.
Key 0 started.
Key 1 started.
Key 2 started.
Key 3 started.
Key 4 started.
Key 5 started.
Key 6 started.
Key 7 started.
Key 8 started.
Key 9 started.
Key 10 started.
Key 11 started.


  median_composite.to_netcdf(f'{temp_path}{fname}.nc')
  median_composite.to_netcdf(f'{temp_path}{fname}.nc')


2021-11-19 09:00:00 done.
2021-11-19 18:00:00 started.
Key 0 started.
Key 1 started.
Key 2 started.
Key 3 started.
Key 4 started.
Key 5 started.
Key 6 started.
Key 7 started.
Key 8 started.
Key 9 started.
Key 10 started.
Key 11 started.


  median_composite.to_netcdf(f'{temp_path}{fname}.nc')
  median_composite.to_netcdf(f'{temp_path}{fname}.nc')


2021-11-19 18:00:00 done.
2021-11-19 20:00:00 started.
Key 0 started.
Key 1 started.
Key 2 started.
Key 3 started.
Key 4 started.
Key 5 started.
Key 6 started.
Key 7 started.
Key 8 started.
Key 9 started.
Key 10 started.
Key 11 started.


  median_composite.to_netcdf(f'{temp_path}{fname}.nc')
  median_composite.to_netcdf(f'{temp_path}{fname}.nc')


2021-11-19 20:00:00 done.
2021-11-22 19:00:00 started.
Key 0 started.
Key 1 started.
Key 2 started.
Key 3 started.
Key 4 started.
Key 5 started.
Key 6 started.
Key 7 started.
Key 8 started.
Key 9 started.


  median_composite.to_netcdf(f'{temp_path}{fname}.nc')
  median_composite.to_netcdf(f'{temp_path}{fname}.nc')


2021-11-22 19:00:00 done.
2021-11-29 19:00:00 started.
Key 0 started.
Key 1 started.
Key 2 started.
Key 3 started.
Key 4 started.
Key 5 started.
Key 6 started.
Key 7 started.
Key 8 started.
Key 9 started.
Key 10 started.
Key 11 started.


  median_composite.to_netcdf(f'{temp_path}{fname}.nc')
  median_composite.to_netcdf(f'{temp_path}{fname}.nc')


2021-11-29 19:00:00 done.
2021-11-30 18:00:00 started.
Key 0 started.
Key 1 started.
Key 2 started.
Key 3 started.
Key 4 started.
Key 5 started.
Key 6 started.
Key 7 started.
Key 8 started.
Key 9 started.
Key 10 started.
Key 11 started.


  median_composite.to_netcdf(f'{temp_path}{fname}.nc')
  median_composite.to_netcdf(f'{temp_path}{fname}.nc')


2021-11-30 18:00:00 done.
2021-12-01 08:00:00 started.
Key 0 started.
Key 1 started.
Key 2 started.
Key 3 started.
Key 4 started.
Key 5 started.
Key 6 started.
Key 7 started.
Key 8 started.
Key 9 started.
Key 10 started.
Key 11 started.


  median_composite.to_netcdf(f'{temp_path}{fname}.nc')
  median_composite.to_netcdf(f'{temp_path}{fname}.nc')


2021-12-01 08:00:00 done.
2021-12-04 09:00:00 started.
Key 0 started.
Key 1 started.
Key 2 started.
Key 3 started.
Key 4 started.
Key 5 started.
Key 6 started.
Key 7 started.
Key 8 started.
Key 9 started.
Key 10 started.
Key 11 started.


  median_composite.to_netcdf(f'{temp_path}{fname}.nc')
  median_composite.to_netcdf(f'{temp_path}{fname}.nc')


2021-12-04 09:00:00 done.
2021-12-05 18:00:00 started.
Key 0 started.
Key 1 started.
Key 2 started.
Key 3 started.
Key 4 started.
Key 5 started.
Key 6 started.
Key 7 started.
Key 8 started.
Key 9 started.
Key 10 started.
Key 11 started.


  median_composite.to_netcdf(f'{temp_path}{fname}.nc')
  median_composite.to_netcdf(f'{temp_path}{fname}.nc')


2021-12-05 18:00:00 done.
2021-12-10 20:00:00 started.
Key 0 started.
Key 1 started.
Key 2 started.
Key 3 started.
Key 4 started.
Key 5 started.
Key 6 started.
Key 7 started.
Key 8 started.
Key 9 started.
Key 10 started.
Key 11 started.


  median_composite.to_netcdf(f'{temp_path}{fname}.nc')
  median_composite.to_netcdf(f'{temp_path}{fname}.nc')


2021-12-10 20:00:00 done.
2021-12-15 19:00:00 started.
Key 0 started.
Key 1 started.
Key 2 started.
Key 3 started.
Key 4 started.
Key 5 started.
Key 6 started.
Key 7 started.
Key 8 started.
Key 9 started.
Key 10 started.
Key 11 started.


  median_composite.to_netcdf(f'{temp_path}{fname}.nc')
  median_composite.to_netcdf(f'{temp_path}{fname}.nc')


2021-12-15 19:00:00 done.
2021-12-15 20:00:00 started.
Key 0 started.
Key 1 started.
Key 2 started.
Key 3 started.
Key 4 started.
Key 5 started.
Key 6 started.
Key 7 started.
Key 8 started.
Key 9 started.
Key 10 started.
Key 11 started.


  median_composite.to_netcdf(f'{temp_path}{fname}.nc')
  median_composite.to_netcdf(f'{temp_path}{fname}.nc')


2021-12-15 20:00:00 done.
2021-12-30 19:00:00 started.
Key 0 started.
Key 1 started.
Key 2 started.
Key 3 started.
Key 4 started.
Key 5 started.
Key 6 started.
Key 7 started.
Key 8 started.
Key 9 started.
Key 10 started.
Key 11 started.


  median_composite.to_netcdf(f'{temp_path}{fname}.nc')
  median_composite.to_netcdf(f'{temp_path}{fname}.nc')


2021-12-30 19:00:00 done.


In [None]:
test = xr.open_dataset('/Users/seancarter/Documents/Data/Training/Stacked/bool/take2/2017/2017-03-01-17_Num1.nc')

In [None]:
test

In [None]:
median_riox = rioxarray.open_rasterio(f'{temp_path}test_goes.nc')

if not median_riox.rio.crs:
    print("CRS NOT FOUND!")


# Clip this composite to VIIRS observations
viirs_images = get_viirs_path(goes_hour, viirs_files)
export_name = f'{parent_path}{year}/{fname}'


for viirs_path in viirs_images:
    num = viirs_path.split('_')[-2][3:]
    print(f'Num {num} started.')

    img = xr.open_dataset(viirs_path)
    img = img.rio.write_crs('EPSG:4326')
    # Check if works**: bounds = img.rio.reproject(**DS.rio.crs).rio.bounds()

    clipped = median_riox.rio.reproject_match(img)
    export_name = f'{parent_path}{year}/{fname}_Num{num}'

    clipped.to_netcdf(export_name + '.nc')

print(str(goes_hour) + ' done.')

2018-08-22 09:00:00 done.


In [None]:
bands = ['CMI_C0' + str(i) for i in range(1,10)]
bands.extend(['CMI_C' + str(i) for i in range(10,17)])

for goes_hour in np.unique(goes_dates):
    print(str(goes_hour) + " started.")
    rasters = []

    key_in_that_hour = get_keys_in_hour(keys, goes_hour)
    year = goes_hour.strftime('%Y')

    i = 0
    for k in key_in_that_hour:
        print("Key " + str(i) + " started.")
        resp = requests.get(f'https://{bucket_name}.s3.amazonaws.com/{k}')

        fname = 'GOES_' + goes_hour.strftime('%Y-%m-%d-%H')
        nc4_ds = netCDF4.Dataset(fname, memory = resp.content)
        store = xr.backends.NetCDF4DataStore(nc4_ds)
        DS = xr.open_dataset(store)
#         DS = DS[bands]

        # Do magic so that it can be exported
#         vars_list = list(DS.data_vars)
#         bads = []
#         for var in vars_list:
#             try:
#                 del DS[var].attrs['grid_mapping']
#             except KeyError:
#                 bads.append(var)

        rasters.append(DS)
        i += 1

    concated = xr.concat(rasters, dim = 'band')

    median_composite = concated.median(dim = 'band', keep_attrs = True)

    parent_path = '../Data/training/Goes/clipped/'

    temp_path = '../Data/training/Goes/clipped/temp/'

    if not os.path.exists(parent_path + year):
        os.mkdir(parent_path + year)


    if not os.path.exists(temp_path):
        os.mkdir(temp_path)


    # Clip this composite to VIIRS observations
    viirs_images = get_viirs_path(goes_hour, viirs_files)
    export_name = f'{parent_path}{year}/{fname}'

    # Absolutely absurd method to clip GOES imagery!

    median_composite.to_netcdf(f'{temp_path}{fname}.nc')
    median_riox = rioxarray.open_rasterio(f'{temp_path}{fname}.nc')
#     os.remove(f'{temp_path}/{fname}.nc')


    for viirs_path in viirs_images:
        num = viirs_path.split('_')[-2][3:]

        img = xr.open_dataset(viirs_path)
        img = img.rio.write_crs('EPSG:4326')
        # Check if works**: bounds = img.rio.reproject(**DS.rio.crs).rio.bounds()

        clipped = median_riox.rio.reproject_match(img)
        export_name = f'{parent_path}{year}/{fname}Num{num}'

        clipped.to_netcdf(export_name + '.nc')

    print(str(goes_hour) + ' done.')

2017-02-28 09:00:00 started.
Key 0 started.
Key 1 started.
Key 2 started.
Key 3 started.
Key 4 started.
Key 5 started.
Key 6 started.
Key 7 started.
Key 8 started.
Key 9 started.
Key 10 started.
Key 11 started.


  median_composite.to_netcdf(f'{temp_path}{fname}.nc')
  median_composite.to_netcdf(f'{temp_path}{fname}.nc')


MissingCRS: CRS not found. Please set the CRS with 'rio.write_crs()'. Data variable: CMI_C01