In [29]:
import xarray as xr
import fsspec
import s3fs
import os
import matplotlib.pyplot as plt
import dask
import rasterio
from dask.distributed import Client, LocalCluster, progress
import datetime
import tempfile
import boto3
import geoviews as gv
from geoviews import opts

gv.extension('matplotlib')

gv.output(size=150)

run conda install -c pyviz geoviews-core dask-ml

In [30]:
env = dict(GDAL_DISABLE_READDIR_ON_OPEN='EMPTY_DIR', 
           AWS_NO_SIGN_REQUEST='YES',
           GDAL_MAX_RAW_BLOCK_CACHE_SIZE='200000000',
           GDAL_SWATH_SIZE='200000000',
           VSI_CURL_CACHE_SIZE='200000000')
os.environ.update(env)

In [31]:
def convert_full_date_to_continous_day(year, month, day):
    """
    Helper function if you wish to use month, day vs julian day
    """
    return datetime.datetime(year, month, day).timetuple().tm_yday

def get_geo_uri(year, day):
    """
    returns list of geo uris
    """
    fs = s3fs.S3FileSystem(anon=True)
    files = []
    
    filepath = "s3://noaa-goes17/ABI-L2-FDCC/%s/%s/*/*.nc" % (str(year).zfill(4), str(day).zfill(3)) 
    files = fs.glob(filepath)
    
    if len(files) < 1:
        raise Exception("No files found")
    
    return files

def download_to_xarray(uri):
    """
    Downloads file and directly loads it into xarray in memory
    """
    s3 = boto3.client("s3")
    
    with tempfile.NamedTemporaryFile() as temp_file:
        s3.download_file(Bucket=uri[:11], Key=uri[12:], Filename=temp_file.name)
        datastore = xr.open_dataset(temp_file.name)
        
    return datastore

def download_to_disk(uri):
    s3 = boto3.client("s3")
    filename = uri[12:].replace("/", "-")
    if not os.path.exists(filename):
        s3.download_file(Bucket=uri[:11], Key=uri[12:], Filename=filename)
        
    return filename

In [32]:
#TUBBS = {"year": 2017, "day1":220, "day2":243}
#CAMP = {"year": 2018, "day1":312, "day2":329}
#WOOLSEY = {"year": 2018, "day1":312, "day2":325}

#below are incorrect to deal with data source
CAMP = {"year": 2018, "day1":317, "day2":318}
WOOLSEY = {"year": 2018, "day1":317, "day2":319}

Downloading a single goes file from s3 takes 1.7s so we first download to disk the portions of time we're interested in 

In [33]:
tubbs_uris = []
camp_uris = []
woolsey_uris = []

In [34]:
for i in range(TUBBS["day2"] - TUBBS["day1"]):
    day = i + TUBBS["day1"]
    tubbs_uris += get_geo_uri(TUBBS["year"], day)

NameError: name 'TUBBS' is not defined

In [35]:
for i in range(CAMP["day2"] - CAMP["day1"]):
    day = i + CAMP["day1"]
    camp_uris += get_geo_uri(CAMP["year"], day)

In [36]:
for i in range(WOOLSEY["day2"] - WOOLSEY["day1"]):
    day = i + WOOLSEY["day1"]
    woolsey_uris += get_geo_uri(WOOLSEY["year"], day)

In [37]:
local_filepaths = []

In [38]:
for key in camp_uris:
    local_filepaths.append(download_to_disk(key))
for key in woolsey_uris:
    local_filepaths.append(download_to_disk(key))

KeyboardInterrupt: 

In [40]:
def load_local_file_into_xarray(year, day, hour, localfilepaths):
    """
    Returns an xarray of a single hour of data
    """
    files = []
    for element in localfilepaths:
        split_file = element.split("-")
        if split_file[3] == str(year) and split_file[4] == str(day) and split_file[5] == str(hour).zfill(2):
            files.append(element)
    
    if len(files) < 1:
        raise Exception("File with that date is not found")
        
    return xr.open_mfdataset(files,combine='nested',concat_dim='time')

In [None]:
def visualize_xarray(data, vdims):
    kdims = ['t', 'x', 'y']
    xr_dataset = gv.Dataset(data, kdims=kdims, vdims=vdims)
    image = xr_dataset.to(gv.Image, ['x', 'y'])
    return image

Now we have the data quickly and easily accessible on disk, and any section of it can be visualized.
To visualize each fire, you can simply stack the functions like so:

In [45]:
# To see part of the Camp Fire:
visualize_xarray(load_local_file_into_xarray(2018, 318, 8, local_filepaths), ['Mask'])





To see multiple hours or days of the fire, you can concatenate them and feed it into the visualizer:

In [1]:
#To see several consecutive hours:
fires = []
for i in range(5, 8):
    fires.append(load_local_file_into_xarray(2018, 318, i, local_filepaths))

visualize_xarray(xr.concat(fires, dim="time"), ['Mask'])

NameError: name 'load_local_file_into_xarray' is not defined

Note that these are very large data sets, and so trying to load weeks into memory will quickly crash

Next, we'll train an ML model on this data. I am going to first simply do binary classification o the entire image, whether or not the date is during the fire.

If I were to do binary classification for every pixel individiually, it would at core look very similar. Using a model without locally correlated information, you can take the 2 dimensional image and convert it into a single vector, aligning it with the similarly transformed binary pixels.

If the data does have locally correlated information, you want to use models that utilize that information, the canonical example being a convolutional neural net. In this case, you could use a U-Net with a binary loss function on each pixel, so that it directly creates the segmentation of each patch of images.

The data I'm going to gather here is a toy example: it's much smaller than what will get actual results. The purpose is just to show the outline of how this would be done, and would be expanded and systemtized for a product.

In [12]:
non_fire_day1 = 200
non_fire_day2 = 205
non_fire_uris = []

for i in range(non_fire_day1, non_fire_day2):
    non_fire_uris += (get_geo_uri(2019, i))
    
for key in non_fire_uris:
    local_filepaths.append(download_to_disk(key))

KeyboardInterrupt: 

In [None]:
import dask_ml

Select subset of days from fire and non-fire for training and testing

In [58]:
local_filepaths

['ABI-L2-FDCC-2018-317-11-OR_ABI-L2-FDCC-M3_G17_s20183171117219_e20183171119592_c20183171120130.nc',
 'ABI-L2-FDCC-2018-317-11-OR_ABI-L2-FDCC-M3_G17_s20183171122219_e20183171124592_c20183171125124.nc',
 'ABI-L2-FDCC-2018-317-11-OR_ABI-L2-FDCC-M3_G17_s20183171127219_e20183171129532_c20183171130055.nc',
 'ABI-L2-FDCC-2018-317-11-OR_ABI-L2-FDCC-M3_G17_s20183171132219_e20183171134592_c20183171135119.nc',
 'ABI-L2-FDCC-2018-317-11-OR_ABI-L2-FDCC-M3_G17_s20183171137219_e20183171139592_c20183171140125.nc',
 'ABI-L2-FDCC-2018-317-11-OR_ABI-L2-FDCC-M3_G17_s20183171142219_e20183171144532_c20183171145059.nc',
 'ABI-L2-FDCC-2018-317-11-OR_ABI-L2-FDCC-M3_G17_s20183171147219_e20183171149592_c20183171150122.nc',
 'ABI-L2-FDCC-2018-317-11-OR_ABI-L2-FDCC-M3_G17_s20183171152219_e20183171154592_c20183171155123.nc',
 'ABI-L2-FDCC-2018-317-12-OR_ABI-L2-FDCC-M3_G17_s20183171157219_e20183171159532_c20183171200054.nc',
 'ABI-L2-FDCC-2018-317-12-OR_ABI-L2-FDCC-M3_G17_s20183171202219_e20183171204592_c2018317120

In [59]:
def load_subset_of_data(year, day1, day2, y, data, labels):
    for i in range(day1, day2):
        for j in range(24):
            try:
                data.append(load_local_file_into_xarray(year, i, j, local_filepaths))
                labels.append(y)
            except:
                continue

In [66]:
train_data = []
train_labels = []
test_data = []
test_labels = []

load_subset_of_data(2019, non_fire_day1, non_fire_day1+2, 0, train_data, train_labels)
load_subset_of_data(2019, non_fire_day1+2, non_fire_day1+2, 0, test_data, test_labels)
load_subset_of_data(2018, 317, 318, 1, train_data, train_labels)
load_subset_of_data(2018, 318, 319, 1, test_data, test_labels)

For a toy example, let's just use the mean fire temperature

In [81]:
trainX, trainY = [], []

In [95]:
for data, label in zip(train_data, train_labels):
    print(data.mean_fire_area)



<xarray.DataArray 'mean_fire_area' (time: 12)>
array([26271.4140625 , 39758.2890625 , 34748.38671875, 17884.97851562,
       41281.76953125, 36443.76953125, 41596.5078125 , 37608.4296875 ,
       42858.0859375 , 29056.40234375, 53272.2890625 , 46513.66796875])
Coordinates:
    t                   (time) datetime64[ns] 2019-07-19T00:02:38.365116928 ....
    y_image             float32 0.08624
    x_image             float32 0.0
    sunglint_angle      float32 10.0
    local_zenith_angle  float32 80.0
    solar_zenith_angle  float32 10.0
Dimensions without coordinates: time
Attributes:
    long_name:      mean fire area
    standard_name:  fire_area
    valid_range:    [   4000. 4000000.]
    units:          m2
    grid_mapping:   goes_imager_projection
    cell_methods:   sunglint_angle: sum (no pixel produced) local_zenith_angl...
<xarray.DataArray 'mean_fire_area' (time: 12)>
array([42203.47265625, 37820.19140625, 58419.8515625 , 35895.3359375 ,
       28980.25976562, 28260.53515625, 



In [20]:
from sklearn.linear_model import LogisticRegression

In [13]:
from dask.distributed import Client, progress
client = Client(processes=False, threads_per_worker=4,
                n_workers=1, memory_limit='4GB')

0,1
Client  Scheduler: inproc://192.168.99.39/87/1  Dashboard: http://192.168.99.39:8787/status,Cluster  Workers: 1  Cores: 4  Memory: 4.00 GB


In [72]:
test_data[0].Mask.values.shape

(12, 1500, 2500)