# Data Processing for CNN

This notebook takes fire-specific data saved in tif files or pickle files, along with weather data covering the period the fire burned, and produces data that can tain a CNN. The data is saved to S3.

In [23]:
# Load packages
import boto3
import io
import csv
import math
import numpy as np
import os
import pandas as pd
import pickle
import random

from datetime import datetime as dt
from matplotlib import pyplot as plt
from PIL import Image

## Variables and Hyperparameters

In [24]:
###############
## VARIABLES ##
###############

# s3 config
s3_client = boto3.client('s3')
bucket_name = 'hotzone'

# name of directory with fire tif files
tif_directory = "data"

# name of directory with weather data
weather_directory = 'weather_data'

# name of fire direction file
direction_file = 'BBdirectionCA'

# name of fire speed file
speed_file = 'BBspeedCA'

# weather and fire data to include in model
rainint = True
raintot = False
high_t = True
low_t = True
humidity = True
wind_speed = True
wind_direction = True
cloud_cover = False
fire_direction = False
fire_speed = False

weather_variables = {
    'rainint': rainint, 
    'raintot': raintot, 
    'High T': high_t, 
    'Low T': low_t, 
    'Humidity': humidity, 
    'Wind Speed': wind_speed, 
    'Wind Direction': wind_direction, 
    'Cloud Cover': cloud_cover,
    'Fire Direction': fire_direction,
    'Fire Speed': fire_speed
}

weather_vars = []

for k, v in weather_variables.items():
    if v == True:
        weather_vars.append(k)

#####################
## HYPERPARAMETERS ##
#####################

# scale the weather data - yea or nay
normalized_weather = True

# the desired height and width (in pixels) of the matrix to feed into the CNN
# 1 pixel side = 500 meters = 0.310686 miles
matrix_dim = 32

# multiplier for amount of zero-labeled data we want to add to dataset
labeled_multiplier = 24

## Dataset Preprocessing Pipeline

### Download files from S3 to local directory

In [25]:
def pull_fire_data_from_s3(year, tif_directory):
    '''
    Pull files from S3 for the provided year and save to local directory
    '''
    
    file_names = [
        "BBdayofburnCA.tif",
        "BBdirectionCA.tif",
        "BBfireid.tif",
        "BBfirelineCA.tif",
        "BBspeedCA.png",
        "BBspeedCA.tif",
        "ignitioncrop.pickle",
        "polygoncrop.pickle"
    ]
        
    s3 = boto3.resource('s3')
    
    for f in file_names:
        key = "GlobalFire/" + str(year) + "/" + f
        path = '/home/ubuntu/wildfireplus/data/' + f
        
        s3.Bucket('hotzone').download_file(key, path)

In [26]:
def pull_weather_data_from_s3(year, weather_directory):
    '''
    Pull files from S3 for the provided year and save to local directory
    '''
    
    file_name = "weather_data.pickle"
        
    s3 = boto3.resource('s3')
    
    key = "BayAreaWeather/" + str(year) + "/" + file_name
    path = '/home/ubuntu/wildfireplus/weather_data/' + file_name
        
    s3.Bucket('hotzone').download_file(key, path)

### Fire Dataset Preprocessing Functions

In [27]:
def data_processing(directory):
    '''
    Process the dataset in the supplied directory and return matrices of which pixels belong to which fire and 
    which day of the year the pixel was on fire.
    
    Args: 
        - directory: name of directory with tif files
    Returns: 
        - fire_data_dict: a dictionary where the key is "fire_id" and the value is a matrix of pixels 
        triggered by that fire (0, 1)
        - fireline: matrix denoting what day of year that pixel was on fire (1-365)
    '''
    
    path = os.path.abspath(directory)

    tiff_files = []

    for f in os.listdir(path):
        if f.endswith('.tif'):
            tiff_files.append(path + '/' + f)

    tiff_dict = {}

    # dictionary of tiff files
    for f in tiff_files:
        k = f.split('/')[-1].split('.tif')[0]
        tiff_dict[k] = f

    # convert to np array
    fire_id = Image.open(tiff_dict['BBfireid'])
    fire_id = np.array(fire_id)
    fire_id[fire_id == -9999] = 0

    fireline = Image.open(tiff_dict['BBfirelineCA'])
    fireline = np.array(fireline)
    fireline[fireline == -9999] = 0

    # get list of unique fire_ids
    fire_ids = set()

    for row in fire_id:
        for val in row:
            fire_ids.add(val)

    # remove 0 from fire_ids set because it does not denote a fire
    fire_ids.remove(0)

    # get dict with key value pairs of fire_id and an empty dict
    fire_data_dict = {}

    for id in fire_ids:
        id = str(id)
        fire_data_dict[id] = {}

    for id in fire_ids:
        indices = np.where(fire_id == id, 1, 0)
        fire_data_dict[str(id)] = indices
        
    return fire_data_dict, fireline

In [28]:
def create_one_hot_matrices(data_dict, fireline):
    '''
    Create matrices for each fire_id that show were the fire was on a given day during the year.
    
    Args:
        - data_dict: a dictionary where the key is "fire_id" and the value is a matrix of pixels 
        triggered by that fire (0, 1)
        - fireline: matrix denoting what day of year that pixel was on fire (1-365)
    Returns:
        - fire_data_dict: a dictionary of the following structure:
            {
                "fire_id": {
                    "day_of_year": one-hot encoded 2D array of fire spread on that day,
                    "day_of_year": one-hot encoded 2D array of fire spread on that day
                }

            }
    '''
    
    fire_data_dict = {}

    for key, val in data_dict.items():
        data = {}
                
        for y in range(1, 366):
            mask = ((fireline == y) & (val == 1))
            mask = mask.astype(int)
        
            if np.sum(mask) > 0:
                data[str(y)] = mask
        
        fire_data_dict[key] = data
        
    return fire_data_dict

In [29]:
def create_day_pairs(fire_data_dict):
    '''
    Create a list of sets where the first value is where the fire was on a given day and the second value is where
    the fire was on the following day.
    
    Args:
        - fire_data_dict: a dictionary of the following structure:
            {
                "fire_id": {
                    "day_of_year": one-hot encoded 2D array of fire spread on that day,
                    "day_of_year": one-hot encoded 2D array of fire spread on that day
                }

            }
    Returns:
        - train_labels: a list of sets where the first value of the set is a one-hot encoded 2D array of fire 
        spread on day_1 and the second value of the set is a one-hot encoded 2D array of fire spread on day_2:
        [
            (one-hot encoded 2D array of fire spread on that day_1, one-hot encoded 2D array of fire spread on day_2),
            (one-hot encoded 2D array of fire spread on that day_2, one-hot encoded 2D array of fire spread on day_3),
        ]
    '''
    
    train_labels = []

    for key, value in fire_data_dict.items():
        burn_matrices = list(value.values())
        day_of_year = list(value.keys())
        
        for index, day in enumerate(burn_matrices):

            if index < len(burn_matrices) - 1:
                day_1 = burn_matrices[index]
                day_2_index = index + 1
                day_2 = burn_matrices[day_2_index]
                
                doy = day_of_year[day_2_index]
                
                pair = (day_1, day_2)
                train_labels.append((doy, pair))

    return train_labels

In [30]:
def process_fire_data_tiff(directory, file):
    '''
    Process the fire data in the supplied tiff file and return a dictionary of key day of year and value a matrix 
    making up the attribute of that tiff file

    Args:
        - directory: name of directory of supplemental data
        - file: name of tiff file of supplemental data to add to model
    Returns:
        - fire_data_dict: a dictionary where the key is "fire_id" and the value is a matrix of pixels 
        triggered by the attribute of interest
    '''
    
    path = os.path.abspath(directory)

    tiff_files = []

    for f in os.listdir(path):
        if f.endswith('.tif'):
            tiff_files.append(path + '/' + f)

    tiff_dict = {}

    # dictionary of tiff files
    for f in tiff_files:
        k = f.split('/')[-1].split('.tif')[0]
        tiff_dict[k] = f
        
    # convert day of burn tif to np array
    fire_dob = Image.open(tiff_dict['BBdayofburnCA'])
    fire_dob = np.array(fire_dob)
    fire_dob[fire_dob == -9999] = 0

    # convert tif of interest to np array
    fire_data_mat = Image.open(tiff_dict[file])
    fire_data_mat = np.array(fire_data_mat)
    fire_data_mat[fire_data_mat == -9999] = 0
    
    # get list of unique days of burn
    days_of_burn = list(np.unique(fire_dob))

    # remove 0 from days of burn because it does not denote a fire
    days_of_burn.remove(0)
        
    # get dict with key value pairs of fire_id and an empty dict
    fire_data_dict = {}

    for idx in days_of_burn:
        idx = int(idx)
        
        mask = (fire_dob == idx)        
        mask = mask.astype(int)
        
        values = np.multiply(mask, fire_data_mat)
        
        idx = str(idx)
        fire_data_dict[idx] = {}
        fire_data_dict[idx]['Fire Direction'] = values 

    
    return fire_data_dict

In [31]:
def process_fire_data_png(directory, file):
    '''
    Process the fire data in the supplied png file and return a dictionary of key day of year and value a matrix 
    making up the attribute of that png file

    Args:
        - directory: name of directory of supplemental data
        - file: name of png file of supplemental data to add to model
    Returns:
        - fire_data_dict: a dictionary where the key is "fire_id" and the value is a matrix of pixels 
        triggered by the attribute of interest
    '''
    
    path = os.path.abspath(directory)

    tiff_files = []
    png_files = []
    
    for f in os.listdir(path):
        if f.endswith('.tif'):
            tiff_files.append(path + '/' + f)

    for f in os.listdir(path):
        if f.endswith('.png'):
            png_files.append(path + '/' + f)
    
    tiff_dict = {}
    png_dict = {}

    # dictionary of tiff files
    for f in tiff_files:
        k = f.split('/')[-1].split('.tif')[0]
        tiff_dict[k] = f
    
    for f in png_files:
        k = f.split('/')[-1].split('.png')[0]
        png_dict[k] = f
        
    # convert day of burn tif to np array
    fire_dob = Image.open(tiff_dict['BBdayofburnCA'])
    fire_dob = np.array(fire_dob)
    fire_dob[fire_dob == -9999] = 0

    # convert png of interest to np array
    fire_data_mat = Image.open(png_dict[file])
    fire_data_mat = np.array(fire_data_mat)
    fire_data_mat[fire_data_mat == -9999] = 0
    
    # get list of unique days of burn
    days_of_burn = list(np.unique(fire_dob))

    # remove 0 from days of burn because it does not denote a fire
    days_of_burn.remove(0)
        
    # get dict with key value pairs of fire_id and an empty dict
    fire_data_dict = {}

    for idx in days_of_burn:
        idx = int(idx)
        
        mask = (fire_dob == idx)        
        mask = mask.astype(int)
        
        values = np.multiply(mask, fire_data_mat)
        
        idx = str(idx)
        fire_data_dict[idx] = {}
        fire_data_dict[idx]['Fire Speed'] = values 

    
    return fire_data_dict

In [32]:
def combine_dicts(dict_1, dict_2):
    '''
    A helper function to combine the values of two dictionaries that have the same keys.
    
    Args:
        - dict_1: a dictionary of key day of year, and value a dictionary of key fire attribute and value a matrix
        denoting where that attribute is triggered
        - dict_2: a dictionary of key day of year, and value a dictionary of key fire attribute and value a matrix
        denoting where that attribute is triggered
    Returns:
        - dict_2: combined dictionary of dict_1 and dict_2
    '''
      
    for k, v in dict_1.items():
        for att, mat in v.items():
            dict_2[k][att] = mat
            
    return dict_2

### Weather Data Preprocessing Functions

In [33]:
def create_weather_dict(directory, normalized_weather, weather_vars, fire_data_dict):
    '''
    Create a dictionary of weather data from a pickled file
    Args:
        - directory: path to weather pickle file
        - normalized_weather: True/False to scale using max value
        - weather_vars: list of weather variables to include in model
    Returns:
        - weather_data: dictionary of key (day of year) and value (dictionary of key (weather parameter) 
        and value (matrix of value for each pixel))
        - max_values: a list of max values for each weather feature to use to normalize data
    '''

    path = os.path.abspath(directory)
    
    weather_file = ''
    
    for f in os.listdir(path):
        if f.endswith('.pickle'):
            weather_file = path + '/' + f
    
    weather = pd.read_pickle(weather_file)
    
    weather_dict = {}
    
    for k, v in weather.items():
        weather_dict[k] = {}
        
        for att, matrix in v.items():
            if att in weather_vars:
                
                # scale to kelvin
                if att in ['High T', 'Low T']:
                    mat = np.nan_to_num(matrix)
                    mat += 273.15
                    weather_dict[k][att] = mat
                else:
                    mat = np.nan_to_num(matrix)
                    weather_dict[k][att] = mat
     
    weather_data = {}

    for k, v in weather_dict.items():
        doy = dt.strptime(k, "%Y-%m-%d").strftime("%-j")
        weather_data[doy] = v
        
    # add in fire direction and speed
    for k, v in fire_data_dict.items():
        for att, mat in v.items():
            weather_data[k][att] = mat
    
    # scale weather data
    vals = list(weather_data.values())[0]
    weather_atts = list(vals.keys())
    max_values = dict.fromkeys(weather_atts, 0)
    
    if normalized_weather == True:
        
        for k, v in weather_dict.items():

            for weather_att, matrix in v.items():
                max_val = matrix.max()
                if max_val > max_values[weather_att]:
                    max_values[weather_att] = max_val
    
    return weather_data, max_values

In [34]:
def fetch_weather_data(max_values, normalized_weather, day_of_year, x, y):
    '''
    Fetch weather data for the relevant day and pixel.
    
    Args:
        - max_values: list of max_values for each weather features
        - normalized_weather: whether the weather data should be normalized - true/false
        - day_of_year: day of the year (1-365)
        - x: x-coordinate of matrix
        - y: y-coordinate of matrix
    Returns:
        - weather_list: an array of relevant weather data for that pixel
    '''
    
    weather_list = []
    
    day_weather = weather_data.get(day_of_year)

    if day_weather is None:
        return None
    else:
        for k, v in day_weather.items():
            if normalized_weather == True:
                max_val = max_values.get(k, 1)
                
                try:
                    val = v[x,y]
                    value = val/max_val
                    
                    if math.isnan(value):
                        weather_list.append(0)
                    else:
                        weather_list.append(value)
                except IndexError:
                    return None
            else:
                try:
                    weather_list.append(v[x,y])
                except IndexError:
                    return None
    
    return weather_list

### Prep Dataset for CNN functions

In [35]:
def balance_dataset(dataset, matrix_dim, data_len, side):
    '''
    Supplement the list produced in `create_labeled_data` with data where there was no data
    
    Args:
        - dataset: a list of sets where the first value of the set is a one-hot encoded 2D array of fire spread 
        on day_1 and the second value of the set is a one-hot encoded 2D array of fire spread on day_2
        - matrix_dim: a hyperparameter for the height and width of the matrices fed into the CNN
        - data_len: how many "no-fire" pixel-matrix pairs we want to return
        - side: half the length of the dimension of the outpur matrix
    Returns:
        - no_fire: a list of sets, where the second value (0, 1) represents whether fire is present for a given 
        pixel, and the first value is a matrix centered on the second value for the previous day and represents 
        where the fire was on the previous day
    '''
        
    no_fire = []
    vals = []
    
    for (doy, (x, y)) in dataset:    

        x = np.pad(x, pad_width=matrix_dim, mode='constant', constant_values=0)
        y = np.pad(y, pad_width=matrix_dim, mode='constant', constant_values=0)

        vals = np.where(y == 0)
        vals = list(zip(vals[0], vals[1]))

    vals = random.sample(vals, 2*data_len)
    
    for (xi, yi) in vals:
        xi_r = xi + side
        xi_l = xi - side
        yi_b = yi + side
        yi_t = yi - side

        m = x[xi_l:xi_r, yi_t:yi_b]

        # control for edge cases where shape doesn't match up - not sure why this is happening
        if m.shape == (matrix_dim, matrix_dim):
            weather_data = fetch_weather_data(max_values, normalized_weather, doy, xi, yi)
            if weather_data is not None:
                no_fire.append(((weather_data, m), 0))
    
    len_no_fire = len(no_fire)
    
    num_pixels = min(len_no_fire, data_len)
    no_fire = random.sample(no_fire, data_len)
    
    return no_fire

In [36]:
def create_labeled_data(dataset, matrix_dim, labeled_multiplier):
    '''
    Create a list of sets where the first value is a matrix of pixels on a given day and the second value denotes
    whether there was fire in the center pixel on the following day.
    
    Args:
        - dataset: a list of sets where the first value of the set is a one-hot encoded 2D array of fire spread 
        on day_1 and the second value of the set is a one-hot encoded 2D array of fire spread on day_2
        - matrix_dim: a hyperparameter for the height and width of the matrices fed into the CNN
        - labeled_multiplier: a hyperparameter for how much "no-fire" labeled data to add to the training set
    Returns:
        - data: a list of sets, where the second value (0, 1) represents whether fire is present for a given pixel, 
        and the first value is a matrix centered on the second value for the previous day and represents where the 
        fire was on the previous day
    '''

    side = int(matrix_dim/2)
    
    data = []
    
    for (doy, (x, y)) in dataset:    

        x = np.pad(x, pad_width=matrix_dim, mode='constant', constant_values=0)
        y = np.pad(y, pad_width=matrix_dim, mode='constant', constant_values=0)

        vals = np.where(y == 1)
        vals = list(zip(vals[0], vals[1]))

        for (xi, yi) in vals:
            xi_r = xi + side
            xi_l = xi - side
            yi_b = yi + side
            yi_t = yi - side

            m = x[xi_l:xi_r, yi_t:yi_b]
                        
            weather_data = fetch_weather_data(max_values, normalized_weather, doy, xi, yi)
            
            if weather_data is not None:
                data.append(((weather_data, m), 1))
    
    data_len = len(data)*labeled_multiplier
    
    # balance this dataset with values where there is no fire
    print('Balance dataset')
    no_fire = balance_dataset(dataset, matrix_dim, data_len, side)
    
    # combine and shuffle
    data += no_fire    
    random.shuffle(data)
    
    return data

In [37]:
def prep_dataset_for_cnn(data, matrix_dim):
    '''
    Takes a list of ((weather_data, fire_data), integer) pairs and returns fire data, weather data, and output labels.
    
    Args:
        - data: a list of (matrix, integer) pairs
        - matrix_dim: a hyperparameter for the height and width of the matrices fed into the CNN
    Returns:
        - fire: array of input data in matrix_dim X matrix_dim shape
        - weather: list of normalized weather weights
        - Y: array of output labels (0 or 1)
    '''
    
    fire = []
    weather = []
    Y = []

    for ((w, f), y) in data:
        f = np.asarray(f)
        fire.append(f)
        
        w = np.asarray(w)
        weather.append(w)
        
        Y.append(y)

    fire = np.asarray(fire)
    weather = np.asarray(weather)
    Y = np.asarray(Y)
    
    obs = len(fire)
    
    fire = fire.reshape(obs, matrix_dim, matrix_dim, 1)

    return fire, weather, Y

### Save Data to S3

In [38]:
def save_array_to_s3(s3_client, array, bucket_name, key_name):
    '''
    Uploads pre-processed data to S3.

    Args:
        - s3_client: boto3 s3 client
        - array: numpy array to save to s3
        - bucket_name: name of bucket on s3 to save array to
        - key_name: directory/file_name to save data to
    Returns:
        - Nothing
    
    https://stackoverflow.com/questions/48049557/how-to-write-npy-file-to-s3-directly
    '''
    
    array_data = io.BytesIO()
    pickle.dump(array, array_data)
    array_data.seek(0)
    
    s3_client.upload_fileobj(array_data, bucket_name, key_name)

### Run Data Preprocessing Pipeline

In [50]:
# run the datat preprocessing pipeline

# years = [2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016]
years = [2014]

for y in years:
    print('*****************************************')
    print('Starting data preprocessing for year: ', y)
    time = dt.now()
    print (time)
    print('*****************************************')
    
    # pull data from s3
    pull_fire_data_from_s3(y, tif_directory)
    pull_weather_data_from_s3(y, weather_directory)

    # get fire speed data
    fire_speed_data_dict = process_fire_data_png(tif_directory, speed_file)

    # get fire direction data
    fire_dir_data_dict = process_fire_data_tiff(tif_directory, direction_file)

    # combine fire speed and fire direction datasets
    fire_data_dict = combine_dicts(fire_speed_data_dict, fire_dir_data_dict)

    # create weather dict and combine with fire speed and fire direction
    weather_data, max_values = create_weather_dict(weather_directory, normalized_weather, weather_vars, fire_data_dict)
    
    # return matrices of which pixels belong to which fire and which day of the year the pixel was on fire
    fire_data_dict, fireline = data_processing(tif_directory)

    # create matrices for each fire_id that show were the fire was on a given day during the year 
    fire_data_dict = create_one_hot_matrices(fire_data_dict, fireline)

    # create a list of sets where the first value is where the fire was on a given day and the second value is where
    # the fire was on the following day
    small_dataset = create_day_pairs(fire_data_dict)

    # create a list of sets where the first value is a matrix of pixels on a given day and the second value denotes
    # whether there was fire in the center pixel on the following day
    data = create_labeled_data(small_dataset, matrix_dim, labeled_multiplier)

    # takes data pairs and returns fire data, weather data, and output labels
    fire, weather, Y = prep_dataset_for_cnn(data, matrix_dim)
    print('Fire data shape: ', fire.shape)
    print('Weather data shape: ', weather.shape)
    print('Y data shape: ', Y.shape)
    
    # save fire data to S3
    print('Save fire data to s3')
    fire_name = 'input_fire/fire_{}.pickle'.format(y)
    save_array_to_s3(s3_client, fire, bucket_name, fire_name)
    
    # save weather data to S3
    print('Save weather data to s3')
    weather_name = 'input_weather/weather_{}.pickle'.format(y)
    save_array_to_s3(s3_client, weather, bucket_name, weather_name)

    #save label data to S3
    print('Save labels to S3')
    label_name = 'labels/label_{}.pickle'.format(y)
    save_array_to_s3(s3_client, Y, bucket_name, label_name)
    
    print('Time: ', (dt.now() - time))

*****************************************
Starting data preprocessing for year:  2014
2020-04-11 18:20:51.699309
*****************************************
Balance dataset
Fire data shape:  (45475, 32, 32, 1)
Weather data shape:  (45475, 8)
Y data shape:  (45475,)
Save fire data to s3
Save weather data to s3
Save labels to S3
Time:  0:02:10.815958
