# CNN Model V3

In [587]:
# Load packages
import csv
import math
import numpy as np
import os
import pandas as pd
import random

from datetime import datetime as dt
from PIL import Image

## Variables and Hyperparameters

In [614]:
###############
## VARIABLES ##
###############

# name of directory with fire tif files
tif_directory = "toydata"

# name of directory with weather data
weather_directory = 'weather_data'

#####################
## HYPERPARAMETERS ##
#####################
# scale the weather data - yea or nay
scaled_weather = False

# the desired height and width (in pixels) of the matrix to feed into the CNN
# 1 pixel side = 500 meters = 0.310686 miles
matrix_dim = 32

# multiplier for amount of zero-labeled data we want to add to dataset
labeled_multiplier = 1

# test size for train/test split
test_size = 0.2

# training epochs
epoc = 30

## Dataset Preprocessing Pipeline

### Fire Dataset Preprocessing Functions

In [615]:
def data_processing(directory):
    '''
    Process the dataset in the supplied directory and return matrices of which pixels belong to which fire and 
    which day of the year the pixel was on fire.
    
    Args: 
        - directory: name of directory with tif files
    Returns: 
        - fire_data_dict: a dictionary where the key is "fire_id" and the value is a matrix of pixels 
        triggered by that fire (0, 1)
        - fireline: matrix denoting what day of year that pixel was on fire (1-365)
    '''
    
    
    path = os.path.abspath(directory)

    tiff_files = []

    for f in os.listdir(path):
        if f.endswith('.tif'):
            tiff_files.append(path + '/' + f)

    tiff_dict = {}

    # dictionary of tiff files
    for f in tiff_files:
        k = f.split('/')[-1].split('.tif')[0]
        tiff_dict[k] = f

    # convert to np array
    fire_id = Image.open(tiff_dict['fireid'])
    fire_id = np.array(fire_id)
    fire_id[fire_id == -9999] = 0

    fireline = Image.open(tiff_dict['Global_fire_atlas_firelinecrop'])
    fireline = np.array(fireline)
    fireline[fireline == -9999] = 0

    # get list of unique fire_ids
    fire_ids = set()

    for row in fire_id:
        for val in row:
            fire_ids.add(val)

    # remove 0 from fire_ids set because it does not denote a fire
    fire_ids.remove(0)

    # get dict with key value pairs of fire_id and an empty dict
    fire_data_dict = {}

    for id in fire_ids:
        id = str(id)
        fire_data_dict[id] = {}

    for id in fire_ids:
        indices = np.where(fire_id == id, 1, 0)
        fire_data_dict[str(id)] = indices
        
    return fire_data_dict, fireline

In [616]:
def create_one_hot_matrices(data_dict, fireline):
    '''
    Create matrices for each fire_id that show were the fire was on a given day during the year.
    
    Args:
        - data_dict: a dictionary where the key is "fire_id" and the value is a matrix of pixels 
        triggered by that fire (0, 1)
        - fireline: matrix denoting what day of year that pixel was on fire (1-365)
    Returns:
        - fire_data_dict: a dictionary of the following structure:
            {
                "fire_id": {
                    "day_of_year": one-hot encoded 2D array of fire spread on that day,
                    "day_of_year": one-hot encoded 2D array of fire spread on that day
                }

            }
    '''
    
    fire_data_dict = {}

    for key, val in data_dict.items():
        data = {}
                
        for y in range(1, 366):
            mask = ((fireline == y) & (val == 1))
            mask = mask.astype(int)
        
            if np.sum(mask) > 0:
                data[str(y)] = mask
        
        fire_data_dict[key] = data
        
    return fire_data_dict

In [617]:
def create_day_pairs(fire_data_dict):
    '''
    Create a list of sets where the first value is where the fire was on a given day and the second value is where
    the fire was on the following day.
    
    Args:
        - fire_data_dict: a dictionary of the following structure:
            {
                "fire_id": {
                    "day_of_year": one-hot encoded 2D array of fire spread on that day,
                    "day_of_year": one-hot encoded 2D array of fire spread on that day
                }

            }
    Returns:
        - train_labels: a list of sets where the first value of the set is a one-hot encoded 2D array of fire 
        spread on day_1 and the second value of the set is a one-hot encoded 2D array of fire spread on day_2:
        [
            (one-hot encoded 2D array of fire spread on that day_1, one-hot encoded 2D array of fire spread on day_2),
            (one-hot encoded 2D array of fire spread on that day_2, one-hot encoded 2D array of fire spread on day_3),
        ]
    '''
    
    train_labels = []

    for key, value in fire_data_dict.items():
        burn_matrices = list(value.values())
        day_of_year = list(value.keys())
        
        for index, day in enumerate(burn_matrices):

            if index < len(burn_matrices) - 1:
                day_1 = burn_matrices[index]
                day_2_index = index + 1
                day_2 = burn_matrices[day_2_index]
                
                doy = day_of_year[day_2_index]
                
                pair = (day_1, day_2)
                train_labels.append((doy, pair))

    return train_labels

### Weather Data Preprocessing Functions

In [618]:
def create_weather_dict(directory, scaled_weather):
    '''
    Create a dictionary of weather data from a pickled file
    Args:
        - directory: path to weather pickle file
        - scaled_weather: True/False to scale using max value
    Returns:
        - weather_data: dictionary of key (day of year) and value (dictionary of key (weather parameter) 
        and value (matrix of value for each pixel))
    '''

    path = os.path.abspath(directory)
    
    weather_file = ''
    
    for f in os.listdir(path):
        if f.endswith('.pickle'):
            weather_file = path + '/' + f
    
    weather = pd.read_pickle(weather_file)
    
    weather_dict = {}
    
    for k, v in weather.items():
        weather_dict[k] = {}
        
        for att, matrix in v.items():
            mat = np.nan_to_num(matrix)
            weather_dict[k][att] = mat
     
    weather_data = {}

    for k, v in weather_dict.items():
        doy = dt.strptime(k, "%Y-%m-%d").strftime("%j")
        weather_data[doy] = v
    
    # scale weather data
    vals = list(weather_data.values())[0]
    weather_atts = list(vals.keys())
    max_values = dict.fromkeys(weather_atts, 0)
    
    if scaled_weather == True:
        
        for k, v in weather_dict.items():

            for weather_att, matrix in v.items():
                max_val = matrix.max()
                if max_val > max_values[weather_att]:
                    max_values[weather_att] = max_val
    
    return weather_data, max_values

In [619]:
def fetch_weather_data(max_values, scaled_weather, day_of_year, x, y):
    '''
    Fetch weather data for the relevant day and pixel.
    
    Args:
        - max_values: list of max_values for each weather features
        - scaled_weather: whether the weather data should be scaled - true/false
        - day_of_year: day of the year (1-365)
        - x: x-coordinate of matrix
        - y: y-coordinate of matrix
    Returns:
        - weather_list: an array of relevant weather data for that pixel
    '''
    weather_list = []
    
    day_weather = weather_data.get(day_of_year)

    if day_weather is None:
        return None
    else:
        for k, v in day_weather.items():
            if scaled_weather == True:
                max_val = max_values.get(k, 1)
                
                try:
                    val = v[x,y]/max_val
                    value = val/max_val
                    
                    if math.isnan(value):
                        weather_list.append(0)
                    else:
                        weather_list.append(value)
                except IndexError:
                    return None

            else:
                try:
                    weather_list.append(v[x,y])
                except IndexError:
                    return None
    
    return weather_list

### Prep Dataset for CNN functions

In [620]:
def balance_dataset(dataset, matrix_dim, num_pixels, side):
    '''
    Supplement the list produced in `create_labeled_data` with data where there was no data
    
    Args:
        - dataset: a list of sets where the first value of the set is a one-hot encoded 2D array of fire spread 
        on day_1 and the second value of the set is a one-hot encoded 2D array of fire spread on day_2
        - matrix_dim: a hyperparameter for the height and width of the matrices fed into the CNN
        - num_pixels: how many "no-fire" pixel-matrix pairs we want to return
        - side: half the length of the dimension of the outpur matrix
    Returns:
        - no_fire: a list of sets, where the second value (0, 1) represents whether fire is present for a given pixel, and the
        first value is a matrix centered on the second value for the previous day and represents where the fire was
        on the previous day
    '''
        
    no_fire = []

    for (doy, (x, y)) in dataset:    

        x = np.pad(x, pad_width=matrix_dim, mode='constant', constant_values=0)
        y = np.pad(y, pad_width=matrix_dim, mode='constant', constant_values=0)

        vals = np.where(y == 0)
        vals = list(zip(vals[0], vals[1]))

        for (xi, yi) in vals:
            xi_r = xi + side
            xi_l = xi - side
            yi_b = yi + side
            yi_t = yi - side

            m = x[xi_l:xi_r, yi_t:yi_b]
                                    
            # control for edge cases where shape doesn't match up - not sure why this is happening
            if m.shape == (matrix_dim, matrix_dim):
                weather_data = fetch_weather_data(max_values, scaled_weather, doy, xi, yi)
                if weather_data is not None:
                    no_fire.append(((weather_data, m), 0))
    
    no_fire = random.sample(no_fire, num_pixels)
    
    return no_fire

In [621]:
def create_labeled_data(dataset, matrix_dim):
    '''
    Create a list of sets where the first value is a matrix of pixels on a given day and the second value denotes
    whether there was fire in the center pixel on the following day.
    
    Args:
        - dataset: a list of sets where the first value of the set is a one-hot encoded 2D array of fire spread 
        on day_1 and the second value of the set is a one-hot encoded 2D array of fire spread on day_2
        - matrix_dim: a hyperparameter for the height and width of the matrices fed into the CNN
    Returns:
        - data: a list of sets, where the second value (0, 1) represents whether fire is present for a given pixel, and the
        first value is a matrix centered on the second value for the previous day and represents where the fire was
        on the previous day
    '''

    side = int(matrix_dim/2)
    
    data = []
    
    for (doy, (x, y)) in dataset:    

        x = np.pad(x, pad_width=matrix_dim, mode='constant', constant_values=0)
        y = np.pad(y, pad_width=matrix_dim, mode='constant', constant_values=0)

        vals = np.where(y == 1)
        vals = list(zip(vals[0], vals[1]))

        for (xi, yi) in vals:
            xi_r = xi + side
            xi_l = xi - side
            yi_b = yi + side
            yi_t = yi - side

            m = x[xi_l:xi_r, yi_t:yi_b]
                        
            weather_data = fetch_weather_data(max_values, scaled_weather, doy, xi, yi)
            
            if weather_data is not None:
                data.append(((weather_data, m), 1))
    
    data_len = len(data)
    num_pixels = min(int(data_len*labeled_multiplier), data_len)
    
    # balance this dataset with values where there is no fire
    no_fire = balance_dataset(dataset, matrix_dim, num_pixels, side)
    
    # combine and shuffle
    data += no_fire    
    random.shuffle(data)
    
    return data

In [622]:
def prep_dataset_for_cnn(data):
    '''
    Takes a list of ((list, matrix), integer) pairs and returns fire data, weather data, and output labels 
    split into train and test sets.
    
    Args:
        - data: a list of (matrix, integer) pairs
    Returns:
        - fire: array of input data in matrix_dim X matrix_dim shape
        - weather: list of scaled weather weights
        - Y: array of output labels (0 or 1)
    '''
    
    fire = []
    weather = []
    Y = []

    for ((w, f), y) in data:
        f = np.asarray(f)
        fire.append(f)
        
        w = np.asarray(w)
        weather.append(w)
        
        Y.append(y)

    fire = np.asarray(fire)
    weather = np.asarray(weather)
    Y = np.asarray(Y)
    
    obs = len(fire)
    
    fire = fire.reshape(obs, matrix_dim, matrix_dim, 1)

    return fire, weather, Y

### Run Data Preprocessing Pipeline

In [623]:
weather_data, max_values = create_weather_dict(weather_directory, scaled_weather)
fire_data_dict, fireline = data_processing(tif_directory)
fire_data_dict = create_one_hot_matrices(fire_data_dict, fireline)
small_dataset = create_day_pairs(fire_data_dict)
data = create_labeled_data(small_dataset, matrix_dim)
fire, weather, Y = prep_dataset_for_cnn(data)

## Build CNN

In [624]:
# import packages

from __future__ import print_function

import tensorflow as tf

import keras
import keras.backend as K

from keras.models import Sequential, Model
from keras.layers import AveragePooling2D, Conv1D, Conv2D, MaxPooling2D, Dropout, Flatten, Dense, Input, concatenate

In [625]:
# compute f1 score manually

def recall_m(y_true, y_pred):
    true_pos = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_pos = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_pos / (possible_pos + K.epsilon())

    return recall

def precision_m(y_true, y_pred):
    true_pos = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_pos = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_pos / (predicted_pos + K.epsilon())
    
    return precision

def f1_score(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    score = 2*((precision*recall)/(precision+recall+K.epsilon()))
    
    return score

### Model 1 - Fire Image Data

In [626]:
# Create model_1: fire image data with Sequential API
model_1 = Sequential()

# Add layers
model_1.add(AveragePooling2D(pool_size=(2, 2), strides=None, padding='valid'))
model_1.add(Conv2D(64, kernel_size=(3, 3), activation='sigmoid'))
model_1.add(Conv2D(32, kernel_size=(3, 3), activation='sigmoid'))
model_1.add(MaxPooling2D(pool_size=(2, 2), strides=None, padding='valid'))
model_1.add(Dropout(0.2))
model_1.add(Flatten())

# Final dense layer 
model_1.add(Dense(1, activation='sigmoid'))

In [627]:
# compile the model
model_1.compile(
    optimizer='adam', 
    loss='binary_crossentropy', 
    metrics=['accuracy', f1_score, tf.keras.metrics.AUC()]
)

In [628]:
# fit the model
model_1.fit(
    x = fire, 
    y = Y,
    validation_split = test_size, 
    epochs=epoc
)

Train on 5409 samples, validate on 1353 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.callbacks.History at 0x7f89b7996b50>

In [629]:
model_1.predict(fire[:10])

array([[0.00762875],
       [1.        ],
       [0.00762875],
       [0.99789375],
       [0.00762875],
       [0.00762875],
       [0.00762875],
       [0.9999999 ],
       [0.00762875],
       [0.99999976]], dtype=float32)

In [630]:
Y[:10]

array([0, 1, 0, 1, 0, 0, 0, 1, 0, 1])

### Model 2 - Fire Image Data and Weather Data

In [645]:
# Create model_2: image data and weather data with functional API

# Define image inputs shape
image_shape = fire[0].shape
image_inputs = Input(shape = image_shape)

# Define weather inputs shape
weather_shape = weather[0].shape
weather_inputs = Input(shape = weather_shape)

# Add layers for fire image interpretation
fire_1 = AveragePooling2D(pool_size=(2, 2), strides=None, padding='valid')(image_inputs)
fire_2 = Conv2D(64, kernel_size=(3, 3), activation='sigmoid')(fire_1)
fire_3 = Conv2D(32, kernel_size=(3, 3), activation='sigmoid')(fire_2)
fire_4 = MaxPooling2D(pool_size=(2,2), strides=None, padding='valid')(fire_3)
fire_5 = Dropout(0.2)(fire_4)
fire_6 = Flatten()(fire_5)
fire_7 = Dense(8, activation='sigmoid')(fire_6)

# Combine the layers
concat = concatenate([fire_7, weather_inputs])

# Final dense layer 
predictions = Dense(1, activation='sigmoid')(concat)

# Define the model
model_2 = Model(inputs=[image_inputs, weather_inputs], outputs=predictions)

In [646]:
# compile the model
model_2.compile(
    optimizer='adam', 
    loss='binary_crossentropy', 
    metrics=['accuracy', f1_score, tf.keras.metrics.AUC()]
)

In [647]:
# fit the model
model_2.fit(
    x = [fire, weather], 
    y = Y,
    validation_split = test_size, 
    epochs=epoc
)

Train on 5409 samples, validate on 1353 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.callbacks.History at 0x7f89fe07ed90>

In [648]:
model_2.predict([fire[:10], weather[:10]])

array([[0.33487126],
       [0.8180104 ],
       [0.13982786],
       [0.63246477],
       [0.4823586 ],
       [0.4553281 ],
       [0.5802383 ],
       [0.73576325],
       [0.3785281 ],
       [0.41456392]], dtype=float32)

In [649]:
Y[:10]

array([0, 1, 0, 1, 0, 0, 0, 1, 0, 1])