# 📚 Import Libraries

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
from IPython.core.interactiveshell import InteractiveShell
import random
InteractiveShell.ast_node_interactivity = "all"
import os
from datetime import datetime
import matplotlib.pyplot as plt
import plotly.express as px
import glob
from tqdm import tqdm

import plotly.figure_factory as ff
import plotly.express as px
import numpy as np

from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
import xgboost as xgb

from sklearn.metrics import accuracy_score
from scipy.stats import mode
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import TimeSeriesSplit

  shapely_geos_version, geos_capi_version_string


In [2]:
# use your own mapbox token
MAPBOX_TOKEN = 'pk.eyJ1IjoiaWdsYXdlYiIsImEiOiJja3picmk5NmsyaDZxMndtenYyOWhvNmtnIn0.Dxi29pChSrUbePq_oZ1rTw'
px.set_mapbox_access_token(MAPBOX_TOKEN)

In [3]:
PRECISION = 1
MIN_FIRE_RECORDS = 2

# Trianing period 2013-2020

We explored Australia and used aggregation for a baseline prediction model.
* Temporal resolution: Monthly
* Spatial resolution: 1 Decimal degree ~ 10 km grid
* Binary Target: At least two fire readings

## Read australia dataset

In [4]:
# data description
# https://earthdata.nasa.gov/earth-observation-data/near-real-time/firms/viirs-i-band-active-fire-data
WORK_DIR = './'
aus_fires = pd.read_csv(WORK_DIR + '/wildfiredataset/australia_fire_daily.csv', parse_dates=['acq_date'])
aus_fires.shape
aus_fires.head()

(5801066, 6)

Unnamed: 0,latitude,longitude,acq_date,satellite,instrument,confidence
0,-39.98,144.06,2016-11-16,N,VIIRS,50
1,-39.98,147.96,2016-03-03,N,VIIRS,50
2,-39.97,147.96,2016-12-07,N,VIIRS,50
3,-39.97,147.97,2016-12-07,N,VIIRS,50
4,-39.96,143.88,2016-10-16,N,VIIRS,50


In [5]:
# Selecting duplicate rows except first 
# occurrence based on all columns
duplicate = aus_fires.drop(columns=['instrument','confidence','satellite'])
duplicate = duplicate[duplicate.duplicated()]

print("Duplicate Rows :")
  
# Print the resultant Dataframe
duplicate.shape

Duplicate Rows :


(836879, 3)

In [6]:
aus_fires['year'] = aus_fires.acq_date.dt.year
aus_fires['month'] = aus_fires.acq_date.dt.month
aus_fires.latitude = aus_fires.latitude.round(PRECISION)
aus_fires.longitude = aus_fires.longitude.round(PRECISION)
fires = aus_fires.groupby(['latitude', 'longitude', 'year', 'month']).size().reset_index()
fires.columns = ['latitude', 'longitude', 'year', 'month', 'fire_cnt']

fires.shape
fires.head()
fires.nunique()

(469010, 5)

Unnamed: 0,latitude,longitude,year,month,fire_cnt
0,-40.0,143.9,2013,12,4
1,-40.0,143.9,2014,5,2
2,-40.0,143.9,2015,6,2
3,-40.0,143.9,2016,10,3
4,-40.0,143.9,2018,10,1


latitude     308
longitude    412
year           9
month         12
fire_cnt     422
dtype: int64

# Feature engineering

In [7]:
coords = fires[['latitude', 'longitude']].drop_duplicates()
times = fires[['year', 'month']].drop_duplicates()
coords['one'] = 1
times['one'] = 1
base = pd.merge(coords, times, how='outer', on='one')
history = base.merge(fires, how='left', on= ['latitude', 'longitude', 'year', 'month'])
history.head()

Unnamed: 0,latitude,longitude,one,year,month,fire_cnt
0,-40.0,143.9,1,2013,12,4.0
1,-40.0,143.9,1,2014,5,2.0
2,-40.0,143.9,1,2015,6,2.0
3,-40.0,143.9,1,2016,10,3.0
4,-40.0,143.9,1,2018,10,1.0


In [8]:
history = history.fillna(0)
history.fire_cnt.value_counts().head()

0.0    4691176
1.0     123472
2.0      62844
3.0      39206
4.0      28371
Name: fire_cnt, dtype: int64

#### Target column

In [9]:
history['fire'] = 1 * (history['fire_cnt'] >= MIN_FIRE_RECORDS)

In [10]:
history.shape
history.head()
history.mean()

(5160186, 7)

Unnamed: 0,latitude,longitude,one,year,month,fire_cnt,fire
0,-40.0,143.9,1,2013,12,4.0,1
1,-40.0,143.9,1,2014,5,2.0,1
2,-40.0,143.9,1,2015,6,2.0,1
3,-40.0,143.9,1,2016,10,3.0,1
4,-40.0,143.9,1,2018,10,1.0,0


latitude      -24.313153
longitude     134.851254
one             1.000000
year         2016.924528
month           6.405660
fire_cnt        1.124197
fire            0.066962
dtype: float64

#### Historical fire frequency features

In [11]:
yearly = history.groupby(
    ['latitude', 'longitude', 'year'])[['fire_cnt', 'fire']].mean().reset_index()
monthly = history.groupby(
    ['latitude', 'longitude', 'year', 'month'])[['fire_cnt', 'fire']].mean().reset_index()

In [12]:
last_year = yearly.copy()
last_year.year += 1
last_year.columns = ['latitude', 'longitude', 'year', 'fire_cnt_last_year', 'fire_last_year']
last_year.head(3)

Unnamed: 0,latitude,longitude,year,fire_cnt_last_year,fire_last_year
0,-40.0,143.9,2014,0.333333,0.083333
1,-40.0,143.9,2015,0.166667,0.083333
2,-40.0,143.9,2016,0.166667,0.083333


In [13]:
last_year_month = monthly.copy()
last_year_month.year += 1
last_year_month.columns = ['latitude', 'longitude', 'year', 'month', 'fire_cnt_last_year_same_month', 'fire_last_year_same_month']
last_year_month.head(3)

Unnamed: 0,latitude,longitude,year,month,fire_cnt_last_year_same_month,fire_last_year_same_month
0,-40.0,143.9,2014,1,0.0,0.0
1,-40.0,143.9,2014,2,0.0,0.0
2,-40.0,143.9,2014,3,0.0,0.0


In [14]:
past = yearly.copy()
past['one'] = 1
past = history[['latitude', 'longitude', 'year', 'one']].drop_duplicates().merge(
    past, on=['latitude', 'longitude', 'one'])
past = past[past.year_x < past.year_y]
past = past.groupby(['latitude', 'longitude', 'year_y'])[['fire_cnt', 'fire']].mean().reset_index()
past.columns = ['latitude', 'longitude', 'year', 'fire_cnt_before', 'fire_before']
past.head(3)

Unnamed: 0,latitude,longitude,year,fire_cnt_before,fire_before
0,-40.0,143.9,2014,0.166667,0.083333
1,-40.0,143.9,2015,0.166667,0.083333
2,-40.0,143.9,2016,0.25,0.083333


## Collect all features

Drop the first year (2013) as we don't have history.

In [15]:
X = history.merge(past, how='left', on=['latitude', 'longitude', 'year'])
X = X.merge(last_year, how='left', on=['latitude', 'longitude', 'year'])
X = X.merge(last_year_month, how='left', on=['latitude', 'longitude', 'year', 'month'])
X = X.drop(columns='one')

X.head()
X.shape
X.groupby('year').size()

Unnamed: 0,latitude,longitude,year,month,fire_cnt,fire,fire_cnt_before,fire_before,fire_cnt_last_year,fire_last_year,fire_cnt_last_year_same_month,fire_last_year_same_month
0,-40.0,143.9,2013,12,4.0,1,,,,,,
1,-40.0,143.9,2014,5,2.0,1,0.166667,0.083333,0.333333,0.083333,0.0,0.0
2,-40.0,143.9,2015,6,2.0,1,0.166667,0.083333,0.166667,0.083333,0.0,0.0
3,-40.0,143.9,2016,10,3.0,1,0.25,0.083333,0.166667,0.083333,0.0,0.0
4,-40.0,143.9,2018,10,1.0,0,0.083333,0.0,0.0,0.0,0.0,0.0


(5160186, 12)

year
2013    584172
2014    584172
2015    584172
2016    584172
2017    584172
2018    584172
2019    584172
2020    584172
2021    486810
dtype: int64

### Remove NA

In [16]:
X = X.dropna()

## Reduce memory usage for dataframe

In [17]:
def reduce_memory_usage(df, verbose=True):
    numerics = ['int8','int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2

    for col in df.columns:
        col_type = df[col].dtypes

        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()

            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2

    if verbose:
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
 
    return df

X = reduce_memory_usage(X)

Mem. usage decreased to 209.47 Mb (53.8% reduction)


In [18]:
def reduce_mem_usage(props):
    start_mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage of properties dataframe is :",start_mem_usg," MB")
    NAlist = [] # Keeps track of columns that have missing values filled in. 
    for col in props.columns:
        if props[col].dtype != object:  # Exclude strings
            
            # Print current column type
            print("******************************")
            print("Column: ",col)
            print("dtype before: ",props[col].dtype)
            
            # make variables for Int, max and min
            IsInt = False
            mx = props[col].max()
            mn = props[col].min()
            
            # Integer does not support NA, therefore, NA needs to be filled
            if not np.isfinite(props[col]).all(): 
                NAlist.append(col)
                props[col].fillna(mn-1,inplace=True)  
                   
            # test if column can be converted to an integer
            asint = props[col].fillna(0).astype(np.int64)
            result = (props[col] - asint)
            result = result.sum()
            if result > -0.01 and result < 0.01:
                IsInt = True

            
            # Make Integer/unsigned Integer datatypes
            if IsInt:
                if mn >= 0:
                    if mx < 255:
                        props[col] = props[col].astype(np.uint8)
                    elif mx < 65535:
                        props[col] = props[col].astype(np.uint16)
                    elif mx < 4294967295:
                        props[col] = props[col].astype(np.uint32)
                    else:
                        props[col] = props[col].astype(np.uint64)
                else:
                    if mn > np.iinfo(np.int8).min and mx < np.iinfo(np.int8).max:
                        props[col] = props[col].astype(np.int8)
                    elif mn > np.iinfo(np.int16).min and mx < np.iinfo(np.int16).max:
                        props[col] = props[col].astype(np.int16)
                    elif mn > np.iinfo(np.int32).min and mx < np.iinfo(np.int32).max:
                        props[col] = props[col].astype(np.int32)
                    elif mn > np.iinfo(np.int64).min and mx < np.iinfo(np.int64).max:
                        props[col] = props[col].astype(np.int64)    
            
            # Make float datatypes 32 bit
            else:
                props[col] = props[col].astype(np.float32)
            
            # Print new column type
            print("dtype after: ",props[col].dtype)
            print("******************************")
    
    # Print final result
    print("___MEMORY USAGE AFTER COMPLETION:___")
    mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage is: ",mem_usg," MB")
    print("This is ",100*mem_usg/start_mem_usg,"% of the initial size")
    return props, NAlist

X, NAlist = reduce_mem_usage(X)

Memory usage of properties dataframe is : 209.47329711914062  MB
******************************
Column:  latitude
dtype before:  float32
dtype after:  float32
******************************
******************************
Column:  longitude
dtype before:  float32
dtype after:  float32
******************************
******************************
Column:  year
dtype before:  int16
dtype after:  uint16
******************************
******************************
Column:  month
dtype before:  int8
dtype after:  uint8
******************************
******************************
Column:  fire_cnt
dtype before:  float32
dtype after:  uint16
******************************
******************************
Column:  fire
dtype before:  int8
dtype after:  uint8
******************************
******************************
Column:  fire_cnt_before
dtype before:  float32
dtype after:  float32
******************************
******************************
Column:  fire_before
dtype before:  float32
dt

In [19]:
# X = X[0]
# X.head()

In [20]:
X['fire_cnt'].mode()

0    0
dtype: uint16

In [21]:
X.dtypes

latitude                         float32
longitude                        float32
year                              uint16
month                              uint8
fire_cnt                          uint16
fire                               uint8
fire_cnt_before                  float32
fire_before                      float32
fire_cnt_last_year               float32
fire_last_year                   float32
fire_cnt_last_year_same_month     uint16
fire_last_year_same_month          uint8
dtype: object

In [22]:
X.to_csv("australia_fire_total_ready.csv", index=False)
print('Total fire saved')

Total fire saved


#### Split the dataset based on time to avoid leakage

In [23]:
train = X[X.year < 2019].dropna()
valid = X[(X.year >= 2019) & (X.year < 2021)]
test = X[X.year == 2021]

train.to_csv('australia_fire_train.csv', index=False)
valid.to_csv('australia_fire_valid.csv', index=False)
test.to_csv('australia_fire_test.csv', index=False)

In [24]:
train.groupby('year').size()
valid.groupby('year').size()
test.groupby('year').size()

year
2014    584172
2015    584172
2016    584172
2017    584172
2018    584172
dtype: int64

year
2019    584172
2020    584172
dtype: int64

year
2021    486810
dtype: int64

In [25]:
train.shape, valid.shape, test.shape
train.head()

((2920860, 12), (1168344, 12), (486810, 12))

Unnamed: 0,latitude,longitude,year,month,fire_cnt,fire,fire_cnt_before,fire_before,fire_cnt_last_year,fire_last_year,fire_cnt_last_year_same_month,fire_last_year_same_month
1,-40.0,143.899994,2014,5,2,1,0.166667,0.083333,0.333333,0.083333,0,0
2,-40.0,143.899994,2015,6,2,1,0.166667,0.083333,0.166667,0.083333,0,0
3,-40.0,143.899994,2016,10,3,1,0.25,0.083333,0.166667,0.083333,0,0
4,-40.0,143.899994,2018,10,1,0,0.083333,0.0,0.0,0.0,0,0
7,-40.0,143.899994,2015,4,0,0,0.166667,0.083333,0.166667,0.083333,0,0
