# 📚 Import Libraries

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
from IPython.core.interactiveshell import InteractiveShell
import random
InteractiveShell.ast_node_interactivity = "all"
import os
from datetime import datetime
import matplotlib.pyplot as plt
import plotly.express as px
import glob
from tqdm import tqdm

import plotly.figure_factory as ff
import plotly.express as px
import numpy as np

from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
import xgboost as xgb

from sklearn.metrics import accuracy_score
from scipy.stats import mode
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import TimeSeriesSplit

import lightgbm as lgb
from sklearn import metrics

  shapely_geos_version, geos_capi_version_string


### Enable GPU if possible

In [2]:
from numba import cuda

gpu_enabled = False
try:
    cuda.select_device(0)
    gpu_enabled = True
except cuda.cudadrv.error.CudaSupportError as error:
    print("CUDA not found")
    gpu_enabled = False

<weakproxy at 0x7fe719cf3110 to Device at 0x7fe719e25810>

### Make results repeatable

In [3]:
def seed_everything(seed=42):
    import random
    import os
    import tensorflow as tf
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)

    print('Seed done!')
seed_everything()

Seed done!


In [4]:
# use your own mapbox token https://docs.mapbox.com/help/getting-started/access-tokens/
MAPBOX_TOKEN = 'pk.eyJ1IjoiaWdsYXdlYiIsImEiOiJja3picmk5NmsyaDZxMndtenYyOWhvNmtnIn0.Dxi29pChSrUbePq_oZ1rTw'
px.set_mapbox_access_token(MAPBOX_TOKEN)

# Baseline Model

### Trianing period 2013-2020

We explored Australia and used aggregation for a baseline prediction model.
* Temporal resolution: Monthly
* Spatial resolution: 1 Decimal degree ~ 10 km grid
* Binary Target: At least two fire readings

## Read prepared dataset

In [5]:
# data description
# https://earthdata.nasa.gov/earth-observation-data/near-real-time/firms/viirs-i-band-active-fire-data
WORK_DIR = './
aus_fires = pd.read_csv(WORK_DIR + '/wildfiredataset/aus_fire_final_temp.csv')
aus_fires.shape
aus_fires.head()

(4576014, 20)

Unnamed: 0,latitude,longitude,year,month,fire_cnt,fire,fire_cnt_before,fire_before,fire_cnt_last_year,fire_last_year,fire_cnt_last_year_same_month,fire_last_year_same_month,st_bin,near_st_lat,near_st_lng,T_MAX,T_MAX_MEAN,T_MEAN,DEWP_MEAN,MXSPD_MAX
0,-40.0,143.9,2014,5,2,1,0.166667,0.083333,0.333333,0.083333,0,0,2346,-39.88,143.88,66.9,61.483871,56.051613,51.474194,21.0
1,-40.0,143.9,2015,6,2,1,0.166667,0.083333,0.166667,0.083333,0,0,2346,-39.88,143.88,61.0,55.393333,51.416667,47.066667,39.0
2,-40.0,143.9,2016,10,3,1,0.25,0.083333,0.166667,0.083333,0,0,2346,-39.88,143.88,73.4,59.26129,53.435484,45.012903,35.0
3,-40.0,143.9,2018,10,1,0,0.083333,0.0,0.0,0.0,0,0,2346,-39.88,143.88,76.5,62.790323,55.5,48.003226,24.1
4,-40.0,143.9,2020,6,1,0,0.083333,0.0,0.0,0.0,0,0,2346,-39.88,143.88,60.8,56.66,50.92,46.506667,31.1


In [6]:
X = aus_fires

X.head()
X.shape
X.groupby('year').size()

Unnamed: 0,latitude,longitude,year,month,fire_cnt,fire,fire_cnt_before,fire_before,fire_cnt_last_year,fire_last_year,fire_cnt_last_year_same_month,fire_last_year_same_month,st_bin,near_st_lat,near_st_lng,T_MAX,T_MAX_MEAN,T_MEAN,DEWP_MEAN,MXSPD_MAX
0,-40.0,143.9,2014,5,2,1,0.166667,0.083333,0.333333,0.083333,0,0,2346,-39.88,143.88,66.9,61.483871,56.051613,51.474194,21.0
1,-40.0,143.9,2015,6,2,1,0.166667,0.083333,0.166667,0.083333,0,0,2346,-39.88,143.88,61.0,55.393333,51.416667,47.066667,39.0
2,-40.0,143.9,2016,10,3,1,0.25,0.083333,0.166667,0.083333,0,0,2346,-39.88,143.88,73.4,59.26129,53.435484,45.012903,35.0
3,-40.0,143.9,2018,10,1,0,0.083333,0.0,0.0,0.0,0,0,2346,-39.88,143.88,76.5,62.790323,55.5,48.003226,24.1
4,-40.0,143.9,2020,6,1,0,0.083333,0.0,0.0,0.0,0,0,2346,-39.88,143.88,60.8,56.66,50.92,46.506667,31.1


(4576014, 20)

year
2014    584172
2015    584172
2016    584172
2017    584172
2018    584172
2019    584172
2020    584172
2021    486810
dtype: int64

#### Split the dataset based on time to avoid leakage

In [7]:
train = X[X.year < 2019].dropna()
valid = X[(X.year >= 2019) & (X.year < 2021)]
test = X[X.year == 2021]

train.to_csv('australia_fire_train.csv', index=False)
valid.to_csv('australia_fire_valid.csv', index=False)
test.to_csv('australia_fire_test.csv', index=False)

In [8]:
train.groupby('year').size()
valid.groupby('year').size()
test.groupby('year').size()

year
2014    584172
2015    584172
2016    584172
2017    584172
2018    584172
dtype: int64

year
2019    584172
2020    584172
dtype: int64

year
2021    486810
dtype: int64

In [9]:
train.shape, valid.shape, test.shape
train.head()

((2920860, 20), (1168344, 20), (486810, 20))

Unnamed: 0,latitude,longitude,year,month,fire_cnt,fire,fire_cnt_before,fire_before,fire_cnt_last_year,fire_last_year,fire_cnt_last_year_same_month,fire_last_year_same_month,st_bin,near_st_lat,near_st_lng,T_MAX,T_MAX_MEAN,T_MEAN,DEWP_MEAN,MXSPD_MAX
0,-40.0,143.9,2014,5,2,1,0.166667,0.083333,0.333333,0.083333,0,0,2346,-39.88,143.88,66.9,61.483871,56.051613,51.474194,21.0
1,-40.0,143.9,2015,6,2,1,0.166667,0.083333,0.166667,0.083333,0,0,2346,-39.88,143.88,61.0,55.393333,51.416667,47.066667,39.0
2,-40.0,143.9,2016,10,3,1,0.25,0.083333,0.166667,0.083333,0,0,2346,-39.88,143.88,73.4,59.26129,53.435484,45.012903,35.0
3,-40.0,143.9,2018,10,1,0,0.083333,0.0,0.0,0.0,0,0,2346,-39.88,143.88,76.5,62.790323,55.5,48.003226,24.1
5,-40.0,143.9,2015,4,0,0,0.166667,0.083333,0.166667,0.083333,0,0,2346,-39.88,143.88,76.5,60.836667,55.08,48.366667,25.1


In [10]:
train.fire.unique()

array([1, 0])

# Baseline Model

### Define features for a model

In [11]:
features = [
    'latitude', 'longitude', 'month',
    'fire_cnt_before', 'fire_before',
    'fire_cnt_last_year', 'fire_last_year',
    'fire_cnt_last_year_same_month', 'fire_last_year_same_month',
    'T_MAX', 'T_MAX_MEAN', 'T_MEAN', 'DEWP_MEAN', 'MXSPD_MAX'
]

### Check data types

In [12]:
train.dtypes
train.columns

latitude                         float64
longitude                        float64
year                               int64
month                              int64
fire_cnt                           int64
fire                               int64
fire_cnt_before                  float64
fire_before                      float64
fire_cnt_last_year               float64
fire_last_year                   float64
fire_cnt_last_year_same_month      int64
fire_last_year_same_month          int64
st_bin                            object
near_st_lat                      float64
near_st_lng                      float64
T_MAX                            float64
T_MAX_MEAN                       float64
T_MEAN                           float64
DEWP_MEAN                        float64
MXSPD_MAX                        float64
dtype: object

Index(['latitude', 'longitude', 'year', 'month', 'fire_cnt', 'fire',
       'fire_cnt_before', 'fire_before', 'fire_cnt_last_year',
       'fire_last_year', 'fire_cnt_last_year_same_month',
       'fire_last_year_same_month', 'st_bin', 'near_st_lat', 'near_st_lng',
       'T_MAX', 'T_MAX_MEAN', 'T_MEAN', 'DEWP_MEAN', 'MXSPD_MAX'],
      dtype='object')

### Initialize datasets

In [13]:
train_data = lgb.Dataset(train[features], label=train.fire)
valid_data = lgb.Dataset(valid[features], label=valid.fire)

In [14]:
parameters = {'num_leaves': 10, 'max_depth': 8, 'objective': 'binary', 'metric': 'auc'}
num_round = 500

### Train model

In [15]:
%%time

model = lgb.train(parameters, train_data, num_round, valid_sets=[valid_data],
                  early_stopping_rounds=5, verbose_eval=50)



[LightGBM] [Info] Number of positive: 212014, number of negative: 2708846
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2311
[LightGBM] [Info] Number of data points in the train set: 2920860, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.072586 -> initscore=-2.547626
[LightGBM] [Info] Start training from score -2.547626
Training until validation scores don't improve for 5 rounds
[50]	valid_0's auc: 0.926226
[100]	valid_0's auc: 0.928984
[150]	valid_0's auc: 0.930751
[200]	valid_0's auc: 0.932019
[250]	valid_0's auc: 0.932906
[300]	valid_0's auc: 0.933957
[350]	valid_0's auc: 0.934905
Early stopping, best iteration is:
[377]	valid_0's auc: 0.935241
CPU times: user 3min 15s, sys: 606 ms, total: 3min 16s
Wall time: 1min 42s


In [16]:
test_predictions = model.predict(test[features])

test_auc = metrics.roc_auc_score(test.fire, test_predictions)
print('Test auc', test_auc)

Test auc 0.959480700698683


# Save model to disk

In [17]:
model.save_model('lightgbm.txt')
print(model.params) # Check params

params_str = '\n'.join("{}:{}".format(k, v) for k, v in model.params.items())
with open("lightgbm.params", "w") as text_file:
    print(params_str, file=text_file)

<lightgbm.basic.Booster at 0x7fe6a427cfd0>

{'num_leaves': 10, 'max_depth': 8, 'objective': 'binary', 'metric': 'auc', 'num_iterations': 500, 'early_stopping_round': 5}


In [18]:
fpr, tpr, thr = metrics.roc_curve(test.fire, test_predictions)
px.line(pd.DataFrame(dict(FPR=fpr, TPR=tpr)),
        x='FPR', y='TPR', title='Fire/hotspot model performance for 2021')

## Build a Catboost model

In [19]:
%%time

print('Create Catboost model')
catb_params = {
    #"objective": "MultiClass",
    "task_type": "GPU" if gpu_enabled else "CPU",
    "custom_loss": ['AUC', 'Accuracy']
}

model = CatBoostClassifier(**catb_params)
model.fit(train[features], train.fire,
      early_stopping_rounds=20,
      eval_set=[(valid[features], valid.fire)],
      verbose=100
)
preds_valid = model.predict(test[features])
acc = accuracy_score(test.fire, preds_valid)

# Getting score for a fold model
fold_auc = roc_auc_score(test.fire, preds_valid)
print(f"Acc: {acc:.8f}, ROC AUC: {fold_auc}")

Create Catboost model
Learning rate set to 0.036137
0:	learn: 0.5833422	test: 0.5702496	best: 0.5702496 (0)	total: 117ms	remaining: 1m 57s
100:	learn: 0.1563710	test: 0.1327020	best: 0.1327020 (100)	total: 3.61s	remaining: 32.1s
200:	learn: 0.1509261	test: 0.1300983	best: 0.1300983 (200)	total: 6.89s	remaining: 27.4s
300:	learn: 0.1480485	test: 0.1288111	best: 0.1288111 (300)	total: 10.2s	remaining: 23.7s
400:	learn: 0.1460668	test: 0.1279750	best: 0.1279746 (399)	total: 13.9s	remaining: 20.7s
500:	learn: 0.1446188	test: 0.1271805	best: 0.1271805 (500)	total: 17.2s	remaining: 17.1s
600:	learn: 0.1434251	test: 0.1266794	best: 0.1266727 (598)	total: 20.5s	remaining: 13.6s
700:	learn: 0.1424926	test: 0.1262297	best: 0.1262228 (699)	total: 24.8s	remaining: 10.6s
800:	learn: 0.1416625	test: 0.1259657	best: 0.1259657 (800)	total: 28.1s	remaining: 6.98s
900:	learn: 0.1408298	test: 0.1257790	best: 0.1257712 (894)	total: 31.4s	remaining: 3.45s
999:	learn: 0.1401980	test: 0.1255789	best: 0.12557

## Build XGBClassifier classifier using TimeSeries

In [20]:
xgb_method = "gpu_hist" if gpu_enabled else "hist"
model = XGBClassifier(n_estimators=1000, random_state=42, verbosity=0, tree_method=xgb_method)

model.fit(train[features], train.fire,
         verbose = False,
         eval_set = [(valid[features], valid.fire)],
         eval_metric = "auc",
         early_stopping_rounds = 200)
preds_valid = model.predict(test[features])
print('ROC AUC score', roc_auc_score(test.fire, preds_valid))





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=0, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=1000, n_jobs=2,
              num_parallel_tree=1, predictor='auto', random_state=42,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='gpu_hist', validate_parameters=1, verbosity=0)

ROC AUC score 0.6851363847154872


In [21]:
%%time

train_ds = X.dropna() #X[X.year <= 2019].dropna()
#test_ds = X[X.year >= 2020]

X_d = train_ds[features]
y_d = train_ds.fire
#X_test = test_ds[features]

lgb_params = {
    'num_iterations': 500,
    'device' : 'gpu',
    'num_leaves': 10,
    'max_depth': 8,
    'objective': 'binary',
    'metric': 'auc'
}

lgb_predictions = []
lgb_scores = []
lgb_auc = []

# 1. Our data is chronological. 1. No shuffling (lead to leakage) 
# 2. Set the old data as training set and the new data as validation set. 
# we can use TimeSeriesSplit
folds = TimeSeriesSplit(n_splits=5)

for fold, (train_idx, valid_idx) in enumerate(tqdm(folds.split(X_d))):

    print(10*"=", f"Fold={fold+1}", 10*"=")
    x_train = X_d.iloc[train_idx, :]
    x_valid = X_d.iloc[valid_idx, :]
    y_train = y_d.iloc[train_idx]
    y_valid = y_d.iloc[valid_idx]
    
    print('Create LGBMClassifier')
    model = LGBMClassifier(**lgb_params)
    model.fit(x_train, y_train,
          early_stopping_rounds=10,
          eval_set=[(x_valid, y_valid)],
          verbose=0
    )
    
    preds_valid = model.predict(x_valid)
    acc = accuracy_score(y_valid, preds_valid)
    lgb_scores.append(acc)
    
    print(f'Getting score for a fold model {fold}')
    # Getting score for a fold model
    fold_auc = roc_auc_score(y_valid, preds_valid)
    lgb_auc.append(fold_auc)
    
    print(f"Fold={fold+1}, acc: {acc:.8f}, ROC AUC: {fold_auc}")
#     test_preds = model.predict(X_test)
#     lgb_predictions.append(test_preds)
    
print(f"Mean Accuracy {np.mean(lgb_scores)}, mean auc {np.mean(lgb_auc)}")


'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.


Found `num_iterations` in params. Will use it instead of argument



Create LGBMClassifier


1it [00:22, 22.20s/it]

Getting score for a fold model 0
Fold=1, acc: 0.94660069, ROC AUC: 0.643801041868575
Create LGBMClassifier



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.


Found `num_iterations` in params. Will use it instead of argument

2it [00:26, 11.44s/it]

Getting score for a fold model 1
Fold=2, acc: 0.95578816, ROC AUC: 0.5
Create LGBMClassifier



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.


Found `num_iterations` in params. Will use it instead of argument

3it [00:50, 17.45s/it]

Getting score for a fold model 2
Fold=3, acc: 0.96343499, ROC AUC: 0.5344251003767001
Create LGBMClassifier



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.


Found `num_iterations` in params. Will use it instead of argument

4it [01:04, 16.00s/it]

Getting score for a fold model 3
Fold=4, acc: 0.95391579, ROC AUC: 0.5138101140415425
Create LGBMClassifier



'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead.


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.


Found `num_iterations` in params. Will use it instead of argument

5it [01:13, 14.66s/it]

Getting score for a fold model 4
Fold=5, acc: 0.84700965, ROC AUC: 0.5
Mean Accuracy 0.9333498542618095, mean auc 0.5384072512573634
CPU times: user 2min 9s, sys: 3.05 s, total: 2min 12s
Wall time: 1min 14s



