In [None]:
%matplotlib inline
import pandas as pd
import numpy as np
from IPython.core.interactiveshell import InteractiveShell
import random
InteractiveShell.ast_node_interactivity = "all"
import os
from datetime import datetime
import matplotlib.pyplot as plt
import plotly.express as px
import glob
from tqdm import tqdm

import plotly.figure_factory as ff
import plotly.express as px
import numpy as np

from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
import xgboost as xgb

from sklearn.metrics import accuracy_score
from scipy.stats import mode
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import TimeSeriesSplit

In [None]:
MAPBOX_TOKEN = 'pk.eyJ1IjoiaWdsYXdlYiIsImEiOiJja3picmk5NmsyaDZxMndtenYyOWhvNmtnIn0.Dxi29pChSrUbePq_oZ1rTw'
px.set_mapbox_access_token(MAPBOX_TOKEN)

### Trianing period 2013-2020

We zoomed in Australia and used aggressive aggregation for a simple baseline prediction model.
* Temporal resolution: Monthly
* Spatial resolution: 1 Decimal degree ~ 10 km grid
* Binary Target: At least two fire readings

In [None]:
aus_fires = pd.read_csv('/kaggle/input/wildfiredataset/aus_fires_binned_geometry_new.csv')
aus_fires.shape
aus_fires.head()

In [None]:
aus_weather = pd.read_csv('/kaggle/input/wildfiredataset/aus_weather_binned_new.csv')
aus_weather.shape
aus_weather.head()

#### Split the dataset based on time to avoid leakage

# Here is the weather data:

* temp: Mean temperature for the day in degrees Fahrenheit to tenths.
* max: Maximum temperature reported during the day in Fahrenheit to tenths--time of max temp report varies by country and region, so this will sometimes not be the max for the calendar day.
* min: Minimum temperature reported during the day in Fahrenheit to tenths--time of min temp report varies by country and region, so this will sometimes not be the min for the calendar day.
* stp: Mean station pressure for the day in millibars to tenths.
* slp: Mean sea level pressure for the day in millibars to tenths.
* dewp: Mean dew point for the day in degrees Fahrenheit to tenths.
* wdsp: Mean wind speed for the day in knots to tenths.
* prcp: Total precipitation (rain and/or melted snow) reported during the day in inches and hundredths; will usually not end with the midnight observation--i.e., may include latter part of previous day. .00 indicates no measurable precipitation (includes a trace).
* fog: Indicators (1 = yes, 0 = no/not reported) for the occurrence during the day

In [None]:
train = X[X.year < 2019].dropna()
valid = X[(X.year >= 2019) & (X.year < 2021)]
test = X[X.year == 2021]

### Total fire records

In [None]:
yearly_fires = aus_fires[aus_fires.year < 2021].groupby(['year', 'month']).sum().reset_index()
yearly_fires.head()

In [None]:
# # how many wildfires per day
# px.bar(yearly_fires, x='day', y='fire_cnt', color='year',
#        title='Hotspot detections by Day in Australia')

In [None]:
px.bar(yearly_fires, x='month', y='fire_cnt', color='year',
       title='Hotspot detections by Month in Australia')

In [None]:
px.bar(yearly_fires, x='year', y='fire_cnt', color='month',
       title='Hotspot detections by Year in Australia')

In [None]:
geo = aus_fires.groupby(['latitude', 'longitude']).sum().reset_index()
geo.shape
geo.head()

In [None]:
fig = ff.create_hexbin_mapbox(
    data_frame=geo, lat="latitude", lon="longitude",
    nx_hexagon=50, opacity=0.9, labels={"color": "Hotspot records"},
    color="fire_cnt", agg_func=np.sum, color_continuous_scale="Reds"
)
fig.show()

# Baseline Model

In [None]:
features = [
    'latitude', 'longitude', 'month',
    'fire_cnt_before', 'fire_before',
    'fire_cnt_last_year', 'fire_last_year',
    'fire_cnt_last_year_same_month', 'fire_last_year_same_month'
]

In [None]:
train.head()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.subplots(figsize=(20,10))
sns.heatmap(train[features].corr(), annot=True, square=True)
plt.show()