# 📚 Import Libraries

In [1]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN
from geopy.distance import great_circle
from shapely.geometry import MultiPoint

%matplotlib inline
import pandas as pd
import numpy as np
from IPython.core.interactiveshell import InteractiveShell
import random
InteractiveShell.ast_node_interactivity = "all"
import os
from datetime import datetime
import matplotlib.pyplot as plt
import plotly.express as px
import glob
from tqdm import tqdm

import plotly.figure_factory as ff
import plotly.express as px
import numpy as np

import geopandas as gpd
from shapely.geometry import Point
from shapely.ops import nearest_points

  shapely_geos_version, geos_capi_version_string


### Use mapbox for visualization

In [2]:
# use your own mapbox token here to visualize
MAPBOX_TOKEN = 'pk.eyJ1IjoiaWdsYXdlYiIsImEiOiJja3picmk5NmsyaDZxMndtenYyOWhvNmtnIn0.Dxi29pChSrUbePq_oZ1rTw'
px.set_mapbox_access_token(MAPBOX_TOKEN)

In [3]:
# data description https://www.ncei.noaa.gov/data/global-summary-of-the-day/doc/readme.txt
WORK_DIR = './'
aus_fire = pd.read_csv(WORK_DIR + '/wildfiredataset/australia_fire_total_ready.csv')
aus_fire.shape
aus_fire.head()

(4576014, 12)

Unnamed: 0,latitude,longitude,year,month,fire_cnt,fire,fire_cnt_before,fire_before,fire_cnt_last_year,fire_last_year,fire_cnt_last_year_same_month,fire_last_year_same_month
0,-40.0,143.9,2014,5,2,1,0.166667,0.083333,0.333333,0.083333,0,0
1,-40.0,143.9,2015,6,2,1,0.166667,0.083333,0.166667,0.083333,0,0
2,-40.0,143.9,2016,10,3,1,0.25,0.083333,0.166667,0.083333,0,0
3,-40.0,143.9,2018,10,1,0,0.083333,0.0,0.0,0.0,0,0
4,-40.0,143.9,2020,6,1,0,0.083333,0.0,0.0,0.0,0,0


In [4]:
aus_fire.latitude.min(), aus_fire.latitude.max()
aus_fire.longitude.min(), aus_fire.longitude.max()

(-40.0, -9.0)

(112.0, 154.7)

In [5]:
# data description https://www.ncei.noaa.gov/data/global-summary-of-the-day/doc/readme.txt
aus_weather = pd.read_csv(WORK_DIR + '/wildfiredataset/australia_weather_full2.csv', parse_dates=['DATE'])
aus_weather.shape
aus_weather.head()

(1479377, 19)

Unnamed: 0,STATION,DATE,LATITUDE,LONGITUDE,ELEVATION,NAME,TEMP,DEWP,SLP,STP,VISIB,WDSP,MXSPD,GUST,MAX,MIN,PRCP,SNDP,FRSHTT
0,95658099999,2013-01-01,-30.483333,136.883333,100.0,"OLYMPIC DAM AERODROME, AS",91.3,17.0,1012.0,0.6,999.9,8.4,13.0,999.9,108.5,61.5,0.0,999.9,0
1,95658099999,2013-01-02,-30.483333,136.883333,100.0,"OLYMPIC DAM AERODROME, AS",91.9,18.2,1010.9,999.5,999.9,7.9,14.0,999.9,107.2,70.3,0.0,999.9,0
2,95658099999,2013-01-03,-30.483333,136.883333,100.0,"OLYMPIC DAM AERODROME, AS",95.2,8.3,1006.0,994.7,999.9,7.6,12.0,999.9,109.6,68.2,0.0,999.9,0
3,95658099999,2013-01-04,-30.483333,136.883333,100.0,"OLYMPIC DAM AERODROME, AS",94.6,25.3,1004.2,992.8,999.9,7.0,14.0,999.9,115.9,69.6,0.0,999.9,0
4,95658099999,2013-01-05,-30.483333,136.883333,100.0,"OLYMPIC DAM AERODROME, AS",91.1,43.6,1011.3,999.9,999.9,14.3,16.9,999.9,111.4,73.4,0.0,999.9,0


# Set precision for geo coordinates

In [6]:
# http://wiki.gis.com/wiki/index.php/Decimal_degrees
PRECISION = 2 # 2 places - 1 km
aus_weather.LATITUDE = aus_weather.LATITUDE.astype(float).round(PRECISION)
aus_weather.LONGITUDE = aus_weather.LONGITUDE.astype(float).round(PRECISION)
aus_weather.head()

Unnamed: 0,STATION,DATE,LATITUDE,LONGITUDE,ELEVATION,NAME,TEMP,DEWP,SLP,STP,VISIB,WDSP,MXSPD,GUST,MAX,MIN,PRCP,SNDP,FRSHTT
0,95658099999,2013-01-01,-30.48,136.88,100.0,"OLYMPIC DAM AERODROME, AS",91.3,17.0,1012.0,0.6,999.9,8.4,13.0,999.9,108.5,61.5,0.0,999.9,0
1,95658099999,2013-01-02,-30.48,136.88,100.0,"OLYMPIC DAM AERODROME, AS",91.9,18.2,1010.9,999.5,999.9,7.9,14.0,999.9,107.2,70.3,0.0,999.9,0
2,95658099999,2013-01-03,-30.48,136.88,100.0,"OLYMPIC DAM AERODROME, AS",95.2,8.3,1006.0,994.7,999.9,7.6,12.0,999.9,109.6,68.2,0.0,999.9,0
3,95658099999,2013-01-04,-30.48,136.88,100.0,"OLYMPIC DAM AERODROME, AS",94.6,25.3,1004.2,992.8,999.9,7.0,14.0,999.9,115.9,69.6,0.0,999.9,0
4,95658099999,2013-01-05,-30.48,136.88,100.0,"OLYMPIC DAM AERODROME, AS",91.1,43.6,1011.3,999.9,999.9,14.3,16.9,999.9,111.4,73.4,0.0,999.9,0


# Analyze duplicates

In [7]:
# Selecting duplicate rows based
# on list of column names
aus_weather_d = aus_weather[aus_weather.duplicated(['DATE', 'LATITUDE', 'LONGITUDE'])]
# df.drop_duplicates(subset=['DATE', 'LATITUDE', 'LONGITUDE'], keep=False, inplace=True)
print(aus_weather_d.shape)
aus_weather_d.head()

(12993, 19)


Unnamed: 0,STATION,DATE,LATITUDE,LONGITUDE,ELEVATION,NAME,TEMP,DEWP,SLP,STP,VISIB,WDSP,MXSPD,GUST,MAX,MIN,PRCP,SNDP,FRSHTT
69783,95719099999,2013-01-01,-32.22,148.57,285.0,"DUBBO AIRPORT AWS, AS",82.6,38.1,1009.6,977.4,999.9,7.2,15.9,999.9,103.3,56.8,0.0,999.9,0
69784,95719099999,2013-01-02,-32.22,148.57,285.0,"DUBBO AIRPORT AWS, AS",80.9,42.8,1011.1,978.9,999.9,11.4,15.9,999.9,100.9,56.3,0.0,999.9,0
69785,95719099999,2013-01-03,-32.22,148.57,285.0,"DUBBO AIRPORT AWS, AS",79.4,55.8,1011.7,979.5,999.9,11.9,18.1,999.9,97.3,64.8,0.0,999.9,0
69786,95719099999,2013-01-04,-32.22,148.57,285.0,"DUBBO AIRPORT AWS, AS",88.0,55.3,1010.7,978.5,999.9,8.9,14.0,999.9,101.7,64.8,0.0,999.9,0
69787,95719099999,2013-01-05,-32.22,148.57,285.0,"DUBBO AIRPORT AWS, AS",89.8,53.7,1013.6,981.3,999.9,7.4,13.0,999.9,102.9,71.2,0.0,999.9,0


In [8]:
print('Number of unique stations', len(aus_weather['STATION'].unique()))

Number of unique stations 595


In [9]:
print('Lat range', aus_weather['LATITUDE'].min(), aus_weather['LATITUDE'].max())
print('Lng range', aus_weather['LONGITUDE'].min(), aus_weather['LONGITUDE'].max())

Lat range -39.88 -10.05
Lng range 113.52 154.4


# Remove stations that have less than 108 measurements

In [10]:
aus_weather['year'] = aus_weather.DATE.dt.year
aus_weather['month'] = aus_weather.DATE.dt.month
#aus_weather.latitude = aus_fires.latitude.round(PRECISION)
#aus_weather.longitude = aus_fires.longitude.round(PRECISION)
#fires = aus_weather.groupby(['STATION', 'LATITUDE', 'LONGITUDE', 'year', 'month']).size().reset_index()

#aus_wth_agg = aus_weather.groupby(['STATION', 'LATITUDE', 'LONGITUDE', 'year', 'month']) \
 #   .agg({'TEMP':'mean','DEWP':'sum', 'WDSP':'sum', 'MAX':'max', 'MIN': 'min'}) \
  #  .reset_index()

aus_wth_agg = aus_weather.groupby(['STATION', 'LATITUDE', 'LONGITUDE', 'year', 'month']) \
    .agg(T_MAX=('MAX', 'max'), T_MAX_MEAN=('MAX', 'mean'), T_MEAN=('TEMP', 'mean'), \
         DEWP_MEAN=('DEWP', 'mean'), WDSP_MEAN=('WDSP', 'mean'), \
         MXSPD_MAX=('MXSPD', 'max')) \
    .reset_index()

#fires.columns = ['latitude', 'longitude', 'year', 'month', 'temp_avg']

aus_wth_agg.shape
aus_wth_agg.head()
aus_wth_agg.nunique()

(49924, 11)

Unnamed: 0,STATION,LATITUDE,LONGITUDE,year,month,T_MAX,T_MAX_MEAN,T_MEAN,DEWP_MEAN,WDSP_MEAN,MXSPD_MAX
0,94100099999,-14.3,126.63,2013,1,98.2,92.912903,83.251613,73.677419,4.151613,15.9
1,94100099999,-14.3,126.63,2013,2,98.8,92.392857,81.314286,72.925,3.271429,18.1
2,94100099999,-14.3,126.63,2013,3,98.6,94.16129,81.209677,72.329032,3.022581,11.1
3,94100099999,-14.3,126.63,2013,4,100.2,94.876667,79.353333,64.23,3.613333,11.1
4,94100099999,-14.3,126.63,2013,5,98.4,91.987097,78.706452,64.177419,3.748387,12.0


STATION         595
LATITUDE        481
LONGITUDE       506
year              9
month            12
T_MAX           470
T_MAX_MEAN    28563
T_MEAN        27248
DEWP_MEAN     25790
WDSP_MEAN     13494
MXSPD_MAX       116
dtype: int64

# Detect stations and its number of observations

In [11]:
st_cnt_sample = aus_wth_agg.groupby(['STATION','LATITUDE','LONGITUDE']).size().reset_index().rename(columns={0:'count'})

st_cnt_sample = st_cnt_sample.reset_index()
st_cnt_sample = st_cnt_sample.rename(columns={"index":"STAT_ID"})
st_cnt_sample['STAT_ID'] = st_cnt_sample.index + 1

st_cnt_sample.shape
print('Max observations', st_cnt_sample['count'].max())
print('Min observations', st_cnt_sample['count'].min())
#st_cnt_sample.head()
# how many stations have only one observation
st_cnt_sample[st_cnt_sample['count'] == 1].shape
st_cnt_sample.head()

(595, 5)

Max observations 108
Min observations 1


(78, 5)

Unnamed: 0,STAT_ID,STATION,LATITUDE,LONGITUDE,count
0,1,94100099999,-14.3,126.63,108
1,2,94102099999,-13.75,126.15,108
2,3,94103099999,-14.12,123.53,91
3,4,94105099999,-12.62,131.05,92
4,5,94106099999,-14.72,134.75,92


In [12]:
# remote stations that do not have 12 * 9 = 108 month data (up to december 2021)
st_cnt_sample_108 = st_cnt_sample[st_cnt_sample['count'] >= 108]
st_cnt_sample_108.shape

(393, 5)

In [13]:
# drop non relevant stations
st_cnt_sample_del = st_cnt_sample[st_cnt_sample['count'] < 108]
st_cnt_sample = st_cnt_sample.drop(st_cnt_sample_del.index)

st_cnt_sample.shape
st_cnt_sample_del.head()

(393, 5)

Unnamed: 0,STAT_ID,STATION,LATITUDE,LONGITUDE,count
2,3,94103099999,-14.12,123.53,91
3,4,94105099999,-12.62,131.05,92
4,5,94106099999,-14.72,134.75,92
5,6,94108099999,-11.17,132.48,90
6,7,94109099999,-11.55,132.93,92


In [14]:
# remove stations that have less count observations
print('Remove stations', len(st_cnt_sample_del['STATION'].unique()))

# remove stations that are in st_cnt_sample_del
cond = aus_wth_agg['STATION'].isin(st_cnt_sample_del['STATION'])
aus_wth_agg.drop(aus_wth_agg[cond].index, inplace = True)

aus_wth_agg.shape
aus_wth_agg.head()

Remove stations 202


(42444, 11)

Unnamed: 0,STATION,LATITUDE,LONGITUDE,year,month,T_MAX,T_MAX_MEAN,T_MEAN,DEWP_MEAN,WDSP_MEAN,MXSPD_MAX
0,94100099999,-14.3,126.63,2013,1,98.2,92.912903,83.251613,73.677419,4.151613,15.9
1,94100099999,-14.3,126.63,2013,2,98.8,92.392857,81.314286,72.925,3.271429,18.1
2,94100099999,-14.3,126.63,2013,3,98.6,94.16129,81.209677,72.329032,3.022581,11.1
3,94100099999,-14.3,126.63,2013,4,100.2,94.876667,79.353333,64.23,3.613333,11.1
4,94100099999,-14.3,126.63,2013,5,98.4,91.987097,78.706452,64.177419,3.748387,12.0


In [15]:
st_cnt_sample.shape
aus_wth_agg[aus_wth_agg['STATION'] == 94100099999].head(2)

(393, 5)

Unnamed: 0,STATION,LATITUDE,LONGITUDE,year,month,T_MAX,T_MAX_MEAN,T_MEAN,DEWP_MEAN,WDSP_MEAN,MXSPD_MAX
0,94100099999,-14.3,126.63,2013,1,98.2,92.912903,83.251613,73.677419,4.151613,15.9
1,94100099999,-14.3,126.63,2013,2,98.8,92.392857,81.314286,72.925,3.271429,18.1


# Check duplicates again

In [16]:
# Selecting duplicate rows based
# on list of column names
# sort it first to remove rows relevant to same stations
aus_wth_agg.sort_values(by=['STATION'], ascending=True, inplace=True)
aus_wth_agg_d = aus_wth_agg[aus_wth_agg.duplicated(['year', 'month', 'LATITUDE', 'LONGITUDE'], keep=False)]

print('Before delete', aus_wth_agg.shape)
aus_wth_agg.drop_duplicates(subset=['year', 'month', 'LATITUDE', 'LONGITUDE'], keep='first', inplace=True)
print('After delete', aus_wth_agg.shape)

#aus_weather_d.head()

test = aus_wth_agg_d[(aus_wth_agg_d['year'] == 2013) & (aus_wth_agg_d['month'] == 1) & \
                     (aus_wth_agg_d['LATITUDE'] == -32.22)]
test.head()

Before delete (42444, 11)
After delete (42012, 11)


Unnamed: 0,STATION,LATITUDE,LONGITUDE,year,month,T_MAX,T_MAX_MEAN,T_MEAN,DEWP_MEAN,WDSP_MEAN,MXSPD_MAX
22100,94719099999,-32.22,148.57,2013,1,111.2,96.916129,82.054839,54.129032,9.912903,35.0
43790,95719099999,-32.22,148.57,2013,1,113.0,98.487097,81.877419,54.083871,9.825806,22.9


In [17]:
# remove stations that have less count observations
print('Stations', len(st_cnt_sample['STATION'].unique()))

# remove stations that are in st_cnt_sample_del
cond = ~st_cnt_sample['STATION'].isin(aus_wth_agg['STATION'])
st_cnt_sample.drop(st_cnt_sample[cond].index, inplace = True)

st_cnt_sample.shape
st_cnt_sample.head()

Stations 393


(389, 5)

Unnamed: 0,STAT_ID,STATION,LATITUDE,LONGITUDE,count
0,1,94100099999,-14.3,126.63,108
1,2,94102099999,-13.75,126.15,108
10,11,94119099999,-11.4,130.42,108
11,12,94120099999,-12.41,130.88,108
12,13,94122099999,-11.78,130.02,108


In [18]:
import plotly.express as px
import geopandas as gpd

fig = px.scatter_geo(st_cnt_sample,
                    lat=st_cnt_sample.LATITUDE,
                    lon=st_cnt_sample.LONGITUDE,
                    hover_name="STAT_ID")
fig.show()

In [19]:
st_cnt_sample_np = st_cnt_sample[['LATITUDE', 'LONGITUDE']].to_numpy()
st_cnt_sample_np[:10]

array([[-14.3 , 126.63],
       [-13.75, 126.15],
       [-11.4 , 130.42],
       [-12.41, 130.88],
       [-11.78, 130.02],
       [-13.05, 131.02],
       [-13.83, 131.18],
       [-14.95, 130.8 ],
       [-14.52, 132.38],
       [-11.05, 132.98]])

In [20]:
coords = st_cnt_sample_np
kms_per_radian = 6371.0088
epsilon = 50 / kms_per_radian
db = DBSCAN(eps=epsilon, min_samples=1, algorithm='ball_tree', metric='haversine').fit(np.radians(coords))
cluster_labels = db.labels_
num_clusters = len(set(cluster_labels))
clusters = pd.Series([coords[cluster_labels == n] for n in range(num_clusters)])
print('Number of clusters: {}'.format(num_clusters))

Number of clusters: 226


In [21]:
kms_per_radian = 6371.0088

AUS_LAT_RANGE = (-40, -9)
AUS_LON_RANGE = (112, 154.7)

AUS_LAT_RANGE_R = (9, 40)
AUS_LON_RANGE_R = (112, 154.7)

bottomLeft = (AUS_LAT_RANGE_R[1], AUS_LON_RANGE[0])
bottomRight = (AUS_LAT_RANGE_R[1], AUS_LON_RANGE[1])
topLeft = (AUS_LAT_RANGE_R[0], AUS_LON_RANGE[0])
topRight = (AUS_LAT_RANGE_R[0], AUS_LON_RANGE[1])

In [22]:
# remove non relevent weather data
print('Before', aus_weather.shape)

aus_weather = aus_weather[
  (aus_weather.LATITUDE <= AUS_LAT_RANGE[1]) & (aus_weather.LATITUDE >= AUS_LAT_RANGE[0])]
aus_weather = aus_weather[
    (aus_weather.LONGITUDE <= AUS_LON_RANGE[1]) & (aus_weather.LONGITUDE >= AUS_LON_RANGE[0])]

print('After', aus_weather.shape)

Before (1479377, 21)
After (1479377, 21)


In [23]:
from geopy.distance import geodesic

coords_1 = (AUS_LAT_RANGE[1], AUS_LON_RANGE[0])
coords_2 = (AUS_LAT_RANGE[0], AUS_LON_RANGE[0])
lat_dist = geodesic(coords_1, coords_2).km
print('Lat dist', lat_dist)

coords_1 = (AUS_LAT_RANGE[1], AUS_LON_RANGE[0])
coords_2 = (AUS_LAT_RANGE[1], AUS_LON_RANGE[1])
lng_dist = geodesic(coords_1, coords_2).km
print('Lng dist', lng_dist)

#coords_1 = (-9, 40)
#coords_2 = (-9, 41)
#lng_dist = geodesic(coords_1, coords_2).km
#print('Lng dist', lng_dist)

Lat dist 3434.2787529548777
Lng dist 4692.393446544515


## Bins and weather stations

We cannot process entire dataset and match weather stations due to platform memory limits. The datasets of fire and weather records is relatively big. The idea that I came up with is to divide the Australia mainland into grid cells (see bins here) and assign a weather station located in this bin to every fire record that is inside this bin. This approach helps us not to iterate through all combinations of fire records and weather stations, and save memory and computations resources.

In [24]:
cnt_rows = lat_dist / 100
cnt_cols = lng_dist / 100

rows = np.linspace(bottomLeft[1], bottomRight[1], num=int(cnt_rows))
cols = np.linspace(topLeft[0], bottomLeft[0], num=int(cnt_cols))

cols_gap = abs(cols[1] - cols[0])
rows_gap = abs(rows[1] - rows[0])
print('Cols cnt', len(cols))
print('Rows cnt', len(rows))
print('Cols gap', cols_gap, 'Rows gap', rows_gap)
print(rows)
print(cols)

print(np.linspace(1, 10, num=3))

def detect_bin(lat, lng):
    col_lng = (int)((lng - AUS_LON_RANGE_R[0]) / cols_gap)
    col_lat = (int)((abs(lat) - AUS_LAT_RANGE_R[0]) / rows_gap)
    return f'{col_lat},{col_lng}'

Cols cnt 46
Rows cnt 34
Cols gap 0.6888888888888882 Rows gap 1.2939393939393966
[112.         113.29393939 114.58787879 115.88181818 117.17575758
 118.46969697 119.76363636 121.05757576 122.35151515 123.64545455
 124.93939394 126.23333333 127.52727273 128.82121212 130.11515152
 131.40909091 132.7030303  133.9969697  135.29090909 136.58484848
 137.87878788 139.17272727 140.46666667 141.76060606 143.05454545
 144.34848485 145.64242424 146.93636364 148.23030303 149.52424242
 150.81818182 152.11212121 153.40606061 154.7       ]
[ 9.          9.68888889 10.37777778 11.06666667 11.75555556 12.44444444
 13.13333333 13.82222222 14.51111111 15.2        15.88888889 16.57777778
 17.26666667 17.95555556 18.64444444 19.33333333 20.02222222 20.71111111
 21.4        22.08888889 22.77777778 23.46666667 24.15555556 24.84444444
 25.53333333 26.22222222 26.91111111 27.6        28.28888889 28.97777778
 29.66666667 30.35555556 31.04444444 31.73333333 32.42222222 33.11111111
 33.8        34.48888889 35.1777

In [25]:
# check stations inside bin
non_null_first = 0
dict_w = {}
for idx1, r in enumerate(tqdm(rows)):
    for idx2, c in enumerate(tqdm(cols)):
        if idx1 == 0 or idx2 == 0: continue
        #print(r, c)
        col_lat1 = cols[idx2 - 1]
        col_lat2 = cols[idx2]
        rows1_lng = rows[idx1 - 1]
        rows2_lng = rows[idx1]
        
        #if idx1 < 3 and idx2 < 3:
         #   print(-col_lat1, -col_lat2)
         #   print(rows1_lng, rows2_lng)
        
        aus_1 = st_cnt_sample[
          (st_cnt_sample.LATITUDE <= -col_lat1) & (st_cnt_sample.LATITUDE >= -col_lat2)]
        aus_1 = aus_1[
            (aus_1.LONGITUDE <= rows2_lng) & (aus_1.LONGITUDE >= rows1_lng)]
        
        if non_null_first == 0 and len(aus_1.index) > 0:
            print('First ', (idx1, idx2), (r, c), len(aus_1.index))
            non_null_first = len(aus_1.index)
        dict_w[(idx1, idx2)] = len(aus_1.index)

print(max(dict_w, key=dict_w.get))        
print(dict_w)

  0%|          | 0/34 [00:00<?, ?it/s]
100%|██████████| 46/46 [00:00<00:00, 188232.18it/s]

100%|██████████| 46/46 [00:00<00:00, 845.37it/s]

100%|██████████| 46/46 [00:00<00:00, 856.58it/s]
  9%|▉         | 3/34 [00:00<00:01, 24.76it/s]

First  (2, 20) (114.5878787878788, 22.77777777777778) 1



100%|██████████| 46/46 [00:00<00:00, 863.90it/s]

100%|██████████| 46/46 [00:00<00:00, 826.14it/s]

100%|██████████| 46/46 [00:00<00:00, 842.17it/s]
 18%|█▊        | 6/34 [00:00<00:01, 19.43it/s]
100%|██████████| 46/46 [00:00<00:00, 672.49it/s]

100%|██████████| 46/46 [00:00<00:00, 611.50it/s]

100%|██████████| 46/46 [00:00<00:00, 723.85it/s]
 26%|██▋       | 9/34 [00:00<00:01, 16.07it/s]
100%|██████████| 46/46 [00:00<00:00, 624.12it/s]

100%|██████████| 46/46 [00:00<00:00, 618.80it/s]
 32%|███▏      | 11/34 [00:00<00:01, 14.75it/s]
100%|██████████| 46/46 [00:00<00:00, 644.79it/s]

100%|██████████| 46/46 [00:00<00:00, 816.91it/s]
 38%|███▊      | 13/34 [00:00<00:01, 14.56it/s]
100%|██████████| 46/46 [00:00<00:00, 784.40it/s]

100%|██████████| 46/46 [00:00<00:00, 676.78it/s]
 44%|████▍     | 15/34 [00:00<00:01, 14.60it/s]
100%|██████████| 46/46 [00:00<00:00, 820.26it/s]

100%|██████████| 46/46 [00:00<00:00, 821.00it/s]
 50%|█████     | 17/34 [00:01<00:01, 14.99it/s]
100%|██████████| 46

(21, 38)
{(1, 1): 0, (1, 2): 0, (1, 3): 0, (1, 4): 0, (1, 5): 0, (1, 6): 0, (1, 7): 0, (1, 8): 0, (1, 9): 0, (1, 10): 0, (1, 11): 0, (1, 12): 0, (1, 13): 0, (1, 14): 0, (1, 15): 0, (1, 16): 0, (1, 17): 0, (1, 18): 0, (1, 19): 0, (1, 20): 0, (1, 21): 0, (1, 22): 0, (1, 23): 0, (1, 24): 0, (1, 25): 0, (1, 26): 0, (1, 27): 0, (1, 28): 0, (1, 29): 0, (1, 30): 0, (1, 31): 0, (1, 32): 0, (1, 33): 0, (1, 34): 0, (1, 35): 0, (1, 36): 0, (1, 37): 0, (1, 38): 0, (1, 39): 0, (1, 40): 0, (1, 41): 0, (1, 42): 0, (1, 43): 0, (1, 44): 0, (1, 45): 0, (2, 1): 0, (2, 2): 0, (2, 3): 0, (2, 4): 0, (2, 5): 0, (2, 6): 0, (2, 7): 0, (2, 8): 0, (2, 9): 0, (2, 10): 0, (2, 11): 0, (2, 12): 0, (2, 13): 0, (2, 14): 0, (2, 15): 0, (2, 16): 0, (2, 17): 0, (2, 18): 0, (2, 19): 0, (2, 20): 1, (2, 21): 0, (2, 22): 0, (2, 23): 0, (2, 24): 1, (2, 25): 1, (2, 26): 0, (2, 27): 0, (2, 28): 0, (2, 29): 0, (2, 30): 0, (2, 31): 0, (2, 32): 0, (2, 33): 0, (2, 34): 0, (2, 35): 0, (2, 36): 0, (2, 37): 0, (2, 38): 0, (2, 39): 0, 




In [26]:
st_cnt_sample['st_bin'] = st_cnt_sample.apply(lambda x: detect_bin(x.LATITUDE, x.LONGITUDE), axis=1)
st_cnt_sample.shape
st_cnt_sample.head()

(389, 6)

Unnamed: 0,STAT_ID,STATION,LATITUDE,LONGITUDE,count,st_bin
0,1,94100099999,-14.3,126.63,108,421
1,2,94102099999,-13.75,126.15,108,320
10,11,94119099999,-11.4,130.42,108,126
11,12,94120099999,-12.41,130.88,108,227
12,13,94122099999,-11.78,130.02,108,226


In [27]:
aus_wth_agg['st_bin'] = aus_wth_agg.apply(lambda x: detect_bin(x.LATITUDE, x.LONGITUDE), axis=1)
aus_wth_agg.head()

Unnamed: 0,STATION,LATITUDE,LONGITUDE,year,month,T_MAX,T_MAX_MEAN,T_MEAN,DEWP_MEAN,WDSP_MEAN,MXSPD_MAX,st_bin
0,94100099999,-14.3,126.63,2013,1,98.2,92.912903,83.251613,73.677419,4.151613,15.9,421
78,94100099999,-14.3,126.63,2019,7,94.1,90.822581,73.174194,47.225806,4.209677,14.0,421
77,94100099999,-14.3,126.63,2019,6,95.0,88.55,73.2,45.39,4.24,12.0,421
76,94100099999,-14.3,126.63,2019,5,98.8,92.212903,76.674194,55.309677,4.029032,13.0,421
75,94100099999,-14.3,126.63,2019,4,97.5,93.663333,81.046667,67.276667,3.376667,15.0,421


In [28]:
el = aus_wth_agg['st_bin'].unique()
len(el)
aus_wth_agg['st_bin'].values
aus_wth_agg.shape

239

array(['4,21', '4,21', '4,21', ..., '20,56', '20,56', '20,56'],
      dtype=object)

(42012, 12)

### Save binned weather data

In [29]:
aus_wth_agg.to_csv("aus_weather_binned_new.csv", index=False)
print('Binned data saved')

Binned data saved


In [30]:
aus_fires = pd.read_csv(WORK_DIR + '/wildfiredataset/australia_fire_total_ready.csv')
aus_fires.shape
aus_fires.head()

(4576014, 12)

Unnamed: 0,latitude,longitude,year,month,fire_cnt,fire,fire_cnt_before,fire_before,fire_cnt_last_year,fire_last_year,fire_cnt_last_year_same_month,fire_last_year_same_month
0,-40.0,143.9,2014,5,2,1,0.166667,0.083333,0.333333,0.083333,0,0
1,-40.0,143.9,2015,6,2,1,0.166667,0.083333,0.166667,0.083333,0,0
2,-40.0,143.9,2016,10,3,1,0.25,0.083333,0.166667,0.083333,0,0
3,-40.0,143.9,2018,10,1,0,0.083333,0.0,0.0,0.0,0,0
4,-40.0,143.9,2020,6,1,0,0.083333,0.0,0.0,0.0,0,0


In [31]:
aus_fires['st_bin'] = aus_fires.apply(lambda x: detect_bin(x.latitude, x.longitude), axis=1)
aus_fires.head()

Unnamed: 0,latitude,longitude,year,month,fire_cnt,fire,fire_cnt_before,fire_before,fire_cnt_last_year,fire_last_year,fire_cnt_last_year_same_month,fire_last_year_same_month,st_bin
0,-40.0,143.9,2014,5,2,1,0.166667,0.083333,0.333333,0.083333,0,0,2346
1,-40.0,143.9,2015,6,2,1,0.166667,0.083333,0.166667,0.083333,0,0,2346
2,-40.0,143.9,2016,10,3,1,0.25,0.083333,0.166667,0.083333,0,0,2346
3,-40.0,143.9,2018,10,1,0,0.083333,0.0,0.0,0.0,0,0,2346
4,-40.0,143.9,2020,6,1,0,0.083333,0.0,0.0,0.0,0,0,2346


In [32]:
fire_st_list = aus_fires['st_bin'].unique()
len(fire_st_list)
print(fire_st_list[:10])
print(dict_w.get((31,33)))

no_st = []
ret = []
for idx1, r in enumerate (rows):
    for idx2, c in enumerate(cols):
        dict_val = dict_w.get((idx1, idx2))
        if dict_val == 0: # no station in this zone
            no_st.append((idx1, idx2))

        if f'{idx1},{idx2}' not in fire_st_list:
            ret.append(f'{idx1},{idx2}')
print('Missing zones', len(ret))

need_st = []
for st_b in tqdm(fire_st_list):
    idx1, idx2 = st_b.split(',')
    idx1 = int(idx1)
    idx2 = int(idx2)
    
    dict_val = dict_w.get((idx1, idx2))
    if dict_val == 0: # no station in this zone
        need_st.append((idx1, idx2))


print('No weather stations', len(need_st))

971

['23,46' '23,52' '23,51' '23,49' '23,50' '23,45' '22,45' '22,46' '22,49'
 '22,50']
0
Missing zones 827


100%|██████████| 971/971 [00:00<00:00, 289993.53it/s]

No weather stations 637





### We are ready to save binned fire dataset

In [33]:
aus_fires.to_csv("aus_fires_binned.csv", index=False)
print('Binned fire dataset saved')

Binned fire dataset saved


### Find nearest weather stations for every fire record

Use GeoPandas package for this task https://geopandas.org/en/stable/docs/reference/api/geopandas.sindex.SpatialIndex.nearest.html

In [34]:
aus_wth_agg.shape
aus_wth_agg.head()

(42012, 12)

Unnamed: 0,STATION,LATITUDE,LONGITUDE,year,month,T_MAX,T_MAX_MEAN,T_MEAN,DEWP_MEAN,WDSP_MEAN,MXSPD_MAX,st_bin
0,94100099999,-14.3,126.63,2013,1,98.2,92.912903,83.251613,73.677419,4.151613,15.9,421
78,94100099999,-14.3,126.63,2019,7,94.1,90.822581,73.174194,47.225806,4.209677,14.0,421
77,94100099999,-14.3,126.63,2019,6,95.0,88.55,73.2,45.39,4.24,12.0,421
76,94100099999,-14.3,126.63,2019,5,98.8,92.212903,76.674194,55.309677,4.029032,13.0,421
75,94100099999,-14.3,126.63,2019,4,97.5,93.663333,81.046667,67.276667,3.376667,15.0,421


In [35]:
# weather stations
st_cnt_sample.shape
st_cnt_sample.head(3)

(389, 6)

Unnamed: 0,STAT_ID,STATION,LATITUDE,LONGITUDE,count,st_bin
0,1,94100099999,-14.3,126.63,108,421
1,2,94102099999,-13.75,126.15,108,320
10,11,94119099999,-11.4,130.42,108,126


In [36]:
%%time

import geopandas as gpd
from shapely.geometry import Point
from shapely.ops import nearest_points

geometry = [Point(xy) for xy in zip(aus_wth_agg.LONGITUDE, aus_wth_agg.LATITUDE)]
gdf = gpd.GeoDataFrame(aus_wth_agg, crs="EPSG:4326", geometry=geometry)
gdf.head(3)

CPU times: user 2.43 s, sys: 1.89 ms, total: 2.43 s
Wall time: 2.49 s


Unnamed: 0,STATION,LATITUDE,LONGITUDE,year,month,T_MAX,T_MAX_MEAN,T_MEAN,DEWP_MEAN,WDSP_MEAN,MXSPD_MAX,st_bin,geometry
0,94100099999,-14.3,126.63,2013,1,98.2,92.912903,83.251613,73.677419,4.151613,15.9,421,POINT (126.63000 -14.30000)
78,94100099999,-14.3,126.63,2019,7,94.1,90.822581,73.174194,47.225806,4.209677,14.0,421,POINT (126.63000 -14.30000)
77,94100099999,-14.3,126.63,2019,6,95.0,88.55,73.2,45.39,4.24,12.0,421,POINT (126.63000 -14.30000)


In [37]:
# calculate nearest stations for each station
aus_fires['nearest_geometry'] = None

multipoint = gdf.geometry.unary_union

for index, row in tqdm(aus_fires.iterrows()):
    point = Point(row.longitude, row.latitude)
    queried_geom, nearest_geom = nearest_points(point, multipoint)
    aus_fires.at[index, 'nearest_geometry'] = nearest_geom

aus_fires.head()

4576014it [17:08, 4450.85it/s]


Unnamed: 0,latitude,longitude,year,month,fire_cnt,fire,fire_cnt_before,fire_before,fire_cnt_last_year,fire_last_year,fire_cnt_last_year_same_month,fire_last_year_same_month,st_bin,nearest_geometry
0,-40.0,143.9,2014,5,2,1,0.166667,0.083333,0.333333,0.083333,0,0,2346,POINT (143.88 -39.88)
1,-40.0,143.9,2015,6,2,1,0.166667,0.083333,0.166667,0.083333,0,0,2346,POINT (143.88 -39.88)
2,-40.0,143.9,2016,10,3,1,0.25,0.083333,0.166667,0.083333,0,0,2346,POINT (143.88 -39.88)
3,-40.0,143.9,2018,10,1,0,0.083333,0.0,0.0,0.0,0,0,2346,POINT (143.88 -39.88)
4,-40.0,143.9,2020,6,1,0,0.083333,0.0,0.0,0.0,0,0,2346,POINT (143.88 -39.88)


### Save a dataset of fire records with information about weather stations

In [38]:
aus_fires.to_csv("aus_fires_binned_geometry.csv", index=False)
print('Fire binned geometry data saved')

Fire binned geometry data saved
