# Merge fire data with weather statistics

# 📚 Import Libraries

In [1]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN
from geopy.distance import great_circle
from shapely.geometry import MultiPoint
import os
from datetime import datetime
import glob
from tqdm import tqdm
import numpy as np
import geopandas as gpd
from shapely.geometry import Point
from shapely.ops import nearest_points

  shapely_geos_version, geos_capi_version_string


In [2]:
def reduce_memory_usage(df, verbose=True):
    numerics = ['int8','int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2

    for col in df.columns:
        col_type = df[col].dtypes

        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()

            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2

    if verbose:
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
 
    return df

In [3]:
# data description https://www.ncei.noaa.gov/data/global-summary-of-the-day/doc/readme.txt
WORK_DIR = './'
aus_weather = pd.read_csv(WORK_DIR + '/wildfiredataset/aus_weather_binned_new.csv')
print(aus_weather.shape)
aus_weather.head()

(42012, 12)


Unnamed: 0,STATION,LATITUDE,LONGITUDE,year,month,T_MAX,T_MAX_MEAN,T_MEAN,DEWP_MEAN,WDSP_MEAN,MXSPD_MAX,st_bin
0,94100099999,-14.3,126.63,2013,1,98.2,92.912903,83.251613,73.677419,4.151613,15.9,421
1,94100099999,-14.3,126.63,2019,7,94.1,90.822581,73.174194,47.225806,4.209677,14.0,421
2,94100099999,-14.3,126.63,2019,6,95.0,88.55,73.2,45.39,4.24,12.0,421
3,94100099999,-14.3,126.63,2019,5,98.8,92.212903,76.674194,55.309677,4.029032,13.0,421
4,94100099999,-14.3,126.63,2019,4,97.5,93.663333,81.046667,67.276667,3.376667,15.0,421


In [4]:
# do not reduce memory usage, as we corrupt lat/lng
# %%time
# aus_weather = reduce_memory_usage(aus_weather)
# aus_weather.head()

In [5]:
aus_fire = pd.read_csv(WORK_DIR + '/wildfiredataset/aus_fires_binned_geometry_new.csv')
print(aus_fire.shape)
aus_fire.head()

(4576014, 15)


Unnamed: 0,latitude,longitude,year,month,fire_cnt,fire,fire_cnt_before,fire_before,fire_cnt_last_year,fire_last_year,fire_cnt_last_year_same_month,fire_last_year_same_month,st_bin,near_st_lat,near_st_lng
0,-40.0,143.9,2014,5,2,1,0.166667,0.083333,0.333333,0.083333,0,0,2346,-39.88,143.88
1,-40.0,143.9,2015,6,2,1,0.166667,0.083333,0.166667,0.083333,0,0,2346,-39.88,143.88
2,-40.0,143.9,2016,10,3,1,0.25,0.083333,0.166667,0.083333,0,0,2346,-39.88,143.88
3,-40.0,143.9,2018,10,1,0,0.083333,0.0,0.0,0.0,0,0,2346,-39.88,143.88
4,-40.0,143.9,2020,6,1,0,0.083333,0.0,0.0,0.0,0,0,2346,-39.88,143.88


In [6]:
# reduce memory usage breaks float type, do not use it
#%%time
#aus_fire = reduce_memory_usage(aus_fire)

In [7]:
aus_fire.dtypes
aus_fire.head()

Unnamed: 0,latitude,longitude,year,month,fire_cnt,fire,fire_cnt_before,fire_before,fire_cnt_last_year,fire_last_year,fire_cnt_last_year_same_month,fire_last_year_same_month,st_bin,near_st_lat,near_st_lng
0,-40.0,143.9,2014,5,2,1,0.166667,0.083333,0.333333,0.083333,0,0,2346,-39.88,143.88
1,-40.0,143.9,2015,6,2,1,0.166667,0.083333,0.166667,0.083333,0,0,2346,-39.88,143.88
2,-40.0,143.9,2016,10,3,1,0.25,0.083333,0.166667,0.083333,0,0,2346,-39.88,143.88
3,-40.0,143.9,2018,10,1,0,0.083333,0.0,0.0,0.0,0,0,2346,-39.88,143.88
4,-40.0,143.9,2020,6,1,0,0.083333,0.0,0.0,0.0,0,0,2346,-39.88,143.88


In [8]:
import gc
gc.collect()

63

In [9]:
aus_weather.shape
aus_weather.head()

Unnamed: 0,STATION,LATITUDE,LONGITUDE,year,month,T_MAX,T_MAX_MEAN,T_MEAN,DEWP_MEAN,WDSP_MEAN,MXSPD_MAX,st_bin
0,94100099999,-14.3,126.63,2013,1,98.2,92.912903,83.251613,73.677419,4.151613,15.9,421
1,94100099999,-14.3,126.63,2019,7,94.1,90.822581,73.174194,47.225806,4.209677,14.0,421
2,94100099999,-14.3,126.63,2019,6,95.0,88.55,73.2,45.39,4.24,12.0,421
3,94100099999,-14.3,126.63,2019,5,98.8,92.212903,76.674194,55.309677,4.029032,13.0,421
4,94100099999,-14.3,126.63,2019,4,97.5,93.663333,81.046667,67.276667,3.376667,15.0,421


# Assign weather parameters to fire dataset

In [10]:
# We can not use pd.merge/concat as it uses a lof of RAM and OOM happends. 
# So it is proposed to use auxilary dict structure 
# and keep lat/lng point as a key, and a list of weather data as a value

dict_st = {}
dict_wth = {}

for index, row in tqdm(aus_weather.iterrows()):
   lat = row.LATITUDE
   lng = row.LONGITUDE
   tpl = (lat, lng)

   if tpl not in dict_st:
       dict_st[tpl] = []
   dict_st[tpl].append({'year': row['year'], 'month': row['month'], \
                        'T_MAX': row['T_MAX'], 'T_MAX_MEAN': row['T_MAX_MEAN'], \
                        'T_MEAN': row['T_MEAN'], 'DEWP_MEAN': row['DEWP_MEAN'], \
                         'MXSPD_MAX': row['MXSPD_MAX']})


aus_fire['T_MAX'] = None
aus_fire['T_MAX_MEAN'] = None
aus_fire['T_MEAN'] = None
aus_fire['DEWP_MEAN'] = None
aus_fire['MXSPD_MAX'] = None

for index, row in tqdm(aus_fire.iterrows()):
    lat = row['near_st_lat']
    lng = row['near_st_lng']
    
    #print('Coord', lat, lng)
    
    tpl = (lat, lng)
    observations = dict_st.get(tpl)
    if observations is None:
        print('Something bad', tpl)
        continue
        
    #print(observations)
    
    # find temperature that matches our fire row
    for obs in observations:
        if obs['year'] == row['year'] and obs['month'] == row['month']:
            aus_fire.at[index, 'T_MAX'] = obs['T_MAX']
            aus_fire.at[index, 'T_MAX_MEAN'] = obs['T_MAX_MEAN']
            aus_fire.at[index, 'T_MEAN'] = obs['T_MEAN']
            aus_fire.at[index, 'DEWP_MEAN'] = obs['DEWP_MEAN']
            aus_fire.at[index, 'MXSPD_MAX'] = obs['MXSPD_MAX']
            # assigned weather values, exit loop
            break

42012it [00:04, 8648.19it/s]
4576014it [40:09, 1899.19it/s]


In [11]:
aus_fire.head()

Unnamed: 0,latitude,longitude,year,month,fire_cnt,fire,fire_cnt_before,fire_before,fire_cnt_last_year,fire_last_year,fire_cnt_last_year_same_month,fire_last_year_same_month,st_bin,near_st_lat,near_st_lng,T_MAX,T_MAX_MEAN,T_MEAN,DEWP_MEAN,MXSPD_MAX
0,-40.0,143.9,2014,5,2,1,0.166667,0.083333,0.333333,0.083333,0,0,2346,-39.88,143.88,66.9,61.483871,56.051613,51.474194,21.0
1,-40.0,143.9,2015,6,2,1,0.166667,0.083333,0.166667,0.083333,0,0,2346,-39.88,143.88,61.0,55.393333,51.416667,47.066667,39.0
2,-40.0,143.9,2016,10,3,1,0.25,0.083333,0.166667,0.083333,0,0,2346,-39.88,143.88,73.4,59.26129,53.435484,45.012903,35.0
3,-40.0,143.9,2018,10,1,0,0.083333,0.0,0.0,0.0,0,0,2346,-39.88,143.88,76.5,62.790323,55.5,48.003226,24.1
4,-40.0,143.9,2020,6,1,0,0.083333,0.0,0.0,0.0,0,0,2346,-39.88,143.88,60.8,56.66,50.92,46.506667,31.1


### Dataset with information about fire and weather is ready!

In [12]:
aus_fire.to_csv("aus_fire_final_temp.csv", index=False)
print('Fire recordings along with weather data')

Fire recordings along with weather data


In [13]:
#print(dict_st.get((-39.88, 143.88)))