# Merge fire data with weather statistics

# 📚 Import Libraries

In [None]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN
from geopy.distance import great_circle
from shapely.geometry import MultiPoint

import os
from datetime import datetime
import glob
from tqdm import tqdm

import numpy as np

import geopandas as gpd
from shapely.geometry import Point
from shapely.ops import nearest_points

In [None]:
def reduce_memory_usage(df, verbose=True):
    numerics = ['int8','int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2

    for col in df.columns:
        col_type = df[col].dtypes

        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()

            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2

    if verbose:
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
 
    return df

In [None]:
# data description https://www.ncei.noaa.gov/data/global-summary-of-the-day/doc/readme.txt
aus_weather = pd.read_csv('./wildfiredataset/aus_weather_binned_new.csv')
print(aus_weather.shape)
aus_weather.head()

In [None]:
# do not reduce memory usage, as we corrupt lat/lng
# %%time
# aus_weather = reduce_memory_usage(aus_weather)
# aus_weather.head()

In [None]:
aus_fire = pd.read_csv('/kaggle/input/wildfiredataset/aus_fires_binned_geometry_new.csv')
print(aus_fire.shape)
aus_fire.head()

In [None]:
aus_fire.dtypes

In [None]:
# reduce memory usage breaks float type, do not use it
#%%time
#aus_fire = reduce_memory_usage(aus_fire)

In [None]:
aus_fire.dtypes
aus_fire.head()

In [None]:
import gc
gc.collect()

In [None]:
aus_weather.shape

In [None]:
aus_weather.head()

# Assign weather parameters to fire dataset

In [None]:
# We can not use pd.merge/concat as it uses a lof of RAM and OOM happends. 
# So it is proposed to use auxilary dict structure 
# and keep lat/lng point as a key, and a list of weather data as a value

dict_st = {}
dict_wth = {}

for index, row in tqdm(aus_weather.iterrows()):
   lat = row.LATITUDE
   lng = row.LONGITUDE
   tpl = (lat, lng)

   if tpl not in dict_st:
       dict_st[tpl] = []
   dict_st[tpl].append({'year': row['year'], 'month': row['month'], \
                        'T_MAX': row['T_MAX'], 'T_MAX_MEAN': row['T_MAX_MEAN'], \
                        'T_MEAN': row['T_MEAN'], 'DEWP_MEAN': row['DEWP_MEAN'], \
                         'MXSPD_MAX': row['MXSPD_MAX']})


aus_fire['T_MAX'] = None
aus_fire['T_MAX_MEAN'] = None
aus_fire['T_MEAN'] = None
aus_fire['DEWP_MEAN'] = None
aus_fire['MXSPD_MAX'] = None

for index, row in tqdm(aus_fire.iterrows()):
    lat = row['near_st_lat']
    lng = row['near_st_lng']
    
    #print('Coord', lat, lng)
    
    tpl = (lat, lng)
    observations = dict_st.get(tpl)
    if observations is None:
        print('Something bad', tpl)
        continue
        
    #print(observations)
    
    # find temperature that matches our fire row
    for obs in observations:
        if obs['year'] == row['year'] and obs['month'] == row['month']:
            aus_fire.at[index, 'T_MAX'] = obs['T_MAX']
            aus_fire.at[index, 'T_MAX_MEAN'] = obs['T_MAX_MEAN']
            aus_fire.at[index, 'T_MEAN'] = obs['T_MEAN']
            aus_fire.at[index, 'DEWP_MEAN'] = obs['DEWP_MEAN']
            aus_fire.at[index, 'MXSPD_MAX'] = obs['MXSPD_MAX']
            # assigned weather values, exit loop
            break


In [None]:
aus_fire.head()

### Australia dataset with information about fire and weather is ready!

In [None]:
#aus_fire.T_MAX.unique()
aus_fire.to_csv("aus_fire_final_temp.csv", index=False)
print('Submission saved')

In [None]:
#print(dict_st.get((-39.88, 143.88)))