In [1]:
import os
import string
import sys

import numpy as np
import pandas as pd

import calendar
from datetime import datetime

In [2]:
%store -r fire_zones
%store -r years
%store -r months

In [3]:
fire_data = pd.read_csv("data/fire_data.csv")
weather_data = pd.read_csv("data/weather_data.csv")
combined_data = pd.DataFrame(columns=['zone', 'date', 'month', 'year', 'area_burned (HA)', 'total_precip (mm)', 'mean_temp (C)'])

In [4]:
for zone in fire_zones:
    for year in years:
        for month in months:
            weather = weather_data[weather_data['Zone'] == zone]
            fire = fire_data[fire_data['FIRE_ZONE'] == zone]
            weather = weather[weather['Year'] == year]
            fire = fire[fire['FIRE_YEAR'] == year]
            weather = weather[weather['Month'] == month]
            fire = fire[fire['FIRE_MONTH'] == month]
            
            area_burned = fire["SIZE_HA"][fire["SIZE_HA"].index[0]]
            total_precip = weather["Total Precip (mm)"][weather["Total Precip (mm)"].index[0]]
            mean_temp = weather["Mean Temp (°C)"][weather["Mean Temp (°C)"].index[0]]
            date = datetime(year, month, calendar.monthrange(year, month)[1])
            
            new_row = pd.DataFrame({'zone':zone, 'date':date, 'month':[month], 'year':[year], 'mean_temp (C)':[mean_temp], 'total_precip (mm)':[total_precip], 'area_burned (HA)':[area_burned]})
            combined_data = pd.concat([new_row,combined_data.loc[:]])

In [5]:
combined_data.reset_index(inplace=True)
combined_data = combined_data.drop(columns=["index"])

In [6]:
for lag in range(1, 12):
    mean_temp_col = "mean_temp_lag" + str(lag)
    total_precip_col = "total_precip_lag" + str(lag)
    combined_data[mean_temp_col] = combined_data.groupby(['zone'])['mean_temp (C)'].shift(-1 * lag)
    combined_data[total_precip_col] = combined_data.groupby(['zone'])['total_precip (mm)'].shift(-1 * lag)

The fire data in the format that I require only exists in 2009 and onward, the 2008 weather data was included solely to create lagged features and thus is no longer required.

In [7]:
combined_data = combined_data.drop(combined_data[combined_data['year'] < 2009].index)
combined_data = combined_data.drop(columns=['year'])

In [8]:
print(combined_data.shape[0])

5376


In [9]:
print(combined_data.dropna().shape[0])

4614


There are unfortunately a lot of missing values in the weather data I was able to find and download. About 13.6% of my data has missing values, my understanding is that this is just something to deal with when it comes to weather data. If I had more time I would do more research and try to find a way to fill in the missing values, or acquire my data from somewhere that has more consistent data.

One very notable thing about the missing values is they tend to be clustered together (IE: the first 6 months in 2013 in 'x' fire zone are missing, as opposed to randomly spaced missing data). Because of this I am choosing to throw away all rows with missing values as opposed to impute them, my thought process is that since most of the columns of a particular row affected by the missing values would be missing (since most of the columns are lagged temperature and precipitation amounts, and the missing values are temporally close) it would cause the imputed rows to be either ineffectual or detrimental. 

In [10]:
combined_data = combined_data.dropna()

In [11]:
combined_data.to_csv("data/combined_data.csv")