In [1]:
import datetime
import itertools
import os

from collections import defaultdict
from pathlib import Path

import pandas as pd
import numpy as np
import seaborn as sns

from matplotlib import pyplot as plt

In [2]:
import utils

In [3]:
# turn off interactive plotting
plt.ioff()

<matplotlib.pyplot._IoffContext at 0x7f06245ac1c0>

# Load Data

In [4]:
csv_location = Path('forest_fire_with_climate.csv')

In [5]:
figure_dir = Path('figures')

In [6]:
df = pd.read_csv(csv_location, index_col='Unnamed: 0')

Convert `Fire_Date` to datetime object

In [7]:
df.Fire_Date = pd.to_datetime(df.Fire_Date)

Convert `DISCOVERY_TIME` to time object.

In [8]:
def convert_time_as_int_to_str(time_as_int):
    if not np.isfinite(time_as_int):
        return np.nan

    hour = int(time_as_int//100)
    miniute = int(time_as_int%100)
    return f'{hour}:{miniute}'

df.DISCOVERY_TIME = pd.to_datetime(
    df.DISCOVERY_TIME.apply(convert_time_as_int_to_str),
    format='%H:%M'
)

df.CONT_TIME = pd.to_datetime(
    df.CONT_TIME.apply(convert_time_as_int_to_str),
    format='%H:%M'
)

# use use the datetime versions of the object to calculate things as
# time delta objects
time_diff = df.DISCOVERY_TIME - df.CONT_TIME
time_diff[time_diff < np.timedelta64(0)] += np.timedelta64(1,'D')
frac_num_days = time_diff /  np.timedelta64(1,'D')

df.Days_to_extinguish_fire += frac_num_days

# remove the date part and only have time
df.DISCOVERY_TIME = df.DISCOVERY_TIME.dt.time
df.CONT_TIME = df.CONT_TIME.dt.time


df_days = df.drop(columns=['DISCOVERY_TIME','CONT_TIME'])
df_days.Days_to_extinguish_fire = df.Days_to_extinguish_fire

We remove the index column

In [9]:
df = df[[cname for cname in df.columns if cname != 'index']]

In [10]:
df

Unnamed: 0,FIRE_YEAR,Fire_Date,DISCOVERY_TIME,STAT_CAUSE_DESCR,Days_to_extinguish_fire,CONT_TIME,FIRE_SIZE,FIRE_SIZE_CLASS,LATITUDE,LONGITUDE,...,tmax,tmax-7,tmin,tmin-7,tdmean,tdmean-7,vpdmin,vpdmin-7,vpdmax,vpdmax-7
0,2004,2004-05-12,08:45:00,Lightning,0.718750,15:30:00,0.25,A,38.933056,-120.404444,...,19.882000,15.443715,8.381,3.559000,-2.044,-1.374000,5.451,2.063715,17.632999,12.678000
1,2004,2004-06-28,16:00:00,Lightning,5.083333,14:00:00,0.10,A,38.559167,-119.913333,...,22.945999,22.963715,5.431,6.212429,-2.837,-2.422428,4.824,5.121143,23.013000,22.866573
2,2004,2004-06-28,16:00:00,Lightning,5.166667,12:00:00,0.10,A,38.559167,-119.933056,...,22.945999,22.963715,5.431,6.212429,-2.837,-2.422428,4.824,5.121143,23.013000,22.866573
3,2004,2004-06-30,18:00:00,Lightning,1.083333,16:00:00,0.10,A,38.635278,-120.103611,...,20.254999,18.632429,7.985,6.914857,0.290,-0.571857,5.021,4.411714,17.228001,15.297143
4,2004,2004-07-01,18:00:00,Lightning,1.166667,14:00:00,0.10,A,38.688333,-120.153333,...,24.375999,23.623569,8.078,7.132000,1.546,1.577143,4.900,3.987714,22.320999,21.595142
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
278463,2010,2010-07-26,14:44:00,Lightning,1.036806,13:51:00,81.00,C,41.508867,-120.819167,...,31.292999,32.303143,11.516,13.050858,-0.111,2.045000,6.537,7.734286,39.761002,42.106289
278464,2010,2010-07-27,18:17:00,Lightning,,NaT,1.00,B,41.684917,-122.716183,...,32.523998,33.561287,15.326,16.937572,4.183,7.629286,9.869,9.395000,41.014000,41.998859
278465,2015,2015-08-06,12:04:00,Lightning,0.686806,19:35:00,0.50,B,39.655817,-121.234567,...,36.076000,31.105860,21.862,18.520285,4.184,7.996857,18.590,9.995000,51.481998,35.001999
278466,2015,2015-06-26,17:52:00,Lightning,0.901389,20:14:00,0.05,A,40.443000,-120.664433,...,27.584000,29.078714,9.428,10.377000,-2.418,-0.871571,6.119,6.983714,32.063999,35.629711


Create the month of the fire as a feature and code the class code A-G to a number 1-7

In [11]:
df['FIRE_MONTH'] = df.Fire_Date.dt.month

df.FIRE_SIZE_CLASS = pd.Categorical(df.FIRE_SIZE_CLASS)
# df['FIRE_SIZE_CLASS_CODES'] = df.FIRE_SIZE_CLASS.cat.codes

We grab the numeric columns and remove rows containing NaNs

In [12]:
numeric_cols = [
    colname
    for colname in df.describe().columns
    if colname not in ['DISCOVERY_TIME', 'CONT_TIME']
]
nonnumeric_interested_cols = [
    'FIRE_SIZE_CLASS',
]

interested_cols = [
    *numeric_cols,
    *nonnumeric_interested_cols,
]

df_cleaned = df[interested_cols].dropna()
no_nan_entries = df[interested_cols].notna()
no_nan_rows = no_nan_entries.all(axis=1)
classes_cleaned = df[interested_cols][no_nan_rows]

We normalize the dataset

In [13]:
means = df_cleaned[numeric_cols].mean()
std_dev = df_cleaned[numeric_cols].std()

df_norm = (df_cleaned[numeric_cols] - means) / std_dev

df_norm[nonnumeric_interested_cols] = df_cleaned[nonnumeric_interested_cols]

In [14]:
cov_matrix = df_norm.cov()

In [15]:
save_formats = [
    'png',
    'pdf',
    'svg',
    'jpg',
]

In [16]:
fig1, ax1 = plt.subplots(dpi=1000)
# fig1, ax1 = plt.subplots()

sns.heatmap(
    cov_matrix,
    annot = False,
    ax=ax1,
    cbar=True,
    cmap='hot',
)
ax1.set_title('Covariance Matrix of Standarized Numeric Data Columns')

for ext in save_formats:
    fig1.savefig(figure_dir / f'covar_matrix.{ext}')

Scatter Plots of the data

In [17]:
pairplot_start_time = datetime.datetime.now()
print(pairplot_start_time)

sns.set_theme(style="ticks")
sns_plot = sns.pairplot(
    df_cleaned,
    hue="FIRE_SIZE_CLASS",
    height=10,
    aspect=1,
)

plt.tight_layout()

for ext in save_formats:
    if ext in ['svg', 'pdf']:
        # skip because file will get too large
        continue
    sns_plot.savefig(figure_dir / f'pairplot.{ext}')


pairplot_end_time = datetime.datetime.now()


print(pairplot_end_time)
pair_plot_duration = pairplot_end_time - pairplot_start_time
print(pair_plot_duration)
print(pair_plot_duration.total_seconds())

2021-12-06 04:22:47.585220
2021-12-06 04:53:16.894946
0:30:29.309726
1829.309726


In [18]:
def factor_scatter_matrix(df, factor,
        palette=None,
        scatter_matrix_kwargs = dict(),
    ):
    '''Create a scatter matrix of the variables in df, with differently colored
    points depending on the value of df[factor].
    inputs:
        df: pandas.DataFrame containing the columns to be plotted, as well 
            as factor.
        factor: string or pandas.Series. The column indicating which group 
            each row belongs to.
        palette: A list of hex codes, at least as long as the number of groups.
            If omitted, a predefined palette will be used, but it only includes
            9 groups.
        scatter_matrix_kwargs: kwargs for pandas scatter_matrix plot function
            Note that 'c' and 'diagonal=None' are being set by this function.
            docs here:
                https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.plotting.scatter_matrix.html
    
    https://stackoverflow.com/a/22976730
    '''
    import matplotlib.colors
    import numpy as np
    from pandas.plotting import scatter_matrix
    from scipy.stats import gaussian_kde

    if isinstance(factor, str):
        factor_name = factor #save off the name
        factor = df[factor] #extract column
        
        # remove from df, so it 
        # doesn't get a row and col in the plot.
        df = df.drop(factor_name,axis=1)

    classes = list(set(factor))

    if palette is None:
        palette = ['#e41a1c', '#377eb8', '#4eae4b', 
                   '#994fa1', '#ff8101', '#fdfc33', 
                   '#a8572c', '#f482be', '#999999']

    color_map = dict(zip(classes,palette))

    if len(classes) > len(palette):
        raise ValueError(
            ' '.join([
                'Too many groups for the number of colors provided',
                f'We only have {len(palette)} colors in the palette, but you have {len(classes)} groups.'
            ]),
        )

    colors = factor.apply(lambda group: color_map[group])
    
    # merge the kwargs but overwrite anything passed in.
    scatter_matrix_kwargs_ = dict(dict(figsize=(10,10), marker='o'), **scatter_matrix_kwargs)
    
    axarr = scatter_matrix(df,c=colors,diagonal=None, **scatter_matrix_kwargs_)

    fig = axarr[0][0].get_figure()
    
    # plot diagonal densityies
    for rc,colname in enumerate(df.columns):
        for group in classes:
            y = df.loc[factor == group, colname].values
            gkde = gaussian_kde(y)
            ind = np.linspace(y.min(), y.max(), 1000)
            axarr[rc][rc].plot(ind, gkde.evaluate(ind),c=color_map[group])

    # rotate labels for easier reading
    for ax in axarr.flatten():
        ax.xaxis.label.set_rotation(90)
        ax.yaxis.label.set_rotation(0)
        ax.yaxis.label.set_ha('right')

    fig.colorbar(
        ax=axarr,
        location='right',
        # shrink=0.6
    )

    return axarr, color_map

In [19]:
# scatter_matrix_start_time = datetime.datetime.now()
# print(scatter_matrix_start_time)

# axarr, cmap_dict = factor_scatter_matrix(
#     df=df_cleaned,
#     factor="FIRE_SIZE_CLASS",
#     scatter_matrix_kwargs=dict(figsize=(30,30)),
# )

# scatter_matrix_end_time = datetime.datetime.now()
# print(scatter_matrix_end_time)
# scatter_matrix_duration = scatter_matrix_end_time - scatter_matrix_start_time
# print(scatter_matrix_duration)
# print(scatter_matrix_duration.total_seconds())

# plt.tight_layout()
# plt.show()