### Part 4: Advanced data analysis and EDA

#### Imports section:

In [4]:
# Please note if running on a clean environment, need to install missing modules
from skimage import io
from PIL import Image
import copy
from sklearn.covariance import EllipticEnvelope
from sklearn.ensemble import IsolationForest
from sklearn.mixture import GaussianMixture
from sklearn.neighbors import KernelDensity, LocalOutlierFactor
from sklearn import preprocessing
from sklearn import svm
import sklearn
from IPython.display import display
from random import randint
import datetime as dt
import pprint
import json
import requests
import cv2
import plotly.express as px
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

#### Global variables:

In [5]:
CSV_BEFORE_OUTLIERS = "fire_history_prep.csv"


WEATHER_CSV = 'fire_history_with_weather.csv'
FINAL_CSV = 'fire_history_final.csv'
OUTLIERS_CSV = 'fire_history_additional.csv'

OUTLIERS_MAP = "geo_outliers.png"
USA_MAP_PNG = "USAMap.png"

In [6]:
def boxplot_outliers(df, df2, cols):
    # Set up the subplots
    fig, axes = plt.subplots(1, 6, figsize=(15, 7))
    ylabels = ['Maximum daily air temperature at 2m above ground in °C',
               'Minimum daily air temperature at 2m above ground in °C',
               'Maximum wind speed on a day in km/h',
               'Dominant wind direction °',
               'The sum of solar radiation on a given day in MJ/m²',
               'Sum of daily precipitation in mm']

    # Create boxplots for each column
    for i, col in enumerate(cols):
        axes[i].boxplot([df[col], df2[col]],
                        labels=['Before', 'After'])
        axes[i].set_title(col + ' Boxplot')
        axes[i].set_ylabel(ylabels[i])
        axes[i].grid(True)

    plt.suptitle('Weather outliers:')
    plt.tight_layout()
    plt.show()

In [7]:
def geo_scatter(df, projection_type):
    fire_cause = df['FireCause'].map({1: 'Human', 2: 'Natural', 3: 'Unknown', 4: 'Undetermined'})
    fig = px.scatter_geo(df,
                         lat=df['InitialLatitude'],
                         lon=df['InitialLongitude'],
                         color=fire_cause, # need to add temperature
                         projection='natural earth',
                         opacity=0.5,
                         size=df['FireDuration'])
    fig.update_coloraxes(colorbar_title_text='Fire Cause')
    fig.show()

In [8]:
def pie_fire_cause(df):
    fire_cause = df['FireCause'].value_counts().sort_index()
    labels = ['Human', 'Natural', 'Unknown', 'Undetermined']
    fig = plt.pie(fire_cause,
                  explode=(0, 0.1, 0, 0),
                  labels=labels,
                  autopct='%1.1f%%',
                  shadow=True,
                  startangle=90)
    fig.show()

In [9]:
def bar_fire_cause_duration(df):
    df_bar = df.copy()
    df_bar['FireCause'] = df['FireCause'].map({1: 'Human', 2: 'Natural', 3: 'Unknown', 4: 'Undetermined'})
    df_bar = df_bar.groupby(['FireCause'])['FireDuration'].mean()
    ax = df_bar.plot(kind='bar',
                     title='Fire duration by cause',
                     ylabel='Fire duration in days',
                     xlabel='Fire cause')
    ax.set_xticklabels(ax.get_xticklabels(), rotation=0)
    plt.show()

In [10]:
def scatter_fire(df, x, y):
    for value in reversed(df['CausedByWeather'].unique()):
        data = df[df['CausedByWeather'] == value]
        plt.scatter(data[x],
                    data[y],
                    label=0,
                    s=data['FireDuration'],
                    alpha=0.5)
    plt.legend(['Other', 'Natural'], title='Cause')
    plt.xlabel(x)
    plt.ylabel(y)
    plt.show()