<a href="https://colab.research.google.com/github/knattarina/wildfire_challenge/blob/main/Data_Preparation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Data Preparation**
*This notebook contains the data preparation of the data provided by the [IBM Call for Code Wildfire Challenge](https://community.ibm.com/community/user/datascience/blogs/susan-malaika/2020/11/10/call-for-code-spot-challenge-for-wildfires). The content is available [on GitHub](https://github.com/knattarina/wildfire_challenge).*


*   loading csv data (wildfires, weather, forecasts, landclass and vegetation index)
*   reformatting weather data one line per date
*   adding fix data to landclass
*   handle missing values in time series per region
*   handling outliers
*   merging data
*   generating new indicators






# IBM Watson related Code

In [135]:
#from project_lib import Project
#from pyspark.sql import SparkSession

#project = Project(sc,"ff75f9c0-4f85-495d-bbc1-6305f6b1dbb8", "p-12f1b4bbd0ab3b00eacd4ce1201f20ce039b72ec")
#spark = SparkSession.builder.getOrCreate()
def get_df_watson(file_name):
    file = project.get_file(file_name)
    file.seek(0)
    return pd.read_csv(file)

def save_files_watson():
  project.save_data("Cleansed_Data.csv", aggr_df.to_csv(index=False), overwrite=True)
  project.save_data("LandClass.csv", landclass_df.to_csv(index=False), overwrite=True)
  project.save_data("Cleansed_Forecasts.csv", forecasts_df.to_csv(index=False), overwrite=True)

# Google Colab

In [136]:
import sys
import os
import datetime
import numpy as np
import pandas as pd
import seaborn as sns
import scipy.stats

from google.colab import drive

drive.mount('/content/drive')
os.chdir('/content/drive/My Drive/Wildfire_Challenge/Data')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [137]:
wildfires_df = pd.read_csv('Historical_Wildfires.csv')
weather_df = pd.read_csv('HistoricalWeather.csv')
forecasts_df = pd.read_csv('HistoricalWeatherForecasts.csv')
landclass_df = pd.read_csv('LandClass.csv')
vegetation_df = pd.read_csv('VegetationIndex.csv')

forecasts_df['Date'] = pd.to_datetime(forecasts_df['Date'])
wildfires_df['Date'] = pd.to_datetime(wildfires_df['Date'])
weather_df['Date'] = pd.to_datetime(weather_df['Date'])
vegetation_df['Date'] = pd.to_datetime(vegetation_df['Date'])

In [138]:
##### ADD FIX DATA TO LANDCLASS #####
temp_df = pd.DataFrame.drop_duplicates(weather_df[['Region','count()[unit: km^2]']])
landclass_df = pd.merge(landclass_df, temp_df)

##### REPLACE ZEROES WITH NAN #####
#wildfires_df = wildfires_df.mask(wildfires_df==0)
wildfires_df = wildfires_df.drop(columns = ['Replaced', 'Count'])

*    Reformatting data to one line per date and region

In [139]:
def reformat_weather_data(df):
    df = df.rename(columns={"count()[unit: km^2]": "Area", "min()": "Min", "max()": "Max", "mean()": "Mean", "variance()": "Variance"})

    # Reformat the data
    df_pivot = df.pivot_table(values=['Min','Max','Mean','Variance'], index=['Date','Region'], columns=['Parameter'])

    # Reset dataframe index
    df_pivot.reset_index(inplace=True)

    # Renaming Column names
    df_pivot.columns = [col[0] if not(col[1]) else '{1}_{0}'.format(*col) for col in df_pivot.columns.values]

    # Rearranging Data and column
    params = df_pivot.columns.tolist()[3:]
    params.sort()
    
    return df_pivot[df_pivot.columns.tolist()[:3] + params].copy()

weather_df = reformat_weather_data(weather_df)
forecasts_df = reformat_weather_data(forecasts_df)

*     Handling missing values with interpolation of time series

In [140]:
def fill_frames(df, columns):
    df_all = pd.DataFrame()
    for region in landclass_df.Region:
      df_temp = df[df['Region'] == region]
      df_temp = df_temp.resample('1D', on='Date').first().drop('Date', 1).reset_index()
      df_temp[['Region']] = df_temp[['Region']].fillna(value = region)
      df_temp[columns]=df_temp[columns].interpolate(method='linear', direction = 'backward')
      df_temp = df_temp.fillna(method='ffill')
      df_temp = df_temp.fillna(method='bfill')
      df_all = df_all.append(df_temp)
      assert (df_all.isna().sum().all() == 0), "Still Nan Values remain!"
    return df_all

forecasts_df.drop_duplicates(inplace = True)
forecasts_columns = ['Precipitation_Max', 'Precipitation_Mean', 'Precipitation_Min', 'Precipitation_Variance', 'RelativeHumidity_Max', 'RelativeHumidity_Mean', 'RelativeHumidity_Min', 'RelativeHumidity_Variance', 'SolarRadiation_Max', 'SolarRadiation_Mean', 'SolarRadiation_Min', 'SolarRadiation_Variance', 'Temperature_Max', 'Temperature_Mean', 'Temperature_Min', 'Temperature_Variance', 'WindSpeed_Max', 'WindSpeed_Mean', 'WindSpeed_Min', 'WindSpeed_Variance']
forecasts_df = fill_frames(forecasts_df, forecasts_columns)

veg_columns = ['Vegetation_index_mean', 'Vegetation_index_max', 'Vegetation_index_min', 'Vegetation_index_std', 'Vegetation_index_variance']
vegetation_df = fill_frames(vegetation_df, veg_columns)

*     Merging data

In [141]:
aggr_df = wildfires_df.merge(weather_df, how='left', on=['Date', 'Region'])
aggr_df = aggr_df.merge(vegetation_df, how='left', on=['Date', 'Region'])
aggr_df = aggr_df.merge(landclass_df, how='left', on=['Region'])

aggr_df_unfilled = aggr_df.fillna(method='bfill', inplace = False)
aggr_df_unfilled = aggr_df_unfilled.fillna(method='ffill')

aggr_df = aggr_df.set_index(['Date','Region']).fillna(forecasts_df.set_index(['Date','Region'])).reset_index() #filling missing weather data with forecast data
aggr_columns = ['Estimated_fire_area','Mean_estimated_fire_brightness','Mean_estimated_fire_radiative_power','Mean_confidence','Precipitation_Max','Precipitation_Mean','Precipitation_Min','Precipitation_Variance','RelativeHumidity_Max','RelativeHumidity_Mean','RelativeHumidity_Min','RelativeHumidity_Variance','SoilWaterContent_Max','SoilWaterContent_Mean','SoilWaterContent_Min','SoilWaterContent_Variance','SolarRadiation_Max','SolarRadiation_Mean','SolarRadiation_Min','SolarRadiation_Variance','Temperature_Max','Temperature_Mean','Temperature_Min','Temperature_Variance', 'WindSpeed_Max','WindSpeed_Mean','WindSpeed_Min','WindSpeed_Variance', 'Vegetation_index_mean', 'Vegetation_index_max', 'Vegetation_index_min', 'Vegetation_index_std', 'Vegetation_index_variance']
aggr_df = fill_frames(aggr_df, aggr_columns)

*     handling outliers

In [142]:
for col in aggr_df.columns:
  if col in ['Estimated_fire_area','Mean_estimated_fire_radioative_power','Preciptation_Min','Preciptation_Variance','Relative_Humidity_Variance','SoilWaterContent_Min','SolarRadiation_Variance','Temperature_Variance','WindSpeed_Max','WindSpeed_Mean','WindSpeed_Variance']:
    if col in ['SoilWaterContent_Min', 'Preciptation_Max', 'Preciptation_Mean', 'Preciptation_Min', 'Preciptation_Variance']:
      scipy.stats.mstats.winsorize(aggr_df[col], limits = 0.10, inplace = True)
    else: 
      scipy.stats.mstats.winsorize(aggr_df[col], limits = 0.05, inplace = True)
    #g = sns.displot(data=aggr_df.iloc[:, :-16], x=col, hue="Region", kind="kde")

*     adding new ratios

In [143]:
# the lower the vegetation index, the drier is the vegetation
vegetation_influenced_columns = ['Shrubs', 'Herbaceous vegetation', 'Cultivated and managed vegetation/agriculture (cropland)', 'Herbaceous wetland', 'Closed forest, deciduous broad leaf', 'Closed forest, unknown', 'Open forest, deciduous broad leaf', 'Open forest, unknown definitions']
other_landclass_columns = ['Urban / built up','Bare / sparse vegetation','Permanent water bodies','Closed forest, evergreen, broad leaf','Open forest, evergreen broad leaf','Open sea','count()[unit: km^2]']

for col in vegetation_influenced_columns:
  aggr_df[col+'_inflame_risk'] = aggr_df[col] * (1-aggr_df.Vegetation_index_mean)
  aggr_df = aggr_df.drop(columns = col)

aggr_df = aggr_df.drop(columns = other_landclass_columns)

In [144]:
def segregate_date(df):
    #df["Day"] = df["Date"].dt.day
    df["Month"] = df["Date"].dt.month
    df["Year"] = df["Date"].dt.year
    return df

aggr_df = segregate_date(aggr_df)
forecasts_df = segregate_date(forecasts_df)

In [145]:
aggr_df.head()

Unnamed: 0,Date,Region,Estimated_fire_area,Mean_estimated_fire_brightness,Mean_estimated_fire_radiative_power,Mean_confidence,Std_confidence,Var_confidence,Precipitation_Max,Precipitation_Mean,Precipitation_Min,Precipitation_Variance,RelativeHumidity_Max,RelativeHumidity_Mean,RelativeHumidity_Min,RelativeHumidity_Variance,SoilWaterContent_Max,SoilWaterContent_Mean,SoilWaterContent_Min,SoilWaterContent_Variance,SolarRadiation_Max,SolarRadiation_Mean,SolarRadiation_Min,SolarRadiation_Variance,Temperature_Max,Temperature_Mean,Temperature_Min,Temperature_Variance,WindSpeed_Max,WindSpeed_Mean,WindSpeed_Min,WindSpeed_Variance,Vegetation_index_mean,Vegetation_index_max,Vegetation_index_min,Vegetation_index_std,Vegetation_index_variance,Shrubs_inflame_risk,Herbaceous vegetation_inflame_risk,Cultivated and managed vegetation/agriculture (cropland)_inflame_risk,Herbaceous wetland_inflame_risk,"Closed forest, deciduous broad leaf_inflame_risk","Closed forest, unknown_inflame_risk","Open forest, deciduous broad leaf_inflame_risk","Open forest, unknown definitions_inflame_risk",Month,Year
0,2005-01-04,NSW,8.68,312.266667,42.4,78.666667,2.886751,8.333333,22.842566,2.80862,0.0,17.383363,90.332771,57.095628,31.27993,267.158378,0.444927,0.214293,0.003026,0.009965,33.214062,22.617291,7.576938,30.447041,28.945488,23.055527,12.495799,9.455474,7.364222,4.841764,1.401951,1.494301,0.349996,0.995264,-0.2,0.20523,0.042121,4.030027,28.340191,8.450057,0.065,4.42003,0.195001,2.405016,6.695045,1,2005
1,2005-01-05,NSW,16.61125,322.475,62.3625,85.5,8.088793,65.428571,7.657155,0.157935,0.0,0.273471,88.623436,47.170735,24.466665,195.639724,0.442955,0.203951,0.003026,0.009483,33.554867,28.076835,14.087289,29.763055,29.51012,22.425765,9.6118,13.35238,7.091141,4.01408,1.011328,1.043316,0.35026,0.994619,-0.2,0.205353,0.042172,4.028387,28.328657,8.446618,0.064974,4.418231,0.194922,2.404037,6.69232,1,2005
2,2005-01-06,NSW,5.52,325.266667,38.4,78.333333,3.21455,10.333333,27.616505,0.434833,0.0,4.634465,92.85096,39.874638,21.705952,213.300558,0.431879,0.193668,0.003026,0.00875,34.075787,30.561703,12.310518,22.667707,26.982698,20.621405,9.024039,11.764178,9.905821,4.477879,0.930842,1.953685,0.350525,0.993974,-0.2,0.205476,0.042222,4.026747,28.317123,8.443179,0.064948,4.416432,0.194843,2.403059,6.689596,1,2005
3,2005-01-07,NSW,6.264,313.87,33.8,92.2,7.52994,56.7,3.839235,0.064224,0.0,0.038927,79.103134,39.27941,16.215517,245.624576,0.418513,0.183778,0.003026,0.007799,34.019218,32.143718,16.044561,6.949267,24.752069,19.632722,8.691216,7.940373,10.446799,4.017578,1.601724,1.102751,0.350789,0.993329,-0.2,0.205599,0.042273,4.025107,28.305589,8.43974,0.064921,4.414633,0.194763,2.40208,6.686871,1,2005
4,2005-01-08,NSW,5.4,337.383333,122.533333,91.0,7.937254,63.0,2.866673,0.203352,0.0,0.236269,83.001541,42.400824,14.008522,342.996833,0.41337,0.175935,0.003026,0.007482,33.265091,29.347715,15.20988,27.82283,26.472469,21.477315,10.821342,12.063971,6.671862,3.88455,1.167861,0.774612,0.351054,0.992684,-0.2,0.205722,0.042324,4.023467,28.294055,8.436301,0.064895,4.412834,0.194684,2.401101,6.684146,1,2005


*     Store preprocessed data to .csv-Files

In [146]:
aggr_df_unfilled.to_csv('Cleansed_Data_Unfilled.csv', index = False)
aggr_df.to_csv('Cleansed_Data.csv', index = False)
landclass_df.to_csv('Cleansed_LandClass.csv', index = False)
forecasts_df.to_csv('Cleansed_Forecasts.csv', index = False)