<a href="https://colab.research.google.com/github/katharina-knappmann/wildfire_challenge/blob/main/RandomForest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import numpy as np
import pandas as pd
import time
import random

from google.colab import drive
from matplotlib import pyplot
from collections import deque
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler

drive.mount('/content/drive')
os.chdir('/content/drive/My Drive/Wildfire_Challenge/Data')

Mounted at /content/drive


In [2]:
df_all = pd.read_csv('Cleansed_Data.csv')
forecasts_df = pd.read_csv('HistoricalWeatherForecasts.csv')

df_all['Date'] = pd.to_datetime(df_all['Date'])
forecasts_df['Date'] = pd.to_datetime(forecasts_df['Date'])

df_all = df_all.drop(columns=['Mean_estimated_fire_brightness', 'Mean_estimated_fire_radiative_power'], inplace=False)

In [4]:
df_all.replace(['NSW', 'NT', 'QL', 'VI', 'SA', 'TA', 'WA'], [1,2,3,4,5,6,7], inplace=True)

first_col = df_all.pop('Estimated_fire_area')
df_all.insert(0, 'Estimated_fire_area', first_col)

In [6]:
# cutting df so that only data where weather forecast was available is taken into account

df = pd.DataFrame()

for region in [1,2,3,4,5,6,7]:
  df_temp = df_all[df_all['Region'] == region]
  df_temp.set_index('Date', inplace=True)
  df_temp = df_temp['2014-01-01' :'2021-01-31']

  df = df.append(df_temp)

dates = df[['Date']]
df.drop(columns = ['Date'], inplace = True)#, 'Region'
df.reset_index(inplace=True)

In [9]:
#SEQ_LEN = 60  # preceeding sequence
#FUTURE_PERIOD_PREDICT = 14
#RATIO_TO_PREDICT = "QL"

In [None]:
## here, split away some slice of the future data from the main main_df

times = sorted(main_df.index.values)
last_5pct = sorted(main_df.index.values)[-int(0.05*len(times))]

main_df = main_df.sample(n = main_df.shape[0])

validation_main_df = main_df[(main_df.index >= last_5pct)]
main_df = main_df[(main_df.index < last_5pct)]
df.dropna(inplace=True)
scaler = MinMaxScaler()
for col in df:
  df[[col]] = scaler.fit_transform(df[[col]])
  validation_main_df[[col]] = scaler.fit_transform(validation_main_df[[col]])

npmain = np.asarray(df)
npvalmain = np.asarray(validation_main_df)

train_x = npmain[:,1:]
train_y = npmain[:,0:1].ravel()

validation_x = npvalmain[:,1:]
validation_y = npvalmain[:,0:1].ravel()

train_x = scaler.fit_transform(train_x)

In [None]:
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor # Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 500, random_state = 30)# Train the model on training data
rf.fit(train_x, train_y)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=500, n_jobs=None, oob_score=False,
                      random_state=30, verbose=0, warm_start=False)

Preparation of "to-predict" data

In [None]:
vegetation_columns = ['Vegetation_index_mean', 'Vegetation_index_max', 'Vegetation_index_min', 'Vegetation_index_std', 'Vegetation_index_variance']
vegetation_influenced_columns = ['Shrubs', 'Herbaceous vegetation', 'Cultivated and managed vegetation/agriculture (cropland)', 'Herbaceous wetland', 'Closed forest, deciduous broad leaf', 'Closed forest, unknown', 'Open forest, deciduous broad leaf', 'Open forest, unknown definitions']
other_landclass_columns = ['Urban / built up','Bare / sparse vegetation','Permanent water bodies','Closed forest, evergreen, broad leaf','Open forest, evergreen broad leaf','Open sea']

forecasts_df = pd.read_csv('Cleansed_Forecasts.csv')
landclass_df = pd.read_csv('LandClass.csv')
vegetation_df = pd.read_csv('VegetationIndex.csv')
soilwater_df = pd.read_csv('SoilwaterContent_Estimation.csv')
february_forecasts = pd.read_csv('Februar_Forecast.csv', delimiter=';')

february_forecasts['Date'] = pd.to_datetime(february_forecasts['Date'])
soilwater_df['Date'] = pd.to_datetime(soilwater_df['Date'])
vegetation_df['Date'] = pd.to_datetime(vegetation_df['Date'])
forecasts_df['Date'] = pd.to_datetime(forecasts_df['Date'])

february_forecasts.set_index('Date', inplace=True)

df_all = pd.DataFrame()
for region in ['NSW', 'NT', 'QL', 'VI', 'SA', 'TA', 'WA']:
  df_temp = february_forecasts[february_forecasts['Region'] == region]
  #df_temp = df_temp['2021-02-01':'2021-02-14']
  df_temp.reset_index(inplace=True)

  df_temp = df_temp.merge(vegetation_df, how='left', on=['Date', 'Region'])
  df_temp = df_temp.merge(landclass_df, how='left', on=['Region'])

  df_temp[vegetation_columns] = df_temp[vegetation_columns].fillna(method='ffill')

  df_all = df_all.append(df_temp)

for col in vegetation_influenced_columns:
  df_all[col+'_inflame_risk'] = df_all[col] * (1-df_all.Vegetation_index_mean)
  df_all = df_all.drop(columns = col)

#df_all = df_all.merge(soilwater_df, how='inner', on=['Date', 'Region'])

df_all = df_all.drop(columns = other_landclass_columns)

#df_all = df_all.drop(columns = ['Date'], inplace = False)
df_all.replace(['NSW', 'NT', 'QL', 'VI', 'SA', 'TA', 'WA'], [1,2,3,4,5,6,7], inplace=True)

columns_titles = ['Region','Precipitation_Max','Precipitation_Mean','Precipitation_Min','Precipitation_Variance','RelativeHumidity_Max','RelativeHumidity_Mean','RelativeHumidity_Min','RelativeHumidity_Variance','SoilWaterContent_Max','SoilWaterContent_Mean','SoilWaterContent_Min','SoilWaterContent_Variance','SolarRadiation_Max','SolarRadiation_Mean','SolarRadiation_Min','SolarRadiation_Variance','Temperature_Max','Temperature_Mean','Temperature_Min','Temperature_Variance','WindSpeed_Max','WindSpeed_Mean','WindSpeed_Min','WindSpeed_Variance','Vegetation_index_mean','Vegetation_index_max','Vegetation_index_min','Vegetation_index_std','Vegetation_index_variance','Shrubs_inflame_risk','Herbaceous vegetation_inflame_risk','Cultivated and managed vegetation/agriculture (cropland)_inflame_risk','Herbaceous wetland_inflame_risk','Closed forest, deciduous broad leaf_inflame_risk','Closed forest, unknown_inflame_risk','Open forest, deciduous broad leaf_inflame_risk','Open forest, unknown definitions_inflame_risk','Month','Year']
df_all=df_all.reindex(columns=columns_titles)
df_all.dropna(inplace=True)

scaler = MinMaxScaler()
for col in df_all:#.columns.drop('Estimated_fire_area'):
  df_all[[col]] = scaler.fit_transform(df_all[[col]])
  #validation_main_df[[col]] = scaler.fit_transform(validation_main_df[[col]])

df_all["Month"] = 0.090909
df_all["Year"] = 0.857143
x_data = np.asarray(df_all)

In [None]:
# Use the forest's predict method on the test data
predictions = rf.predict(x_data)# Calculate the absolute errors
#errors = abs(predictions - validation_y)# Print out the mean absolute error (mae)
#print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')

In [None]:
file = open("file1.txt", "w+") 
# Saving the array in a text file 
content = str(predictions) 
file.write(content) 
file.close() 

In [None]:
from matplotlib import pyplot

#pyplot.plot(errors, label = 'Expected')
pyplot.plot(validation_y[:100], label = 'Expected')
pyplot.plot(predictions[:100], label = 'Predicted')
pyplot.legend()
pyplot.show()

In [None]:
# Import tools needed for visualization
from sklearn.tree import export_graphviz
import pydot# Pull out one tree from the forest
#tree = rf.estimators_[5]# Import tools needed for visualization
from sklearn.tree import export_graphviz
import pydot# Pull out one tree from the forest

#tree = rf.estimators_[5]# Export the image to a dot file
#export_graphviz(tree, out_file = 'tree.dot', feature_names = feature_names, rounded = True, precision = 1)# Use dot file to create a graph
#(graph, ) = pydot.graph_from_dot_file('tree.dot')# Write graph to a png file
#graph.write_png('tree.png')

In [None]:
features = main_df.drop(columns=['Estimated_fire_area'], inplace=True)
feature_names = main_df.columns

# Get numerical feature importances
importances = list(rf.feature_importances_)# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_names, importances)]# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: Open forest, deciduous broad leaf_inflame_risk Importance: 0.28
Variable: SoilWaterContent_Max Importance: 0.13
Variable: SolarRadiation_Mean  Importance: 0.06
Variable: Year                 Importance: 0.05
Variable: RelativeHumidity_Mean Importance: 0.04
Variable: SoilWaterContent_Variance Importance: 0.04
Variable: Vegetation_index_mean Importance: 0.03
Variable: Vegetation_index_std Importance: 0.03
Variable: Vegetation_index_variance Importance: 0.03
Variable: Month                Importance: 0.03
Variable: SoilWaterContent_Mean Importance: 0.02
Variable: SolarRadiation_Max   Importance: 0.02
Variable: Vegetation_index_max Importance: 0.02
Variable: Precipitation_Max    Importance: 0.01
Variable: Precipitation_Mean   Importance: 0.01
Variable: RelativeHumidity_Max Importance: 0.01
Variable: RelativeHumidity_Min Importance: 0.01
Variable: RelativeHumidity_Variance Importance: 0.01
Variable: SolarRadiation_Min   Importance: 0.01
Variable: SolarRadiation_Variance Importance