In [22]:
#importing the CSV file, original file can be found on GitHub page or at https://www.kaggle.com/datasets/sumitm004/forest-fire-area
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df = pd.read_csv("forestfires.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 517 entries, 0 to 516
Data columns (total 13 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   X       517 non-null    int64  
 1   Y       517 non-null    int64  
 2   month   517 non-null    object 
 3   day     517 non-null    object 
 4   FFMC    517 non-null    float64
 5   DMC     517 non-null    float64
 6   DC      517 non-null    float64
 7   ISI     517 non-null    float64
 8   temp    517 non-null    float64
 9   RH      517 non-null    int64  
 10  wind    517 non-null    float64
 11  rain    517 non-null    float64
 12  area    517 non-null    float64
dtypes: float64(8), int64(3), object(2)
memory usage: 52.6+ KB


In [23]:
from calendar import month_abbr
#addidng a new column where month is displayed as the Int8 monthNum
lower_ma = [m.lower() for m in month_abbr]

df['monthNum'] = df['month'].str.lower().map(lambda m: lower_ma.index(m)).astype('Int8')
df.head(10)

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area,monthNum
0,7,5,mar,fri,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,0.0,3
1,7,4,oct,tue,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,0.0,10
2,7,4,oct,sat,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,0.0,10
3,8,6,mar,fri,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,0.0,3
4,8,6,mar,sun,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,0.0,3
5,8,6,aug,sun,92.3,85.3,488.0,14.7,22.2,29,5.4,0.0,0.0,8
6,8,6,aug,mon,92.3,88.9,495.6,8.5,24.1,27,3.1,0.0,0.0,8
7,8,6,aug,mon,91.5,145.4,608.2,10.7,8.0,86,2.2,0.0,0.0,8
8,8,6,sep,tue,91.0,129.5,692.6,7.0,13.1,63,5.4,0.0,0.0,9
9,7,5,sep,sat,92.5,88.0,698.6,7.1,22.8,40,4.0,0.0,0.0,9


In [24]:
#dropping the string values from the dataset as well as X and Y. X and Y were very specific to the Montesinho Natural Park and
#we want the model to be usable in any location
df = df.drop(columns=['X', 'Y','month','day'])
df.head(10)

Unnamed: 0,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area,monthNum
0,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,0.0,3
1,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,0.0,10
2,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,0.0,10
3,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,0.0,3
4,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,0.0,3
5,92.3,85.3,488.0,14.7,22.2,29,5.4,0.0,0.0,8
6,92.3,88.9,495.6,8.5,24.1,27,3.1,0.0,0.0,8
7,91.5,145.4,608.2,10.7,8.0,86,2.2,0.0,0.0,8
8,91.0,129.5,692.6,7.0,13.1,63,5.4,0.0,0.0,9
9,92.5,88.0,698.6,7.1,22.8,40,4.0,0.0,0.0,9


In [25]:
#area is currently a float value however converting to integers makes many models easier to work with
#as shown below, area has a wide range of values and the floats do not add much value to the model 
df.agg({"area": ["min", "max", "median", "skew"]})

Unnamed: 0,area
min,0.0
max,1090.84
median,0.52
skew,12.846934


In [26]:
#converting area to area_int as this is both simple and will work for the model

df['area_int'] = df['area'].astype({"area": int})
#df = df.drop(columns=['area'])
df.head(10)

Unnamed: 0,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area,monthNum,area_int
0,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,0.0,3,0
1,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,0.0,10,0
2,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,0.0,10,0
3,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,0.0,3,0
4,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,0.0,3,0
5,92.3,85.3,488.0,14.7,22.2,29,5.4,0.0,0.0,8,0
6,92.3,88.9,495.6,8.5,24.1,27,3.1,0.0,0.0,8,0
7,91.5,145.4,608.2,10.7,8.0,86,2.2,0.0,0.0,8,0
8,91.0,129.5,692.6,7.0,13.1,63,5.4,0.0,0.0,9,0
9,92.5,88.0,698.6,7.1,22.8,40,4.0,0.0,0.0,9,0


In [27]:
#train test split

#split the dataset into train and test sets
from sklearn.model_selection import train_test_split

X, y = df[['FFMC','DMC','DC','ISI','temp','RH','wind','rain','monthNum']], df.area_int
  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42) 

DecisionTreeClassifier

In [28]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import precision_score, recall_score
  

X = X_train[['FFMC','DMC','DC','ISI']]
y = y_train

X_test = X_test[['FFMC','DMC','DC','ISI']]
y_test = y_test

tree_model = DecisionTreeClassifier()
tree_model.fit(X,y)

y_pred = tree_model.predict(X)
print('Results for decision tree on training data')
print('  Default settings')
print("Confusion Matrix")
print(confusion_matrix(y, y_pred))
print('Accuracy is  ', accuracy_score(y, y_pred))
print('Precision is ', precision_score(y, y_pred, average='weighted'))
print('Recall is    ', recall_score(y,y_pred, average='weighted'))
print('F1 is        ', f1_score(y, y_pred, average='weighted'))
print()

y_test_pred = tree_model.predict(X_test)
print('Results for decision tree on test data')
print('  Default settings')
print("Confusion Matrix")
print(confusion_matrix(y_test, y_test_pred))
print('Accuracy is  ', accuracy_score(y_test, y_test_pred))
print('Precision is ', precision_score(y_test, y_test_pred, average='weighted'))
print('Recall is    ', recall_score(y_test,y_test_pred, average='weighted'))
print('F1 is        ', f1_score(y_test, y_test_pred, average='weighted'))

Results for decision tree on training data
  Default settings
Confusion Matrix
[[136   0   0 ...   0   0   0]
 [  7  11   0 ...   0   0   0]
 [  4   1   7 ...   0   0   0]
 ...
 [  1   0   0 ...   0   0   0]
 [  0   0   0 ...   0   1   0]
 [  1   0   0 ...   0   0   0]]
Accuracy is   0.7364341085271318
Precision is  0.7128872419570095
Recall is     0.7364341085271318
F1 is         0.6815300196674532

Results for decision tree on test data
  Default settings
Confusion Matrix
[[89  9  8 ...  0  0  0]
 [13  0  1 ...  0  0  0]
 [10  1  0 ...  0  0  0]
 ...
 [ 1  0  0 ...  0  0  0]
 [ 0  0  1 ...  0  0  0]
 [ 0  0  0 ...  0  0  0]]
Accuracy is   0.3436293436293436
Precision is  0.267484204984205
Recall is     0.3436293436293436
F1 is         0.3008129078416618


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


RandomForestClassifier

In [29]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import precision_score, recall_score


X = X_train[['FFMC','DMC','DC','ISI']]
y = y_train

X_test = X_test[['FFMC','DMC','DC','ISI']]

rf_model = RandomForestClassifier(n_estimators=100)

rf_model.fit(X,y)

y_pred = rf_model.predict(X)


print('Results for Random Forest on train data')
print('  Default settings')
print("Confusion Matrix")
print(confusion_matrix(y, y_pred))
print('Accuracy is  ', accuracy_score(y, y_pred))
print('Precision is ', precision_score(y, y_pred, average='weighted'))
print('Recall is    ', recall_score(y,y_pred, average='weighted'))
print('F1 is        ', f1_score(y, y_pred, average='weighted'))
print()

y_test_pred = rf_model.predict(X_test)
print('Results for Random Forest on test data')
print('  Default settings')
print("Confusion Matrix")
print(confusion_matrix(y_test, y_test_pred))
print('Accuracy is  ', accuracy_score(y_test, y_test_pred))
print('Precision is ', precision_score(y_test, y_test_pred, average='weighted'))
print('Recall is    ', recall_score(y_test,y_test_pred, average='weighted'))
print('F1 is        ', f1_score(y_test, y_test_pred, average='weighted'))

Results for Random Forest on train data
  Default settings
Confusion Matrix
[[127   1   0 ...   0   0   0]
 [  5  12   0 ...   0   0   0]
 [  3   1   5 ...   0   0   0]
 ...
 [  1   0   0 ...   0   0   0]
 [  0   0   0 ...   0   1   0]
 [  1   0   0 ...   0   0   0]]
Accuracy is   0.7364341085271318
Precision is  0.7385325393587329
Recall is     0.7364341085271318
F1 is         0.7046800035856123

Results for Random Forest on test data
  Default settings
Confusion Matrix
[[102   7   2 ...   0   0   0]
 [ 15   0   0 ...   0   0   0]
 [ 11   0   0 ...   0   0   0]
 ...
 [  0   0   0 ...   0   0   0]
 [  1   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]]
Accuracy is   0.4015444015444015
Precision is  0.30031370656370654
Recall is     0.4015444015444015
F1 is         0.3390172630294211


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


SVC

In [30]:
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import precision_score, recall_score
  

X = X_train[['FFMC','DMC','DC','ISI']]
y = y_train

X_test = X_test[['FFMC','DMC','DC','ISI']]

svc_model = SVC()
svc_model.fit(X,y)

y_pred = svc_model.predict(X)
print('Results for svc on training data')
print('  Default settings')
print("Confusion Matrix")
print(confusion_matrix(y, y_pred))
print('Accuracy is  ', accuracy_score(y, y_pred))
print('Precision is ', precision_score(y, y_pred, average='weighted'))
print('Recall is    ', recall_score(y,y_pred, average='weighted'))
print('F1 is        ', f1_score(y, y_pred, average='weighted'))
print()

y_test_pred = svc_model.predict(X_test)
print('Results for svc on test data')
print('  Default settings')
print("Confusion Matrix")
print(confusion_matrix(y_test, y_test_pred))
print('Accuracy is  ', accuracy_score(y_test, y_test_pred))
print('Precision is ', precision_score(y_test, y_test_pred, average='weighted'))
print('Recall is    ', recall_score(y_test,y_test_pred, average='weighted'))
print('F1 is        ', f1_score(y_test, y_test_pred, average='weighted'))



Results for svc on training data
  Default settings
Confusion Matrix
[[137   0   0 ...   0   0   0]
 [ 18   0   0 ...   0   0   0]
 [ 12   0   0 ...   0   0   0]
 ...
 [  1   0   0 ...   0   0   0]
 [  1   0   0 ...   0   0   0]
 [  1   0   0 ...   0   0   0]]
Accuracy is   0.5310077519379846
Precision is  0.2819692326182321


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Recall is     0.5310077519379846
F1 is         0.36834461780001965

Results for svc on test data
  Default settings
Confusion Matrix
[[137   0   0 ...   0   0   0]
 [ 17   0   0 ...   0   0   0]
 [ 14   0   0 ...   0   0   0]
 ...
 [  1   0   0 ...   0   0   0]
 [  1   0   0 ...   0   0   0]
 [  1   0   0 ...   0   0   0]]
Accuracy is   0.528957528957529
Precision is  0.2797960674408551
Recall is     0.528957528957529
F1 is         0.36599586599586603


Neural Network - MLP Classifier

In [31]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import precision_score, recall_score
  

X = X_train[['FFMC','DMC','DC','ISI']]
y = y_train

X_test = X_test[['FFMC','DMC','DC','ISI']]

nn_model = MLPClassifier(hidden_layer_sizes=(50, 25, 10),
                         solver='lbfgs')
nn_model.fit(X,y)

y_pred = nn_model.predict(X)


print('Results for NN on train data')
print('  Default settings')
print("Confusion Matrix")
print(confusion_matrix(y, y_pred))
print('Accuracy is  ', accuracy_score(y, y_pred))
print('Precision is ', precision_score(y, y_pred, average='weighted'))
print('Recall is    ', recall_score(y,y_pred, average='weighted'))
print('F1 is        ', f1_score(y, y_pred, average='weighted'))
print()

y_test_pred = nn_model.predict(X_test)
print('Results for NN on test data')
print('  Default settings')
print("Confusion Matrix")
print(confusion_matrix(y_test, y_test_pred))
print('Accuracy is  ', accuracy_score(y_test, y_test_pred))
print('Precision is ', precision_score(y_test, y_test_pred, average='weighted'))
print('Recall is    ', recall_score(y_test,y_test_pred, average='weighted'))
print('F1 is        ', f1_score(y_test, y_test_pred, average='weighted'))

Results for NN on train data
  Default settings
Confusion Matrix
[[137   0   0 ...   0   0   0]
 [ 18   0   0 ...   0   0   0]
 [ 12   0   0 ...   0   0   0]
 ...
 [  1   0   0 ...   0   0   0]
 [  1   0   0 ...   0   0   0]
 [  1   0   0 ...   0   0   0]]
Accuracy is   0.5310077519379846
Precision is  0.2819692326182321
Recall is     0.5310077519379846
F1 is         0.36834461780001965

Results for NN on test data
  Default settings
Confusion Matrix
[[136   0   1 ...   0   0   0]
 [ 17   0   0 ...   0   0   0]
 [ 14   0   0 ...   0   0   0]
 ...
 [  1   0   0 ...   0   0   0]
 [  1   0   0 ...   0   0   0]
 [  1   0   0 ...   0   0   0]]
Accuracy is   0.525096525096525
Precision is  0.2788303253419533
Recall is     0.525096525096525
F1 is         0.36424417183910857


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


AdaBoost Classifier

In [32]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import precision_score, recall_score
  

X = X_train[['FFMC','DMC','DC','ISI']]
y = y_train

X_test = X_test[['FFMC','DMC','DC','ISI']]

ada_model = AdaBoostClassifier(n_estimators=100)

ada_model.fit(X,y)

y_pred = ada_model.predict(X)


print('Results for ADA on train data')
print('  Default settings')
print("Confusion Matrix")
print(confusion_matrix(y, y_pred))
print('Accuracy is  ', accuracy_score(y, y_pred))
print('Precision is ', precision_score(y, y_pred, average='weighted'))
print('Recall is    ', recall_score(y,y_pred, average='weighted'))
print('F1 is        ', f1_score(y, y_pred, average='weighted'))
print()

y_test_pred = ada_model.predict(X_test)
print('Results for ADA on test data')
print('  Default settings')
print("Confusion Matrix")
print(confusion_matrix(y_test, y_test_pred))
print('Accuracy is  ', accuracy_score(y_test, y_test_pred))
print('Precision is ', precision_score(y_test, y_test_pred, average='weighted'))
print('Recall is    ', recall_score(y_test,y_test_pred, average='weighted'))
print('F1 is        ', f1_score(y_test, y_test_pred, average='weighted'))


Results for ADA on train data
  Default settings
Confusion Matrix
[[136   0   0 ...   0   0   0]
 [ 18   0   0 ...   0   0   0]
 [ 12   0   0 ...   0   0   0]
 ...
 [  1   0   0 ...   0   0   0]
 [  1   0   0 ...   0   0   0]
 [  1   0   0 ...   0   0   0]]
Accuracy is   0.5310077519379846
Precision is  0.29400903375450166
Recall is     0.5310077519379846
F1 is         0.37493379556069145

Results for ADA on test data
  Default settings
Confusion Matrix
[[137   0   0 ...   0   0   0]
 [ 17   0   0 ...   0   0   0]
 [ 14   0   0 ...   0   0   0]
 ...
 [  1   0   0 ...   0   0   0]
 [  1   0   0 ...   0   0   0]
 [  1   0   0 ...   0   0   0]]
Accuracy is   0.528957528957529
Precision is  0.2808805483224088
Recall is     0.528957528957529
F1 is         0.3669224378085138


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
