In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [160]:
dataLog = pd.read_csv('DataWithLog.csv')
dataNoLog = pd.read_csv('DataNoLog.csv')

# Since the range of our target variable "area" is so big and it is imbalanced we will try different approaches.
# We will in both cases use classification as it is our goal to predict whether a fire will be 'big' or not.
# Big or not consists of: bigger/smaller than mean fire size, but also bigger/smaller than mean*1.5IQR fire size.

# Our first try will consist of performing a logarithmic transformation on the target variable 'area'.
# As you will see this will take care of the 'big range' of the variable and also solve the imbalance problem!
# We do have to keep in mind that when checking for results we will have to 'undo' this operation on our target variable.

# Our second try will consist of performing SMOTE (oversampling) on our target variable in order to deal with the imbalance.

# When both methods have been completed we will respectively create a model for each dataset and compare results.

# With Log 

In [9]:
np.mean(dataLog['area'])

1.1110257652960873

In [10]:
np.max(dataLog['area'])-np.min(dataLog['area'])

6.995619625423205

In [161]:
dataLog['MeanFire'] = np.where(dataLog['area']<np.mean(dataLog['area']), 0, 1)
dataLog['MeanFire'].value_counts()

# These are averige fires so it is normal they are distributed this way.

0    312
1    205
Name: MeanFire, dtype: int64

In [162]:
Q1 = dataLog['area'].quantile(0.25)
Q3 = dataLog['area'].quantile(0.75)
IQR = Q3 - Q1
dataLog['IQRFire'] = np.where(dataLog['area']<(Q3+1.5*IQR), 0, 1)
dataLog['IQRFire'].value_counts()

# These fires are extremely big, so it is normal they do not occur that much.
# Seems like we will perform SMOTE here as well.

0    509
1      8
Name: IQRFire, dtype: int64

In [167]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

standardscaler = StandardScaler()
normalizescaler = MinMaxScaler()

StandardData = standardscaler.fit(dataLog.drop(columns=['area','MeanFire','IQRFire','day']))
StandardData = standardscaler.transform(dataLog.drop(columns=['area','MeanFire','IQRFire','day']))
StandardData = pd.DataFrame(StandardData, columns=['month','FFMC','DMC','DC','ISI','temp','RH','wind','rain'])
StandardData['MeanFire'] = dataLog['MeanFire']
StandardData['IQRFire'] = dataLog['IQRFire']
StandardData['day'] = dataLog['day']

NormData = normalizescaler.fit(dataLog.drop(columns=['area','MeanFire','IQRFire']))
NormData = normalizescaler.transform(dataLog.drop(columns=['area','MeanFire','IQRFire']))
NormData = pd.DataFrame(NormData, columns=['month','day','FFMC','DMC','DC','ISI','temp','RH','wind','rain'])
NormData['MeanFire'] = dataLog['MeanFire']
NormData['IQRFire'] = dataLog['IQRFire']

# 2 new DFs: StandardData & NormData

StandardData.to_csv('standarddata.csv', index=False)
NormData.to_csv('normdata.csv', index=False)

# Logistic Regression normalized data

In [149]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

X = NormData.drop(columns=['MeanFire','IQRFire'])
y = NormData['MeanFire']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

model1 = LogisticRegression()
model1.fit(X_train, y_train)

LogisticRegression()

In [150]:
from sklearn.metrics import confusion_matrix, accuracy_score

score = model1.score(X_test,y_test)

# Predict using test data
y_pred = model1.predict(X_test)

# Generate matrix
matrix = confusion_matrix(y_test, y_pred)

print('Accuracy score is:', score)
print('Confusion Matrix:\n',matrix)

Accuracy score is: 0.6442307692307693
Confusion Matrix:
 [[59  5]
 [32  8]]


# Logistic Regression standardized data

In [79]:
X = StandardData.drop(columns=['MeanFire','IQRFire'])
y = StandardData['MeanFire']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

model2 = LogisticRegression()
model2.fit(X_train, y_train)

LogisticRegression()

In [80]:
score = model2.score(X_test,y_test)

# Predict using test data
y_pred = model1.predict(X_test)

# Generate matrix
matrix = confusion_matrix(y_test, y_pred)

print('Accuracy score is:', score)
print('Confusion Matrix:\n',matrix)

Accuracy score is: 0.6346153846153846
Confusion Matrix:
 [[54 13]
 [27 10]]


# KNN normalized data

In [129]:
from sklearn.neighbors import KNeighborsClassifier

X = NormData.drop(columns=['MeanFire','IQRFire'])
y = NormData['MeanFire']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

model1 = KNeighborsClassifier(n_neighbors=2)
model1.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=2)

In [130]:
knn_y_pred = model1.predict(X_test)
knn_matrix = confusion_matrix(y_test, knn_y_pred)
knn_score = model1.score(X_test,y_test)

print('Accuracy score is:', knn_score)
print('Confusion Matrix:\n',knn_matrix)

Accuracy score is: 0.6730769230769231
Confusion Matrix:
 [[62  6]
 [28  8]]


# KNN standardized data 

In [145]:
X = StandardData.drop(columns=['MeanFire','IQRFire'])
y = StandardData['MeanFire']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

model2 = KNeighborsClassifier(n_neighbors=2)
model2.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=2)

In [146]:
knn_y_pred = model2.predict(X_test)
knn_matrix = confusion_matrix(y_test, knn_y_pred)
knn_score = model2.score(X_test,y_test)

print('Accuracy score is:', knn_score)
print('Confusion Matrix:\n',knn_matrix)

Accuracy score is: 0.6153846153846154
Confusion Matrix:
 [[57 11]
 [29  7]]


# Closer look at the 8 'big' fires

In [164]:
dataLog['IQRFire'].value_counts()

0    509
1      8
Name: IQRFire, dtype: int64

In [166]:
BigFires = dataLog.loc[(dataLog['IQRFire']==1)]
BigFires

# We need to find out more about FFMC, DMC, DC, ISI.

Unnamed: 0,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area,MeanFire,IQRFire
235,3,1,91.4,142.4,601.4,10.6,19.6,41,5.8,0.0,5.285637,1,1
236,4,1,92.5,121.1,674.4,8.6,18.2,46,1.8,0.0,5.307971,1,1
237,4,0,91.0,129.5,692.6,7.0,18.8,40,2.2,0.0,5.365415,1,1
238,4,1,92.5,121.1,674.4,8.6,25.1,27,4.0,0.0,6.99562,1,1
377,3,1,93.7,231.1,715.1,8.4,21.9,42,2.2,0.0,5.16838,1,1
415,3,0,94.8,222.4,698.6,13.9,27.5,27,4.9,0.0,6.61644,1,1
420,3,0,91.7,191.4,635.9,7.8,26.2,36,4.5,0.0,5.229824,1,1
479,3,0,89.2,103.9,431.6,6.4,22.6,57,4.9,0.0,5.63311,1,1
