In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

import warnings
sns.set()
warnings.filterwarnings("ignore")

%matplotlib inline

In [2]:
df = pd.read_csv("Algerian_forest_fires_dataset_UPDATE.csv", header=1)
df.head()

Unnamed: 0,day,month,year,Temperature,RH,Ws,Rain,FFMC,DMC,DC,ISI,BUI,FWI,Classes
0,1,6,2012,29,57,18,0.0,65.7,3.4,7.6,1.3,3.4,0.5,not fire
1,2,6,2012,29,61,13,1.3,64.4,4.1,7.6,1.0,3.9,0.4,not fire
2,3,6,2012,26,82,22,13.1,47.1,2.5,7.1,0.3,2.7,0.1,not fire
3,4,6,2012,25,89,13,2.5,28.6,1.3,6.9,0.0,1.7,0.0,not fire
4,5,6,2012,27,77,16,0.0,64.8,3.0,14.2,1.2,3.9,0.5,not fire


In [3]:
df.drop(index=[122,123], inplace=True) # removing rows
df.reset_index(inplace=True)
df.drop('index', axis=1, inplace=True)
df.loc[:122, 'region'] = 'bejaia'
df.loc[122:, 'region'] = 'Sidi-Bel Abbes'

# make date time column
df['date'] = pd.to_datetime(df[['day','month','year']])
df.drop(['day', 'month', 'year'], axis=1, inplace=True)
df.head()

# stripping column names
df.columns = [i.strip() for i in df.columns] 

# stripping Classes feature
df.Classes = df.Classes.str.strip()

# replacing nan and null string with object
df.replace('nan', np.nan) 
df.replace('null', np.nan)

Unnamed: 0,Temperature,RH,Ws,Rain,FFMC,DMC,DC,ISI,BUI,FWI,Classes,region,date
0,29,57,18,0,65.7,3.4,7.6,1.3,3.4,0.5,not fire,bejaia,2012-06-01
1,29,61,13,1.3,64.4,4.1,7.6,1,3.9,0.4,not fire,bejaia,2012-06-02
2,26,82,22,13.1,47.1,2.5,7.1,0.3,2.7,0.1,not fire,bejaia,2012-06-03
3,25,89,13,2.5,28.6,1.3,6.9,0,1.7,0,not fire,bejaia,2012-06-04
4,27,77,16,0,64.8,3,14.2,1.2,3.9,0.5,not fire,bejaia,2012-06-05
...,...,...,...,...,...,...,...,...,...,...,...,...,...
239,30,65,14,0,85.4,16,44.5,4.5,16.9,6.5,fire,Sidi-Bel Abbes,2012-09-26
240,28,87,15,4.4,41.1,6.5,8,0.1,6.2,0,not fire,Sidi-Bel Abbes,2012-09-27
241,27,87,29,0.5,45.9,3.5,7.9,0.4,3.4,0.2,not fire,Sidi-Bel Abbes,2012-09-28
242,24,54,18,0.1,79.7,4.3,15.2,1.7,5.1,0.7,not fire,Sidi-Bel Abbes,2012-09-29


In [4]:
df.loc[165,'DC'] = 14.6
df.loc[165,'ISI'] = 9
df.loc[165,'BUI'] = 12.5
df.loc[165,'FWI'] = 10.4
df.loc[165,'Classes'] = 'fire'

In [5]:
# check null value

df.isnull().sum()

Temperature    0
RH             0
Ws             0
Rain           0
FFMC           0
DMC            0
DC             0
ISI            0
BUI            0
FWI            0
Classes        0
region         0
date           0
dtype: int64

In [6]:
# changing dtypes to numerical

df = df.astype({'RH':np.int64, 'Temperature':np.int64, 'Ws':np.int64, 'Rain':np.float64, 'FFMC':np.float64
               ,'DMC':np.float64, 'DC':np.float64, 'ISI':np.float64, 'BUI':np.float64,
               'FWI':np.float64})

### Label Encoding

In [7]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
df['Classes'] = encoder.fit_transform(df.Classes)
df['region'] = encoder.fit_transform(df.region)
df.head()

Unnamed: 0,Temperature,RH,Ws,Rain,FFMC,DMC,DC,ISI,BUI,FWI,Classes,region,date
0,29,57,18,0.0,65.7,3.4,7.6,1.3,3.4,0.5,1,1,2012-06-01
1,29,61,13,1.3,64.4,4.1,7.6,1.0,3.9,0.4,1,1,2012-06-02
2,26,82,22,13.1,47.1,2.5,7.1,0.3,2.7,0.1,1,1,2012-06-03
3,25,89,13,2.5,28.6,1.3,6.9,0.0,1.7,0.0,1,1,2012-06-04
4,27,77,16,0.0,64.8,3.0,14.2,1.2,3.9,0.5,1,1,2012-06-05


### Outliers

In [8]:
def outlier_index_zscore(data):
    z = np.abs(stats.zscore(data))
    return np.where(z > 3)

In [9]:
indexes = np.concatenate([outlier_index_zscore(df.Rain)[0], outlier_index_zscore(df.FFMC)[0] , outlier_index_zscore(df.DMC)[0], 
                          outlier_index_zscore(df.DC)[0], outlier_index_zscore(df.BUI)[0]])

In [10]:
trim_df = df.copy()

In [11]:
trim_df = trim_df.loc[set(trim_df.index) - set(indexes)]

### Train/Test split

In [12]:
trim_df.drop('date', axis=1, inplace=True)

In [13]:
X = trim_df.drop('Classes', axis=1) # Independet Feature
y = trim_df['Classes'] # Dependent Feature

In [14]:
from sklearn.model_selection import train_test_split
# Split into training and test set
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state=42)

In [15]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape


((154, 11), (77, 11), (154,), (77,))

### Boosting (Adaboost)

In [22]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, classification_report

In [17]:
abc_model = AdaBoostClassifier()

In [19]:
abc_model.fit(x_train, y_train)

In [20]:
y_pred = abc_model.predict(x_test)

In [24]:
# Model is overfitted
abc_model.score(x_train, y_train)

1.0

### Performance metircs

In [27]:
accuracy_score(y_test, y_pred)

0.974025974025974

In [29]:
print(classification_report(y_test, y_pred, target_names=['Fire', 'Not fire']))

              precision    recall  f1-score   support

        Fire       1.00      0.96      0.98        49
    Not fire       0.93      1.00      0.97        28

    accuracy                           0.97        77
   macro avg       0.97      0.98      0.97        77
weighted avg       0.98      0.97      0.97        77



### Hyperparameter Tuning

In [30]:
from sklearn.model_selection import RandomizedSearchCV

In [32]:
grid_param = {
    "n_estimators" : [90,100,115,130],
    "learning_rate": [0.0001, 0.01, 0.1, 0.5]   
}

In [34]:
random_search = RandomizedSearchCV(estimator=abc_model, param_distributions=grid_param,cv=3,verbose=2,n_jobs=-1)

In [36]:
random_search.fit(x_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


In [37]:
rs_predit = random_search.predict(x_test)

In [38]:
random_search.score(x_train, y_train)

0.9935064935064936

In [39]:
accuracy_score(y_test, rs_predit)

0.961038961038961

In [41]:
print(classification_report(y_test, rs_predit))

              precision    recall  f1-score   support

           0       1.00      0.94      0.97        49
           1       0.90      1.00      0.95        28

    accuracy                           0.96        77
   macro avg       0.95      0.97      0.96        77
weighted avg       0.96      0.96      0.96        77

