# Algeria Forest Fires Prediction

## Import Libraries 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
%matplotlib inline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.metrics import accuracy_score
from pandas_profiling import ProfileReport
from sklearn.svm import SVC
from sklearn.metrics import classification_report

## Import Dataset

In [2]:
forest_fires_data = pd.read_csv('dataset/Algerian_forest_fires.csv')
forest_fires_data.head()

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Bejaia Region Dataset
day,month,year,Temperature,RH,Ws,Rain,FFMC,DMC,DC,ISI,BUI,FWI,Classes
01,06,2012,29,57,18,0,65.7,3.4,7.6,1.3,3.4,0.5,not fire
02,06,2012,29,61,13,1.3,64.4,4.1,7.6,1,3.9,0.4,not fire
03,06,2012,26,82,22,13.1,47.1,2.5,7.1,0.3,2.7,0.1,not fire
04,06,2012,25,89,13,2.5,28.6,1.3,6.9,0,1.7,0,not fire


### Remove Header

In [3]:
forest_fires_data = pd.read_csv('dataset/Algerian_forest_fires.csv', header=1)
forest_fires_data.head()

Unnamed: 0,day,month,year,Temperature,RH,Ws,Rain,FFMC,DMC,DC,ISI,BUI,FWI,Classes
0,1,6,2012,29,57,18,0.0,65.7,3.4,7.6,1.3,3.4,0.5,not fire
1,2,6,2012,29,61,13,1.3,64.4,4.1,7.6,1.0,3.9,0.4,not fire
2,3,6,2012,26,82,22,13.1,47.1,2.5,7.1,0.3,2.7,0.1,not fire
3,4,6,2012,25,89,13,2.5,28.6,1.3,6.9,0.0,1.7,0.0,not fire
4,5,6,2012,27,77,16,0.0,64.8,3.0,14.2,1.2,3.9,0.5,not fire


## Exploratory Data Analysis

<b>View Columns<b>

In [4]:
forest_fires_data.columns

Index(['day', 'month', 'year', 'Temperature', ' RH', ' Ws', 'Rain ', 'FFMC',
       'DMC', 'DC', 'ISI', 'BUI', 'FWI', 'Classes  '],
      dtype='object')

<b> Remove whitespaces from column names <b>

In [5]:
for column in forest_fires_data.columns:
    new_column = column.strip()
    forest_fires_data.rename(columns={column: new_column}, inplace=True)
forest_fires_data.columns

Index(['day', 'month', 'year', 'Temperature', 'RH', 'Ws', 'Rain', 'FFMC',
       'DMC', 'DC', 'ISI', 'BUI', 'FWI', 'Classes'],
      dtype='object')

<b>First top 5 rows from forest_fires_data<b>

In [6]:
forest_fires_data.head()

Unnamed: 0,day,month,year,Temperature,RH,Ws,Rain,FFMC,DMC,DC,ISI,BUI,FWI,Classes
0,1,6,2012,29,57,18,0.0,65.7,3.4,7.6,1.3,3.4,0.5,not fire
1,2,6,2012,29,61,13,1.3,64.4,4.1,7.6,1.0,3.9,0.4,not fire
2,3,6,2012,26,82,22,13.1,47.1,2.5,7.1,0.3,2.7,0.1,not fire
3,4,6,2012,25,89,13,2.5,28.6,1.3,6.9,0.0,1.7,0.0,not fire
4,5,6,2012,27,77,16,0.0,64.8,3.0,14.2,1.2,3.9,0.5,not fire


<b>Describe forest_fires_data<b>

In [7]:
forest_fires_data.describe()

Unnamed: 0,day,month,year,Temperature,RH,Ws,Rain,FFMC,DMC,DC,ISI,BUI,FWI,Classes
count,246,245,245,245,245,245,245,245.0,245.0,245,245.0,245,245.0,244
unique,33,5,2,20,63,19,40,174.0,167.0,199,107.0,175,128.0,9
top,1,8,2012,35,55,14,0,88.9,7.9,8,1.1,3,0.4,fire
freq,8,62,244,29,10,43,133,8.0,5.0,5,8.0,5,12.0,131


<b>Dataset Info<b>

In [8]:
forest_fires_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 246 entries, 0 to 245
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   day          246 non-null    object
 1   month        245 non-null    object
 2   year         245 non-null    object
 3   Temperature  245 non-null    object
 4   RH           245 non-null    object
 5   Ws           245 non-null    object
 6   Rain         245 non-null    object
 7   FFMC         245 non-null    object
 8   DMC          245 non-null    object
 9   DC           245 non-null    object
 10  ISI          245 non-null    object
 11  BUI          245 non-null    object
 12  FWI          245 non-null    object
 13  Classes      244 non-null    object
dtypes: object(14)
memory usage: 27.0+ KB


<b>Get dimension and number of of the dataset<b>

In [9]:
print(f'Dimension is {forest_fires_data.ndim}')

Dimension is 2


In [10]:
print(f'Rows and Columns {forest_fires_data.shape}')

Rows and Columns (246, 14)


<b>Find Missing Values<b>

In [11]:
forest_fires_data.isnull().sum()

day            0
month          1
year           1
Temperature    1
RH             1
Ws             1
Rain           1
FFMC           1
DMC            1
DC             1
ISI            1
BUI            1
FWI            1
Classes        2
dtype: int64

In [12]:
sns.heatmap(forest_fires_data.isnull(), xticklabels=True, yticklabels=False, cmap='viridis')

<AxesSubplot:>

<b>Inference</b>

Null values present in every column except 'day'. Maximum null values were 2. Classes column has 2 null values while other columns have 1 null value.

<b>Find row wise null values</b>

In [13]:
forest_fires_data[forest_fires_data.isnull().any(axis=1)]

Unnamed: 0,day,month,year,Temperature,RH,Ws,Rain,FFMC,DMC,DC,ISI,BUI,FWI,Classes
122,Sidi-Bel Abbes Region Dataset,,,,,,,,,,,,,
167,14,7.0,2012.0,37.0,37.0,18.0,0.2,88.9,12.9,14.6 9,12.5,10.4,fire,


**Inference:**

From above data it is clear that from index 0 to 121,the data is related to Bejaia Region and from index 122, data is related to Sidi-Bel Abbes Region. So for partitioning this,  a new column is added named **Region** and for *Bejaia Region* is labelled as 1 and for *Sidi-Bel Abbes Region* is labelled as 2. Then deleting 122nd row since all other values are null.

On 167th row, it is clearly visible that in column DC, the values are misplaced. So shift the values to right. So that the row also get cleared.

In [14]:
forest_fires_data.loc[:122, 'Region'] = 1 # 1 for Bejaia Region
forest_fires_data.loc[122:, 'Region'] = 2 # 2 for Sidi Bel Abbes Region
forest_fires_data.head()

Unnamed: 0,day,month,year,Temperature,RH,Ws,Rain,FFMC,DMC,DC,ISI,BUI,FWI,Classes,Region
0,1,6,2012,29,57,18,0.0,65.7,3.4,7.6,1.3,3.4,0.5,not fire,1.0
1,2,6,2012,29,61,13,1.3,64.4,4.1,7.6,1.0,3.9,0.4,not fire,1.0
2,3,6,2012,26,82,22,13.1,47.1,2.5,7.1,0.3,2.7,0.1,not fire,1.0
3,4,6,2012,25,89,13,2.5,28.6,1.3,6.9,0.0,1.7,0.0,not fire,1.0
4,5,6,2012,27,77,16,0.0,64.8,3.0,14.2,1.2,3.9,0.5,not fire,1.0


<b>Delete 122nd Record</b>

In [15]:
forest_fires_data.drop(122, inplace=True)

<b>Splitting Column DC of 167th Row</b>

In [16]:
DC_Split=forest_fires_data.iloc[166]['DC'].split(' ')
forest_fires_data.loc[167, 'DC'] = DC_Split[0]

In [17]:
forest_fires_data.iloc[[166]]

Unnamed: 0,day,month,year,Temperature,RH,Ws,Rain,FFMC,DMC,DC,ISI,BUI,FWI,Classes,Region
167,14,7,2012,37,37,18,0.2,88.9,12.9,14.6,12.5,10.4,fire,,2.0


<b>Save ISI column value to new variable</b>

In [18]:
ISI_Col = forest_fires_data.loc[167, 'ISI']

# 2nd part of splitted DC is assigned to ISI
forest_fires_data.loc[167, 'ISI'] = DC_Split[1]

<b>Save BUI column value to new variable</b>

In [19]:
BUI_Col = forest_fires_data.loc[167, 'BUI']

# Assign ISI_Col value to BUI
forest_fires_data.loc[167, 'BUI'] = ISI_Col

<b>Save FWI column value to a variable<b>

In [20]:
FWI_Col = forest_fires_data.loc[167, 'FWI']

# Assign BUI_Col value to FWI
forest_fires_data.loc[167, 'FWI'] = BUI_Col

# Assign FWI_Col value to Classes
forest_fires_data.loc[167, 'Classes'] = FWI_Col

In [21]:
forest_fires_data.loc[[167]]

Unnamed: 0,day,month,year,Temperature,RH,Ws,Rain,FFMC,DMC,DC,ISI,BUI,FWI,Classes,Region
167,14,7,2012,37,37,18,0.2,88.9,12.9,14.6,9,12.5,10.4,fire,2.0


<b>Check Missing Values<b>

In [22]:
forest_fires_data.isnull().sum()

day            0
month          0
year           0
Temperature    0
RH             0
Ws             0
Rain           0
FFMC           0
DMC            0
DC             0
ISI            0
BUI            0
FWI            0
Classes        0
Region         0
dtype: int64

**Inference:**

No missing Values in the dataset.

<b>Get All Unique Value<b>

In [23]:
forest_fires_data.nunique()

day             32
month            5
year             2
Temperature     20
RH              63
Ws              19
Rain            40
FFMC           174
DMC            167
DC             199
ISI            107
BUI            174
FWI            128
Classes          9
Region           2
dtype: int64

In [24]:
for feature in forest_fires_data.columns:
    # choose the features whose unique values are less than 10.
    if len(forest_fires_data[feature].unique()) <= 10:
        print(f'Unique values of {feature} are {forest_fires_data[feature].unique()}')

Unique values of month are ['06' '07' '08' '09' 'month']
Unique values of year are ['2012' 'year']
Unique values of Classes are ['not fire   ' 'fire   ' 'fire' 'fire ' 'not fire' 'not fire ' 'Classes  '
 'not fire     ' 'not fire    ']
Unique values of Region are [1. 2.]


<b>Inference<b>

    * In month column there are 5 unique values, in which there is a value called 'month'. Remove the row containing column value 'month'.
    * In year column there are 2 unique values, in which there is a value called 'year'. Remove the row containing column value 'year'.
    * In Classes column, there are whitespaces on both ends. So remove them.
    * In Region column, 2 unique value 1 and 2.

<b>Find Row with value month under month column<b>

In [25]:
forest_fires_data[forest_fires_data['month']=='month']

Unnamed: 0,day,month,year,Temperature,RH,Ws,Rain,FFMC,DMC,DC,ISI,BUI,FWI,Classes,Region
123,day,month,year,Temperature,RH,Ws,Rain,FFMC,DMC,DC,ISI,BUI,FWI,Classes,2.0


In [26]:
# Remove this invalid row,
forest_fires_data.drop(123, inplace=True)

In [27]:
# check above row is removed or not
forest_fires_data[forest_fires_data['month']=='month']

Unnamed: 0,day,month,year,Temperature,RH,Ws,Rain,FFMC,DMC,DC,ISI,BUI,FWI,Classes,Region


<b>Find row with valuer year under year column<b>

In [28]:
forest_fires_data[forest_fires_data['year'] == 'year']

Unnamed: 0,day,month,year,Temperature,RH,Ws,Rain,FFMC,DMC,DC,ISI,BUI,FWI,Classes,Region


**Inference:**

    * No rows exist with that value.

<b>Remove whitespace from Classes column<b>

In [29]:
for col_value in forest_fires_data['Classes'].unique():
    new_col_val = col_value.strip() # remove white spaces
    forest_fires_data.loc[forest_fires_data['Classes'] == col_value, 'Classes'] = new_col_val # assign updated column values to Classes

In [30]:
forest_fires_data['Classes'].unique()

array(['not fire', 'fire'], dtype=object)

<b>Convert Column data types to int32 type<b>

In [31]:
forest_fires_data.dtypes

day             object
month           object
year            object
Temperature     object
RH              object
Ws              object
Rain            object
FFMC            object
DMC             object
DC              object
ISI             object
BUI             object
FWI             object
Classes         object
Region         float64
dtype: object

Convert datatypes of day, month, year, Temperature, RH, Ws and Region to int32 type.

In [32]:
features = ['day', 'month', 'year', 'Temperature', 'RH', 'Ws', 'Region']
for feature in features:
    forest_fires_data[feature] = forest_fires_data[feature].astype('int32')
forest_fires_data.dtypes

day             int32
month           int32
year            int32
Temperature     int32
RH              int32
Ws              int32
Rain           object
FFMC           object
DMC            object
DC             object
ISI            object
BUI            object
FWI            object
Classes        object
Region          int32
dtype: object

Convert datatypes of Rain, FFMC, DMC, DC, ISI, BUI and FWI to float64 type.

In [33]:
features = forest_fires_data.columns[6:13].tolist()
features

['Rain', 'FFMC', 'DMC', 'DC', 'ISI', 'BUI', 'FWI']

In [34]:
for feature in features:
    forest_fires_data[feature] = forest_fires_data[feature].astype('float64')
forest_fires_data.dtypes

day              int32
month            int32
year             int32
Temperature      int32
RH               int32
Ws               int32
Rain           float64
FFMC           float64
DMC            float64
DC             float64
ISI            float64
BUI            float64
FWI            float64
Classes         object
Region           int32
dtype: object

<b>Fixing Duplicate Rows</b>

In [35]:
forest_fires_data.loc[forest_fires_data.duplicated(), :]

Unnamed: 0,day,month,year,Temperature,RH,Ws,Rain,FFMC,DMC,DC,ISI,BUI,FWI,Classes,Region


**Inference:**

No duplciate records in dataset.

<b>Map month number to month name</b>

In [36]:
forest_fires_data['month']

0      6
1      6
2      6
3      6
4      6
      ..
241    9
242    9
243    9
244    9
245    9
Name: month, Length: 244, dtype: int32

In [37]:
month_names = {
    1: 'January',
    2: 'February',
    3: 'March',
    4: 'April',
    5: 'May',
    6: 'June',
    7: 'July',
    8: 'August',
    9: 'September',
    10: 'October',
    11: 'November',
    12: 'December'
}

In [38]:
forest_fires_data['month'] = forest_fires_data['month'].map(month_names)

In [39]:
forest_fires_data['month']

0           June
1           June
2           June
3           June
4           June
         ...    
241    September
242    September
243    September
244    September
245    September
Name: month, Length: 244, dtype: object

<b>Map Classes column with 0 and 1<b>

Add a new column Class with mapped values.

* 0 = not Fire
* 1 = fire

In [40]:
forest_fires_data['Classes']

0      not fire
1      not fire
2      not fire
3      not fire
4      not fire
         ...   
241        fire
242    not fire
243    not fire
244    not fire
245    not fire
Name: Classes, Length: 244, dtype: object

In [41]:
forest_fires_data['Class']=forest_fires_data['Classes'].map({
    'not fire': 0,
    'fire': 1
})

In [42]:
forest_fires_data[['Class', 'Classes']].head()

Unnamed: 0,Class,Classes
0,0,not fire
1,0,not fire
2,0,not fire
3,0,not fire
4,0,not fire


## Data Visualization

<b>Plot a Correlation Matrix<b>

In [43]:
matrix = forest_fires_data.corr()
plt.figure(figsize=(12,12))
sns.heatmap(matrix, annot=True, cmap='RdYlGn')

<AxesSubplot:>

<b>Find outliers in numerical features using boxplot</b>

In [44]:
forest_fires_data.dtypes

day              int32
month           object
year             int32
Temperature      int32
RH               int32
Ws               int32
Rain           float64
FFMC           float64
DMC            float64
DC             float64
ISI            float64
BUI            float64
FWI            float64
Classes         object
Region           int32
Class            int64
dtype: object

Get numerical features

In [45]:
numerical_features = forest_fires_data.select_dtypes(include=['int32', 'int64', 'float64']).columns
numerical_features

Index(['day', 'year', 'Temperature', 'RH', 'Ws', 'Rain', 'FFMC', 'DMC', 'DC',
       'ISI', 'BUI', 'FWI', 'Region', 'Class'],
      dtype='object')

In [46]:
for feature in numerical_features:
    # sns.set(rc={'figure.figsize':(12,12)})
    sns.boxplot(x=forest_fires_data[feature])
    plt.xlabel(feature)
    plt.ylabel(feature)
    plt.show()

  plt.show()
  plt.show()
  plt.show()
  plt.show()
  plt.show()
  plt.show()
  plt.show()
  plt.show()
  plt.show()
  plt.show()
  plt.show()
  plt.show()
  plt.show()
  plt.show()


**Inference:**

Outliers in Temparature, Ws, Rain, FFMC, DMC, DC, ISI, BUI and FWI .

<b>Relationship b/w not fire and fire category<b>

In [47]:
sns.set(rc={'figure.figsize':(10,10)})
sns.countplot(x='Classes', data=forest_fires_data)

<AxesSubplot:xlabel='Classes', ylabel='count'>

**Inference:**

More records have *fire* category than *not fire* category.

<b>Scatterplot to show relationship between Month and Temparature<b>

In [48]:
sns.set(rc={'figure.figsize':(15,10)})
sns.scatterplot(x='month', 
                y='Temperature', 
                hue='Classes',
                size='Temperature', 
                sizes=(20, 200),
                legend="full",
                data=forest_fires_data)

<AxesSubplot:xlabel='Classes', ylabel='count'>

**Inference:**

    *Highest temperature is recorded in month of August when there was Fire.
    *Lowest temperature is recorded in month of September when there was No Fire.

<b>Scatterplot to show relationship between Month and Rain<b>

In [49]:
forest_fires_data['Rain'].head()

0     0.0
1     1.3
2    13.1
3     2.5
4     0.0
Name: Rain, dtype: float64

In [50]:
sns.set(rc={'figure.figsize':(15,10)})
sns.scatterplot(x='month', 
                y='Rain', 
                hue='Classes', 
                size='Rain', 
                sizes=(20, 200),
                data=forest_fires_data)

<AxesSubplot:xlabel='Classes', ylabel='count'>

**Inference:**

    *Highest Rain is recorded in month of August when there was No Fire.
    *Lowest Rain is recorded in every month when there was Fire.

<b>Histogram to analyze distribution of data b/w features and Classes features<b>

In [51]:
forest_fires_data.columns
# sns.histplot(x='Temperature', data=forest_fires_data, kde=True, hue='Classes')

Index(['day', 'month', 'year', 'Temperature', 'RH', 'Ws', 'Rain', 'FFMC',
       'DMC', 'DC', 'ISI', 'BUI', 'FWI', 'Classes', 'Region', 'Class'],
      dtype='object')

<b>Histogram to analyze distribution of data b/w Rain and Classes features</b>

In [52]:
features = forest_fires_data.columns[3:13].tolist()
for feature in features:
    sns.histplot(
        x=feature,
        data=forest_fires_data,
        kde=True,
        hue='Classes',
    )
    plt.show()

  plt.show()
  plt.show()
  plt.show()
  plt.show()
  plt.show()
  plt.show()
  plt.show()
  plt.show()
  plt.show()
  plt.show()


<b>Group and plot the data by Classes feature<b>

In [53]:
forest_fires_data.groupby('Classes')[features].mean().T.plot(legend=True, kind='bar')

<AxesSubplot:>

<b>Create a pandas profile report of dataset<b>

In [54]:
pf=ProfileReport(forest_fires_data)
pf.to_notebook_iframe()

In [55]:
pf.to_file('df_report.html')

## Logistic Regression

<b>Split the Data into Dependent and Independent features<b>

In [56]:
forest_fires_data.head()

Unnamed: 0,day,month,year,Temperature,RH,Ws,Rain,FFMC,DMC,DC,ISI,BUI,FWI,Classes,Region,Class
0,1,June,2012,29,57,18,0.0,65.7,3.4,7.6,1.3,3.4,0.5,not fire,1,0
1,2,June,2012,29,61,13,1.3,64.4,4.1,7.6,1.0,3.9,0.4,not fire,1,0
2,3,June,2012,26,82,22,13.1,47.1,2.5,7.1,0.3,2.7,0.1,not fire,1,0
3,4,June,2012,25,89,13,2.5,28.6,1.3,6.9,0.0,1.7,0.0,not fire,1,0
4,5,June,2012,27,77,16,0.0,64.8,3.0,14.2,1.2,3.9,0.5,not fire,1,0


<b>Update month name values to number format<b>

In [57]:
forest_fires_data['month'] = forest_fires_data['month'].map({
    'January': 1,
    'February': 2,
    'March': 3,
    'April': 4,
    'May': 5,
    'June': 6,
    'July': 7,
    'August': 8,
    'September': 9,
    'October': 10,
    'November': 11,
    'December': 12
})

In [58]:
X = forest_fires_data.drop(['Classes', 'Class', 'year'], axis=1) # independent features
y = forest_fires_data['Class'] # independent feature

In [59]:
X.head()

Unnamed: 0,day,month,Temperature,RH,Ws,Rain,FFMC,DMC,DC,ISI,BUI,FWI,Region
0,1,6,29,57,18,0.0,65.7,3.4,7.6,1.3,3.4,0.5,1
1,2,6,29,61,13,1.3,64.4,4.1,7.6,1.0,3.9,0.4,1
2,3,6,26,82,22,13.1,47.1,2.5,7.1,0.3,2.7,0.1,1
3,4,6,25,89,13,2.5,28.6,1.3,6.9,0.0,1.7,0.0,1
4,5,6,27,77,16,0.0,64.8,3.0,14.2,1.2,3.9,0.5,1


In [60]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: Class, dtype: int64

In [61]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [62]:
X_train.shape

(163, 13)

In [63]:
X_test.shape

(81, 13)

In [64]:
X_train.head()

Unnamed: 0,day,month,Temperature,RH,Ws,Rain,FFMC,DMC,DC,ISI,BUI,FWI,Region
114,23,9,32,54,11,0.5,73.7,7.9,30.4,1.2,9.6,0.7,1
65,5,8,34,65,13,0.0,86.8,11.1,29.7,5.2,11.5,6.1,1
134,11,6,31,42,21,0.0,90.6,18.2,30.5,13.4,18.0,16.7,2
209,25,8,34,40,18,0.0,92.1,56.3,157.5,14.3,59.5,31.1,2
164,11,7,34,56,15,2.9,74.8,7.1,9.5,1.6,6.8,0.8,2


In [65]:
y_train.head()

114    0
65     1
134    1
209    1
164    0
Name: Class, dtype: int64

<b>Apply Logistic Regression<b>

Logistic Regression is used to predict the class of the data.

In [66]:
regressor = LogisticRegression()
regressor

LogisticRegression()

In [67]:
from sklearn.model_selection import GridSearchCV
parameters={'C': [0.5,1,2,3,5,6,7,9,10,30,45], 'penalty': ['l2', 'elsaticnet']}
log_regressor = GridSearchCV(
            regressor,
            cv=5, 
            param_grid=parameters,
            scoring='accuracy'
            )
log_regressor.fit(X, y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

GridSearchCV(cv=5, estimator=LogisticRegression(),
             param_grid={'C': [0.5, 1, 2, 3, 5, 6, 7, 9, 10, 30, 45],
                         'penalty': ['l2', 'elsaticnet']},
             scoring='accuracy')

In [68]:
print(log_regressor.best_params_)

{'C': 0.5, 'penalty': 'l2'}


In [69]:
print(log_regressor.best_score_)

0.9672619047619048


**Inference:**

    *Logistic regressor giving the best accuracy score of 0.967 or 96.7%

<b>Predict the Test Data<b>

In [70]:
prediction_log = log_regressor.predict(X_test)
prediction_log

array([1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1], dtype=int64)

<b>f1 score, accuracy score and classification report of model<b>

In [71]:
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, classification_report

In [72]:
confusion_matrix(y_test, prediction_log)

array([[28,  0],
       [ 0, 53]], dtype=int64)

In [73]:
f1_score(y_test, prediction_log)

1.0

In [74]:
accuracy_score(y_test, prediction_log)

1.0

In [75]:
print(classification_report(y_test, prediction_log))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        28
           1       1.00      1.00      1.00        53

    accuracy                           1.00        81
   macro avg       1.00      1.00      1.00        81
weighted avg       1.00      1.00      1.00        81



**Inference:**

    *Logistic regresion gives f1 score of 1.0 or 100%.
    *Logistic regresion gives accuracy score of 1.0 or 100%.

## KNN

<b>Standardize the Variables<b>

Because the KNN classifier predicts the class of a given test observation by identifying the observations that are nearest to it, the scale of the variables matters. Any variables that are on a large scale will have a much larger effect on the distance between the observations, and hence on the KNN classifier, than variables that are on a small scale.

In [76]:
from sklearn.preprocessing import StandardScaler

In [77]:
scalar = StandardScaler()

In [78]:
scalar.fit(forest_fires_data.drop(['Classes', 'Class'], axis=1))

StandardScaler()

In [79]:
scaled_features = scalar.transform(forest_fires_data.drop(['Classes', 'Class'], axis=1))

In [80]:
scaled_features

array([[-1.67527776, -1.35052592,  0.        , ..., -0.93655635,
        -0.88345707, -1.        ],
       [-1.56173115, -1.35052592,  0.        , ..., -0.9012768 ,
        -0.89694665, -1.        ],
       [-1.44818455, -1.35052592,  0.        , ..., -0.98594772,
        -0.9374154 , -1.        ],
       ...,
       [ 1.39048054,  1.35052592,  0.        , ..., -0.93655635,
        -0.92392582,  1.        ],
       [ 1.50402714,  1.35052592,  0.        , ..., -0.81660589,
        -0.85647791,  1.        ],
       [ 1.61757374,  1.35052592,  0.        , ..., -0.83777362,
        -0.88345707,  1.        ]])

In [81]:
features = ['day', 'month', 'year', 'Temperature', 'RH', 'Ws', 'Rain', 'FFMC', 'DMC', 'DC', 'ISI', 'BUI', 'FWI', 'Region']
pd.DataFrame(scaled_features, columns=features).head()

Unnamed: 0,day,month,year,Temperature,RH,Ws,Rain,FFMC,DMC,DC,ISI,BUI,FWI,Region
0,-1.675278,-1.350526,0.0,-0.874735,-0.332478,0.88999,-0.381223,-0.851801,-0.913365,-0.877239,-0.834479,-0.936556,-0.883457,-1.0
1,-1.561731,-1.350526,0.0,-0.874735,-0.063185,-0.892913,0.270307,-0.942658,-0.856651,-0.877239,-0.906836,-0.901277,-0.896947,-1.0
2,-1.448185,-1.350526,0.0,-1.702005,1.350607,2.316313,6.184192,-2.151758,-0.986283,-0.88776,-1.075669,-0.985948,-0.937415,-1.0
3,-1.334638,-1.350526,0.0,-1.977761,1.821872,-0.892913,0.871719,-3.444727,-1.083506,-0.891969,-1.148026,-1.056507,-0.950905,-1.0
4,-1.221091,-1.350526,0.0,-1.426248,1.01399,0.176829,-0.381223,-0.914702,-0.945773,-0.738355,-0.858598,-0.901277,-0.883457,-1.0


In [82]:
forest_fires_data.head()

Unnamed: 0,day,month,year,Temperature,RH,Ws,Rain,FFMC,DMC,DC,ISI,BUI,FWI,Classes,Region,Class
0,1,6,2012,29,57,18,0.0,65.7,3.4,7.6,1.3,3.4,0.5,not fire,1,0
1,2,6,2012,29,61,13,1.3,64.4,4.1,7.6,1.0,3.9,0.4,not fire,1,0
2,3,6,2012,26,82,22,13.1,47.1,2.5,7.1,0.3,2.7,0.1,not fire,1,0
3,4,6,2012,25,89,13,2.5,28.6,1.3,6.9,0.0,1.7,0.0,not fire,1,0
4,5,6,2012,27,77,16,0.0,64.8,3.0,14.2,1.2,3.9,0.5,not fire,1,0


<b>Pairplot of data with respect to target variable<b>

In [83]:
sns.pairplot(forest_fires_data.drop(['Classes'], axis=1), hue='Class')



<seaborn.axisgrid.PairGrid at 0x22084a3a7f0>

<b>Train Test Split the Data<b>

In [84]:
from sklearn.model_selection import train_test_split

In [85]:
X_train, X_test, y_train, y_test = train_test_split(scaled_features, forest_fires_data['Class'], test_size=0.33, random_state=42)

In [86]:
X_train

array([[ 0.82274752,  1.35052592,  0.        , ..., -0.49908995,
        -0.85647791, -1.        ],
       [-1.22109134,  0.45017531,  0.        , ..., -0.36502766,
        -0.12804046, -1.        ],
       [-0.53981172, -1.35052592,  0.        , ...,  0.09360648,
         1.30185526,  1.        ],
       ...,
       [-1.67527776,  1.35052592,  0.        , ..., -1.04945091,
        -0.9374154 , -1.        ],
       [ 1.39048054, -0.45017531,  0.        , ..., -0.06867945,
         0.26315743,  1.        ],
       [-0.53981172,  1.35052592,  0.        , ..., -1.00711545,
        -0.91043623, -1.        ]])

In [87]:
y_train

114    0
65     1
134    1
209    1
164    0
      ..
106    0
14     0
92     0
181    1
102    0
Name: Class, Length: 163, dtype: int64

### Apply KNN

k-nearest neighbors is a classification algorithm that is used to predict the class of a new observation based on the observations of the training set.

In [88]:
from sklearn.neighbors import KNeighborsClassifier

In [89]:
knn = KNeighborsClassifier(n_neighbors=1)

In [90]:
knn

KNeighborsClassifier(n_neighbors=1)

In [91]:
knn.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=1)

<b>Prediction<b>

In [92]:
prediction = knn.predict(X_test)

In [93]:
prediction

array([1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1], dtype=int64)

<b>Generate classification report and confusion_matrix<b>

In [94]:
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, classification_report

In [95]:
confusion_matrix(y_test, prediction)

array([[25,  3],
       [ 8, 45]], dtype=int64)

In [96]:
print(classification_report(y_test, prediction))

              precision    recall  f1-score   support

           0       0.76      0.89      0.82        28
           1       0.94      0.85      0.89        53

    accuracy                           0.86        81
   macro avg       0.85      0.87      0.86        81
weighted avg       0.88      0.86      0.87        81



In [97]:
f1_score(y_test, prediction)

0.8910891089108911

In [98]:
accuracy_score(y_test, prediction)

0.8641975308641975

**Inference:**

    *knn gives f1 score of 0.89 or 89.1%
    *knn gives accuracy score of 0.864 or 86.4%


### Choose a K Value

<b>Find accuracy rate<b>

In [99]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
accuracy_rate = []
X = forest_fires_data.drop(['Classes', 'Class', 'year'], axis=1) # independent features
y = forest_fires_data['Class'] # dependent feature
for n in range(1, 40):
    knn = KNeighborsClassifier(n_neighbors=n)
    scores = cross_val_score(knn, X, y, cv=3, scoring='accuracy')
    accuracy_rate.append(scores.mean())
print(accuracy_rate)

[0.8526046371574827, 0.8323296195924922, 0.8852755194218608, 0.8770450667469638, 0.8933554150356319, 0.8891900030111413, 0.8933554150356319, 0.8973702699989962, 0.8606343470842116, 0.8647495734216601, 0.8688146140720666, 0.8688647997591087, 0.8770450667469637, 0.8729298404095153, 0.8770450667469638, 0.8688146140720666, 0.8729298404095153, 0.8646993877346181, 0.8729298404095153, 0.8646993877346181, 0.8729298404095153, 0.8646993877346181, 0.8729298404095153, 0.8729298404095153, 0.8688146140720666, 0.8646993877346181, 0.8728294690354311, 0.8523537087222723, 0.8728294690354311, 0.8728294690354311, 0.8687142426979825, 0.8523537087222723, 0.8646993877346181, 0.8564689350597209, 0.8646993877346181, 0.8524038944093144, 0.8524038944093144, 0.8524038944093144, 0.8482886680718659]


<b>Plot the Accuracy rate<b>

In [100]:
plt.figure(figsize=(10,6))
plt.plot(range(1,40), 
         accuracy_rate, 
         color='blue', 
         linestyle='dashed',
         marker='o',
         markerfacecolor='red',
         markersize=10
         )
plt.title('Accuracy Rate vs K Value')

Text(0.5, 1.0, 'Accuracy Rate vs K Value')

**Inference:**

Here we can observe when k=0, accurac rate is between 0.85 and 0.86. When the k value increases, the accuracy rate had a steep fall at first. When k value further increased, the accuracy rate increased. The maximum accuracy rate reached at 0.90 or 90.0% when k value is 8. The minimum accuracy rate reached at 0.83 or 83.0% when k value is 2.

<b>Find error rate<b>

In [101]:
error_rate = []
for i in range(1, 40):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    error_rate.append(np.mean(y_pred != y_test))

<b>Plot the error rate<b>

In [102]:
plt.figure(figsize=(10,6))
plt.plot(range(1,40), 
         error_rate, 
         color='blue', 
         linestyle='dashed',
         marker='o',
         markerfacecolor='red',
         markersize=10
         )
plt.title('Error Rate vs K Value')

Text(0.5, 1.0, 'Error Rate vs K Value')

**Inference:**

Here we can observe when k=0, error rate is between 0.13 and 0.14. When the k value increases, the error rate had a steep rise at first. When k value further increased, the error rate decreased. The maximum error rate reached at 0.18 or 18.0% when k value is 2. The minimum error rate reached at 0.1 or 1% when k value is near to 30.

## Decision Tree Classifier

In [103]:
classifier = DecisionTreeClassifier(criterion='entropy')

In [104]:
classifier

DecisionTreeClassifier(criterion='entropy')

In [105]:
X = forest_fires_data.drop(['Classes', 'Class', 'year'], axis=1) # independent features
y = forest_fires_data['Class'] # dependent feature

In [106]:
X.head()

Unnamed: 0,day,month,Temperature,RH,Ws,Rain,FFMC,DMC,DC,ISI,BUI,FWI,Region
0,1,6,29,57,18,0.0,65.7,3.4,7.6,1.3,3.4,0.5,1
1,2,6,29,61,13,1.3,64.4,4.1,7.6,1.0,3.9,0.4,1
2,3,6,26,82,22,13.1,47.1,2.5,7.1,0.3,2.7,0.1,1
3,4,6,25,89,13,2.5,28.6,1.3,6.9,0.0,1.7,0.0,1
4,5,6,27,77,16,0.0,64.8,3.0,14.2,1.2,3.9,0.5,1


In [107]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: Class, dtype: int64

In [108]:
classifier.fit(X, y)

DecisionTreeClassifier(criterion='entropy')

In [109]:
tree_parameters = {'criterion':['gini','entropy'],'max_depth':[4,5,6,7,8,9,10,11,12,15,20,30,40,50,70,90,120,150]}
classifier_new = GridSearchCV(DecisionTreeClassifier(), tree_parameters, cv=5)
classifier_new.fit(X, y)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 20, 30,
                                       40, 50, 70, 90, 120, 150]})

In [110]:
print(classifier_new.best_params_)

{'criterion': 'gini', 'max_depth': 11}


In [111]:
print(classifier_new.best_score_)

0.9795068027210885


Inference:

    *Decision tree classifier gives best score of 0.98 or 98.0% when max_depth is 11 and criterion is gini.

<b>Construct a decison tree<b>

In [112]:
from sklearn import tree
plt.figure(figsize=(15,10))
tree.plot_tree(classifier_new, filled=True)

AttributeError: 'GridSearchCV' object has no attribute 'tree_'

<b>Train Test Split<b>

In [None]:
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
pd.DataFrame(X_train).head()

Unnamed: 0,day,month,Temperature,RH,Ws,Rain,FFMC,DMC,DC,ISI,BUI,FWI,Region
114,23,9,32,54,11,0.5,73.7,7.9,30.4,1.2,9.6,0.7,1
65,5,8,34,65,13,0.0,86.8,11.1,29.7,5.2,11.5,6.1,1
134,11,6,31,42,21,0.0,90.6,18.2,30.5,13.4,18.0,16.7,2
209,25,8,34,40,18,0.0,92.1,56.3,157.5,14.3,59.5,31.1,2
164,11,7,34,56,15,2.9,74.8,7.1,9.5,1.6,6.8,0.8,2


In [None]:
y_train.head()

114    0
65     1
134    1
209    1
164    0
Name: Class, dtype: int64

<b>Result prediction<b>

In [None]:
prediction_res = classifier_new.predict(X_test)

In [None]:
prediction_res

array([1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1], dtype=int64)

<b>Generate f1 score,accuracy score, confusion matrix and classification_report<b>

In [None]:
f1_score(y_test, prediction_res)

1.0

In [None]:
confusion_matrix(y_test, prediction_res)

array([[28,  0],
       [ 0, 53]], dtype=int64)

In [None]:
accuracy_score(y_test, prediction_res)

1.0

In [None]:
print(classification_report(y_test, prediction_res))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        28
           1       1.00      1.00      1.00        53

    accuracy                           1.00        81
   macro avg       1.00      1.00      1.00        81
weighted avg       1.00      1.00      1.00        81



**Inference:**

    *Decison tree classifier gives f1 score of 1.0 or 100%.
    *Decison tree classifier gives accuracy score of 1.0 or 100%.

## SVM

SVM is a supervised machine learning algorithm that is used to predict the class of a new observation based on the observations of the training set.

<b>Split data into dependent and independent features<b>

In [None]:
X = forest_fires_data.drop(['Classes', 'Class', 'year'], axis=1) # independent features
y = forest_fires_data['Class'] # dependent feature

In [None]:
X.head()

Unnamed: 0,day,month,Temperature,RH,Ws,Rain,FFMC,DMC,DC,ISI,BUI,FWI,Region
0,1,6,29,57,18,0.0,65.7,3.4,7.6,1.3,3.4,0.5,1
1,2,6,29,61,13,1.3,64.4,4.1,7.6,1.0,3.9,0.4,1
2,3,6,26,82,22,13.1,47.1,2.5,7.1,0.3,2.7,0.1,1
3,4,6,25,89,13,2.5,28.6,1.3,6.9,0.0,1.7,0.0,1
4,5,6,27,77,16,0.0,64.8,3.0,14.2,1.2,3.9,0.5,1


In [None]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: Class, dtype: int64

<b>Train test split the data<b>

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

In [None]:
X_train.head()

Unnamed: 0,day,month,Temperature,RH,Ws,Rain,FFMC,DMC,DC,ISI,BUI,FWI,Region
173,20,7,36,50,16,0.0,89.9,32.7,71.0,9.5,32.6,17.3,2
89,29,8,35,48,18,0.0,90.1,54.2,220.4,12.5,67.4,30.2,1
159,6,7,35,42,15,0.3,84.7,15.5,45.1,4.3,16.7,6.3,2
221,6,9,34,71,14,6.5,64.5,3.3,9.1,1.0,3.5,0.4,2
236,21,9,35,34,17,0.0,92.2,23.6,97.3,13.8,29.4,21.6,2


In [None]:
y_test.head()

64     1
63     0
55     1
111    1
227    1
Name: Class, dtype: int64

<b>Use SVC Classifier with rbf kernel<b>

In [None]:
from sklearn.svm import SVC

In [None]:
classifier = SVC(kernel='rbf')

In [None]:
classifier.fit(X_train, y_train)

SVC()

<b>Get prediction,  accuracy score, f1 score and report of classifier when using rbf as kernel<b>

In [None]:
prediction_rbf = classifier.predict(X_test)
prediction_rbf

array([1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0,
       0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1,
       1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0], dtype=int64)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, classification_report

In [None]:
accuracy_score(y_test, prediction_rbf)

0.9344262295081968

In [None]:
f1_score(y_test, prediction_rbf)

0.9374999999999999

In [None]:
print(classification_report(y_test, prediction_rbf))

              precision    recall  f1-score   support

           0       0.90      0.96      0.93        28
           1       0.97      0.91      0.94        33

    accuracy                           0.93        61
   macro avg       0.93      0.94      0.93        61
weighted avg       0.94      0.93      0.93        61



**Inference:**

    *SVC classifier gives f1 score of 0.937 or 93.7% when kernel is rbf.
    *SVC classifier gives accuracy score of 0.934 or 93.4% when kernel is rbf.

<b>Use SVC Classifier with linear kernel<b>

In [None]:
classifier_linear = SVC(kernel='linear')
classifier_linear

SVC(kernel='linear')

In [None]:
classifier_linear.fit(X_train, y_train)

SVC(kernel='linear')

<b>Get predited result<b>

In [None]:
prediction_linear = classifier_linear.predict(X_test)
prediction_linear

array([1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1,
       1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0], dtype=int64)

<b>Generate accuracy and f1 score, classification report of the classifier<b>

In [None]:
accuracy_score(y_test, prediction_linear)

0.9508196721311475

In [None]:
f1_score(y_test, prediction_linear)

0.9538461538461539

In [None]:
print(classification_report(y_test, prediction_linear))

              precision    recall  f1-score   support

           0       0.93      0.96      0.95        28
           1       0.97      0.94      0.95        33

    accuracy                           0.95        61
   macro avg       0.95      0.95      0.95        61
weighted avg       0.95      0.95      0.95        61



**Inference:**

    *SVC classifier gives f1 score of 0.95 or 95.08% when kernel is polynomial.
    *SVC classifier gives accuracy score of 0.953 or 95.38% when kernel is polynomial.

**Conclusion:**

    *Comparing both the svc classifiers,  svc classifier with kernel as linear gives better accuracy score.

## AdaBoost

AdaBoost classifier builds a strong classifier by combining multiple poorly performing classifiers so that you will get high accuracy strong classifier. The basic concept behind Adaboost is to set the weights of classifiers and training the data sample in each iteration such that it ensures the accurate predictions of unusual observations. 

Adaboost should meet two conditions:
- The classifier should be trained interactively on various weighed training examples.
- In each iteration, it tries to provide an excellent fit for these examples by minimizing training error.

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.datasets import make_classification

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [None]:
# create ababoost classifier
adaBoost = AdaBoostClassifier(n_estimators=50, learning_rate=1)

# train adaboost classifier
adaBoost.fit(X_train, y_train)

AdaBoostClassifier(learning_rate=1)

In [None]:
X_train.head()

Unnamed: 0,day,month,Temperature,RH,Ws,Rain,FFMC,DMC,DC,ISI,BUI,FWI,Region
61,1,8,36,45,14,0.0,78.8,4.8,10.2,2.0,4.7,0.9,1
148,25,6,34,70,16,0.0,86.0,12.8,25.6,5.4,12.7,6.7,2
52,23,7,27,66,22,0.4,68.2,10.5,71.3,1.8,15.4,2.1,1
66,6,8,32,75,14,0.0,86.4,13.0,39.1,5.2,14.2,6.8,1
26,27,6,34,53,18,0.0,89.0,21.6,80.3,9.2,25.8,15.0,1


In [None]:
y_test.head()

64     1
63     0
55     1
111    1
227    1
Name: Class, dtype: int64

<b>Prediction<b>

In [None]:
# predict the response for test dataset
prediction_adaBoost = adaBoost.predict(X_test)

In [None]:
prediction_adaBoost

array([1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1,
       1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1,
       1, 0, 0, 0, 0, 1, 0, 1], dtype=int64)

<b>Evaluate Model<b>

In [None]:
print(accuracy_score(y_test, prediction_adaBoost))

0.9864864864864865


In [None]:
print(f1_score(y_test, prediction_adaBoost))

0.9876543209876543


In [None]:
print(classification_report(y_test, prediction_adaBoost))

              precision    recall  f1-score   support

           0       1.00      0.97      0.99        34
           1       0.98      1.00      0.99        40

    accuracy                           0.99        74
   macro avg       0.99      0.99      0.99        74
weighted avg       0.99      0.99      0.99        74



**Inference:**

    * The accuracy of the model is around 98%
    * The f1 score is around 98%

## Final Conclusion

Model Accuracy Scores:

|Models|Logistic Regreesion | KNN | Decision Tree| SVM | AdaBoost |
|:-:|:-:|:-:|:-:|:-:|:-:|
|Accuarcy| 100% | 86% | 100% | 95% | 98% |

## Prediction

In [113]:
import pickle

# choose decision tree as the classifier sinxe it provides the best accuracy 100%.
pickle.dump(classifier_new, open('model.pkl','wb'))

In [124]:
# train test split
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.33, random_state=42)


<b>Laod and predict the result<b>

In [123]:
# load the model from disk
loaded_model = pickle.load(open('model.pkl','rb'))

# make predictions using the loaded model
result = loaded_model.predict(X_test)

<b>Get user input and predict the class<b>

In [125]:
fires_test_data={
    "day" : 18,
    "month" : 5,
    "temperature" : 38,
    "RH" : 72,
    'Ws' : 18,
    'Rain' : 0,
    'FFMC' : 62.8,
    'DMC' : 3,
    'DC' : 16.7,
    'ISI' : 1.8,
    'BUI' : 3.3,
    'FWI' :  0.6,
    'Region' :2
}

In [130]:
predict_result = loaded_model.predict([list(fires_test_data.values())])
if(predict_result[0] == 1):
    print("Fire")
else:
    print("No Fire")

No Fire


<Give user input for prediction>