## Predicting Forest Fires with Regression ##
### Tess and Jalin ###

## 1. Import the necessary libraries for EDA and Regression.

In [3]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

## 2. Import the Dataset.

In [19]:
dataset=pd.read_csv('forestfires.csv')
dataset.head()

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
0,7,5,mar,fri,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,0.0
1,7,4,oct,tue,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,0.0
2,7,4,oct,sat,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,0.0
3,8,6,mar,fri,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,0.0
4,8,6,mar,sun,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,0.0


## 3. Check for Missing Values.

In [20]:
dataset.isnull().sum()

X        0
Y        0
month    0
day      0
FFMC     0
DMC      0
DC       0
ISI      0
temp     0
RH       0
wind     0
rain     0
area     0
dtype: int64

## 4. Split the dataset into Independent and Dependent variables.

In [21]:
dataset.columns

Index(['X', 'Y', 'month', 'day', 'FFMC', 'DMC', 'DC', 'ISI', 'temp', 'RH',
       'wind', 'rain', 'area'],
      dtype='object')

In [25]:
X=dataset.drop('area',axis=1)
y=dataset[['area']]

## 5. Handling categorical variables.

In [27]:
X=pd.get_dummies(dataset[['X', 'Y', 'month', 'day', 'FFMC', 'DMC', 'DC', 'ISI', 'temp', 'RH',
       'wind', 'rain']],drop_first=True)
X.head()

Unnamed: 0,X,Y,FFMC,DMC,DC,ISI,temp,RH,wind,rain,...,month_may,month_nov,month_oct,month_sep,day_mon,day_sat,day_sun,day_thu,day_tue,day_wed
0,7,5,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,...,0,0,0,0,0,0,0,0,0,0
1,7,4,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,...,0,0,1,0,0,0,0,0,1,0
2,7,4,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,...,0,0,1,0,0,1,0,0,0,0
3,8,6,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,...,0,0,0,0,0,0,0,0,0,0
4,8,6,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,...,0,0,0,0,0,0,1,0,0,0


## 6. Splitting the dataset into the Training set, Test set and validation set.

In [32]:
from sklearn.model_selection import train_test_split
X_train,X_rest,y_train,y_rest=train_test_split(X,y,
                                              test_size=.2,
                                              random_state=45)
X_test,X_val,y_test,y_val=train_test_split(X_rest,y_rest,
                                          test_size=.5,
                                          random_state=45)

## 7. Training the Multiple Linear Regression model on the Training set

In [None]:
from sklearn.linear_model import LinearRegression
regressor=LinearRegression()
regressor.fit(X_train.values,y_train)

## 8. Intercept and Coefficient

In [38]:
print('Intercept:',regressor.intercept_)
print('Coefficients: ',regressor.coef_)

Intercept: [-22.36049418]
Coefficients:  [[  2.50361999   1.01670929  -0.15942752   0.24030324  -0.1284822
   -0.65486319   1.4754396   -0.13010435   3.01150248  -2.86345623
   41.93859594  44.6972293    8.81609287  13.53538741  24.29784607
    1.19173865  -8.21311775 -17.09353071   0.52447676  70.00634724
   69.30323691   5.27281176  23.90182478   5.2294092   11.98105028
    3.2184349    6.23103906]]


In [39]:
X_train.columns

Index(['X', 'Y', 'FFMC', 'DMC', 'DC', 'ISI', 'temp', 'RH', 'wind', 'rain',
       'month_aug', 'month_dec', 'month_feb', 'month_jan', 'month_jul',
       'month_jun', 'month_mar', 'month_may', 'month_nov', 'month_oct',
       'month_sep', 'day_mon', 'day_sat', 'day_sun', 'day_thu', 'day_tue',
       'day_wed'],
      dtype='object')

## 9. Validation Set

In [40]:
y_predval=regressor.predict(X_val.values)

In [41]:
y_validate=y_val.to_numpy()

In [46]:
np.hstack((y_predval,y_validate)).round()

array([[ 18.,   0.],
       [ 47.,   0.],
       [ 48.,   0.],
       [-11.,   2.],
       [  7.,   6.],
       [  6.,   2.],
       [ 47.,  86.],
       [ 28.,  11.],
       [-12.,   0.],
       [  3.,   0.],
       [-20.,   0.],
       [-14.,  38.],
       [ 18.,  14.],
       [  5.,   0.],
       [ -1.,   0.],
       [  3.,  32.],
       [ -1.,   0.],
       [-11.,   0.],
       [ 16.,   0.],
       [ 35.,   3.],
       [  9.,   5.],
       [ 32.,   0.],
       [ -4.,   0.],
       [ 65.,   4.],
       [ 19.,  29.],
       [ 37.,   6.],
       [ 21.,   5.],
       [ 28.,   0.],
       [ 10.,   0.],
       [ 24.,   3.],
       [ 31.,  15.],
       [ 16.,  56.],
       [-13.,   1.],
       [ 13.,  26.],
       [-17.,   0.],
       [  7.,   7.],
       [ 22.,   2.],
       [ 29.,   0.],
       [ 12.,   0.],
       [ -6.,   0.],
       [  1.,   1.],
       [ 13.,   0.],
       [ -7.,  10.],
       [ -2.,   4.],
       [ 28.,  39.],
       [ 14.,   0.],
       [  9.,  36.],
       [  2.,

## 10. Predicting the Test Set Results.

In [47]:
y_pred=regressor.predict(X_test.values)

## 11. RMSE and R-Squared

In [51]:
from sklearn.metrics import mean_squared_error, r2_score
import math
print(f"R-squared: {r2_score(y_test,y_pred):.2f}")
print(f"MSE: {mean_squared_error(y_test,y_pred):.2f}")
print(f"RMSE: {math.sqrt(mean_squared_error(y_test,y_pred)):.2f}")

R-squared: -0.22
MSE: 1226.85
RMSE: 35.03
