# Regression Metrics 

In [1]:
# Data Mining 
import numpy as np
import pandas as pd 

# Model Building
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

# Regression Metrics 
import sklearn.metrics as metrics

In [2]:
df = pd.read_csv("forestfires.csv")
df.head()

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
0,7,5,mar,fri,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,0.0
1,7,4,oct,tue,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,0.0
2,7,4,oct,sat,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,0.0
3,8,6,mar,fri,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,0.0
4,8,6,mar,sun,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,0.0


## Building MLR Models to Predict Area Affected by Forest Fires

### _Model 1 : Using Temperature & Rain as Predictor Variables_

In [3]:
# Defining x and y variables 
x1 = df[['temp','rain']]
y1 = df['area']

# Splitting into training and testing sets
X1_train, X1_test, y1_train, y1_test = train_test_split(x1, y1,test_size=0.1)

# Fitting linear regression model on training data 
linreg1 = LinearRegression()
linreg1.fit(X1_train, y1_train)

# Making predictions on testing data 
y_pred1 = linreg1.predict(X1_test)

### _Model 2 : Using Engineered Feature "Summer Months", Temperature, & Rain as Predictor Variables_

In [4]:
# Engineering new feature 'Summer'

# Grabbing summer months 
summer = ['may','jun','jul','aug']

# Creating function to change 'Month' to binary variable column 
def summer_flag(x):
    summer = []
    if x in summer: 
        return 1
    else: 
        return 0

# Applying function to 'month' column
df['month'] = df['month'].apply(summer_flag)

# Inspecting
df['month'].head(5)

0    0
1    0
2    0
3    0
4    0
Name: month, dtype: int64

In [5]:
# Defining x and y variables 
x2 = df[['month','temp','rain']]
y2 = df['area']

# Splitting into training and testing sets
X2_train, X2_test, y2_train, y2_test = train_test_split(x2, y2,test_size=0.1)

# Fitting linear regression model on training data 
linreg2 = LinearRegression()
linreg2.fit(X2_train, y2_train)

# Making predictions on testing data 
y_pred2 = linreg2.predict(X2_test)

## Regression Metrics
Once models have been build models, we want to evaluate their performance. There are six different regressionn metrics that assist in doing so : 
>  ### Mean Squared Error 
   - Represents average distance squared from the predicted value
   - Very common metric; part of other calculations
   - Not in originnal units of Y & heavily affected by outliers
   - Goal : Get MSE as close to 0 as possible 

> ### Root Mean Squared Error
    - Represents average distance from the predicted value
    - Pretty common; part of other calculations
    - In original units of Y but still heavily affected by outliers
    - Goal : Get RMSE as close to 0 as possible 

> ### Mean Squared Log Error
    - Represents average distance from the predicted value
    - Not in original units of Y, but Logarithm accounts for large values of predicted and observed values
    - Goal : Get MSLE as close to 0 as possible 
> - Median Absolute Error
    - Represents median distance from the predicted value 
    - In original units of Y & nnoy heavily affected by outliers
    - Goal : Get MedAE  as close to 0 as possible
> - $R^2$ Score 
    - Interpretation :  An $R^2$ value of 0.8 means that 80% of the variability in the data are explained by our model
    - Common metric 
    - As you add more variables, $R^2$ will never decrease
    - Goal: Get $R^2$ as close to 1 as possible
> - Adjusted $R^2$ Score  
    - Interpretation :  Isn't exactly the same as $R^2$, although we often treat it as such
    - Penalizes for "bad" variables
    - Goal : Get $R^2_{adj}$ as close to 1 as possible
    
- The metrics detailed above can be easily accessed utilizing sklearn and pandas, the formulas for which are shown below : 

### Model 1: 

In [None]:
r2_score = metrics.r2_score(y1_test,y_pred1)
mean_squared_error = metrics.mean_squared_error(y1_test,y_pred1)
root_mean_squared_error = (metrics.mean_squared_error(y1_test,y_pred1)) ** 0.5
mean_squared_absolute_error = metrics.mean_absolute_error(y1_test,y_pred1)

In [None]:
# Printing Regression Metrics
print('r2_score:',r2_score)
print('mean squared error:',mean_squared_error)
print('root mean squared error:',root_mean_squared_error)
print('mean squared absolute error:',mean_squared_absolute_error)

### _Model 2:_

In [None]:
r2_score = metrics.r2_score(y2_test,y_pred2)
mean_squared_error = metrics.mean_squared_error(y2_test,y_pred2)
root_mean_squared_error = (metrics.mean_squared_error(y2_test,y_pred2)) ** 0.5
mean_squared_absolute_error = metrics.mean_absolute_error(y2_test,y_pred2)

In [None]:
# Printing Regression Metrics
print('r2_score:',r2_score)
print('mean squared error:',mean_squared_error)
print('root mean squared error:',root_mean_squared_error)
print('mean squared absolute error:',mean_squared_absolute_error)

## Model Picking Based on Regression Metrics 
Comparing the four regression metrics produced above to select the best performing model 

### _r2 score-Model 2 Wins_
   - Model 1 : -0.19
   - Model 2 : -0.08

### _MSE - Model 1 Wins_ 
   - Model 1 : 299.04
   - Model 2 : 1633.25

### _RMSE - Model 1 Wins_
   - Model 1 : 17.29
   - Model 2 : 40.41

### _MSAE - Model 1 Wins_ 
   - Model 1 : 14.18
   - Model 2 : 21.18
    
# ---> Model 1 is the winner <---