# ML Workflow - Error evaluation (Regression)

![Image](./img/scikit_learn.png)

In [None]:
# imports

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import datasets
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
import pickle
import joblib

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [None]:
# Housing dataset

housing = datasets.fetch_california_housing(as_frame=True)
description = housing.DESCR

housing = housing['data'].merge(housing['target'], left_index=True, right_index=True)
housing

In [None]:
housing.info()

In [None]:
housing.describe()

In [None]:
print(description)

---

In [None]:
# Load datasets for ML

X, y = datasets.fetch_california_housing(return_X_y=True)
#X, y = make_regression(n_samples=1000, n_features=10, random_state=42)

print(X.shape, y.shape)

In [None]:
# Create train and test (validation) samples

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"X_train: {X_train.shape}, X_test: {X_test.shape}, y_train: {y_train.shape}, y_test: {y_test.shape}")

In [None]:
%%time

# Train the model and make predictions

regressor = LinearRegression()
#regressor = RandomForestRegressor()
hyperparameters = regressor.get_params()
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)

print('Model:', regressor, '\n')
print('Model hyperparameters:', hyperparameters, '\n')
print('Ground truth target:', y_test, '\n')
print('Predicted target:', y_pred, '\n')

---

## [Error evaluation (Regression)](https://scikit-learn.org/stable/modules/model_evaluation.html)

![Image](./img/error_regression.jpeg)

### MSE (Mean Square Error)

Mean Square Error measures the average squared difference between the estimated values and the actual value. 

__Potential problems:__ Very sensitive to outliers.

![Image](./img/mse.JPG)

In [None]:
# MSE calculation

mean_squared_error(y_test, y_pred)

__Interpretation:__ The smaller the mean squared error, the closer you are to finding the best fit.

---

### RMSE (Root Mean Square Error)

Root Mean Squared Error is just the square root of the mean square error. It is preferred over MSE because it the most easily interpreted statistic since it has the same units as the target. It is the standard deviation of the residuals (prediction errors).

__Potential problems:__ It gives a relatively high weight to large errors.

![Image](./img/rmse.JPG)

In [None]:
# RMSE calculation

rmse = mean_squared_error(y_test, y_pred)**0.5
rmse

__Interpretation:__ The smaller the mean squared error, the closer you are to finding the best fit. This is the one used by Kaggle!!!

In [None]:
# Visual analysis

check = pd.DataFrame({'Ground truth':y_test[:10],
                      'Predictions':y_pred[:10], 
                      'RMSE':rmse})
check.reset_index(inplace=True)
check.plot(x='index',
           y=['Ground truth', 'Predictions', 'RMSE'], 
           kind='line', 
           figsize=(15, 10));

---

### R2 (R-squared)

R-squared is a statistical measure that represents what proportion of the variance for a dependent variable is explained by the independent variables (e.g.: an r-squared of 60% reveals that 60% of the data fit the regression model).

__Potential problems:__ R-squared always increases with any new feature addition which means that it’s difficult to identify if the model did better with lesser features.

![Image](./img/r2.JPG)

In [None]:
# R2 calculation

r2 = r2_score(y_test, y_pred)
r2

In [None]:
# Another way

r2_ = regressor.score(X_test, y_test)
r2_

__Interpretation:__ R-squared rank from 0 to 100% (i.e.: from 0 to 1), where 0 means that 0% of the data fit the regression model and 1 means that 100% of the data fit the regression model.

---