# Introduction 


- **Regression metrics**



# Import modules

In [3]:
import numpy as np
from pandas import read_csv
from sklearn.model_selection import KFold
from IPython.display import Markdown, display
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.linear_model import LinearRegression

# Read-in the data

In [4]:
'''names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO',
'B', 'LSTAT', 'MEDV']
dataframe = read_csv(filename, delim_whitespace=True, names=names) 
array = dataframe.values
X = array[:,0:13]
Y = array[:,13]
print("Data shape: ", dataframe.shape)
print("Labels shape: ", Y.shape)
print(array)'''

from sklearn.datasets import load_boston
X, Y = load_boston(return_X_y = True)
Y = np.array(Y)
print("Inputs shape: ", X.shape)
print("Labels shape", Y.shape)

Inputs shape:  (506, 13)
Labels shape (506,)


# MEAN ABSOLUTE ERROR

In [None]:
"""
The Mean Absolute Error (or MAE) is the sum of the absolute differences between
predictions and actual values. It gives an idea of how wrong the predictions were. 
The measure gives an idea of the magnitude of the error, but no idea of the direction
(e.g. over or under predicting).
"""

In [5]:
kfold = KFold(n_splits=10, shuffle = True, random_state=7)
model = LinearRegression()
results = cross_val_score(model, X, Y, cv=kfold, scoring = 'neg_mean_absolute_error')
print("MAE: ",  results.mean(), " with standard deviation of: ",  results.std())

MAE:  -3.3870077451158886  with standard deviation of:  0.6666977115119672


# MEAN SQUARED ERROR

In [None]:
"""
The Mean Squared Error (or MSE) is much like the mean absolute error in that
it provides a gross idea of the magnitude of error. Taking the square root of
the mean squared error converts the units back to the original units of the 
output variable and can be meaningful for description and presentation.
"""

In [6]:
kfold = KFold(n_splits=10, shuffle = True, random_state=7)
model = LinearRegression()
results = cross_val_score(model, X, Y, cv=kfold, scoring = 'neg_mean_squared_error') 
print("MSE: ",  results.mean(), " with standard deviation of: ",  results.std())

MSE:  -23.746501811313365  with standard deviation of:  11.143430110698096


# R2 METRIC

In [None]:
"""
The R2 (or R Squared) metric provides an indication of the goodness of fit 
of a set of predictions to the actual values. In statistical literature this
measure is called the coefficient of determination. This is a value between 
0 and 1 for no-fit and perfect fit respectively.
"""

In [7]:
results = cross_val_score(model, X, Y, cv=kfold, scoring='r2')
print("MSE: ",  results.mean(), " with standard deviation of: ",  results.std())

MSE:  0.7181683241114103  with standard deviation of:  0.09866585171842626


# Getting multiple scores at the same time


- What if we want to get two score at the same time?
- Running two CVs is not efficient and scikilearn offer an alternative:
    - `cross_val_score` accepts only *1* score at the time
    - `cross_validate` accepts *more* than score at the time



In [9]:
result = cross_validate(model, X, Y, scoring=[
    "neg_mean_squared_error", "r2"], n_jobs=-1, return_train_score=True)

In [10]:
result

{'fit_time': array([0.00199413, 0.00197506, 0.0018239 , 0.0020647 , 0.00152826]),
 'score_time': array([0.00137496, 0.00121808, 0.00131679, 0.00129914, 0.00085592]),
 'test_neg_mean_squared_error': array([-12.46030057, -26.04862111, -33.07413798, -80.76237112,
        -33.31360656]),
 'train_neg_mean_squared_error': array([-24.5892302 , -22.24092194, -21.19051839, -12.91756328,
        -22.73718934]),
 'test_r2': array([ 0.63919994,  0.71386698,  0.58702344,  0.07923081, -0.25294154]),
 'train_r2': array([0.74652533, 0.72763185, 0.69498059, 0.84181027, 0.73545537])}

# References


- [cross_validate](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_validate.html)

