# Regression model evaluation metrics 

In [102]:
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split


In [104]:
housing = fetch_california_housing()
housing 

{'data': array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
           37.88      , -122.23      ],
        [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
           37.86      , -122.22      ],
        [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
           37.85      , -122.24      ],
        ...,
        [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
           39.43      , -121.22      ],
        [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
           39.43      , -121.32      ],
        [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
           39.37      , -121.24      ]]),
 'target': array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894]),
 'frame': None,
 'target_names': ['MedHouseVal'],
 'feature_names': ['MedInc',
  'HouseAge',
  'AveRooms',
  'AveBedrms',
  'Population',
  'AveOccup',
  'Latitude',
  'Longitude'],
 'DESCR': '.. _california_housing_dataset:\n

In [106]:
housing_df = pd.DataFrame(housing["data"], columns=housing["feature_names"])
housing_df

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25
...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32


In [108]:
housing_df['Target'] = housing['target'] 

In [110]:
housing_df

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,Target
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422
...,...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32,0.847


In [122]:
# make an x and y
X = housing_df.drop("Target", axis = 1 )
y = housing_df["Target"]

# pick a model, train and test 
model = RandomForestRegressor(n_estimators=100)

#train
X_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
model.fit(X_train, y_train)

In [123]:
# test the model 
model.score(x_test, y_test)

0.8075018864106014

In [126]:
housing_df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,Target
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [128]:
y_test.mean()

2.074760133236434

In [130]:
y_test

20155    3.250
10227    1.489
13762    1.030
13961    2.265
2080     0.669
         ...  
10714    2.316
13008    1.440
3732     2.622
2472     0.756
14280    0.875
Name: Target, Length: 4128, dtype: float64

In [136]:
from sklearn.metrics import r2_score 

# fill an array with y_test
y_test_mean = np.full(len(y_test), y_test.mean())
y_test_mean

array([2.07476013, 2.07476013, 2.07476013, ..., 2.07476013, 2.07476013,
       2.07476013])

In [140]:
r2_score(y_true = y_test,
        y_pred = y_test_mean)

0.0

In [142]:
r2_score(y_true = y_test,
        y_pred = y_test)

1.0

# Mean Absolute Errors

In [153]:
from sklearn.metrics import mean_absolute_error

y_preds = model.predict(x_test)
mae = mean_absolute_error(y_test, y_preds)
mae

0.33039329576065907

In [155]:
y_preds

array([2.5248401, 2.14657  , 0.99407  , ..., 2.63634  , 0.7671   ,
       0.9283   ])

In [157]:
y_test

20155    3.250
10227    1.489
13762    1.030
13961    2.265
2080     0.669
         ...  
10714    2.316
13008    1.440
3732     2.622
2472     0.756
14280    0.875
Name: Target, Length: 4128, dtype: float64

In [159]:
df = pd.DataFrame(data = {"actual values" : y_test,
                         "predicted values" : y_preds})
df["differences"] = df["predicted values"] - df["actual values"]
df.head(10)

Unnamed: 0,actual values,predicted values,differences
20155,3.25,2.52484,-0.72516
10227,1.489,2.14657,0.65757
13762,1.03,0.99407,-0.03593
13961,2.265,1.62688,-0.63812
2080,0.669,0.65279,-0.01621
14165,1.676,1.24099,-0.43501
9342,4.143,3.36969,-0.77331
3754,1.902,1.99816,0.09616
11373,2.738,2.57209,-0.16591
16126,5.00001,3.11943,-1.88058


In [161]:
df["differences"].mean()

0.01273873272771274

In [165]:
np.abs(df["differences"]).mean()

0.33039329576065907

# Mean Squared Error 

In [172]:
from sklearn.metrics import mean_squared_error

y_preds = model.predict(x_test)
mse = mean_absolute_error(y_test, y_preds)
mse

0.33039329576065907

In [178]:
df["squared_differences"] = np.square(df["differences"])
df.head()

Unnamed: 0,actual values,predicted values,differences,squared_differences
20155,3.25,2.52484,-0.72516,0.525857
10227,1.489,2.14657,0.65757,0.432398
13762,1.03,0.99407,-0.03593,0.001291
13961,2.265,1.62688,-0.63812,0.407197
2080,0.669,0.65279,-0.01621,0.000263


# Finally using the scoring

In [185]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
h_d = pd.read_csv("data/heart-disease.csv")

np.random.seed(42)
# create an X and y
X = h_d.drop("target", axis = 1)
y = h_d["target"]

clf = RandomForestClassifier(n_estimators=100)


In [191]:
np.random.seed(42)

cv_acc = cross_val_score(clf, X, y, cv = 5, scoring=None)
cv_acc

array([0.81967213, 0.90163934, 0.83606557, 0.78333333, 0.78333333])

# Cross validation accuracy

In [200]:
print(f"This is the accuracy of the CVA {np.mean (cv_acc) * 100 : .2f} %")

This is the accuracy of the CVA  82.48 %


In [204]:
np.random.seed(42)

cv_acc = cross_val_score(clf, X, y, cv = 5, scoring="accuracy")
cv_acc

array([0.81967213, 0.90163934, 0.83606557, 0.78333333, 0.78333333])

In [206]:
print(f"This is the accuracy of the CVA {np.mean (cv_acc) * 100 : .2f} %")

This is the accuracy of the CVA  82.48 %


# Precision

In [213]:
cv_precision = cross_val_score(clf, X, y, cv = 5, scoring = "precision")
cv_precision

array([0.76315789, 0.90322581, 0.83870968, 0.79411765, 0.74358974])

In [217]:
print(f"This is the accuracy of the precision {np.mean (cv_precision) * 100 : .2f} %")

This is the accuracy of the precision  80.86 %


# Recall 

In [220]:
cv_recall = cross_val_score(clf, X, y, cv = 5, scoring = "recall")
cv_recall 

array([0.87878788, 0.84848485, 0.78787879, 0.78787879, 0.90909091])

In [224]:
print(f"This is the accuracy of the recall {np.mean (cv_recall ) * 100 : .2f} %")

This is the accuracy of the recall  84.24 %


# Regression 

In [233]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor

np.random.seed(42)

X = h_d.drop("target", axis = 1)
y = h_d["target"]

model = RandomForestRegressor(n_estimators=100)

In [241]:
np.random.seed(42)

cv_r2 = cross_val_score(model, X, y, cv =3, scoring = None)

np.mean(cv_r2)

0.11597576013513518

In [None]:
# Metrics Accuracy