# Regression model evaluation metrics 

In [2]:
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split


In [3]:
housing = fetch_california_housing()
housing 

{'data': array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
           37.88      , -122.23      ],
        [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
           37.86      , -122.22      ],
        [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
           37.85      , -122.24      ],
        ...,
        [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
           39.43      , -121.22      ],
        [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
           39.43      , -121.32      ],
        [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
           39.37      , -121.24      ]]),
 'target': array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894]),
 'frame': None,
 'target_names': ['MedHouseVal'],
 'feature_names': ['MedInc',
  'HouseAge',
  'AveRooms',
  'AveBedrms',
  'Population',
  'AveOccup',
  'Latitude',
  'Longitude'],
 'DESCR': '.. _california_housing_dataset:\n

In [4]:
housing_df = pd.DataFrame(housing["data"], columns=housing["feature_names"])
housing_df

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25
...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32


In [5]:
housing_df['Target'] = housing['target'] 

In [6]:
housing_df

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,Target
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422
...,...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32,0.847


In [7]:
# make an x and y
X = housing_df.drop("Target", axis = 1 )
y = housing_df["Target"]

# pick a model, train and test 
model = RandomForestRegressor(n_estimators=100)

#train
X_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
model.fit(X_train, y_train)

In [8]:
# test the model 
model.score(x_test, y_test)

0.8195422035550294

In [9]:
housing_df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,Target
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [10]:
y_test.mean()

2.0448585271317827

In [11]:
y_test

309      1.000
15312    1.063
6658     1.714
11726    0.993
4318     3.216
         ...  
14432    2.294
20576    0.984
17230    4.436
7281     0.972
12312    1.153
Name: Target, Length: 4128, dtype: float64

In [12]:
from sklearn.metrics import r2_score 

# fill an array with y_test
y_test_mean = np.full(len(y_test), y_test.mean())
y_test_mean

array([2.04485853, 2.04485853, 2.04485853, ..., 2.04485853, 2.04485853,
       2.04485853])

In [13]:
r2_score(y_true = y_test,
        y_pred = y_test_mean)

0.0

In [14]:
r2_score(y_true = y_test,
        y_pred = y_test)

1.0

# Mean Absolute Errors

In [16]:
from sklearn.metrics import mean_absolute_error

y_preds = model.predict(x_test)
mae = mean_absolute_error(y_test, y_preds)
mae

0.3197032987403103

In [17]:
y_preds

array([1.05694  , 1.8548303, 2.02979  , ..., 3.2185706, 1.20036  ,
       1.3589501])

In [18]:
y_test

309      1.000
15312    1.063
6658     1.714
11726    0.993
4318     3.216
         ...  
14432    2.294
20576    0.984
17230    4.436
7281     0.972
12312    1.153
Name: Target, Length: 4128, dtype: float64

In [19]:
df = pd.DataFrame(data = {"actual values" : y_test,
                         "predicted values" : y_preds})
df["differences"] = df["predicted values"] - df["actual values"]
df.head(10)

Unnamed: 0,actual values,predicted values,differences
309,1.0,1.05694,0.05694
15312,1.063,1.85483,0.79183
6658,1.714,2.02979,0.31579
11726,0.993,0.99802,0.00502
4318,3.216,3.33091,0.11491
1677,1.477,1.09979,-0.37721
18193,2.327,3.14721,0.82021
3530,3.118,3.00968,-0.10832
5375,2.375,2.856451,0.481451
3355,0.715,0.88927,0.17427


In [20]:
df["differences"].mean()

0.0052812980135654814

In [21]:
np.abs(df["differences"]).mean()

0.3197032987403103

# Mean Squared Error 

In [23]:
from sklearn.metrics import mean_squared_error

y_preds = model.predict(x_test)
mse = mean_absolute_error(y_test, y_preds)
mse

0.3197032987403103

In [24]:
df["squared_differences"] = np.square(df["differences"])
df.head()

Unnamed: 0,actual values,predicted values,differences,squared_differences
309,1.0,1.05694,0.05694,0.003242
15312,1.063,1.85483,0.79183,0.626995
6658,1.714,2.02979,0.31579,0.099723
11726,0.993,0.99802,0.00502,2.5e-05
4318,3.216,3.33091,0.11491,0.013204


# Finally using the scoring

In [26]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
h_d = pd.read_csv("data/heart-disease.csv")

np.random.seed(42)
# create an X and y
X = h_d.drop("target", axis = 1)
y = h_d["target"]

clf = RandomForestClassifier(n_estimators=100)


In [27]:
np.random.seed(42)

cv_acc = cross_val_score(clf, X, y, cv = 5, scoring=None)
cv_acc

array([0.81967213, 0.90163934, 0.83606557, 0.78333333, 0.78333333])

# Cross validation accuracy

In [29]:
print(f"This is the accuracy of the CVA {np.mean (cv_acc) * 100 : .2f} %")

This is the accuracy of the CVA  82.48 %


In [30]:
np.random.seed(42)

cv_acc = cross_val_score(clf, X, y, cv = 5, scoring="accuracy")
cv_acc

array([0.81967213, 0.90163934, 0.83606557, 0.78333333, 0.78333333])

In [31]:
print(f"This is the accuracy of the CVA {np.mean (cv_acc) * 100 : .2f} %")

This is the accuracy of the CVA  82.48 %


# Precision

In [33]:
cv_precision = cross_val_score(clf, X, y, cv = 5, scoring = "precision")
cv_precision

array([0.76315789, 0.90322581, 0.83870968, 0.79411765, 0.74358974])

In [34]:
print(f"This is the accuracy of the precision {np.mean (cv_precision) * 100 : .2f} %")

This is the accuracy of the precision  80.86 %


# Recall 

In [36]:
cv_recall = cross_val_score(clf, X, y, cv = 5, scoring = "recall")
cv_recall 

array([0.87878788, 0.84848485, 0.78787879, 0.78787879, 0.90909091])

In [37]:
print(f"This is the accuracy of the recall {np.mean (cv_recall ) * 100 : .2f} %")

This is the accuracy of the recall  84.24 %


# Regression 

In [39]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor

np.random.seed(42)

X = h_d.drop("target", axis = 1)
y = h_d["target"]

model = RandomForestRegressor(n_estimators=100)

In [40]:
np.random.seed(42)

cv_r2 = cross_val_score(model, X, y, cv =3, scoring = None)

np.mean(cv_r2)

0.11597576013513518

# Metrics Accuracy

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


# Using different evaluation metrics as Sckit-Learn functions

In [127]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [129]:
np.random.seed(42)
# create X and y
X = h_d.drop("target", axis = 1)
y =  h_d["target"]

# split the data 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

# create a model 
clf = RandomForestClassifier()

# fit the model 
clf.fit(X_train, y_train)

# make predictions
y_preds = clf.predict(X_test)
# evaluate model using evaluation functons
print("Classification the test")
print(f"Accuracy is here {accuracy_score(y_test, y_preds) * 100:.2f}%")
print(f"Precision is here {precision_score(y_test, y_preds) * 100:.2f}%")
print(f"Recall is here {recall_score(y_test, y_preds) * 100:.2f}%")
print(f"F1 is here {f1_score(y_test, y_preds) * 100:.2f}%")

Classification the test
Accuracy is here 85.25%
Precision is here 84.85%
Recall is here 87.50%
F1 is here 86.15%


In [131]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.ensemble import RandomForestRegressor

In [149]:
np.random.seed(42)
# create X and y

X = h_d.drop("target", axis = 1)
y = h_d["target"]

# split the data 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

# create a model
reg_model = RandomForestRegressor()

# fit the model 
reg_model.fit(X_train, y_train)

# make a prediction
reg_model.predict(X_test)

# Evaluate using the regression 
print("Regression metrics")
print(f"R2_score: {r2_score(y_test, y_preds) * 100:.2f}")
print(f"MAE: {mean_absolute_error(y_test, y_preds) * 100:.2f}")
print(f"MSE: {mean_squared_error(y_test, y_preds) * 100:.2f}")

Regression metrics
R2_score: 40.84
MAE: 14.75
MSE: 14.75
