# Regression model evaluation metrics 

In [2]:
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split


In [3]:
housing = fetch_california_housing()
housing 

{'data': array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
           37.88      , -122.23      ],
        [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
           37.86      , -122.22      ],
        [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
           37.85      , -122.24      ],
        ...,
        [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
           39.43      , -121.22      ],
        [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
           39.43      , -121.32      ],
        [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
           39.37      , -121.24      ]]),
 'target': array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894]),
 'frame': None,
 'target_names': ['MedHouseVal'],
 'feature_names': ['MedInc',
  'HouseAge',
  'AveRooms',
  'AveBedrms',
  'Population',
  'AveOccup',
  'Latitude',
  'Longitude'],
 'DESCR': '.. _california_housing_dataset:\n

In [4]:
housing_df = pd.DataFrame(housing["data"], columns=housing["feature_names"])
housing_df

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25
...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32


In [5]:
housing_df['Target'] = housing['target'] 

In [6]:
housing_df

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,Target
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422
...,...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32,0.847


In [7]:
# make an x and y
X = housing_df.drop("Target", axis = 1 )
y = housing_df["Target"]

# pick a model, train and test 
model = RandomForestRegressor(n_estimators=100)

#train
X_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
model.fit(X_train, y_train)

In [8]:
# test the model 
model.score(x_test, y_test)

0.8124824138786475

In [9]:
housing_df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,Target
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [10]:
y_test.mean()

2.0545058333333333

In [11]:
y_test

7388     1.12500
10548    2.07500
2603     0.62800
12617    2.04600
19476    1.63200
          ...   
537      2.00000
18885    0.92800
18262    5.00001
10704    0.74800
7005     1.59200
Name: Target, Length: 4128, dtype: float64

In [12]:
from sklearn.metrics import r2_score 

# fill an array with y_test
y_test_mean = np.full(len(y_test), y_test.mean())
y_test_mean

array([2.05450583, 2.05450583, 2.05450583, ..., 2.05450583, 2.05450583,
       2.05450583])

In [13]:
r2_score(y_true = y_test,
        y_pred = y_test_mean)

0.0

In [14]:
r2_score(y_true = y_test,
        y_pred = y_test)

1.0

# Mean Absolute Errors

In [16]:
from sklearn.metrics import mean_absolute_error

y_preds = model.predict(x_test)
mae = mean_absolute_error(y_test, y_preds)
mae

0.3301478964874033

In [17]:
y_preds

array([1.15669  , 2.833361 , 0.72405  , ..., 4.1118117, 1.1605604,
       1.75414  ])

In [18]:
y_test

7388     1.12500
10548    2.07500
2603     0.62800
12617    2.04600
19476    1.63200
          ...   
537      2.00000
18885    0.92800
18262    5.00001
10704    0.74800
7005     1.59200
Name: Target, Length: 4128, dtype: float64

In [19]:
df = pd.DataFrame(data = {"actual values" : y_test,
                         "predicted values" : y_preds})
df["differences"] = df["predicted values"] - df["actual values"]
df.head(10)

Unnamed: 0,actual values,predicted values,differences
7388,1.125,1.15669,0.03169
10548,2.075,2.833361,0.758361
2603,0.628,0.72405,0.09605
12617,2.046,2.37943,0.33343
19476,1.632,2.08751,0.45551
9554,0.846,0.80185,-0.04415
393,2.213,2.58696,0.37396
15786,1.875,2.32798,0.45298
18668,3.223,3.17163,-0.05137
16481,1.467,1.20146,-0.26554


In [20]:
df["differences"].mean()

0.01224898078972824

In [21]:
np.abs(df["differences"]).mean()

0.3301478964874033

# Mean Squared Error 

In [23]:
from sklearn.metrics import mean_squared_error

y_preds = model.predict(x_test)
mse = mean_absolute_error(y_test, y_preds)
mse

0.3301478964874033

In [24]:
df["squared_differences"] = np.square(df["differences"])
df.head()

Unnamed: 0,actual values,predicted values,differences,squared_differences
7388,1.125,1.15669,0.03169,0.001004
10548,2.075,2.833361,0.758361,0.575111
2603,0.628,0.72405,0.09605,0.009226
12617,2.046,2.37943,0.33343,0.111176
19476,1.632,2.08751,0.45551,0.207489


# Finally using the scoring

In [26]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
h_d = pd.read_csv("data/heart-disease.csv")

np.random.seed(42)
# create an X and y
X = h_d.drop("target", axis = 1)
y = h_d["target"]

clf = RandomForestClassifier(n_estimators=100)


In [27]:
np.random.seed(42)

cv_acc = cross_val_score(clf, X, y, cv = 5, scoring=None)
cv_acc

array([0.81967213, 0.90163934, 0.83606557, 0.78333333, 0.78333333])

# Cross validation accuracy

In [29]:
print(f"This is the accuracy of the CVA {np.mean (cv_acc) * 100 : .2f} %")

This is the accuracy of the CVA  82.48 %


In [30]:
np.random.seed(42)

cv_acc = cross_val_score(clf, X, y, cv = 5, scoring="accuracy")
cv_acc

array([0.81967213, 0.90163934, 0.83606557, 0.78333333, 0.78333333])

In [31]:
print(f"This is the accuracy of the CVA {np.mean (cv_acc) * 100 : .2f} %")

This is the accuracy of the CVA  82.48 %


# Precision

In [33]:
cv_precision = cross_val_score(clf, X, y, cv = 5, scoring = "precision")
cv_precision

array([0.76315789, 0.90322581, 0.83870968, 0.79411765, 0.74358974])

In [34]:
print(f"This is the accuracy of the precision {np.mean (cv_precision) * 100 : .2f} %")

This is the accuracy of the precision  80.86 %


# Recall 

In [36]:
cv_recall = cross_val_score(clf, X, y, cv = 5, scoring = "recall")
cv_recall 

array([0.87878788, 0.84848485, 0.78787879, 0.78787879, 0.90909091])

In [37]:
print(f"This is the accuracy of the recall {np.mean (cv_recall ) * 100 : .2f} %")

This is the accuracy of the recall  84.24 %


# Regression 

In [39]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor

np.random.seed(42)

X = h_d.drop("target", axis = 1)
y = h_d["target"]

model = RandomForestRegressor(n_estimators=100)

In [40]:
np.random.seed(42)

cv_r2 = cross_val_score(model, X, y, cv =3, scoring = None)

np.mean(cv_r2)

0.11597576013513518

# Metrics Accuracy

# Using different evaluation metrics as Sckit-Learn functions

In [43]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [44]:
np.random.seed(42)
# create X and y
X = h_d.drop("target", axis = 1)
y =  h_d["target"]

# split the data 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

# create a model 
clf = RandomForestClassifier()

# fit the model 
clf.fit(X_train, y_train)

# make predictions
y_preds = clf.predict(X_test)
# evaluate model using evaluation functons
print("Classification the test")
print(f"Accuracy is here {accuracy_score(y_test, y_preds) * 100:.2f}%")
print(f"Precision is here {precision_score(y_test, y_preds) * 100:.2f}%")
print(f"Recall is here {recall_score(y_test, y_preds) * 100:.2f}%")
print(f"F1 is here {f1_score(y_test, y_preds) * 100:.2f}%")

Classification the test
Accuracy is here 85.25%
Precision is here 84.85%
Recall is here 87.50%
F1 is here 86.15%


In [45]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.ensemble import RandomForestRegressor

In [46]:
np.random.seed(42)
# create X and y

X = h_d.drop("target", axis = 1)
y = h_d["target"]

# split the data 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

# create a model
reg_model = RandomForestRegressor()

# fit the model 
reg_model.fit(X_train, y_train)

# make a prediction
reg_model.predict(X_test)

# Evaluate using the regression 
print("Regression metrics")
print(f"R2_score: {r2_score(y_test, y_preds):.2f}")
print(f"Mean_Absolute_Error: {mean_absolute_error(y_test, y_preds) * 100:.2f}")
print(f"Mean_Squared_Error: {mean_squared_error(y_test, y_preds) * 100:.2f}")

Regression metrics
R2_score: 0.41
Mean_Absolute_Error: 14.75
Mean_Squared_Error: 14.75


# Improving the model 

In [48]:
from sklearn.ensemble import RandomForestClassifier

# make a model 
clf = RandomForestClassifier(n_estimators=100)


In [49]:
clf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

# hyperparameters tuning 

In [51]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evaluate_preds(y_true, y_preds):
    accuracy = accuracy_score(y_true, y_preds)
    precision = precision_score(y_true, y_preds)
    recall = recall_score(y_true, y_preds)
    f1 = f1_score(y_true, y_preds)
    
    metric_dict = {
        "accuracy": round(accuracy, 2),
        "precision": round(precision, 2),
        "recall": round(recall, 2),
        "f1": round(f1, 2)
    }
    
    print(f"Accuracy: {accuracy * 100:.2f}%")
    print(f"Precision: {precision * 100:.2f}%")
    print(f"Recall: {recall * 100:.2f}%")
    print(f"F1 Score: {f1 * 100:.2f}%")
    
    return metric_dict


In [52]:
from sklearn.ensemble import RandomForestClassifier
np.random.seed(42)

#shuffle the data 
h_d_shuffled = h_d.sample(frac = 1)

#split the data in x and y

X = h_d_shuffled.drop("target", axis = 1)
y = h_d_shuffled["target"]

#split the data 
train_split = round(0.7 * len(h_d_shuffled))
valid_split = round(train_split + 0.15 * len(h_d_shuffled))

X_train, y_train = X[:train_split], y[:train_split]
X_valid, y_valid = X[train_split:valid_split], y[train_split:valid_split]
X_test, y_test = X[valid_split:], y[valid_split:]

len(X_train) , len(X_valid), len(X_test)

(212, 45, 46)

In [53]:
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

#Make a prediction 
y_preds = clf.predict(X_valid)

#evaluate 
baseline_metrics = evaluate_preds(y_valid, y_preds)

Accuracy: 82.22%
Precision: 81.48%
Recall: 88.00%
F1 Score: 84.62%


In [54]:
clf_2 = RandomForestClassifier(n_estimators=100)
clf_2.fit(X_train, y_train)

#make a pred
y_preds_2 = clf_2.predict(X_valid)

# evaluate 
clf_2_metrics = evaluate_preds(y_valid, y_preds_2)

Accuracy: 84.44%
Precision: 84.62%
Recall: 88.00%
F1 Score: 86.27%


In [55]:
clf_3 = RandomForestClassifier(n_estimators=100, max_depth=10)
clf_3.fit(X_train, y_train)

#make a pred
y_preds_3 = clf_3.predict(X_valid)

# evaluate 
clf_3_metrics = evaluate_preds(y_valid, y_preds_3)

Accuracy: 80.00%
Precision: 80.77%
Recall: 84.00%
F1 Score: 82.35%


# hyperparameter tuning with RandomizedSearchCV

In [99]:
import numpy as np
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.ensemble import RandomForestClassifier

# Define the parameter grid
grid = {"n_estimators": [10, 100, 200, 500, 1000, 1200],
        "max_depth": [None, 5, 10, 20, 30],
        "min_samples_split": [2, 4, 6],
        "min_samples_leaf": [1, 2, 4]}

# Set the random seed for reproducibility
np.random.seed(42)

# Split into X and y
X = h_d_shuffled.drop("target", axis=1)
y = h_d_shuffled["target"]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Instantiate RandomForestClassifier
clf = RandomForestClassifier(n_jobs=1)

# Setup RandomizedSearchCV
rs_clf = RandomizedSearchCV(estimator=clf,
                            param_distributions=grid,
                            n_iter=10,
                            cv=5,
                            verbose=2,
                            random_state=42)
# Fit RandomizedSearchCV
rs_clf.fit(X_train, y_train);

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=6, n_estimators=10; total time=   0.0s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=6, n_estimators=10; total time=   0.0s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=6, n_estimators=10; total time=   0.0s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=6, n_estimators=10; total time=   0.0s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=6, n_estimators=10; total time=   0.0s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=4, n_estimators=200; total time=   0.1s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=4, n_estimators=200; total time=   0.1s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=4, n_estimators=200; total time=   0.1s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=4, n_estimators=200; total time=   0.1s
[CV] END max_depth=10, mi

# make a predict using the best hyp

In [100]:
rs_y_preds = rs_clf.predict(X_test)

#evaluate 
rs_metrics = evaluate_preds(y_test, rs_y_preds)

Accuracy: 81.97%
Precision: 75.76%
Recall: 89.29%
F1 Score: 81.97%


# hyperparameter tuning with GirdSearchCV

In [91]:
grid

{'n_estimators': [10, 100, 200, 500, 1000, 1200],
 'max_depth': [None, 5, 10, 20, 30],
 'min_samples_split': [2, 4, 6],
 'min_samples_leaf': [1, 2, 4]}

In [104]:
import numpy as np
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.ensemble import RandomForestClassifier

# Define the parameter grid
grid = {
    "n_estimators": [10, 100, 200, 500, 1000, 1200],
    "max_depth": [None, 5, 10, 20, 30],
    "min_samples_split": [2, 4, 6],
    "min_samples_leaf": [1, 2, 4]
}

# Set the random seed for reproducibility
np.random.seed(42)

# Split into X and y
X = h_d_shuffled.drop("target", axis=1)
y = h_d_shuffled["target"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Instantiate RandomForestClassifier
clf = RandomForestClassifier(n_jobs=1)

# Setup GridSearchCV
gs_clf = GridSearchCV(estimator=clf,
                      param_grid=grid,
                      cv=5,
                      verbose=2,
                      n_jobs=-1)  # Use all available cores

# Fit GridSearchCV
# gs_clf.fit(X_train, y_train)

# Get the best parameters
print("Best parameters found:", gs_clf.best_params_)

AttributeError: 'GridSearchCV' object has no attribute 'best_params_'

In [None]:
compare_metrics = pd.DataFrame({"baseline": baseline_metrics,
                               "clf_2": clf_2_metrics,
                               "random search" : rs_metrics,
                               "grid search" : gs_metrics})