# Evaluate Metrics of Regressor Models
Evaluate any kind of models models (with feature eng and without it)

**IMPORTANT**: The list of models to evaluate is the same, but each model could have its own feature eng, but the Input (the data_X) and the Output (the prediction) follow the same structure, so it is necesary only one notebook to evaluate the differents notebooks of training (if it is not logic for you thinking in the kaggle competitions).

In this notebook, there are a parameter "folder_models" and in this folder are located the pkl of each model

The list of Metrics to evaluate are:


**Group 1 R2**
- R2

**Group 2 MSE**
- MSE

**Group 3 RMSE**
- RMSE
- RMSE MEAN RATIO
- RMSE IQR RATIO

**Group 4 MAE**
- MAE
- MAE MEAN RATIO
- MAE IQR RATIO

# RUN

In [1]:
import os
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt


# metrics
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from scipy.stats import iqr

### 0. Global params

In [2]:
# define folder where the models were saved. There are the same models accepted by gurobi but the feature eng changed

# list of folder with models = ['basic', 'scaler', 'poly_2', 'poly_3']
folder_models = 'basic'

### 1. Load data

In [3]:
### DEFINE LIST FEARTURES - TARGET (order data to have the same order in the features always)
list_features = ['AveOccup', 'Latitude', 'Population', 'AveBedrms', 'HouseAge', 'Longitude', 'AveRooms', 'MedInc']
target = 'Price'

In [4]:
### LOAD DATA
X_train = pd.read_pickle('artifacts/data/X_train.pkl')
X_test = pd.read_pickle('artifacts/data/X_test.pkl')
y_train = pd.read_pickle('artifacts/data/y_train.pkl')
y_test = pd.read_pickle('artifacts/data/y_test.pkl')

In [5]:
print('shape data')
print('\n\n TRAIN')
print('X_train: ', X_train.shape)
print('y_train: ', y_train.shape)

print('\n\n TEST')
print('X_test: ', X_test.shape)
print('y_test: ', y_test.shape)

shape data


 TRAIN
X_train:  (14540, 8)
y_train:  (14540, 1)


 TEST
X_test:  (3636, 8)
y_test:  (3636, 1)


### 2. Load Models
Load all the models in a dictory

In [6]:
## define list of models - list to have always the same order.
#### In this example, the strings in the list are the same with the models were saved
list_models_names = [
    "lr",
    "ridge",
    "lasso",
    
    "tree_simple",
    "tree_default",
    
    "rf_simple",
    "rf_default",

    "gb_simple",
    "gb_default",

    "xgb_simple",
    "xgb_default",

    "mlp_simple",
    "mlp_default"
]

In [7]:
# define path to folder models
path_folder_models = f'artifacts/models/{folder_models}/'

In [8]:
### load models
dict_models = {}
for model_name in list_models_names:
    print(f'loading model: {model_name}')
    path_model = path_folder_models + f'{model_name}.pkl'
    with open(path_model, 'rb') as artifact:
        dict_models[model_name] = pickle.load(artifact)

loading model: lr
loading model: ridge
loading model: lasso
loading model: tree_simple
loading model: tree_default
loading model: rf_simple
loading model: rf_default
loading model: gb_simple
loading model: gb_default
loading model: xgb_simple
loading model: xgb_default
loading model: mlp_simple
loading model: mlp_default


### 3. Define Functions to calculate metrics

In [9]:
# show version scikit-learn - since version 1.4 some codes to evaluate metrics changed
!pip show scikit-learn

Name: scikit-learn
Version: 1.3.1
Summary: A set of python modules for machine learning and data mining
Home-page: http://scikit-learn.org
Author: 
Author-email: 
License: new BSD
Location: /opt/anaconda3/envs/data-science-python-3-10/lib/python3.10/site-packages
Requires: joblib, numpy, scipy, threadpoolctl
Required-by: mlflow


In [10]:
def calculate_metrics_regressors_models(y, y_pred, model_name, decimals_round = None):
    """
    Calculate a certain number of metrics to evaluate regression models. The metrics are rounded to X decimals

    Args
        y (dataframe): y true
        y_pred (dataframe): y predicted with the model. In this codes are passed y_pred instead of X
        model_name (string): name of the model. This name is used when the metrics are saved to identify the model of these metrics
        decimals_round = Number of decimals to round the values. Defult None, no round the values.

    Return
        metrics_regressors (dataframe): dataframe with the metrics of the model in this datasets. Row: name metrics. Columns: value metrics
    """

    #### R2
    r2 = r2_score(y, y_pred)
    
    #### MSE
    mse = mean_squared_error(y, y_pred, squared = True)
    
    #### RMSE
    rmse = mean_squared_error(y, y_pred, squared = False)
    
    #### RMSE_MEAN_RATIO
    # rmse mean ratio: rmse / mean_y_true
    rmse_mean_ratio = rmse / y.mean().values[0]
    
    #### RMSE_IQR_RATIO
    # rmse iqr ratio: rmse / iqr_y_true
    rmse_iqr_ratio = rmse / iqr(y)
    
    #### MAE
    mae = mean_absolute_error(y, y_pred)
    
    #### MAE_RATIO
    mae_mean_ratio = mae / y.mean().values[0]
    
    #### MAE_IQR_RATIO
    mae_iqr_ratio = mae / iqr(y)
    
    
    
    #### JOIN INTO ONE DATAFRAME
    # create dataframe
    metrics_regressors = pd.DataFrame(index = [model_name])
    
    # add metrics
    metrics_regressors['r2'] = r2
    metrics_regressors['mse'] = mse
    metrics_regressors['rmse'] = rmse
    metrics_regressors['rmse_mean_ratio'] = rmse_mean_ratio
    metrics_regressors['rmse_iqr_ratio'] = rmse_iqr_ratio
    metrics_regressors['mae'] = mae
    metrics_regressors['mae_mean_ratio'] = mae_mean_ratio
    metrics_regressors['mae_iqr_ratio'] = mae_iqr_ratio
    
    # round
    metrics_regressors = metrics_regressors.astype('float')
    if decimals_round:
        metrics_regressors = metrics_regressors.round(decimals_round)


    return metrics_regressors

In [11]:
# show examples
calculate_metrics_regressors_models(y = y_train,
                                    y_pred = dict_models['lr'].predict(X_train),
                                    model_name = 'lr',
                                    decimals_round = 3
                                   )

Unnamed: 0,r2,mse,rmse,rmse_mean_ratio,rmse_iqr_ratio,mae,mae_mean_ratio,mae_iqr_ratio
lr,0.681,0.398,0.631,0.3,0.443,0.471,0.224,0.331


### 4. Calculate metrics train

In [12]:
### calculate metrics for all models, TRAIN DATA
metrics_train = pd.DataFrame()
for m_name in list_models_names:
    print(f'calculating metrics: {m_name}')

    # calcualte metrics
    y_pred_train = dict_models[m_name].predict(X_train)
    metrics_aux = calculate_metrics_regressors_models(y = y_train,
                                                      y_pred = y_pred_train,
                                                      model_name = m_name,
                                                      decimals_round = 3
                                                     )

    # append ouput dataframe
    metrics_train = pd.concat([metrics_train, metrics_aux], axis = 0)

calculating metrics: lr
calculating metrics: ridge
calculating metrics: lasso
calculating metrics: tree_simple
calculating metrics: tree_default
calculating metrics: rf_simple
calculating metrics: rf_default
calculating metrics: gb_simple
calculating metrics: gb_default
calculating metrics: xgb_simple
calculating metrics: xgb_default


ValueError: feature_names mismatch: ['AveOccup', 'Latitude', 'Longitude', 'AveBedrms', 'AveRooms', 'MedInc', 'Population', 'HouseAge'] ['AveBedrms', 'AveOccup', 'AveRooms', 'Longitude', 'Population', 'HouseAge', 'MedInc', 'Latitude']

In [None]:
metrics_train

### 5. Calculate metrics test

In [None]:
### calculate metrics for all models, TEST DATA
metrics_test = pd.DataFrame()
for m_name in list_models_names:
    print(f'calculating metrics: {m_name}')

    # calcualte metrics
    y_pred_test = dict_models[m_name].predict(X_test)
    metrics_aux = calculate_metrics_regressors_models(y = y_test,
                                                      y_pred = y_pred_test,
                                                      model_name = m_name,
                                                      decimals_round = 3
                                                     )

    # append ouput dataframe
    metrics_test = pd.concat([metrics_test, metrics_aux], axis = 0)

In [None]:
metrics_test

### 6. Save Metrics
Save metrics in a excel

In [None]:
metrics_train.to_excel(f'artifacts/metrics/{folder_models}/metrics_train.xlsx')
metrics_test.to_excel(f'artifacts/metrics/{folder_models}/metrics_test.xlsx')