# Plots predictions
Plots of predictions of the models with any kind of transformation

**IMPORTANT**: The list of models to evaluate is the same, but each model could have its own feature eng, but the Input (the data_X) and the Output (the prediction) follow the same structure, so it is necesary only one notebook to evaluate the differents notebooks of training (if it is not logic for you thinking in the kaggle competitions).

In this notebook, there are a parameter "folder_models" and in this folder are located the pkl of each model

In [None]:
import os
# fix root path to save outputs
actual_path = os.path.abspath(os.getcwd())
list_root_path = actual_path.split('\\')[:-1]
root_path = '\\'.join(list_root_path)
os.chdir(root_path)
print('root path: ', root_path)

## RUN

In [None]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt

import plotly.graph_objs as go
from plotly.subplots import make_subplots
import plotly.express as px

### 0. Global params

In [None]:
# define folder where the models were saved. There are the same models accepted by gurobi but the feature eng changed

# list of folder with models = ['basic', 'scaler', 'poly_2', 'poly_3']
folder_models = 'poly_3'

### 1. Load data

In [None]:
### DEFINE LIST FEARTURES - TARGET (order data to have the same order in the features always)
list_features = ['AveOccup', 'Latitude', 'Population', 'AveBedrms', 'HouseAge', 'Longitude', 'AveRooms', 'MedInc']
target = 'Price'

In [None]:
### LOAD DATA
X_train = pd.read_pickle('artifacts/data/X_train.pkl')
X_test = pd.read_pickle('artifacts/data/X_test.pkl')
y_train = pd.read_pickle('artifacts/data/y_train.pkl')
y_test = pd.read_pickle('artifacts/data/y_test.pkl')

In [None]:
print('shape data')
print('\n\n TRAIN')
print('X_train: ', X_train.shape)
print('y_train: ', y_train.shape)

print('\n\n TEST')
print('X_test: ', X_test.shape)
print('y_test: ', y_test.shape)

In [None]:
# sample - run fast
X_train = X_train[0:100]
y_train = y_train[0:100]

### 2. Load Models
Load all the models in a dictory

In [None]:
## define list of models - list to have always the same order.
#### In this example, the strings in the list are the same with the models were saved
list_models_names = [
    "lr",
    "ridge",
    "lasso",
    
    "tree_simple",
    "tree_default",
    
    "rf_simple",
    "rf_default",

    "gb_simple",
    "gb_default",

    "xgb_simple",
    "xgb_default",

    "mlp_simple",
    "mlp_default"
]

In [None]:
# define path to folder models
path_folder_models = f'artifacts/models/{folder_models}/'

In [None]:
### load models
dict_models = {}
for model_name in list_models_names:
    print(f'loading model: {model_name}')
    path_model = path_folder_models + f'{model_name}.pkl'
    with open(path_model, 'rb') as artifact:
        dict_models[model_name] = pickle.load(artifact)

### 3. Hist Errors y_true vs y_pred (individual plot)

In [None]:
def hist_errors_predictions(y, y_pred, title_plot, n_bins = 10):
    """
    Plot histogram of error in prediction: errors: abs(y_true vs y_pred) (using matplotlib figure)

    Args:
        y (dataframe): dataframe with y-true values 
        y_pred (dataframe): dataframe with y-pred values
        title_plot (string): tittle in the plot
        n_bins (integer): number of bins in the histogram. Default = 10
    
    Return
        fig (figure matplolib): figure to show, download, etc
    """
    # calculate error
    errors = y - y_pred
    errors = np.abs(errors) # error in abs value
    
    # hist error
    fig = plt.figure()
    plt.hist(errors, bins = n_bins)
    plt.xlabel('Error')
    plt.ylabel('Freq')
    plt.title(f'Histogram of Errors in Predictions:  abs(y - y_pred) - {title_plot}')

    return fig

In [None]:
# example

model_example = dict_models['lr']
y_example_true = y_train
y_example_pred = model_example.predict(X_train)
y_example_pred = pd.DataFrame(y_example_pred, index = y_example_true.index, columns = y_example_true.columns)

# plot
fig_example = hist_errors_predictions(y = y_example_true, 
                                        y_pred = y_example_pred, 
                                        title_plot = 'train - linear regression'
                                   )

# save plot
fig_example.savefig(f'artifacts/plots_predictions_true_pred/{folder_models}/hist_errors_train_lr.png', dpi = 500)

### 4. Hist Errors y_true vs y_pred (multiple plots - one plot for model)

In [None]:
def plot_errors_predictions_subplots_models(dict_models, X, y, n_bins = 10):
    """
    Plot y_true vs y_pred for each model saved in dict_models (following the estrcture of a dictionary with differents models)

    Args:
        dict_models(dictionary): python dictionary where each element are differents models
        X (dataframe): dataframe with features
        y (dataframe): dataframe with target (y_true)
        n_bins (integer): number of bins in the histogram. Default = 10
    
    Return
        fig (figure matplolib): figure to show, download, etc
    """
    # create subplots
    number_models = len(dict_models)
    fig, ax = plt.subplots(number_models, 1, figsize = ((10, 70)) , dpi = 300)
    
    for index, model_name in enumerate(dict_models):
        
        # calculate y_pred
        y_pred = dict_models[model_name].predict(X)
        y_pred = pd.DataFrame(y_pred, index = y.index, columns = y.columns)

        # calculate error
        errors = y - y_pred
        errors = np.abs(errors) # error in abs value
        
        # plot scatter plot y_true vs y_pred
        ax[index].hist(errors, bins = n_bins)

        # Add names to axis
        ax[index].set_xlabel('Error')
        ax[index].set_ylabel('Freq')

        # layout
        ax[index].set_title(f'Hist Errors - Model: {model_name}')


    # Adjust vertical spacing between subplots
    fig.subplots_adjust(wspace=0.5)
    
    # Automatically adjust layout to avoid overlapping elements
    plt.tight_layout()
    return fig

In [None]:
# generate plots data TRAIN
fig_true_pred_models_train = plot_errors_predictions_subplots_models(dict_models = dict_models, 
                                                                   X = X_train, 
                                                                   y = y_train
                                                                  )

# save plot
fig_true_pred_models_train.savefig(f'artifacts/plots_predictions_true_pred/{folder_models}/hist_errors_models_train.png', dpi = 300)

In [None]:
# generate plots data TEST
fig_true_pred_models_test = plot_errors_predictions_subplots_models(dict_models = dict_models, 
                                                                   X = X_test, 
                                                                   y = y_test
                                                                  )

# save plot
fig_true_pred_models_test.savefig(f'artifacts/plots_predictions_true_pred/{folder_models}/hist_errors_models_test.png', dpi = 300)

### TODO: make this plots in plotly

### 5. Plot errors ALL MODELS together
This plot is made with plotly to interact with it (activate, deactivate plots for example)

In [None]:
def plot_errors_predictions_together(dict_models, X, y):
    """
    Plot in plotly y_true vs y_pred for each model, all plots together in one only plot to interact with it
    
    Args:
        dict_models(dictionary): python dictionary where each element are differents models
        X (dataframe): dataframe with features
        y (dataframe): dataframe with target (y_true)
    
    Return
        fig (figure plotly): fig of plotly with the plot generated 
    """
    # generate dataframe with y_true, y_pred, model
    df_error = pd.DataFrame()
    for model_name, model in dict_models.items():
        # calculate y_pred
        y_pred = model.predict(X)
        y_pred = pd.DataFrame(y_pred, index = y.index, columns = y.columns)

        # calculate error
        errors = y - y_pred
        errors = np.abs(errors) # error in abs value

        # save dataframe
        df_error_aux = pd.DataFrame({'error': errors.values.flatten(), 'Model': model_name})
        df_error = pd.concat([df_error, df_error_aux], ignore_index=True)
    

    # plot scatter plot - y_true vs y_pred
    fig = px.histogram(df_error, color='Model', title='Errors abs(y_true - y_pred) by Model')
    
    return fig

In [None]:
# PLOT DATA TRAIN
fig_true_pred_all_models_train = plot_errors_predictions_together(dict_models = dict_models, 
                                                                       X = X_train, 
                                                                       y = y_train
                                                                      )

# show
#fig_true_pred_all_models_train

# save
fig_true_pred_all_models_train.write_html(f'artifacts/plots_predictions_true_pred/{folder_models}/hist_errors_all_models_train.html')

In [None]:
# PLOT DATA TEST
fig_true_pred_all_models_test = plot_errors_predictions_together(dict_models = dict_models, 
                                                                       X = X_test, 
                                                                       y = y_test
                                                                      )

# show
fig_true_pred_all_models_test.show()

# save
fig_true_pred_all_models_test.write_html(f'artifacts/plots_predictions_true_pred/{folder_models}/hist_errors_all_models_test.html')