## data basic features - train one linear regression model

Content:
- data: containt the data basic features (units_solds, peak) but in addition the feature prices is added with the prices of all regions. Also the data is splitted to train different models for each region

- Model: artifact that contain the model and the feature engineering (previosly a feature engineering was done, but for example objetive, in this part a do more feature engineering and "compile" it with the model)

- TRAIN ONLY ONE MODEL LR TO FIT ALL THE DATA FOR ALL REGIONS (no split the model)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


from sklearn.model_selection import train_test_split

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_transformer

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline

from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

### 0. Root repo

In [None]:
import os
# fix root path to save outputs
actual_path = os.path.abspath(os.getcwd())
list_root_path = actual_path.split('\\')[:-1]
root_path = '\\'.join(list_root_path)
os.chdir(root_path)
print('root path: ', root_path)

### 1. Read data

In [None]:
path_data_prices_regions = 'artifacts/data/data_prices_regions.pkl'

data = pd.read_pickle(path_data_prices_regions)

data.head()

### 2. Generate X, y, list features, list segmentation data

In [None]:
""" Create list regions """
list_regions = data['region'].unique().tolist()
list_regions.sort()
list_regions

In [None]:
""" Define features and target """
# target
target = 'units_sold'
list_target = [target]

# list features - all variables in dataframe that are not target
list_features = list(set(data.columns.tolist()) - set([target]))


######  set manually list features
list_columns_prices_regions = ['price_' + name_region for name_region in list_regions] # generate a list of all the columns of prices (prices for each region)
list_columns_prices_regions = [column.lower() for column in list_columns_prices_regions]
list_features = ['region'] + ['peak'] + list_columns_prices_regions # list features hardcoded


print('list_features: ', list_features)
print('list_target: ', list_target)

In [None]:
""" create data X - features // y - target """
data_X = data[list_features]
data_y = data[list_target]

In [None]:
data_X.head()

In [None]:
data_y.head()

### 3. Split - train - test

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data_X, 
                                                    data_y, 
                                                    train_size = 0.7, 
                                                    random_state = 42
                                                   )

In [None]:
print('shapes')
print('X_train: ', X_train.shape)
print('y_train: ', y_train.shape)
print('X_test: ', X_test.shape)
print('y_test: ', y_test.shape)

In [None]:
X_train.head(2)

In [None]:
y_train.head(2)

In [None]:
X_test.head(2)

In [None]:
y_test.head(2)

### 4. Pipeline processing data
- Region string, transform into categorical features
- Peak ok
- price normalize continuous variable

In [None]:
# transformer data sklearn
feat_transform = make_column_transformer(
    (OneHotEncoder(drop="first"), ["region"]),
    (StandardScaler(), list_columns_prices_regions),
    ("passthrough", ["peak"]),
    verbose_feature_names_out=False, # conserve original column names
    remainder='drop'
)

In [None]:
X_train

In [None]:
# shape output
feat_transform.fit_transform(X_train).shape

In [None]:
# example output
feat_transform.fit_transform(X_train)[0, :]

In [None]:
# original example output
X_train.iloc[0, :]

### 5. Pipeline processing data + train model

In [None]:
model_pipeline = make_pipeline(feat_transform, LinearRegression())

### 6. Train with cross validation

In [None]:
# get scores of cross validation - R2
scores = cross_val_score(model_pipeline, X_train, y_train, cv=5)
scores

In [None]:
pd.DataFrame(scores).describe()

### 7. Train model with all train dataset

In [None]:
model_pipeline.fit(X_train, y_train)

## 8. Evaluate Performance Model

## -> Performance all data
Evaluate performance of the model with all the train and test data

#### 8.0 Get y_train_pred , y_test_pred

In [None]:
# y train pred
y_train_pred = model_pipeline.predict(X_train)
y_train_pred = pd.DataFrame(y_train_pred)

# y test pred
y_test_pred = model_pipeline.predict(X_test)
y_test_pred = pd.DataFrame(y_test_pred)

#### 8.1. Evaluate performance model - metrics

In [None]:
def print_metrics_evaluation(y_train,  y_train_pred, y_test, y_test_pred):
    """
    Print metrics of supervised models. Train and Test metrics

    Args:
        y_train
        y_train_pred
        y_test
        y_test_pred
    """
    # evaluate model
    
    # r2
    r2_train = r2_score(y_train, y_train_pred).round(3)
    r2_test = r2_score(y_test, y_test_pred).round(3)
    
    print('\nR2')
    print('r2_train: ', r2_train)
    print('r2_test: ', r2_test)
    
    
    # mae
    mae_train = mean_absolute_error(y_train, y_train_pred).round(3)
    mae_test = mean_absolute_error(y_test, y_test_pred).round(3)
    
    print('\nMAE')
    print('mae_train: ', mae_train)
    print('mae_test: ', mae_test)
    
    # mse
    mse_train = mean_squared_error(y_train, y_train_pred, squared = True).round(3)
    mse_test= mean_squared_error(y_test, y_test_pred, squared = True).round(3)
    
    print('\nMSE')
    print('mse_train: ', mse_train)
    print('mse_test: ', mse_test)
    
    
    # rmse
    rmse_train = mean_squared_error(y_train, y_train_pred, squared = False).round(3)
    rmse_test= mean_squared_error(y_test, y_test_pred, squared = False).round(3)
    
    print('\nRMSE')
    print('rmse_train: ', rmse_train)
    print('rmse_test: ', rmse_test)

In [None]:
print_metrics_evaluation(y_train,  y_train_pred, y_test, y_test_pred)

In [None]:
# # r2 - score - OLD
# r2_train = model_pipeline.score(X_train, y_train).round(3)
# r2_test = model_pipeline.score(X_test, y_test).round(3)

# print('r2_train: ', r2_train)
# print('r2_test: ', r2_test)

#### 8.2 Evaluate performance model - y true vs y_predicted

In [None]:
#### OLD
# def plot_y_true_vs_y_pred(df_y_true, df_y_pred):
#     """
#     Plot y_true vs y_pred. Both in format dataframe
#     """
#     ### plot scatter plot
#     fig_plot = plt.scatter(df_y_true, 
#                            df_y_pred,
#                           alpha = 0.3,
#                           marker = 'x',
#                           label = 'y_true vs y_pred')

    
#     ### add bisectriz 
#     x = np.linspace(df_y_true.min()[0], df_y_true.max()[0], df_y_true.shape[0])
#     y = x  # Bisectriz: y = x
#     plt.plot(x, y, label='Bisectriz', color='red', alpha = 0.3)
    
#     ### add names to axis
#     plt.xlabel('Y true')
#     plt.ylabel('Y pred')
#     plt.legend()


#     fig.show()
#     #return fig_plot

def plot_y_true_vs_y_pred(df_y_true, df_y_pred, title_plot):
    """
    Plot y_true vs y_pred. Both in format dataframe
    """
    fig, ax = plt.subplots()
    scatter_plot = ax.scatter(df_y_true, df_y_pred, alpha=0.3, marker='x', label='y_true vs y_pred')

    # Add bisectriz
    x = np.linspace(df_y_true.min()[0], df_y_true.max()[0], df_y_true.shape[0])
    y = x  # Bisectriz: y = x
    ax.plot(x, y, label='Bisectriz', color='red', alpha=0.3)

    # Add names to axis
    ax.set_xlabel('Y true')
    ax.set_ylabel('Y pred')
    
    ax.set_title(title_plot)
    ax.legend()

    return fig

In [None]:
# plot TRAIN
plot_true_pred_train = plot_y_true_vs_y_pred(df_y_true = y_train,
                                               df_y_pred = y_train_pred,
                                             title_plot = 'TRAIN DATA'
                                              )

In [None]:
# plot TEST
plot_true_pred_test = plot_y_true_vs_y_pred(df_y_true = y_test,
                     df_y_pred = y_test_pred,
                                            title_plot = 'TEST DATA'
                     )

## -> Performance by region
Evaluate performance of the model with the data segmented. In this example, divide by region

#### 8.3 Evaluate performance model by region - metrics by region

In [None]:
def get_metric_each_region(X, y, model, list_regions):
    """
    Split data by each region and calculate the metric of a segmented data
    """

    ## list to save metrics to calculate
    list_r2_each_region = []
    list_mae_each_region = []
    list_mse_each_region = []
    list_rmse_reach_region = []

    # add column region in data "y" to filter data by region
    y['region'] = X[['region']]

    for region_name in list_regions:
        #print(region_name)
    
        # generate X filtered by region
        X_filter_region = X[X['region'] == region_name]
        
        # generate y filtered by region
        y_filter_region = y[y['region'] == region_name]
        y_filter_region = y_filter_region.drop(columns = 'region')
        
        # predict
        y_filter_region_pred = model.predict(X_filter_region)
        y_filter_region_pred = pd.DataFrame(y_filter_region_pred)
        
        # evaluate metric
        r2_aux = r2_score(y_filter_region, y_filter_region_pred).round(3) # r2
        mae_aux = mean_absolute_error(y_filter_region, y_filter_region_pred).round(3)
        mse_aux = mean_squared_error(y_filter_region, y_filter_region_pred, squared = True).round(3)
        rmse_aux = mean_squared_error(y_filter_region, y_filter_region_pred, squared = False).round(3)

        # save in a list
        list_r2_each_region.append(r2_aux)
        list_mae_each_region.append(mae_aux)
        list_mse_each_region.append(mse_aux)
        list_rmse_reach_region.append(rmse_aux)


    #save as df
    df_metrics_each_region = pd.DataFrame()
    df_metrics_each_region['region'] = list_regions
    df_metrics_each_region['r2'] = list_r2_each_region
    df_metrics_each_region['mae'] = list_mae_each_region
    df_metrics_each_region['mse'] = list_mse_each_region
    df_metrics_each_region['rmse'] = list_rmse_reach_region

    return df_metrics_each_region

In [None]:
# r2 each region - TRAIN
get_metric_each_region(X = X_train.copy(),
                      y = y_train.copy(),
                      model = model_pipeline,
                      list_regions = list_regions
                      )

In [None]:
# r2 each region - TEST
get_metric_each_region(X = X_test.copy(),
                      y = y_test.copy(),
                      model = model_pipeline,
                       list_regions = list_regions
                      )

#### 8.4 Evaluate y_true vs y_pred by region (individual plot)

In [None]:
def plot_y_true_vs_y_pred_each_regions(X, y, model, list_regions):
    """
    Plot y_true vs y_predict for each region.
    Use the individual function to plot y_true vs y_pred
    """
    
    # add column region in data "y" to filter data by region
    y['region'] = X[['region']]

    for region_name in list_regions:
        #print(region_name)
    
        # generate X filtered by region
        X_filter_region = X[X['region'] == region_name]
        
        # generate y filtered by region
        y_filter_region = y[y['region'] == region_name]
        y_filter_region = y_filter_region.drop(columns = 'region')
        
        # predict
        y_filter_region_pred = model.predict(X_filter_region)
        y_filter_region_pred = pd.DataFrame(y_filter_region_pred)
    
    
        # plot
        plot_y_true_vs_y_pred(df_y_true = y_filter_region,
                             df_y_pred = y_filter_region_pred,
                              title_plot = f'y_true vs y_pred for region: {region_name}'
                             )

In [None]:
# DATA TRAIN
plot_y_true_vs_y_pred_each_regions(X = X_train.copy(),
                                   y = y_train.copy(),
                                   model = model_pipeline,
                                   list_regions =  list_regions
                                  )

In [None]:
# DATA TEST
plot_y_true_vs_y_pred_each_regions(X = X_test.copy(),
                                   y = y_test.copy(),
                                   model = model_pipeline,
                                   list_regions = list_regions
                                  )

#### 8.5 Evaluate y_true vs y_pred by region (one plot true vs pred - colored by region)

In [None]:
X = X_train.copy()
y = y_train.copy()
model = model_pipeline

In [None]:
def plot_y_true_vs_y_pred_color_regions(X, y, model, list_regions):
    """
    y_true vs y_pred for all data, colored by region
    """

    # add column region in data "y" to filter data by region
    y['region'] = X[['region']]
    
    for region_name in list_regions:
        #print(region_name)
    
        # generate X filtered by region
        X_filter_region = X[X['region'] == region_name]
        #print('debugging order columns: ', X_filter_region.columns.tolist())
        
        # generate y filtered by region
        y_filter_region = y[y['region'] == region_name]
        y_filter_region = y_filter_region.drop(columns = 'region')
        
        # predict
        y_filter_region_pred = model.predict(X_filter_region)
        y_filter_region_pred = pd.DataFrame(y_filter_region_pred)
    
    
        # plot scatter plot for each region
        ### plot scatter plot
        fig_plot = plt.scatter(y_filter_region, 
                               y_filter_region_pred,
                              alpha = 0.3,
                              marker = 'x',
                              label = f'region: {region_name}')
        
        ### add names to axis
        plt.xlabel('Y true')
        plt.ylabel('Y pred')
    
    
    ### add bisectriz 
    x_bisectriz = np.linspace(y.min()[0], y.max()[0], y.shape[0])
    y_bisectriz = x_bisectriz  # Bisectriz: y = x
    plt.plot(x_bisectriz, y_bisectriz, label='Bisectriz', color='red')
    
    # title
    plt.title('y_true vs y_pred')
    plt.legend()

In [None]:
### DATA TRAIN
plot_y_true_vs_y_pred_color_regions(X = X_train.copy(),
                                    y = y_train.copy(),
                                    model = model_pipeline,
                                    list_regions = list_regions
                                   )

In [None]:
### DATA TEST
plot_y_true_vs_y_pred_color_regions(X = X_test.copy(),
                                    y = y_test.copy(),
                                    model = model_pipeline,
                                    list_regions = list_regions
                                   )

#### 8.X Evaluate model - business logic
Evaluate performance model according to a business logic