# Script for evaluate a model trained.

The same process of transformation of the test data is followed as was done for the training data for the correct functioning of the model prediction. The results will reflect the true effectiveness of the trained model.

In [1]:
# Import libraries 
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.model_selection import RandomizedSearchCV, GroupKFold, cross_val_score
import lightgbm as ltb
import matplotlib.pyplot as plt 
import joblib
import time 
import warnings
warnings.filterwarnings("ignore")
import sys
t1 = time.time()

    From an fitted model makes a prediction and returns the results.
    
    Parameters:
        model: Model object fitted.
        X: Data with the independent variables to predict
        y: Data with the dependient variable to compare with the prediction
        thresholds: List of thresholds where to check for each of them, the 
        number of records of X below them.
        verbose: Defines whether or not the output is displayed.
    
    Returns:
        results: Pandas Dataframe with the results of the predictions:
            - REAL: Real value of dependent variable.
            - PRED: Prediction value of dependent variable.
            - PERCENTAGE_ERROR:  Percentage deviation mean error by row from REAL and PRED.
            - ABSOLUTE_ERROR: Absolute mean error by row from REAL and PRED.
            - R2_SCORE: R square score from REAL and PRED.

In [2]:
def eval_model(model, X, y, thresholds=[5], verbose=1):
    y_pred = model.predict(X)
    y = np.power(y,3)
    y_pred = np.power(y_pred,3)
    porc_error = abs(y_pred - y)*100/y
    absolute_mean_error = mean_absolute_error(y, y_pred)
    porcentual_mean_error = np.mean(porc_error[porc_error != np.inf])
    if verbose:
        print("\nTEST: Absolute Error:", absolute_mean_error)
        print("Porcentual Error:", porcentual_mean_error)
        print("STD Error:", np.std(abs(y - y_pred)))
        print("R2 Score:", r2_score(y, y_pred)) #Coef determ
    results = pd.DataFrame(np.array(y), columns = ['REAL'])
    results['PRED'] = y_pred
    results['PERCENTAGE_ERROR'] = np.abs(results['PRED'] - results['REAL'])*100/results['REAL'] 
    results['ABSOLUTE_ERROR'] = np.abs(results['PRED'] - results['REAL'])*100
    results['R2_SCORE'] = r2_score(results['REAL'], results['PRED'])
    
    for threshold in thresholds:
        hits = 0
        for element in results['ABSOLUTE_ERROR']:
            if element <= threshold:
                hits+=1
        
        porcentual_hits = hits*100/len(results)
        if verbose:
            print(str(porcentual_hits)+str("% registers are with less than"), str(threshold)+str("% of absolute error."))
            results['% REG ERROR < '+str(threshold)] = porcentual_hits
        
    return results

In [3]:
# Define INPUT and OUTPUT files
INPUT = '../02_Data/prepared_train.csv'
INPUT_FEATS = '../02_Data/features.npy'
OUTPUT_MODEL = '../02_Data/model.pkl'
OUTPUT_SC = '../02_Data/sc_X.bin'
OUTPUT_TRAIN_RES = '../02_Data/train_results.csv'
OUTPUT_VAL_RES = '../02_Data/val_results.csv'

In [4]:
# Read best features from feature selection
features = np.load(INPUT_FEATS).tolist()

In [5]:
# Read data resetting the indexes
data = pd.read_csv(INPUT, sep='|').reset_index(drop=True)

In [6]:
# Indexing the independent and dependent variables in X and y respectively
X = data[features + ['CUSTOMER_ID','BRANDFAMILY_ID']]
y = data['QUOTA_SELLOUT']

In [7]:
# Getting unique customers and shuffle them
customers = X['CUSTOMER_ID'].drop_duplicates().reset_index(drop=True)
index = np.random.permutation(len(customers))
customers = customers.loc[index].reset_index(drop=True)

In [8]:
# Separate train and val customers by ratio
ratio = 0.9
train = customers[:int(np.round(len(customers)*ratio))]
val = customers[int(np.round(len(customers)*ratio)):]

In [25]:
val

4925    36050037
4926    28007268
4927    15130222
4928    46040689
4929    47030119
          ...   
5467    48001911
5468     8090326
5469    27070085
5470    46002556
5471    14000418
Name: CUSTOMER_ID, Length: 547, dtype: int64

In [26]:
train

0       46030177
1       20010427
2        3040342
3        7005093
4       48020325
          ...   
4920     8030322
4921     3010501
4922    15000599
4923    31000037
4924    28007227
Name: CUSTOMER_ID, Length: 4925, dtype: int64

In [9]:
# Obtain the indexes of the dataset where the customers are located
index_train = X['CUSTOMER_ID'].isin(train)
index_val = X['CUSTOMER_ID'].isin(val)

X_train = X[index_train].reset_index(drop=True)
X_val = X[index_val].reset_index(drop=True)

y_train = y[index_train].reset_index(drop=True)
y_val = y[index_val].reset_index(drop=True)

In [10]:
# Create peer groups of customers for cross-validation
groups = X_train.CUSTOMER_ID
group_KFold = GroupKFold(n_splits=5)

In [11]:
# Remove CUSTOMER_ID from data
X_train = X_train.drop(['CUSTOMER_ID','BRANDFAMILY_ID'], axis=1)
X_val = X_val.drop(['CUSTOMER_ID','BRANDFAMILY_ID'], axis=1)
X = X.drop(['CUSTOMER_ID','BRANDFAMILY_ID'], axis=1)

In [12]:
# Apply a data scaling with StandardScaler on X_train and transform on X_val
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_val = sc_X.transform(X_val)

In [13]:
# Save StandardScaler configuration to file for future use
joblib.dump(sc_X, OUTPUT_SC, compress=True)

['../02_Data/sc_X.bin']

In [14]:
# Transform train and val dependent variable to a gaussian distribution
y_train = y_train**(float(1)/3)
y_val = y_val**(float(1)/3)

In [15]:
# Boolean to choose the use of gridsearch
grid_search = False
if grid_search:
    # Define parameters for the gridsearch
    param_grid = {
        'learning_rate': [0.01, 0.1],
        'n_estimators': [50, 100, 200],
        'num_leaves': [12, 16, 31],
        'random_state' : [501],
        'colsample_bytree' : [0.65, 0.66, 1.0],
        'subsample' : [0.75, 1.0],
        'reg_alpha' : [0.0, 1.0, 1.2],
        'reg_lambda' : [0.0, 1.0, 1.2],
    }
    # Define the estimator base model
    estimator = ltb.LGBMRegressor()
    
    # Define de type of gridsearch (RandomizedSearchCV) and fit for get the best parameters
    model = RandomizedSearchCV(estimator = estimator, param_distributions = param_grid, n_iter = 50, cv = group_KFold.split(X_train, y_train, groups), verbose=1, random_state=27, n_jobs = -1)
    model.fit(X_train, y_train)
    
    # Get the best estimator into model
    model = model.best_estimator_
    
    # Check the scores of the model with cross validation and R squared metric on train data
    scores = cross_val_score(model, X_train, y_train, cv=group_KFold.split(X_train, y_train, groups), scoring='r2')    
    

else:
    # Define predefined model
    model = ltb.LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.1, max_depth=-1,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
              random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)
    
    # Check the scores of the model with cross validation and R squared metric on train data
    scores = cross_val_score(model, X_train, y_train, cv=group_KFold.split(X_train, y_train, groups), scoring='r2')     
       
    # Fit the model with train data
    model.fit(X_train, y_train)

In [16]:
# Save model configuration to file for future use
joblib.dump(model, OUTPUT_MODEL)

['../02_Data/model.pkl']

In [17]:
# Get results from evaluate the model with train and val data
res_train = eval_model(model, X_train, y_train, [1, 2, 5, 10], verbose=1)
res_val = eval_model(model, X_val, y_val, [1, 2, 5, 10], verbose=1)


TEST: Absolute Error: 0.01505422293243398
Porcentual Error: 45.486742884187365
STD Error: 0.02375558758113329
R2 Score: 0.9005759901691001
59.771229674359255% registers are with less than 1% of absolute error.
78.03592529944602% registers are with less than 2% of absolute error.
93.68093079184928% registers are with less than 5% of absolute error.
98.71718162078861% registers are with less than 10% of absolute error.

TEST: Absolute Error: 0.014784552250040281
Porcentual Error: 45.47092892662934
STD Error: 0.023145922883882626
R2 Score: 0.8994779543760659
59.877656353101756% registers are with less than 1% of absolute error.
78.3015158499878% registers are with less than 2% of absolute error.
94.01501387495368% registers are with less than 5% of absolute error.
98.83079787762924% registers are with less than 10% of absolute error.


In [18]:
# Format results dataframes for write to file
res_train = data[index_train][['CUSTOMER_ID','BRANDFAMILY_ID','CAL_DATE','CAL_DATE_end']].reset_index(drop=True).merge(res_train, left_index=True, right_index=True)
res_val = data[index_val][['CUSTOMER_ID','BRANDFAMILY_ID','CAL_DATE','CAL_DATE_end']].reset_index(drop=True).merge(res_val, left_index=True, right_index=True)

res_train['CV_R2_SCORE'] = scores.mean()

In [19]:
# The data is written to a file
res_train.to_csv(OUTPUT_TRAIN_RES, sep='|', index=False)
res_val.to_csv(OUTPUT_VAL_RES, sep='|', index=False)

In [20]:
t2 = time.time()
print ("Time to execute script:",str(round((t2-t1)/3600,2)), "h")

Time to execute script: 0.11 h
