# Model 

### Prepare modules and data

In [1]:
# Import packages
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

# Autoreload
%load_ext autoreload
%autoreload 2

# Warnings
import warnings
warnings.filterwarnings('ignore')

# Import data
from bimbo.data import Bimbo

data = Bimbo().get_data()

# Load in the train and test datasets
train_df = data['train']
# test_df = data['test']
# cliente_tabla_df = data['cliente_tabla']
# producto_tabla_df = data['producto_tabla']
town_state_df = data['town_state']
# sample_submission_df = data['sample_submission']

# Delete data
import gc
del data
gc.collect()

Memory usage of dataframe is 0.04 MB
Memory usage after optimization is: 0.11 MB
Decreased by -184.7%
Memory usage of dataframe is 14.27 MB
Memory usage after optimization is: 19.51 MB
Decreased by -36.7%
Memory usage of dataframe is 373.80 MB
Memory usage after optimization is: 120.15 MB
Decreased by 67.9%
Memory usage of dataframe is 0.02 MB
Memory usage after optimization is: 0.02 MB
Decreased by 6.0%
Memory usage of dataframe is 6225.47 MB
Memory usage after optimization is: 2122.32 MB
Decreased by 65.9%
Memory usage of dataframe is 106.80 MB
Memory usage after optimization is: 33.38 MB
Decreased by 68.7%


42

In [2]:
# Merge product clusters
products_id_clusters_df = pd.read_csv('../data/producto_clusters.csv')
new_train_df = pd.merge(train_df, products_id_clusters_df, how='left', on='Producto_ID')

# Merge town states
new_train_df = pd.merge(new_train_df, town_state_df, how='left', on='Agencia_ID')

### Feature engineering

In [3]:
from bimbo.preprocessing import *
from bimbo.feature_engineering import *

In [4]:
val_df = new_train_df[new_train_df['Semana'] == 8]
test_df = new_train_df[new_train_df['Semana'] == 9]
train_df = new_train_df[new_train_df['Semana'] < 8]

final_y_test = test_df['Demanda_uni_equil'] # for later

# del new_train_df
# gc.collect()

14

In [5]:
train_df = data_preprocess(train_df)
val_df = data_preprocess(val_df)
test_df = data_preprocess(test_df)

In [6]:
train_df = change_type_to_categ2(train_df)
val_df = change_type_to_categ2(val_df)
test_df = change_type_to_categ2(test_df)

In [7]:
temp = feature_engineering2(train_df)

In [8]:
val = merge_feature2(val_df, temp, 'val')

In [9]:
val.to_csv('../data/processed_val_4.csv', index=False)

In [10]:
frames = [train_df, val_df]
result = pd.concat(frames)
temp = feature_engineering2(result)

In [11]:
test = merge_feature2(test_df, temp, 'test')

In [12]:
test.to_csv('../data/processed_test_4.csv', index=False)

In [13]:
# val = pd.read_csv('../data/processed_val_4.csv')
# test = pd.read_csv('../data/processed_test_4.csv')

### Bruno's idea to see the improvement of our model

In [21]:
df_1 = val_df[['Agencia_ID', 'Canal_ID', 'Ruta_SAK', 'Cliente_ID', 'Producto_ID', 'Demanda_uni_equil']]
df_1 = df_1.rename(columns={"Demanda_uni_equil": "Demanda_uni_equil_S8"})
df_1.head(3)

Unnamed: 0,Agencia_ID,Canal_ID,Ruta_SAK,Cliente_ID,Producto_ID,Demanda_uni_equil_S8
53364883,1110,7,3301,15766,1212,4
53364884,1110,7,3301,15766,1216,5
53364885,1110,7,3301,15766,1220,1


In [22]:
df_2 = test_df[['Agencia_ID', 'Canal_ID', 'Ruta_SAK', 'Cliente_ID', 'Producto_ID', 'Demanda_uni_equil']]
df_2 = df_2.rename(columns={"Demanda_uni_equil": "Demanda_uni_equil_S9"})
df_2.head(3)

Unnamed: 0,Agencia_ID,Canal_ID,Ruta_SAK,Cliente_ID,Producto_ID,Demanda_uni_equil_S9
63771751,1110,7,3301,15766,1212,1
63771752,1110,7,3301,15766,1238,2
63771753,1110,7,3301,15766,1240,2


In [23]:
result = pd.merge(df_1, df_2, how='left', on=['Agencia_ID', 'Canal_ID', 'Ruta_SAK', 'Cliente_ID', 'Producto_ID'])

In [26]:
result = result.dropna()

In [28]:
result['relative_analysis'] = (abs(result['Demanda_uni_equil_S8'] - result['Demanda_uni_equil_S9']) / result['Demanda_uni_equil_S9'])

In [29]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_log_error

y_true = result['Demanda_uni_equil_S9']
y_pred = result['Demanda_uni_equil_S8']

median_relative_value = result['relative_analysis'].median(skipna = True)
print(f'median relative value: {median_relative_value}')
mae = mean_absolute_error(y_true, y_pred)
print(f'mae: {mae}')
rmse = np.sqrt(mean_squared_error(y_true, y_pred))
print(f'rmse: {rmse}')
rmsle = np.sqrt(mean_squared_log_error(y_true, y_pred))
print(f'rmsle: {rmsle}')

median relative value: 0.4025974025974026
mae: 3.7452860051098864
rmse: 12.121574469249182
rmsle: 0.564177389534406


### Model with XGBoost

In [30]:
X = val.drop('log_demanda_uni_equil', axis=1)
y = val['log_demanda_uni_equil']

In [31]:
from bimbo.xgboost_models import *

In [32]:
model = build_model(X, y)

[0]	validation_0-mae:0.805051	validation_0-rmse:1.02639	validation_1-mae:0.804493	validation_1-rmse:1.02569
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 1 rounds.
[1]	validation_0-mae:0.613926	validation_0-rmse:0.792932	validation_1-mae:0.613452	validation_1-rmse:0.79245
[2]	validation_0-mae:0.498906	validation_0-rmse:0.647591	validation_1-mae:0.49851	validation_1-rmse:0.647247
[3]	validation_0-mae:0.431862	validation_0-rmse:0.561709	validation_1-mae:0.431603	validation_1-rmse:0.561481
[4]	validation_0-mae:0.394195	validation_0-rmse:0.513794	validation_1-mae:0.39401	validation_1-rmse:0.513646
[5]	validation_0-mae:0.372927	validation_0-rmse:0.488002	validation_1-mae:0.372772	validation_1-rmse:0.487915
[6]	validation_0-mae:0.360615	validation_0-rmse:0.474276	validation_1-mae:0.360519	validation_1-rmse:0.474235
[7]	validation_0-mae:0.35371	validation_0-rmse:0.467055	validation_1-mae:0.35

[70]	validation_0-mae:0.33856	validation_0-rmse:0.452623	validation_1-mae:0.339099	validation_1-rmse:0.453544
[71]	validation_0-mae:0.338521	validation_0-rmse:0.452587	validation_1-mae:0.339074	validation_1-rmse:0.453518
[72]	validation_0-mae:0.338512	validation_0-rmse:0.452562	validation_1-mae:0.33907	validation_1-rmse:0.453502
[73]	validation_0-mae:0.338487	validation_0-rmse:0.452535	validation_1-mae:0.339059	validation_1-rmse:0.453479
[74]	validation_0-mae:0.338489	validation_0-rmse:0.452518	validation_1-mae:0.33906	validation_1-rmse:0.453466
[75]	validation_0-mae:0.338451	validation_0-rmse:0.452475	validation_1-mae:0.339036	validation_1-rmse:0.453438
[76]	validation_0-mae:0.338409	validation_0-rmse:0.452432	validation_1-mae:0.338989	validation_1-rmse:0.453405
[77]	validation_0-mae:0.338388	validation_0-rmse:0.452406	validation_1-mae:0.338971	validation_1-rmse:0.453387
[78]	validation_0-mae:0.338347	validation_0-rmse:0.45234	validation_1-mae:0.338945	validation_1-rmse:0.453335
[79]	

[144]	validation_0-mae:0.336915	validation_0-rmse:0.450423	validation_1-mae:0.337997	validation_1-rmse:0.452146
[145]	validation_0-mae:0.336908	validation_0-rmse:0.450409	validation_1-mae:0.337992	validation_1-rmse:0.452143
[146]	validation_0-mae:0.336892	validation_0-rmse:0.450383	validation_1-mae:0.337987	validation_1-rmse:0.452132
[147]	validation_0-mae:0.336876	validation_0-rmse:0.450362	validation_1-mae:0.337979	validation_1-rmse:0.452122
[148]	validation_0-mae:0.336867	validation_0-rmse:0.450348	validation_1-mae:0.337966	validation_1-rmse:0.452115
[149]	validation_0-mae:0.336845	validation_0-rmse:0.450321	validation_1-mae:0.337958	validation_1-rmse:0.452099
[150]	validation_0-mae:0.336843	validation_0-rmse:0.4503	validation_1-mae:0.337963	validation_1-rmse:0.452086
[151]	validation_0-mae:0.336835	validation_0-rmse:0.450279	validation_1-mae:0.337957	validation_1-rmse:0.452077
[152]	validation_0-mae:0.336816	validation_0-rmse:0.450255	validation_1-mae:0.337942	validation_1-rmse:0.4

In [33]:
save_model(model, "model_fulldataset_1")

Saved model to: model_fulldataset_1.pickle.dat
Saved model to: model_fulldataset_1.joblib.dat


### Data analysis

In [None]:
# library & dataset
import seaborn as sns

# Use the 'hue' argument to provide a factor variable
sns.lmplot( x="sepal_length", y="sepal_width", data=new_train_df, fit_reg=False, hue='species', legend=False)
 
# Move the legend to an empty part of the plot
plt.legend(loc='lower right')
 
sns.plt.show()