In [1]:
# # Reminder
# import os, sys

# os.chdir('/Users/herve/code/hervelao/bakery-products')
# sys.path.append('/Users/herve/code/hervelao/bakery-products')

# Create features

In [2]:
# Import packages
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

# Autoreload
%load_ext autoreload
%autoreload 2

# Warnings
import warnings
warnings.filterwarnings('ignore')

# Import data
from bimbo.data import Bimbo

data = Bimbo().get_data()

# Load in the train and test datasets
train_df = data['train']
test_df = data['test']
cliente_tabla_df = data['cliente_tabla']
producto_tabla_df = data['producto_tabla']
town_state_df = data['town_state']
sample_submission_df = data['sample_submission']

### Prepare data

In [3]:
# Let's count the number of elements each week ('semana')
from collections import Counter

Counter(train_df['Semana'])

Counter({3: 11165207,
         4: 11009593,
         5: 10615397,
         6: 10191837,
         7: 10382849,
         8: 10406868,
         9: 10408713})

Therefore, we have **seven weeks** worth of data.

We are going to validate on the last week of our train_df, i.e. `train_df['Semana'] == 9`

In [17]:
new_train_df = train_df[train_df['Semana'] < 9]
val_df = train_df[train_df['Semana'] == 9]

In [18]:
new_train_df.head(3)

Unnamed: 0,Semana,Agencia_ID,Canal_ID,Ruta_SAK,Cliente_ID,Producto_ID,Venta_uni_hoy,Venta_hoy,Dev_uni_proxima,Dev_proxima,Demanda_uni_equil
0,3,1110,7,3301,15766,1212,3,25.14,0,0.0,3
1,3,1110,7,3301,15766,1216,4,33.52,0,0.0,4
2,3,1110,7,3301,15766,1238,4,39.32,0,0.0,4


In [19]:
val_df.head(3)

Unnamed: 0,Semana,Agencia_ID,Canal_ID,Ruta_SAK,Cliente_ID,Producto_ID,Venta_uni_hoy,Venta_hoy,Dev_uni_proxima,Dev_proxima,Demanda_uni_equil
63771751,9,1110,7,3301,15766,1212,1,8.38,0,0.0,1
63771752,9,1110,7,3301,15766,1238,2,19.66,0,0.0,2
63771753,9,1110,7,3301,15766,1240,2,16.76,0,0.0,2


### Feature Engineering

In [12]:
from bimbo.preprocessing import *
from bimbo.feature_engineering import *

#### for the training/validation data

In [20]:
# Create a new column log_demanda_uni_equil on the val_X val_y 
# log1p => Return the natural logarithm of one plus the input array
# Why log1p ? => log1p produces only positive values and removes the 'danger' of large negative numbers
# Indeed, if our dataset contains numbers much larger than zero, they can be distorted towards large negative numbers
new_train_df = data_preprocess(new_train_df)

val_df = data_preprocess(val_df)

In [21]:
new_train_df.head(3)

Unnamed: 0,Semana,Agencia_ID,Canal_ID,Ruta_SAK,Cliente_ID,Producto_ID,Venta_uni_hoy,Venta_hoy,Dev_uni_proxima,Dev_proxima,Demanda_uni_equil,log_venta_hoy,log_demanda_uni_equil
0,3,1110,7,3301,15766,1212,3,25.14,0,0.0,3,3.263467,1.386294
1,3,1110,7,3301,15766,1216,4,33.52,0,0.0,4,3.541539,1.609438
2,3,1110,7,3301,15766,1238,4,39.32,0,0.0,4,3.696848,1.609438


In [22]:
val_df.head(3)

Unnamed: 0,Semana,Agencia_ID,Canal_ID,Ruta_SAK,Cliente_ID,Producto_ID,Venta_uni_hoy,Venta_hoy,Dev_uni_proxima,Dev_proxima,Demanda_uni_equil,log_venta_hoy,log_demanda_uni_equil
63771751,9,1110,7,3301,15766,1212,1,8.38,0,0.0,1,2.23858,0.693147
63771752,9,1110,7,3301,15766,1238,2,19.66,0,0.0,2,3.028199,1.098612
63771753,9,1110,7,3301,15766,1240,2,16.76,0,0.0,2,2.876949,1.098612


In [23]:
# Here, we change the features [Agencia_ID' , 'Canal_ID' , 'Ruta_SAK' , 'Cliente_ID' , 'Producto_ID'] into categories
new_train_df = change_type_to_categ(new_train_df)

In [24]:
# We aggregate the existent features to get more features
temp = feature_engineering(new_train_df)

In [25]:
# Merge all the features in a dataframe for the training/validation
val = merge_feature(val_df, temp, 'val')

In [26]:
val.head(3)

Unnamed: 0,Demanda_uni_equil,log_demanda_uni_equil,mean_due_agencia,mean_due_canal,mean_due_ruta,mean_due_cliente,mean_due_prod_age,mean_due_prod_rut,mean_due_prod_cli,mean_due_prod_can,mean_due_prod_cli_age,mean_vh_agencia,std_due_acrcp,mean_due_acrcp
0,1,0.693147,2.010012,1.996967,2.08378,1.719464,1.587694,1.556331,1.599232,1.512998,1.599232,4.113977,0.165949,1.599232
1,2,1.098612,2.010012,1.996967,2.08378,1.719464,1.680625,1.553152,1.212066,1.601887,1.212066,4.113977,0.320511,1.212066
2,2,1.098612,2.010012,1.996967,2.08378,1.719464,1.931563,1.803948,1.746179,1.819226,1.746179,4.113977,0.500848,1.746179


In [27]:
val.to_csv('../data/processed_val.csv', index=False)

#### for the test data

In [28]:
real_train_df = data_preprocess(train_df)

In [30]:
real_train_df = change_type_to_categ(real_train_df)

In [31]:
temp = feature_engineering(real_train_df)

In [32]:
# Now doing the same but for what we really want
test = merge_feature(test_df, temp, 'test')

In [35]:
# NaN are explained because this combination of features has never been seen before
test.head(3)

Unnamed: 0,mean_due_agencia,mean_due_canal,mean_due_ruta,mean_due_cliente,mean_due_prod_age,mean_due_prod_rut,mean_due_prod_cli,mean_due_prod_can,mean_due_prod_cli_age,mean_vh_agencia,std_due_acrcp,mean_due_acrcp
0,1.433683,1.529564,1.321099,1.179669,1.901905,1.630124,1.609438,2.068613,1.609438,3.295887,,1.609438
1,1.440236,1.529564,1.392361,1.25245,1.017037,1.006477,,1.192182,,3.351941,,
2,1.552585,1.529564,1.511835,1.76314,1.527452,1.355031,1.098612,1.422284,1.098612,3.370756,0.0,1.098612


In [34]:
test.to_csv('../data/processed_test.csv', index=False)