In [None]:
# # Reminder
# import os, sys

# os.chdir('/Users/herve/code/hervelao/bakery-products')
# sys.path.append('/Users/herve/code/hervelao/bakery-products')

# Create features

In [1]:
# Import packages
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

# Autoreload
%load_ext autoreload
%autoreload 2

# Warnings
import warnings
warnings.filterwarnings('ignore')

# Import data
from bimbo.data import Bimbo

data = Bimbo().get_data()

# Load in the train and test datasets
train_df = data['train']
test_df = data['test']
cliente_tabla_df = data['cliente_tabla']
producto_tabla_df = data['producto_tabla']
town_state_df = data['town_state']
sample_submission_df = data['sample_submission']

### Prepare data

In [2]:
# Let's count the number of elements each week ('semana')
from collections import Counter

Counter(train_df['Semana'])

Counter({3: 11165207,
         4: 11009593,
         5: 10615397,
         6: 10191837,
         7: 10382849,
         8: 10406868,
         9: 10408713})

Therefore, we have **seven weeks** worth of data.

We are going to validate on the last week of our train_df, i.e. `train_df['Semana'] == 9`

In [11]:
val_X = train_df[train_df['Semana'] < 9]
val_y = train_df[train_df['Semana'] == 9]

### Feature Engineering

#### for the val data

In [12]:
from bimbo.preprocessing import *
from bimbo.feature_engineering import *
from bimbo.utils import *

In [13]:
# Create a new column log_demanda_uni_equil on the val_X val_y 
# log1p => Return the natural logarithm of one plus the input array
# Why log1p ? => log1p produces only positive values and removes the 'danger' of large negative numbers
# Indeed, if our dataset contains numbers much larger than zero, they can be distorted towards large negative numbers
val_X = data_preprocess(val_X)

val_y['log_demanda_uni_equil'] = pd.DataFrame(np.log1p(val_y['Demanda_uni_equil']))

In [14]:
# Here, we change the features [Agencia_ID' , 'Canal_ID' , 'Ruta_SAK' , 'Cliente_ID' , 'Producto_ID'] into categories
val_X = change_type2cate(val_X)

In [16]:
# We aggregate the existent features to get the mean.
# Indeed, we need XGBoost to capture the time series nature of the data, and this is the way.
temp = feature_engineering(val_X)

In [23]:
# Merge all the features with 
val = merge_feature(val_y, temp, 'val')

In [24]:
val.head()

Unnamed: 0,Demanda_uni_equil,log_demanda_uni_equil,mean_due_age,mean_due_can,mean_due_rut,mean_due_cli,mean_due_pa,mean_due_pr,mean_due_pcli,mean_due_pcan,mean_due_pca,mean_vh_age,sd_due_acrcp,mean_due_acrcp
0,1,0.693147,2.010012,1.996967,2.08378,1.719464,1.587694,1.556331,1.599232,1.512998,1.599232,4.113977,0.165949,1.599232
1,2,1.098612,2.010012,1.996967,2.08378,1.719464,1.680625,1.553152,1.212066,1.601887,1.212066,4.113977,0.320511,1.212066
2,2,1.098612,2.010012,1.996967,2.08378,1.719464,1.931563,1.803948,1.746179,1.819226,1.746179,4.113977,0.500848,1.746179
3,1,0.693147,2.010012,1.996967,2.08378,1.719464,1.655291,1.937759,1.174876,1.782192,1.174876,4.113977,0.274877,1.174876
4,10,2.397895,2.010012,1.996967,2.08378,1.719464,2.266708,1.903195,1.82895,1.979976,1.82895,4.113977,0.71154,1.82895


In [25]:
val.to_csv('../data/processed_val.csv', index=False)

#### for the test data

In [26]:
train_df = data_preprocess(train_df)

In [27]:
train_df = change_type2cate(train_df)

In [28]:
temp = feature_engineering(train_df)

In [30]:
test = merge_feature(test_df, temp, 'test')

In [31]:
test_id = test['id']
test.drop(['id','Semana'], axis=1, inplace=True)

In [33]:
test.head()

Unnamed: 0,mean_due_age,mean_due_can,mean_due_rut,mean_due_cli,mean_due_pa,mean_due_pr,mean_due_pcli,mean_due_pcan,mean_due_pca,mean_vh_age,sd_due_acrcp,mean_due_acrcp
0,1.433683,1.529564,1.321099,1.179669,1.901905,1.630124,1.609438,2.068613,1.609438,3.295887,,1.609438
1,1.440236,1.529564,1.392361,1.25245,1.017037,1.006477,,1.192182,,3.351941,,
2,1.552585,1.529564,1.511835,1.76314,1.527452,1.355031,1.098612,1.422284,1.098612,3.370756,0.0,1.098612
3,1.38793,1.529564,1.567975,1.341171,0.742755,1.059591,,1.031969,,3.211019,,
4,1.496363,1.529564,1.617854,1.341662,,,,1.31075,,3.41091,,


In [32]:
test.to_csv('../data/processed_test.csv', index=False)