# Baseline model | predicting 2-week blocks (WIP)

In [1]:
import zipfile as zp
import pandas as pd
import numpy as np

import lightgbm as lgb
import xgboost as xgb

from sklearn.metrics import mean_squared_log_error

import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots

In [2]:
def rmsle(ytrue, ypred):
    return np.sqrt(mean_squared_log_error(ytrue, ypred))

## Reading data

In [3]:
DATA_FILE = "1.0v.zip"

def read_data(data_dir = "../main/datasets/", data_file = DATA_FILE):
    """Returns the data, in order infos, items, orders"""
    
    with zp.ZipFile(data_dir + DATA_FILE) as z:
        dfs = []
        for name in ["infos", "items", "orders"]:
            dfs.append(pd.read_csv(z.open(f"1.0v/{name}.csv"), sep = "|"))
    
    return dfs

In [4]:
infos_df, items_df, orders_df = read_data()

## Processing data

In [5]:
N_WEEKS = 2

# Creating date feature in format YYYY-MM-DD
orders_df['time'] = pd.to_datetime(orders_df['time'])
orders_df['date'] = orders_df['time'].dt.date
orders_df['month'] = orders_df['time'].dt.month
orders_df['day'] = orders_df['time'].dt.day

# Tranforming date to nº of days since first date
basedate = orders_df['date'].min()
orders_df['date'] = (orders_df['date'] - basedate).dt.days

# Transforming date to 'nº of the n-week blocks - we'll work with 2 weeks for now
orders_df['date'] = orders_df['date'] // (7 * N_WEEKS)

In [6]:
# Grouping orders by day and itemID, getting the sum of orders and mean of salesPrice
orders_by_date = orders_df.groupby(['date', 'itemID'], as_index=False).agg({'order':'sum', 
                                                                            'salesPrice':'mean'})
# Creating dataframe in the usual timeseries format
timeseries = orders_by_date.pivot(index='itemID', columns='date')['order']
timeseries = timeseries.fillna(0)

## Taking a look at our data

In [7]:
orders_by_date.head()

Unnamed: 0,date,itemID,order,salesPrice
0,0,3,1,14.04
1,0,5,2,7.84
2,0,9,2,199.84
3,0,11,3,2.13
4,0,12,1,2.11


#### Must remember to use salesPrice later ***

In [8]:
timeseries.head()

date,0,1,2,3,4,5,6,7,8,9,10,11,12
itemID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,0.0,2.0,314.0,35.0,2.0,1.0,1.0,2.0,300.0,3.0,30.0,0.0,3.0
2,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0
3,1.0,95.0,90.0,0.0,1.0,0.0,0.0,0.0,0.0,3.0,0.0,2.0,139.0
4,0.0,1.0,0.0,0.0,1.0,0.0,2.0,42.0,1.0,1.0,5.0,0.0,145.0
5,2.0,1.0,0.0,1.0,0.0,0.0,0.0,127.0,4.0,39.0,3.0,2.0,1.0


In [9]:
checking = orders_df[(orders_df['date'] == 2) & (orders_df['itemID'] == 1)]
print(f'Verificando produto 1 no bloco 2, vendas = {checking["order"].sum()}')
    
checking = orders_df[(orders_df['date'] == 1) & (orders_df['itemID'] == 3)]
print(f'Verificando produto 3 no bloco 1, vendas = {checking["order"].sum()}')

Verificando produto 1 no bloco 2, vendas = 314
Verificando produto 3 no bloco 1, vendas = 95


## Melting data for modelling

Now each line will become: **itemID, date (2-week-block), orders.**<br>

#### IMPORTANT
Every product must exist every day, even when orders = 0

In [10]:
melt = pd.melt(timeseries.reset_index(), id_vars = 'itemID', var_name = 'date', value_name='orders')

In [11]:
melt

Unnamed: 0,itemID,date,orders
0,1,0,0.0
1,2,0,0.0
2,3,0,1.0
3,4,0,0.0
4,5,0,2.0
...,...,...,...
127915,10450,12,6.0
127916,10459,12,0.0
127917,10460,12,0.0
127918,10462,12,0.0


#### Can merge salesPrice and dates at this point later (using orders_by_date)

## Creating features

In [12]:
LAST_N_BLOCKS = 3

for i in range(1, LAST_N_BLOCKS+1):
    melt[f'last_{i}_week_orders'] = melt.groupby(['itemID'])['orders'].shift(i)
    melt[f'last_{i}_week_diff'] = melt.groupby(['itemID'])[f'last_{i}_week_orders'].diff()

melt = melt.fillna(0)

In [13]:
melt.tail()

Unnamed: 0,itemID,date,orders,last_1_week_orders,last_1_week_diff,last_2_week_orders,last_2_week_diff,last_3_week_orders,last_3_week_diff
127915,10450,12,6.0,150.0,150.0,0.0,0.0,0.0,0.0
127916,10459,12,0.0,0.0,0.0,0.0,0.0,0.0,-1.0
127917,10460,12,0.0,0.0,0.0,0.0,0.0,0.0,0.0
127918,10462,12,0.0,0.0,0.0,0.0,0.0,0.0,0.0
127919,10463,12,0.0,0.0,-1.0,1.0,1.0,0.0,0.0


In [14]:
melt['date'] = melt['date'].astype(np.int64)

## Validation

#### Later: try using date as categorical feature

In [33]:
params = {
          "objective" : "poisson",
          "metric" :"rmse",
          # "force_row_wise" : True,
          "learning_rate" : 0.1,
          # "sub_row" : 0.75,
          # "bagging_freq" : 1,
          # "lambda_l2" : 0.1,
          # "metric": ["rmse"],
          'verbosity': 1,
          # 'num_leaves': 128,
          # "min_data_in_leaf": 100,
         }

### Without log

In [34]:
mean_errors = []

# Starting validation: 8th 2-week block
for date in range(10, 13):
    train = melt[melt['date'] < date]
    valid = melt[melt['date'] == date]
    
    xtr, xts = train.drop(['orders'], axis=1), valid.drop(['orders'], axis=1)
    ytr, yts = train['orders'].values, valid['orders'].values
    
    lgbtrain = lgb.Dataset(xtr, label = ytr)
    lgbvalid = lgb.Dataset(xts, label = yts)
    
    num_round = 300
    model = lgb.train(params, lgbtrain, num_round, valid_sets = [lgbtrain, lgbvalid], 
                      verbose_eval=20, early_stopping_rounds=20)
    
    preds = model.predict(xts, num_iteration=model.best_iteration)
    
    error = rmsle(yts, preds)
    print(f'Block {date}: {error}')
    mean_errors.append(error)

print(f'Mean error: {np.mean(mean_errors)}')

Training until validation scores don't improve for 20 rounds
[20]	training's rmse: 98.9025	valid_1's rmse: 115.843
[40]	training's rmse: 96.4748	valid_1's rmse: 115.308
[60]	training's rmse: 94.6076	valid_1's rmse: 115.085
[80]	training's rmse: 93.1088	valid_1's rmse: 115.004
[100]	training's rmse: 91.657	valid_1's rmse: 115.01
Early stopping, best iteration is:
[87]	training's rmse: 92.5332	valid_1's rmse: 114.969
Block 10: 2.415872278214388
Training until validation scores don't improve for 20 rounds
[20]	training's rmse: 100.224	valid_1's rmse: 106.775
[40]	training's rmse: 97.7739	valid_1's rmse: 106.248
[60]	training's rmse: 96.2621	valid_1's rmse: 105.834
[80]	training's rmse: 94.7408	valid_1's rmse: 105.698
[100]	training's rmse: 93.2116	valid_1's rmse: 105.731
[120]	training's rmse: 91.6931	valid_1's rmse: 105.628
[140]	training's rmse: 90.2327	valid_1's rmse: 105.647
Early stopping, best iteration is:
[125]	training's rmse: 91.3512	valid_1's rmse: 105.616
Block 11: 2.478556229

### With log

In [35]:
mean_errors = []

# Starting validation: 8th 2-week block
for date in range(10, 13):
    train = melt[melt['date'] < date]
    valid = melt[melt['date'] == date]
    
    xtr, xts = train.drop(['orders'], axis=1), valid.drop(['orders'], axis=1)
    ytr, yts = train['orders'].values, valid['orders'].values
    
    lgbtrain = lgb.Dataset(xtr, label = np.log1p(ytr))
    lgbvalid = lgb.Dataset(xts, label = np.log1p(yts))
    
    num_round = 300
    model = lgb.train(params, lgbtrain, num_round, valid_sets = [lgbtrain, lgbvalid], 
                      verbose_eval=20, early_stopping_rounds=20)
    
    preds = np.exp(model.predict(xts, num_iteration=model.best_iteration))
    
    error = rmsle(yts, preds)
    print(f'Block {date}: {error}')
    mean_errors.append(error)

print(f'Mean error: {np.mean(mean_errors)}')

Training until validation scores don't improve for 20 rounds
[20]	training's rmse: 1.31725	valid_1's rmse: 1.65727
[40]	training's rmse: 1.28122	valid_1's rmse: 1.64754
[60]	training's rmse: 1.25871	valid_1's rmse: 1.6432
[80]	training's rmse: 1.2413	valid_1's rmse: 1.63587
[100]	training's rmse: 1.23133	valid_1's rmse: 1.63368
[120]	training's rmse: 1.22035	valid_1's rmse: 1.63362
Early stopping, best iteration is:
[115]	training's rmse: 1.22245	valid_1's rmse: 1.63294
Block 10: 1.635145638128516
Training until validation scores don't improve for 20 rounds
[20]	training's rmse: 1.34942	valid_1's rmse: 1.62988
[40]	training's rmse: 1.31339	valid_1's rmse: 1.61949
[60]	training's rmse: 1.28947	valid_1's rmse: 1.61504
[80]	training's rmse: 1.27229	valid_1's rmse: 1.60954
[100]	training's rmse: 1.26064	valid_1's rmse: 1.60813
Early stopping, best iteration is:
[97]	training's rmse: 1.26184	valid_1's rmse: 1.60697
Block 11: 1.6308406311205224
Training until validation scores don't improve 