# Simple Baseline
The main goal of this notebook is to create a good starting baseline.

In [1]:
import numpy as np
import pandas as pd
from utils import read_data, process_time, merge_data, promo_detector, promotionAggregation
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse
import sys
import xgboost as xgb
import lightgbm as lgb
from datetime import datetime

NUMBER_OF_LAGS = 4

sys.path.append("../../main/datasets/")
!ls  ../../main/datasets/

1.0v  1.0v.zip


## Defining metrics

Baseline_score function

In [2]:
def baseline_score(prediction, target, simulatedPrice):
    prediction = prediction.astype(int)
    
#     print("Prediction", prediction)
#     print("Difference", prediction - target)
#     print("Maximum", np.maximum(prediction - target, 0))
#     print("Argsort", np.sort((prediction - np.maximum(prediction - target, 0) * 1.6)  * simulatedPrice))
    return np.sum((prediction - np.maximum(prediction - target, 0) * 1.6)  * simulatedPrice)

Evaluation Metric

In [3]:
def feval(prediction, dtrain):
    prediction = prediction.astype(int)
    target = dtrain.get_label()
    simulatedPrice = dtrain.get_weight()
    return 'feval', -np.sum((prediction - np.maximum(prediction - target, 0) * 1.6)  * simulatedPrice), False

Objective Metric

In [4]:
def gradient(predt, dtrain):
    y = dtrain.get_label()
    sp = dtrain.get_weight()
    print(np.count_nonzero((predt > y) == False))
    return -2 * (predt - np.maximum(predt - y, 0) * 1.6) * (1 - (predt > y) * 1.6) * sp

def hessian(predt, dtrain):
    y = dtrain.get_label()
    sp = dtrain.get_weight() 
    return -2 * ((1 - (predt > y) * 1.6) ** 2) * sp

def objective(predt, dtrain):
    grad = gradient(predt, dtrain)
    hess = hessian(predt, dtrain)
    return grad, hess

## Preparing our dataset
These steps were already seen on ```../pre-processing-features``` notebooks.

In [5]:
infos, items, orders = read_data("../../main/datasets/")
print("Sanity checks...", infos.shape, items.shape, orders.shape)

Sanity checks... (10463, 3) (10463, 8) (2181955, 5)


In [6]:
# Changing our time signatures, 
# adding our promotion feature 
# and aggregating our data by weeks...
process_time(orders)
orders = promo_detector(orders)
df = promotionAggregation(orders, items)

In [7]:
def prepareOrders(orders, items):
    """This function is responsible for adding in our 'orders' dataframe
    the items that were not sold. THIS IS NOT MODULARIZED, THUS YOU
    SHOULD CHANGE THE CODE TO BETTER SUIT YOUR DATASET FEATURES
    """
    
    df = orders.copy()
    
    # Getting the IDs that were never sold
    not_sold_items = items[np.logical_not(
        items.itemID.isin(sorted(orders['itemID'].unique())))]

    new_rows = []
    weeks_database = orders['group_backwards'].unique()

    for idd in df['itemID'].unique():
        orders_id = df[df.itemID == idd]
        example = orders_id.iloc[0]

        # finding weeks without itemID sales
        weeks_id = orders_id['group_backwards'].unique()
        weeks_without_id = np.setdiff1d(weeks_database, weeks_id)

        # creating new row
        for w in weeks_without_id:
            new_rows.append({'itemID': idd,
                             'group_backwards': w,
                             'salesPrice_mean': 0,
                             'customerRating': example['customerRating'],
                             'category1': example['category1'],
                             'category2': example['category2'],
                             'category3': example['category3'],
                             'recommendedRetailPrice': example['recommendedRetailPrice'],
                             'orderSum': 0,
                             'manufacturer': example['manufacturer'],
                             'brand': example['brand'],
                             'promotion_mean': 0
                             })
    #  Adding rows in every week with the IDs of the
    # items that were never sold.
    df = df.append(new_rows)
    not_sold_orders = pd.DataFrame()
    for i in range(1, 14):
        aux = not_sold_items.copy()
        aux['group_backwards'] = i
        aux['salesPrice_mean'] = 0
        aux['promotion_mean'] = 0
        aux['orderSum'] = 0
        not_sold_orders = pd.concat([not_sold_orders, aux], axis=0)
    df = pd.concat([df, not_sold_orders], axis=0).sort_values(
        ['group_backwards', 'itemID'], ascending=[False, True], ignore_index=True)
    return df

In [8]:
df = prepareOrders(df, items)

In [9]:
# This cell lags and diffs our features 'orderSum' and "promotion"

shifting = df.copy()

for i in range(1, NUMBER_OF_LAGS + 1):
    # Carrying the data of weeks t-1
    shifting[f'orderSum_{i}'] = shifting.groupby('itemID')['orderSum'].shift(i)
    shifting[f'promotion_mean_{i}'] = shifting.groupby('itemID')['promotion_mean'].shift(i)
    
    # Getting the difference of the orders and promotions between weeks t-1 and t-2...
    shifting[f'orderSum_diff_{i}'] = shifting.groupby('itemID')[f'orderSum_{i}'].diff()
    shifting[f'promotion_mean_diff_{i}'] = shifting.groupby('itemID')[f'promotion_mean_{i}'].diff()
shifting.fillna(0, inplace=True)

## Maximum error
The maximum error we could get in this dataset would be just guessing the mean of our sales from weeks 1 to 12, and that's what the cell below is computing.

In [10]:
worst_possible_prediction = shifting.loc[shifting.group_backwards < 13]['orderSum'].mean()
prediction = np.full(shifting.loc[shifting.group_backwards == 13]['orderSum'].shape, worst_possible_prediction) # Array filled with the mean...
target = shifting.loc[shifting.group_backwards == 13]['orderSum']
print("Guessing the mean of 'orderSum' for all items in target", mse(target, prediction) ** 0.5)

Guessing the mean of 'orderSum' for all items in target 90.29706562119341


## Dataset Splitting
All my experiments will use weeks 13 to 3 as a train set, week 2 as our validation set and week 1 as a test set.

In [67]:
train = shifting.loc[shifting.group_backwards >= 3]
val = shifting.loc[shifting.group_backwards == 1]
test = shifting.loc[shifting.group_backwards == 1]

weights = infos.set_index('itemID')['simulationPrice'].to_dict()

w_train = train['itemID'].map(weights)
w_val = val['itemID'].map(weights)

In [68]:
# I recommend to the other members of the team keeping the
# datatypes of our datasets as Pandas DataFrames instead of Numpy,
# since It will easier to use Boosting Analysis frameworks
y_train = train['orderSum']
y_val = val['orderSum']
X_train = train.drop(columns=["orderSum"])
X_val = val.drop(columns=["orderSum"])

## Testing which week achieves the best score as a baseline

In [13]:
for i in range(13, 3, -1):
    print("PROFIT", baseline_score(y_train.loc[X_train.group_backwards == i].values, y_val.values, infos['simulationPrice']))

PROFIT -1509592.3780000003
PROFIT -1304199.3880000003
PROFIT -1481778.3940000003
PROFIT -1785598.9080000005
PROFIT -3202476.704000001
PROFIT -3393890.3020000006
PROFIT -3316071.3480000007
PROFIT -3036484.416
PROFIT -2664976.0720000006
PROFIT -3030506.4560000002


## Testing Tobias's version

In [92]:
# y_val.values[1]


Unnamed: 0,group_backwards,itemID,orderSum,promotion_mean,salesPrice_mean,brand,manufacturer,customerRating,category1,category2,category3,recommendedRetailPrice
0,13,1,0,0.0,0.00,0.0,1.0,4.38,1.0,1.0,1.0,8.84
1,13,2,0,0.0,0.00,0.0,2.0,3.00,1.0,2.0,1.0,16.92
2,13,3,1,0.0,14.04,0.0,3.0,5.00,1.0,3.0,1.0,15.89
3,13,4,0,0.0,0.00,0.0,2.0,4.44,1.0,2.0,1.0,40.17
4,13,5,2,0.0,7.84,0.0,2.0,2.33,1.0,1.0,1.0,17.04
...,...,...,...,...,...,...,...,...,...,...,...,...
136014,1,10459,0,0.0,0.00,180.0,253.0,0.00,8.0,44.0,8.0,56.57
136015,1,10460,0,0.0,0.00,0.0,253.0,0.00,8.0,44.0,8.0,163.81
136016,1,10461,0,0.0,0.00,0.0,253.0,0.00,8.0,44.0,8.0,128.01
136017,1,10462,0,0.0,0.00,180.0,253.0,0.00,8.0,44.0,8.0,166.97


In [59]:
df.loc[df.group_backwards == 1]['orderSum'].values

1

In [26]:
tobias = pd.read_csv('/home/joaopedromattos/Downloads/Telegram Desktop/tobias_sub.csv', sep='|')

In [55]:
baseline_score(tobias['demandPrediction'].values, df.loc[df.group_backwards == 1]['orderSum'].values, infos['simulationPrice'])

-16568699.568000002

## Testing with the items that sell in every week...

We'll import the variable ```ordersBaseline``` from ```'./dora/pre-processing-features/Relevance Feature Baseline'```, which consists of a binary array of the length equal to the number of distincts itemIDs. This array consists entirely of zeros in every position, except of those indexes that correspond to the ID of an item that is on the set of items that were sold every week of the train.

In [17]:
# Variable taken from './dora/pre-processing-features/Relevance Feature Baseline'
%store -r ordersBaseline

In [18]:
 baseline_score(ordersBaseline, y_val.values, infos['simulationPrice'])

Prediction [0 0 0 ... 0 0 0]
Difference [ 0  0 -1 ...  0  0  0]
Maximum [0 0 0 ... 0 0 0]
Argsort [-8.04 -2.85  0.   ...  1.7  12.51 15.34]


18.66

In [19]:
%store -r all_baselines

In [22]:
for i in all_baselines:
    print("Profit:", baseline_score(i, y_val.values, infos['simulationPrice']))

Prediction [0 0 0 ... 0 0 0]
Difference [ 0  0 -1 ...  0  0  0]
Maximum [0 0 0 ... 0 0 0]
Argsort [ 0.    0.    0.   ...  0.    0.   12.51]
Profit: 12.51
Prediction [0 0 0 ... 0 0 0]
Difference [ 0  0 -1 ...  0  0  0]
Maximum [0 0 0 ... 0 0 0]
Argsort [ 0.    0.    0.   ...  0.   12.51 15.34]
Profit: 27.85
Prediction [0 0 0 ... 0 0 0]
Difference [ 0  0 -1 ...  0  0  0]
Maximum [0 0 0 ... 0 0 0]
Argsort [-2.85  0.    0.   ...  0.   12.51 15.34]
Profit: 25.0
Prediction [0 0 0 ... 0 0 0]
Difference [ 0  0 -1 ...  0  0  0]
Maximum [0 0 0 ... 0 0 0]
Argsort [-2.85  0.    0.   ...  1.7  12.51 15.34]
Profit: 26.7
Prediction [0 0 0 ... 0 0 0]
Difference [ 0  0 -1 ...  0  0  0]
Maximum [0 0 0 ... 0 0 0]
Argsort [-8.04 -2.85  0.   ...  1.7  12.51 15.34]
Profit: 18.66
Prediction [0 0 0 ... 0 0 0]
Difference [ 0  0 -1 ...  0  0  0]
Maximum [0 0 0 ... 0 0 0]
Argsort [-30.9   -8.04  -2.85 ...   1.7   12.51  15.34]
Profit: -12.240000000000006
Prediction [0 0 0 ... 0 0 0]
Difference [ 0  0 -1 ...  0  

In [30]:
last_week_mean = df.loc[df.group_backwards == 3]['orderSum'].mean()

In [32]:
mean_baseline = np.array([last_week_mean] * df['itemID'].nunique())

In [34]:
baseline_score(mean_baseline, y_val.values, infos['simulationPrice'])

Prediction [25 25 25 ... 25 25 25]
Difference [25 25 24 ... 25 25 25]
Maximum [25 25 24 ... 25 25 25]
Argsort [-135826.05 -132196.05 -127941.15 ...   32996.75   54757.75   58243.5 ]


-14602345.287999999

### Utilities

**Predicting at test time**

In [13]:
y_test = test['orderSum']
X_test = test.drop(columns=["orderSum"])
final_predictions = model.predict(X_test)

In [None]:
final_predictions[final_predictions < 0].

**Creating our Kaggle CSV**

In [None]:
final = pd.Series(0, index=np.arange(1, len(items)+1))
final[items.itemID] = final_predictions.astype(int)

final.to_csv("lgbm_kaggle_df.csv", header=["demandPrediction"],
            index_label="itemID", sep="|")

**Saving our model in disk**

In [None]:
now = datetime.now().strftime("%d-%m-%Y-%Hh%Mm%Ss")
modelName = 'lgbm-' + now
bst.save_model(modelName)

# Baseline naive searching
The objective of this notebook is simple: Create a simple baseline that has low loss.