# Sin and Cos Encoding
This notebook creates a function that encodes an entire year as 26 week-pairs.

In [1]:
import numpy as np
import pandas as pd
from utils import read_data, process_time, merge_data, promo_detector, promo_detector_fixed, promotionAggregation, dataset_builder
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse
import sys
import xgboost as xgb
import lightgbm as lgb
from datetime import datetime

NUMBER_OF_LAGS = 4

sys.path.append("../../main/datasets/")
!ls  ../../main/datasets/

1.0v.zip


<hr>

## Defining metrics

Baseline_score function

In [2]:
def baseline_score(prediction, target, simulatedPrice):
    prediction = prediction.astype(int)

    return np.sum((prediction - np.maximum(prediction - target, 0) * 1.6)  * simulatedPrice)

Evaluation Metric

In [3]:
def feval(prediction, dtrain):
    
    prediction = prediction.astype(int)
    target = dtrain.get_label()

    simulatedPrice = dtrain.get_weight()
    
    return 'feval', np.sum((prediction - np.maximum(prediction - target, 0) * 1.6)  * simulatedPrice), True

Objective Metric

In [4]:
def gradient(predt, dtrain):
    y = dtrain.get_label()
    sp = dtrain.get_weight()
    return -2 * (predt - np.maximum(predt - y, 0) * 1.6) * (1 - (predt > y) * 1.6) * sp

def hessian(predt, dtrain):
    y = dtrain.get_label()
    sp = dtrain.get_weight() 
    return -2 * ((1 - (predt > y) * 1.6) ** 2) * sp

def objective(predt, dtrain):
    grad = gradient(predt, dtrain)
    hess = hessian(predt, dtrain)
    return grad, hess

<hr>

## Building our dataset
This notebook makes this step cleaner than the previous versions. So It'll be tidier and shorter than before!

In [5]:
infos, items, orders = read_data("../../main/datasets/")
print("Sanity checks...", infos.shape, items.shape, orders.shape)

Sanity checks... (10463, 3) (10463, 8) (2181955, 5)


In [6]:
# Changing our time signatures
process_time(orders)

In [7]:
df = dataset_builder(orders, items)

<hr>

## Feature building

In [9]:
def time_encoder(data, col, max_val):
    """This function aims to encode a time series in function sines and cosines.
    
    Parameters
    -------------
    data : A pandas DataFrame with all the dataset
    col : A string corresponding to the name of the column that will be encoded
    max_val : Size of the time-window of encoding
                    
    Return
    -------------
    A new pandas DataFrame with two new columns, one encoded as sin and other as cosine.
    """
    data[col + '_sin'] = np.sin(2 * np.pi * data[col]/max_val)
    data[col + '_cos'] = np.cos(2 * np.pi * data[col]/max_val)
    return data

In [10]:
time_encoder(df, 'group_backwards', 26)

Unnamed: 0,group_backwards,itemID,orderSum,brand,manufacturer,customerRating,category1,category2,category3,recommendedRetailPrice,group_backwards_sin,group_backwards_cos
0,13,1,0.0,0,1,4.38,1,1,1,8.84,-3.216245e-16,-1.000000
1,13,2,0.0,0,2,3.00,1,2,1,16.92,-3.216245e-16,-1.000000
2,13,3,1.0,0,3,5.00,1,3,1,15.89,-3.216245e-16,-1.000000
3,13,4,0.0,0,2,4.44,1,2,1,40.17,-3.216245e-16,-1.000000
4,13,5,2.0,0,2,2.33,1,1,1,17.04,-3.216245e-16,-1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...
136014,1,10459,0.0,180,253,0.00,8,44,8,56.57,2.393157e-01,0.970942
136015,1,10460,0.0,0,253,0.00,8,44,8,163.81,2.393157e-01,0.970942
136016,1,10461,0.0,0,253,0.00,8,44,8,128.01,2.393157e-01,0.970942
136017,1,10462,0.0,180,253,0.00,8,44,8,166.97,2.393157e-01,0.970942
