# PromoSniffer
This notebook aims to find out what's the definition of "promotion"
**Spoiler:** I failed...

In [1]:
import numpy as np
import pandas as pd
from utils import read_data, process_time, merge_data, promo_detector, promo_detector_fixed, promotionAggregation, dataset_builder
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse
import sys
import xgboost as xgb
import lightgbm as lgb
from datetime import datetime

NUMBER_OF_LAGS = 4

sys.path.append("../../main/datasets/")
!ls  ../../main/datasets/

1.0v.zip


<hr>

## Defining metrics

Baseline_score function

In [2]:
def baseline_score(prediction, target, simulatedPrice):
    prediction = prediction.astype(int)

    return np.sum((prediction - np.maximum(prediction - target, 0) * 1.6)  * simulatedPrice)

Evaluation Metric

In [3]:
def feval(prediction, dtrain):
    
    prediction = prediction.astype(int)
    target = dtrain.get_label()

    simulatedPrice = dtrain.get_weight()
    
    return 'feval', np.sum((prediction - np.maximum(prediction - target, 0) * 1.6)  * simulatedPrice), True

<hr>

## Building our dataset
This notebook makes this step cleaner than the previous versions. So It'll be tidier and shorter than before!

In [4]:
infos, items, orders = read_data("../../main/datasets/")
print("Sanity checks...", infos.shape, items.shape, orders.shape)

Sanity checks... (10463, 3) (10463, 8) (2181955, 5)


In [5]:
# Changing our time signatures
process_time(orders)

In [6]:
df = dataset_builder(orders, items)

<hr>

## Feature building

**First appearance Feature**

In [7]:
orders_sorted_by_week = orders.sort_values('group_backwards', ascending=False)
weeks_grouped_by_items = orders_sorted_by_week.groupby('itemID', as_index=False)
items_first_appearance = weeks_grouped_by_items.first()[['itemID', 'group_backwards']]
items_first_appearance.rename(columns={'group_backwards':'first_appearance'}, inplace=True)

In [8]:
items_first_appearance

Unnamed: 0,itemID,first_appearance
0,1,12
1,2,9
2,3,13
3,4,12
4,5,13
...,...,...
9835,10450,2
9836,10459,4
9837,10460,6
9838,10462,5


In [9]:
df['is_new'] = 0

In [10]:
df = pd.merge(df, items_first_appearance, left_on=['itemID'], right_on=['itemID'], how='left', validate='m:1')

In [11]:
df.loc[df['first_appearance'] == df['group_backwards'], 'is_new'] = 1

In [12]:
df['not_selling_yet'] = 0

In [13]:
df.loc[df['first_appearance'] < df['group_backwards'], 'not_selling_yet'] = 2

In [14]:
df

Unnamed: 0,group_backwards,itemID,orderSum,brand,manufacturer,customerRating,category1,category2,category3,recommendedRetailPrice,is_new,first_appearance,not_selling_yet
0,13,1,0.0,0,1,4.38,1,1,1,8.84,0,12.0,2
1,13,2,0.0,0,2,3.00,1,2,1,16.92,0,9.0,2
2,13,3,1.0,0,3,5.00,1,3,1,15.89,1,13.0,0
3,13,4,0.0,0,2,4.44,1,2,1,40.17,0,12.0,2
4,13,5,2.0,0,2,2.33,1,1,1,17.04,1,13.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
136014,1,10459,0.0,180,253,0.00,8,44,8,56.57,0,4.0,0
136015,1,10460,0.0,0,253,0.00,8,44,8,163.81,0,6.0,0
136016,1,10461,0.0,0,253,0.00,8,44,8,128.01,0,,0
136017,1,10462,0.0,180,253,0.00,8,44,8,166.97,0,5.0,0


In [15]:
df.drop(columns=['first_appearance'], inplace=True)

In [16]:
df

Unnamed: 0,group_backwards,itemID,orderSum,brand,manufacturer,customerRating,category1,category2,category3,recommendedRetailPrice,is_new,not_selling_yet
0,13,1,0.0,0,1,4.38,1,1,1,8.84,0,2
1,13,2,0.0,0,2,3.00,1,2,1,16.92,0,2
2,13,3,1.0,0,3,5.00,1,3,1,15.89,1,0
3,13,4,0.0,0,2,4.44,1,2,1,40.17,0,2
4,13,5,2.0,0,2,2.33,1,1,1,17.04,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
136014,1,10459,0.0,180,253,0.00,8,44,8,56.57,0,0
136015,1,10460,0.0,0,253,0.00,8,44,8,163.81,0,0
136016,1,10461,0.0,0,253,0.00,8,44,8,128.01,0,0
136017,1,10462,0.0,180,253,0.00,8,44,8,166.97,0,0


**How many new items there will be this week (expanding our leak)**

In [17]:
new_items_in_a_given_week = df.groupby('group_backwards').agg(how_many_new=('is_new', 'sum'))

In [18]:
df = pd.merge(df, new_items_in_a_given_week, left_on=['group_backwards'], right_on=['group_backwards'], validate='m:1')

**What's the definition of a promotion???**

In [19]:
df.groupby('group_backwards', as_index=False)[['how_many_new']].mean()

Unnamed: 0,group_backwards,how_many_new
0,1,728
1,2,727
2,3,794
3,4,671
4,5,785
5,6,661
6,7,716
7,8,909
8,9,785
9,10,533


In [20]:
new_infos = pd.merge(infos, items[['itemID', 'recommendedRetailPrice']], left_on=['itemID'], right_on=['itemID'])

In [21]:
promotionItems = new_infos[- new_infos.promotion.isna()]

In [22]:
salesPriceMode = orders.groupby(['itemID']).agg(salesPriceMode=('salesPrice', lambda x: x.value_counts().index[0]))

In [23]:
promotionItems = pd.merge(promotionItems, salesPriceMode, left_on=['itemID'], right_on=['itemID'], how='left')

In [24]:
promotionItems['priceDifference'] = promotionItems['simulationPrice'] - promotionItems['salesPriceMode']

In [26]:
promotionItems

Unnamed: 0,itemID,simulationPrice,promotion,recommendedRetailPrice,salesPriceMode,priceDifference
0,26,14.27,2018-07-13,23.62,14.25,0.02
1,27,1.64,2018-07-13,5.44,1.64,0.00
2,29,56.13,2018-07-13,29.19,56.13,0.00
3,35,337.84,"2018-07-01,2018-07-02,2018-07-13",173.86,368.13,-30.29
4,58,277.72,2018-07-13,326.30,,
...,...,...,...,...,...,...
1838,10447,44.99,2018-07-10,257.88,,
1839,10448,63.00,2018-07-10,275.08,,
1840,10450,44.43,2018-07-10,36.78,55.65,-11.22
1841,10455,62.84,2018-07-11,1037.50,,


In [120]:
items

Unnamed: 0,itemID,brand,manufacturer,customerRating,category1,category2,category3,recommendedRetailPrice
0,1,0,1,4.38,1,1,1,8.84
1,2,0,2,3.00,1,2,1,16.92
2,3,0,3,5.00,1,3,1,15.89
3,4,0,2,4.44,1,2,1,40.17
4,5,0,2,2.33,1,1,1,17.04
...,...,...,...,...,...,...,...,...
10458,10459,180,253,0.00,8,44,8,56.57
10459,10460,0,253,0.00,8,44,8,163.81
10460,10461,0,253,0.00,8,44,8,128.01
10461,10462,180,253,0.00,8,44,8,166.97
