# New Item Flag

In [1]:
import numpy as np
import pandas as pd
from utils import read_data, process_time, merge_data, promo_detector, promo_detector_fixed, promotionAggregation, dataset_builder
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse
import sys
import xgboost as xgb
import lightgbm as lgb
from datetime import datetime

NUMBER_OF_LAGS = 4

sys.path.append("../../main/datasets/")
!ls  ../../main/datasets/

1.0v.zip


<hr>

## Defining metrics

Baseline_score function

In [2]:
def baseline_score(prediction, target, simulatedPrice):
    prediction = prediction.astype(int)

    return np.sum((prediction - np.maximum(prediction - target, 0) * 1.6)  * simulatedPrice)

Evaluation Metric

In [3]:
def feval(prediction, dtrain):
    
    prediction = prediction.astype(int)
    target = dtrain.get_label()

    simulatedPrice = dtrain.get_weight()
    
    return 'feval', np.sum((prediction - np.maximum(prediction - target, 0) * 1.6)  * simulatedPrice), True

<hr>

## Building our dataset
This notebook makes this step cleaner than the previous versions. So It'll be tidier and shorter than before!

In [4]:
infos, items, orders = read_data("../../main/datasets/")
print("Sanity checks...", infos.shape, items.shape, orders.shape)

Sanity checks... (10463, 3) (10463, 8) (2181955, 5)


In [5]:
# Changing our time signatures
process_time(orders)

In [6]:
df = dataset_builder(orders, items)

<hr>

## Feature building

In [7]:
orders_sorted_by_week = orders.sort_values('group_backwards', ascending=False)
weeks_grouped_by_items = orders_sorted_by_week.groupby('itemID', as_index=False)
items_first_appearance = weeks_grouped_by_items.first()[['itemID', 'group_backwards']]
items_first_appearance.rename(columns={'group_backwards':'first_appearance'}, inplace=True)

In [8]:
items_first_appearance

Unnamed: 0,itemID,first_appearance
0,1,12
1,2,9
2,3,13
3,4,12
4,5,13
...,...,...
9835,10450,2
9836,10459,4
9837,10460,6
9838,10462,5


In [9]:
df['is_new'] = 0

In [10]:
df = pd.merge(df, items_first_appearance, left_on=['itemID'], right_on=['itemID'], how='left', validate='m:1')

In [11]:
df.loc[df['first_appearance'] == df['group_backwards'], 'is_new'] = 1

In [12]:
df['not_selling_yet'] = 0

In [13]:
df.loc[df['first_appearance'] < df['group_backwards'], 'not_selling_yet'] = 2

In [14]:
df

Unnamed: 0,group_backwards,itemID,orderSum,brand,manufacturer,customerRating,category1,category2,category3,recommendedRetailPrice,is_new,first_appearance,not_selling_yet
0,13,1,0.0,0,1,4.38,1,1,1,8.84,0,12.0,2
1,13,2,0.0,0,2,3.00,1,2,1,16.92,0,9.0,2
2,13,3,1.0,0,3,5.00,1,3,1,15.89,1,13.0,0
3,13,4,0.0,0,2,4.44,1,2,1,40.17,0,12.0,2
4,13,5,2.0,0,2,2.33,1,1,1,17.04,1,13.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
136014,1,10459,0.0,180,253,0.00,8,44,8,56.57,0,4.0,0
136015,1,10460,0.0,0,253,0.00,8,44,8,163.81,0,6.0,0
136016,1,10461,0.0,0,253,0.00,8,44,8,128.01,0,,0
136017,1,10462,0.0,180,253,0.00,8,44,8,166.97,0,5.0,0


In [14]:
df.drop(columns=['first_appearance'], inplace=True)

In [15]:
df

Unnamed: 0,group_backwards,itemID,orderSum,brand,manufacturer,customerRating,category1,category2,category3,recommendedRetailPrice,is_new
0,13,1,0.0,0,1,4.38,1,1,1,8.84,2
1,13,2,0.0,0,2,3.00,1,2,1,16.92,2
2,13,3,1.0,0,3,5.00,1,3,1,15.89,1
3,13,4,0.0,0,2,4.44,1,2,1,40.17,2
4,13,5,2.0,0,2,2.33,1,1,1,17.04,1
...,...,...,...,...,...,...,...,...,...,...,...
136014,1,10459,0.0,180,253,0.00,8,44,8,56.57,0
136015,1,10460,0.0,0,253,0.00,8,44,8,163.81,0
136016,1,10461,0.0,0,253,0.00,8,44,8,128.01,0
136017,1,10462,0.0,180,253,0.00,8,44,8,166.97,0
