In [219]:
import math
import datetime
import pandas as pd
import numpy as np
import time
from statistics import median, stdev
from scipy.stats import iqr

from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import OrdinalEncoder

import timeit
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [3]:
data_path = '../data/'
items = pd.read_csv('{}items.csv'.format(data_path), sep='|')
infos = pd.read_csv('{}infos.csv'.format(data_path), sep='|')
orders = pd.read_csv('{}orders0206_train.csv'.format(data_path), sep='|')

In [4]:
orders['daytime'] = pd.to_datetime(orders["time"])

# Date
orders["date"] = pd.DatetimeIndex(orders['daytime']).date

In [5]:
orders.head()

Unnamed: 0,time,transactID,itemID,order,salesPrice,daytime,date
0,2018-01-01 00:01:56,2278968,450,1,17.42,2018-01-01 00:01:56,2018-01-01
1,2018-01-01 00:01:56,2278968,83,1,5.19,2018-01-01 00:01:56,2018-01-01
2,2018-01-01 00:07:11,2255797,7851,2,20.47,2018-01-01 00:07:11,2018-01-01
3,2018-01-01 00:09:24,2278968,450,1,17.42,2018-01-01 00:09:24,2018-01-01
4,2018-01-01 00:09:24,2278968,83,1,5.19,2018-01-01 00:09:24,2018-01-01


In [180]:
aggregated_orders = orders.groupby(['itemID', 'date']).agg(total_orders=pd.NamedAgg(column='order', aggfunc=sum)).reset_index()

In [181]:
aggregated_orders.head()

Unnamed: 0,itemID,date,total_orders
0,1,2018-01-23,1
1,1,2018-01-25,1
2,1,2018-01-29,307
3,1,2018-01-30,3
4,1,2018-01-31,1


In [130]:
# add 0 sales for missing dates per itemID
for prod in aggregated_orders.itemID.unique():
    s = aggregated_orders.loc[aggregated_orders['itemID'] == prod][["date","total_orders"]]
    s = s.set_index("date")
    idx = pd.date_range(s.index.min(), 
                        datetime.date(2018, 6, 1))
    s.index = pd.DatetimeIndex(s.index)
    s = s.reindex(idx, fill_value=0)
    if "df_timeseries" not in globals():
        s["itemID"] = prod
        df_timeseries = s
    else:
        s["itemID"] = prod
        df_timeseries = pd.concat([df_timeseries,s], ignore_index=False)

In [131]:
df_timeseries['date'] = df_timeseries.index
df_timeseries.reset_index(inplace=True)
df_timeseries.drop('index', inplace=True, axis=1)

In [183]:
df_timeseries = aggregated_orders

In [184]:
unique_itemIDs = list(df_timeseries['itemID'].unique())

In [185]:
daily_orders = {itemID: list(df_timeseries.loc[df_timeseries['itemID'] == itemID]['total_orders']) for itemID in unique_itemIDs}



In [186]:
days = {itemID: list(df_timeseries.loc[df_timeseries['itemID'] == itemID]['date']) for itemID in unique_itemIDs}

In [187]:
daily_orders_item = pd.DataFrame({'itemID': list(daily_orders.keys()), 'list_daily_orders': list(daily_orders.values()), 'list_ordering_days': list(days.values())})


In [188]:
daily_orders_item.head()

Unnamed: 0,itemID,list_daily_orders,list_ordering_days
0,1,"[1, 1, 307, 3, 1, 2, 1, 1, 1, 27, 3, 2, 1, 1, ...","[2018-01-23, 2018-01-25, 2018-01-29, 2018-01-3..."
1,2,"[1, 1, 1, 2]","[2018-02-24, 2018-02-26, 2018-05-27, 2018-05-29]"
2,3,"[1, 89, 2, 1, 1, 1, 1, 90, 1, 1, 2]","[2018-01-10, 2018-01-18, 2018-01-19, 2018-01-2..."
3,4,"[1, 1, 2, 42, 1, 1, 4, 1]","[2018-01-18, 2018-02-28, 2018-04-08, 2018-04-2..."
4,5,"[1, 1, 1, 1, 124, 2, 1, 4, 39, 2, 1]","[2018-01-06, 2018-01-07, 2018-01-21, 2018-02-1..."


In [205]:
all_orders = [order for index, row in daily_orders_item.iterrows() for order in row['list_daily_orders']]

In [190]:
def identify_outliers(item):
    clf = LocalOutlierFactor()
    return clf.fit_predict(np.asarray(item['list_daily_orders']).reshape(-1, 1))

In [217]:
clf = LocalOutlierFactor(n_neighbors=50)
predictions = clf.fit_predict(np.asarray(all_orders).reshape(-1, 1))

In [222]:
unique, counts = np.unique(predictions, return_counts=True)
dict(zip(unique, counts))

{-1: 791, 1: 76293}

In [None]:
daily_orders_item['is_promotion'] = daily_orders_item.apply(lambda row: identify_outliers(row), axis=1)

In [48]:
daily_orders_item['median'] = daily_orders_item.apply(lambda row: median(row['list_daily_orders']), axis=1)

In [49]:
daily_orders_item['iqr_90'] = daily_orders_item.apply(lambda row: iqr(row['list_daily_orders'], rng=(25,90)), axis=1)
daily_orders_item['upper_bound_90'] = daily_orders_item.apply(lambda row: row['median'] + 2 * row['iqr_90'], axis=1)

daily_orders_item['iqr_75'] = daily_orders_item.apply(lambda row: iqr(row['list_daily_orders'], rng=(25,75)), axis=1)
daily_orders_item['upper_bound_75'] = daily_orders_item.apply(lambda row: row['median'] + 2 * row['iqr_75'], axis=1)

In [53]:
daily_orders_item['iqr_75'] = daily_orders_item.apply(lambda row: iqr(row['list_daily_orders'], rng=(25,75)), axis=1)
daily_orders_item['upper_bound_75'] = daily_orders_item.apply(lambda row: row['median'] + 4 * row['iqr_75'], axis=1)

In [51]:
daily_orders_item['iqr_90_'] = daily_orders_item.apply(lambda row: iqr(row['list_daily_orders'], rng=(25,90)), axis=1)
daily_orders_item['upper_bound_90_'] = daily_orders_item.apply(lambda row: row['median'] + 1.5 * row['iqr_90_'], axis=1)

In [59]:
daily_orders_item['std'] = daily_orders_item.apply(lambda row: stdev(row['list_daily_orders']) if len(row['list_daily_orders']) > 1 else 0, axis=1)


In [None]:
daily_orders_item.drop('iqr_90 upper_bound_90'.split(), inplace=True, axis=1)

In [66]:
daily_orders_item['upper_bound'] = daily_orders_item.apply(lambda row: row['upper_bound_90_'] if row['upper_bound_90_'] <= max(row['list_daily_orders']) else row['upper_bound_75'] if row['std'] > 10 else np.nan, axis=1)



In [67]:
daily_orders_item.head(100)

Unnamed: 0,itemID,list_daily_orders,list_ordering_days,median,iqr_75,upper_bound_75,iqr_90_,upper_bound_90_,std,upper_bound
0,1,"[1, 1, 307, 3, 1, 2, 1, 1, 1, 27, 3, 2, 1, 1, ...","[2018-01-23, 2018-01-25, 2018-01-29, 2018-01-3...",1.0,1.0,5.0,26.6,40.9,76.984868,40.9
1,2,"[1, 1, 1, 2]","[2018-02-24, 2018-02-26, 2018-05-27, 2018-05-29]",1.0,0.25,2.0,0.7,2.05,0.5,
2,3,"[1, 89, 2, 1, 1, 1, 1, 90, 1, 1, 2]","[2018-01-10, 2018-01-18, 2018-01-19, 2018-01-2...",1.0,1.0,5.0,88.0,133.0,35.712997,5.0
3,4,"[1, 1, 2, 42, 1, 1, 4, 1]","[2018-01-18, 2018-02-28, 2018-04-08, 2018-04-2...",1.0,1.5,7.0,14.4,22.6,14.332157,22.6
4,5,"[1, 1, 1, 1, 124, 2, 1, 4, 39, 2, 1]","[2018-01-06, 2018-01-07, 2018-01-21, 2018-02-1...",1.0,2.0,9.0,38.0,58.0,37.521872,58.0
5,6,"[1, 1, 178, 1, 2, 3, 2, 81]","[2018-02-14, 2018-04-22, 2018-05-11, 2018-05-1...",2.0,21.5,88.0,109.1,165.65,64.608574,165.65
6,7,[1],[2018-02-04],1.0,0.0,1.0,0.0,1.0,0.0,1.0
7,8,"[1, 1, 1, 99, 1, 24, 2, 1, 93]","[2018-01-18, 2018-01-21, 2018-02-15, 2018-03-2...",1.0,23.0,93.0,93.2,140.8,41.093728,93.0
8,9,"[1, 1, 1, 1, 6]","[2018-01-01, 2018-01-12, 2018-02-03, 2018-02-1...",1.0,0.0,1.0,3.0,5.5,2.236068,5.5
9,10,"[1, 355, 1, 1, 1, 1, 60, 1, 1, 1, 1, 3, 1, 1, 1]","[2018-02-15, 2018-02-19, 2018-02-21, 2018-02-2...",1.0,0.0,1.0,36.2,55.3,91.54208,55.3


In [125]:
daily_orders_item[daily_orders_item['itemID'] == 5010]

Unnamed: 0,itemID,list_daily_orders,list_ordering_days,median,iqr_75,upper_bound_75,iqr_90_,upper_bound_90_,std,upper_bound
4133,5010,"[1017, 2, 3, 3, 657, 1]","[2018-04-30, 2018-05-04, 2018-05-06, 2018-05-0...",3.0,491.25,1968.0,834.75,1255.125,445.84291,1968.0


In [70]:
df_timeseries = pd.merge(df_timeseries, daily_orders_item['itemID upper_bound'.split()], on='itemID')

In [72]:
df_timeseries['is_promotion'] = df_timeseries.apply(lambda row: 1 if row['total_orders'] > row['upper_bound'] else 0, axis=1)



In [75]:
df_timeseries = df_timeseries[df_timeseries['is_promotion'] == 1]

In [85]:
infos['promotion_train'] = infos.apply(lambda row: list(df_timeseries[df_timeseries['itemID'] == row['itemID']]['date']), axis=1)

In [81]:
items.head()

Unnamed: 0,itemID,brand,manufacturer,customerRating,category1,category2,category3,recommendedRetailPrice,promotion_train
0,1,0,1,4.38,1,1,1,8.84,"[2018-01-29, 2018-04-23]"
1,2,0,2,3.0,1,2,1,16.92,[]
2,3,0,3,5.0,1,3,1,15.89,"[2018-01-18, 2018-02-08]"
3,4,0,2,4.44,1,2,1,40.17,[2018-04-22]
4,5,0,2,2.33,1,1,1,17.04,[2018-04-17]


In [88]:
infos['promotion_train'] = infos.apply(lambda row: np.nan if len(row['promotion_train']) == 0 else row['promotion_train'], axis=1)

In [89]:
infos.head()

Unnamed: 0,itemID,simulationPrice,promotion,promotion_train
0,1,3.43,,"[2018-01-29, 2018-04-23]"
1,2,9.15,,
2,3,14.04,,"[2018-01-18, 2018-02-08]"
3,4,14.1,,[2018-04-22]
4,5,7.48,,[2018-04-17]


In [124]:
df_timeseries[df_timeseries['itemID'] == 5010]

Unnamed: 0,itemID,date,total_orders,upper_bound,is_promotion


In [93]:
test_orders['daytime'] = pd.to_datetime(test_orders["time"])

# Date
test_orders["date"] = pd.DatetimeIndex(test_orders['daytime']).date

In [94]:
aggregated_orders = test_orders.groupby(['itemID', 'date']).agg(total_orders=pd.NamedAgg(column='order', aggfunc=sum)).reset_index()


In [96]:
aggregated_orders = pd.merge(aggregated_orders, daily_orders_item['itemID upper_bound'.split()], on='itemID')
aggregated_orders['is_promotion'] = aggregated_orders.apply(lambda row: 1 if row['total_orders'] > row['upper_bound'] else 0, axis=1)



In [113]:
infos['promotion_train'] = infos.apply(lambda row: list(df_timeseries[df_timeseries['itemID'] == row['itemID']]['date']), axis=1)



In [114]:
infos['promotion_test'] = infos.apply(lambda row: list(aggregated_orders[aggregated_orders['itemID'] == row['itemID']]['date']), axis=1)


In [108]:
infos['promotion_train'] = infos.apply(lambda row: np.nan if len(row['promotion_train']) == 0 else row['promotion_train'], axis=1)


In [106]:
infos['promotion_test'] = infos.apply(lambda row: np.nan if len(row['promotion_test']) == 0 else row['promotion_test'], axis=1)


In [122]:
infos.head()

Unnamed: 0,index,itemID,simulationPrice,promotion,promotion_train,promotion_test,predicted_promotion
0,0,1,3.43,,"[2018-01-29, 2018-04-23]",[],2018-01-29
1,0,1,3.43,,"[2018-01-29, 2018-04-23]",[],2018-04-23
2,1,2,9.15,,[],[],
3,2,3,14.04,,"[2018-01-18, 2018-02-08]",[2018-06-06],2018-01-18
4,2,3,14.04,,"[2018-01-18, 2018-02-08]",[2018-06-06],2018-02-08


In [116]:
infos['predicted_promotion'] = infos.apply(lambda row: row['promotion_train'] + row['promotion_test'], axis=1)



In [121]:
infos = infos.explode('predicted_promotion').reset_index()

In [123]:
infos.to_csv(f'{data_path}infos_promotions.csv', sep='|', index=False)