Determine promotions based on IQR /z score of items in the same soldWeeklyBin

In [1]:
import math
import datetime
import pandas as pd
import numpy as np
import time
from statistics import median, stdev
from scipy.stats import iqr

from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import OrdinalEncoder

import timeit
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
# change used width of browser window
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

In [3]:
data_path = '../data/'
items = pd.read_csv('{}items.csv'.format(data_path), sep='|')
ex_items = pd.read_csv('{}ex_items_all.csv'.format(data_path), sep='|')
infos = pd.read_csv('{}infos.csv'.format(data_path), sep='|')
orders = pd.read_csv('{}orders.csv'.format(data_path), sep='|')

In [4]:
orders['daytime'] = pd.to_datetime(orders["time"])

# Date
orders["date"] = pd.DatetimeIndex(orders['daytime']).date

In [5]:
orders.head()

Unnamed: 0,time,transactID,itemID,order,salesPrice,daytime,date
0,2018-01-01 00:01:56,2278968,450,1,17.42,2018-01-01 00:01:56,2018-01-01
1,2018-01-01 00:01:56,2278968,83,1,5.19,2018-01-01 00:01:56,2018-01-01
2,2018-01-01 00:07:11,2255797,7851,2,20.47,2018-01-01 00:07:11,2018-01-01
3,2018-01-01 00:09:24,2278968,450,1,17.42,2018-01-01 00:09:24,2018-01-01
4,2018-01-01 00:09:24,2278968,83,1,5.19,2018-01-01 00:09:24,2018-01-01


In [6]:
aggregated_orders = orders.groupby(['itemID', 'date']).agg(total_orders=pd.NamedAgg(column='order', aggfunc=sum)).reset_index()

In [7]:
aggregated_orders.head()

Unnamed: 0,itemID,date,total_orders
0,1,2018-01-23,1
1,1,2018-01-25,1
2,1,2018-01-29,307
3,1,2018-01-30,3
4,1,2018-01-31,1


In [8]:
df_timeseries = aggregated_orders.join(ex_items.set_index("itemID"), on = "itemID")
df_timeseries.head()

Unnamed: 0,itemID,date,total_orders,brand,manufacturer,customerRating,category1,category2,category3,recommendedRetailPrice,minSalesPrice,maxSalesPrice,meanSalesPrice,minSalesPriceCounts,maxSalesPriceCounts,unitsSold,revenue,relRevenue,dateFirstSell,dateLastSell,soldDaily,soldWeekly,soldMonthly,recentlySold,soldWeeklyContinuous
0,1,2018-01-23,1,0,1,4.38,1,1,1,8.84,3.11,3.43,3.111661,690.0,3.0,693.0,2156.19,2.4e-05,2018-01-23,2018-06-26,0.206452,False,1.0,4.0,0.73913
1,1,2018-01-25,1,0,1,4.38,1,1,1,8.84,3.11,3.43,3.111661,690.0,3.0,693.0,2156.19,2.4e-05,2018-01-23,2018-06-26,0.206452,False,1.0,4.0,0.73913
2,1,2018-01-29,307,0,1,4.38,1,1,1,8.84,3.11,3.43,3.111661,690.0,3.0,693.0,2156.19,2.4e-05,2018-01-23,2018-06-26,0.206452,False,1.0,4.0,0.73913
3,1,2018-01-30,3,0,1,4.38,1,1,1,8.84,3.11,3.43,3.111661,690.0,3.0,693.0,2156.19,2.4e-05,2018-01-23,2018-06-26,0.206452,False,1.0,4.0,0.73913
4,1,2018-01-31,1,0,1,4.38,1,1,1,8.84,3.11,3.43,3.111661,690.0,3.0,693.0,2156.19,2.4e-05,2018-01-23,2018-06-26,0.206452,False,1.0,4.0,0.73913


In [9]:
unique_itemIDs = list(df_timeseries['itemID'].unique())

In [10]:
daily_orders = {itemID: list(df_timeseries.loc[df_timeseries['itemID'] == itemID]['total_orders']) for itemID in unique_itemIDs}



In [11]:
days = {itemID: list(df_timeseries.loc[df_timeseries['itemID'] == itemID]['date']) for itemID in unique_itemIDs}

In [12]:
daily_orders_item = pd.DataFrame({'itemID': list(daily_orders.keys()), 'list_daily_orders': list(daily_orders.values()), 'list_ordering_days': list(days.values())})


In [13]:
daily_orders_item.head()

Unnamed: 0,itemID,list_daily_orders,list_ordering_days
0,1,"[1, 1, 307, 3, 1, 2, 1, 1, 1, 27, 3, 2, 1, 1, ...","[2018-01-23, 2018-01-25, 2018-01-29, 2018-01-3..."
1,2,"[1, 1, 1, 2]","[2018-02-24, 2018-02-26, 2018-05-27, 2018-05-29]"
2,3,"[1, 89, 2, 1, 1, 1, 1, 90, 1, 1, 2, 1, 1, 138, 1]","[2018-01-10, 2018-01-18, 2018-01-19, 2018-01-2..."
3,4,"[1, 1, 2, 42, 1, 1, 4, 1, 142, 2, 1]","[2018-01-18, 2018-02-28, 2018-04-08, 2018-04-2..."
4,5,"[1, 1, 1, 1, 124, 2, 1, 4, 39, 2, 1, 1, 1, 1]","[2018-01-06, 2018-01-07, 2018-01-21, 2018-02-1..."


In [14]:
daily_orders_item_categ = daily_orders_item.join(ex_items.set_index("itemID"), on = "itemID")
bin_labels = [1,2,3,4,5]
daily_orders_item_categ["soldWeeklyBin"] = pd.cut(daily_orders_item_categ['soldWeeklyContinuous'], len(bin_labels), labels=bin_labels)#,duplicates="drop")

In [15]:
flatten = lambda l: [item for sublist in l for item in sublist]
daily_orders_category = {soldWeeklyBin: flatten(list(daily_orders_item_categ.loc[daily_orders_item_categ['soldWeeklyBin'] == soldWeeklyBin]['list_daily_orders'])) for soldWeeklyBin in bin_labels}
daily_orders_cat = pd.DataFrame({'soldWeeklyBin': list(daily_orders_category.keys()), 'list_daily_orders': list(daily_orders_category.values())})

In [16]:
daily_orders_cat['median'] = daily_orders_cat.apply(lambda row: median(row['list_daily_orders']), axis=1)
daily_orders_cat['iqr_90'] = daily_orders_cat.apply(lambda row: iqr(row['list_daily_orders'], rng=(25,90)), axis=1)
daily_orders_cat['upper_bound_90'] = daily_orders_cat.apply(lambda row: row['median'] + 2 * row['iqr_90'], axis=1)

daily_orders_cat['iqr_75'] = daily_orders_cat.apply(lambda row: iqr(row['list_daily_orders'], rng=(25,75)), axis=1)
daily_orders_cat['upper_bound_75'] = daily_orders_cat.apply(lambda row: row['median'] + 2 * row['iqr_75'], axis=1)
daily_orders_cat['iqr_75'] = daily_orders_cat.apply(lambda row: iqr(row['list_daily_orders'], rng=(25,75)), axis=1)
daily_orders_cat['upper_bound_75'] = daily_orders_cat.apply(lambda row: row['median'] + 4 * row['iqr_75'], axis=1)
daily_orders_cat['iqr_90_'] = daily_orders_cat.apply(lambda row: iqr(row['list_daily_orders'], rng=(25,90)), axis=1)
daily_orders_cat['upper_bound_90_'] = daily_orders_cat.apply(lambda row: row['median'] + 2 * row['iqr_90_'], axis=1)
daily_orders_cat['std'] = daily_orders_cat.apply(lambda row: stdev(row['list_daily_orders']) if len(row['list_daily_orders']) > 1 else 0, axis=1)
daily_orders_cat.drop('iqr_90 upper_bound_90'.split(), inplace=True, axis=1)

In [17]:
daily_orders_cat['upper_bound'] = daily_orders_cat.apply(lambda row: row['upper_bound_90_'] if row['upper_bound_90_'] <= max(row['list_daily_orders']) else row['upper_bound_75'] if row['std'] > 10 else np.nan, axis=1)

In [18]:
daily_orders_cat.head(5)

Unnamed: 0,soldWeeklyBin,list_daily_orders,median,iqr_75,upper_bound_75,iqr_90_,upper_bound_90_,std,upper_bound
0,1,"[1, 1, 1, 1, 1, 1, 6, 1, 1, 1, 1, 1, 1, 2, 1, ...",1.0,9.0,37.0,39.0,79.0,66.201207,79.0
1,2,"[1, 1, 1, 2, 1, 89, 2, 1, 1, 1, 1, 90, 1, 1, 2...",1.0,4.0,17.0,49.0,99.0,63.454749,99.0
2,3,"[1, 355, 1, 1, 1, 1, 60, 1, 1, 1, 1, 3, 1, 1, ...",1.0,2.0,9.0,52.0,105.0,82.247679,105.0
3,4,"[1, 1, 307, 3, 1, 2, 1, 1, 1, 27, 3, 2, 1, 1, ...",1.0,2.0,9.0,49.0,99.0,101.673324,99.0
4,5,"[4, 1, 1, 1, 3, 1, 1, 32, 1, 1, 1, 1, 1, 11, 1...",1.0,2.0,9.0,51.0,103.0,137.707603,103.0


In [19]:
daily_orders_item = daily_orders_item.join(daily_orders_item_categ.set_index("itemID")[["soldWeeklyBin"]],on="itemID").join(daily_orders_cat.set_index("soldWeeklyBin")[["upper_bound"]],on="soldWeeklyBin")
daily_orders_item.head()

Unnamed: 0,itemID,list_daily_orders,list_ordering_days,soldWeeklyBin,upper_bound
0,1,"[1, 1, 307, 3, 1, 2, 1, 1, 1, 27, 3, 2, 1, 1, ...","[2018-01-23, 2018-01-25, 2018-01-29, 2018-01-3...",4,99.0
1,2,"[1, 1, 1, 2]","[2018-02-24, 2018-02-26, 2018-05-27, 2018-05-29]",2,99.0
2,3,"[1, 89, 2, 1, 1, 1, 1, 90, 1, 1, 2, 1, 1, 138, 1]","[2018-01-10, 2018-01-18, 2018-01-19, 2018-01-2...",2,99.0
3,4,"[1, 1, 2, 42, 1, 1, 4, 1, 142, 2, 1]","[2018-01-18, 2018-02-28, 2018-04-08, 2018-04-2...",2,99.0
4,5,"[1, 1, 1, 1, 124, 2, 1, 4, 39, 2, 1, 1, 1, 1]","[2018-01-06, 2018-01-07, 2018-01-21, 2018-02-1...",2,99.0


In [20]:
# get upper-bound for test data
daily_orders_item_categ.groupby("soldWeeklyBin").min()[["soldWeeklyContinuous"]]

Unnamed: 0_level_0,soldWeeklyContinuous
soldWeeklyBin,Unnamed: 1_level_1
1,0.08
2,0.266667
3,0.45
4,0.636364
5,0.818182


In [21]:
ex_items =  ex_items.join(daily_orders_item.set_index("itemID")[["upper_bound"]],on="itemID",how="left")
ex_items["upper_bound"] = ex_items["upper_bound"].fillna(daily_orders_cat["upper_bound"].mean())
ex_items.head()

Unnamed: 0,itemID,brand,manufacturer,customerRating,category1,category2,category3,recommendedRetailPrice,minSalesPrice,maxSalesPrice,meanSalesPrice,minSalesPriceCounts,maxSalesPriceCounts,unitsSold,revenue,relRevenue,dateFirstSell,dateLastSell,soldDaily,soldWeekly,soldMonthly,recentlySold,soldWeeklyContinuous,upper_bound
0,1,0,1,4.38,1,1,1,8.84,3.11,3.43,3.111661,690.0,3.0,693.0,2156.19,2.418927e-05,2018-01-23,2018-06-26,0.206452,False,1.0,4.0,0.73913,99.0
1,2,0,2,3.0,1,2,2,16.92,9.15,9.15,9.15,5.0,5.0,5.0,45.75,5.132475e-07,2018-02-24,2018-05-29,0.042105,False,0.5,32.0,0.266667,99.0
2,3,0,3,5.0,1,3,3,15.89,9.89,14.04,12.733253,91.0,240.0,331.0,4269.59,4.78985e-05,2018-01-10,2018-06-29,0.087719,False,0.833333,1.0,0.4,99.0
3,4,0,2,4.44,1,2,2,40.17,13.01,14.1,13.798895,53.0,145.0,198.0,2734.03,3.067178e-05,2018-01-18,2018-06-28,0.067901,False,0.833333,2.0,0.416667,99.0
4,5,0,2,2.33,1,1,1,17.04,7.48,7.84,7.735556,52.0,128.0,180.0,1392.48,1.562157e-05,2018-01-06,2018-06-23,0.08284,False,0.833333,7.0,0.4,99.0


In [22]:
len(ex_items)

10463

In [23]:
daily_orders_item[daily_orders_item['itemID'] == 5010]

Unnamed: 0,itemID,list_daily_orders,list_ordering_days,soldWeeklyBin,upper_bound
4752,5010,"[1017, 2, 3, 3, 657, 1, 1, 1475, 502, 1, 2, 2,...","[2018-04-30, 2018-05-04, 2018-05-06, 2018-05-0...",5,103.0


In [24]:
df_timeseries = pd.merge(df_timeseries, daily_orders_item['itemID upper_bound'.split()], on='itemID')

In [25]:
df_timeseries['is_promotion'] = df_timeseries.apply(lambda row: 1 if row['total_orders'] > row['upper_bound'] else 0, axis=1)



In [26]:
df_timeseries = df_timeseries[df_timeseries['is_promotion'] == 1]

In [27]:
infos['promotion_train'] = infos.apply(lambda row: list(df_timeseries[df_timeseries['itemID'] == row['itemID']]['date']), axis=1)

In [28]:
items.head()

Unnamed: 0,itemID,brand,manufacturer,customerRating,category1,category2,category3,recommendedRetailPrice
0,1,0,1,4.38,1,1,1,8.84
1,2,0,2,3.0,1,2,1,16.92
2,3,0,3,5.0,1,3,1,15.89
3,4,0,2,4.44,1,2,1,40.17
4,5,0,2,2.33,1,1,1,17.04


In [29]:
df_timeseries[df_timeseries['itemID'] == 5010]

Unnamed: 0,itemID,date,total_orders,brand,manufacturer,customerRating,category1,category2,category3,recommendedRetailPrice,minSalesPrice,maxSalesPrice,meanSalesPrice,minSalesPriceCounts,maxSalesPriceCounts,unitsSold,revenue,relRevenue,dateFirstSell,dateLastSell,soldDaily,soldWeekly,soldMonthly,recentlySold,soldWeeklyContinuous,upper_bound,is_promotion
50905,5010,2018-04-30,1017,95,135,4.33,5,26,39,9.01,5.95,8.22,6.860426,1090.0,1017.0,4249.0,29218.06,0.000328,2018-04-30,2018-06-29,0.295082,False,1.0,1.0,0.888889,103.0,1
50909,5010,2018-05-23,657,95,135,4.33,5,26,39,9.01,5.95,8.22,6.860426,1090.0,1017.0,4249.0,29218.06,0.000328,2018-04-30,2018-06-29,0.295082,False,1.0,1.0,0.888889,103.0,1
50912,5010,2018-06-10,1475,95,135,4.33,5,26,39,9.01,5.95,8.22,6.860426,1090.0,1017.0,4249.0,29218.06,0.000328,2018-04-30,2018-06-29,0.295082,False,1.0,1.0,0.888889,103.0,1
50913,5010,2018-06-13,502,95,135,4.33,5,26,39,9.01,5.95,8.22,6.860426,1090.0,1017.0,4249.0,29218.06,0.000328,2018-04-30,2018-06-29,0.295082,False,1.0,1.0,0.888889,103.0,1
50918,5010,2018-06-21,566,95,135,4.33,5,26,39,9.01,5.95,8.22,6.860426,1090.0,1017.0,4249.0,29218.06,0.000328,2018-04-30,2018-06-29,0.295082,False,1.0,1.0,0.888889,103.0,1


In [30]:
infos.head()

Unnamed: 0,itemID,simulationPrice,promotion,promotion_train
0,1,3.43,,"[2018-01-29, 2018-04-23]"
1,2,9.15,,[]
2,3,14.04,,[2018-06-28]
3,4,14.1,,[2018-06-19]
4,5,7.48,,[2018-04-17]


In [98]:
infos['predicted_promotion'] = infos.apply(lambda row: row['promotion_train'] + row['promotion_test'], axis=1)



In [32]:
infos.rename(columns={'promotion_train': 'predicted_promotion'}, inplace=True)

In [33]:
infos.head()

Unnamed: 0,itemID,simulationPrice,promotion,predicted_promotion
0,1,3.43,,"[2018-01-29, 2018-04-23]"
1,2,9.15,,[]
2,3,14.04,,[2018-06-28]
3,4,14.1,,[2018-06-19]
4,5,7.48,,[2018-04-17]


In [36]:
infos = infos.explode('predicted_promotion').reset_index()

In [37]:
infos.to_csv(f'{data_path}infos_promotions_all.csv', sep='|', index=False)