# SETTINGS

In [None]:
########## LIBRARIES

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import scipy.stats

import os
import time
import datetime
import random
import multiprocessing
import pickle
import warnings
import gc
from tqdm import tqdm

In [None]:
########## HELPER FUNCTIONS

!pip install --upgrade dptools
from dptools import *

In [None]:
########## SETTINGS

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
plt.style.use('dark_background')
%matplotlib inline
gc.enable()

# DATA IMPORT

In [None]:
# read data
orders = pd.read_csv('../data/prepared/orders_v1.csv', compression = 'gzip')
items  = pd.read_csv('../data/prepared/items_v1.csv',  compression = 'gzip')
print(orders.shape)
print(items.shape)

In [None]:
# convert dates
orders['time']       = pd.to_datetime(orders['time'].astype('str'),       infer_datetime_format = True)
items['promotion_0'] = pd.to_datetime(items['promotion_0'].astype('str'), infer_datetime_format = True)
items['promotion_1'] = pd.to_datetime(items['promotion_1'].astype('str'), infer_datetime_format = True)
items['promotion_2'] = pd.to_datetime(items['promotion_2'].astype('str'), infer_datetime_format = True)

# ADD FEATURES: ITEMS

In [None]:
# price ratio
items['price_ratio'] = items['simulationPrice'] / items['recommendedRetailPrice']
items['price_ratio'].describe()

# ADD FEATURES: ORDERS

In [None]:
##### AGGREGATE ORDERS BY DAY

orders['day_of_year'] = orders['time'].dt.dayofyear

agg_order      = orders.groupby(['itemID', 'day_of_year'])['order'].agg('sum')
agg_salesPrice = orders.groupby(['itemID', 'day_of_year'])['salesPrice'].agg('mean')
agg_transactID = orders.groupby(['itemID', 'day_of_year'])['transactID'].agg('count')

orders = pd.concat([agg_order, agg_salesPrice, agg_transactID], axis = 1).reset_index()

orders.head()

In [None]:
##### ADD MISSING ZEROES

print(orders.shape)
agg_orders = orders.groupby(['itemID', 'day_of_year']).order.unique().unstack('day_of_year').stack('day_of_year', dropna = False)
agg_orders = agg_orders.reset_index()
agg_orders.columns = ['itemID', 'day_of_year', 'order']
agg_orders['order'].fillna(0, inplace = True)
agg_orders['order'] = agg_orders['order'].astype(int)
print(agg_orders.shape)

In [None]:
# check mean and total orders
print(agg_orders['order'].mean())
print(agg_orders['order'].sum())

In [None]:
##### COMPUTING TARGETS AND FEATURES

# parameters
days_input  = [7, 14, 21, 28]
days_target = 14

# preparations
day_first = np.max(days_input)
day_last  = agg_orders['day_of_year'].max() - days_target
orders    = None

# computations
for day_of_year in tqdm(range(day_first, day_last)):
    
    break
    
    # day intervals
    target_day_min = day_of_year + 1
    target_day_max = day_of_year + days_target
    
    # compute target
    tmp_df = agg_orders[(agg_orders['day_of_year'] >= target_day_min) &
                        (agg_orders['day_of_year'] <= target_day_max)
                       ].groupby('itemID')['order'].agg('sum').reset_index()
    tmp_df.columns = ['itemID', 'target']
    
    # compute features
    for day_input in days_input:
        
        # day intervals
        input_day_min  = day_of_year - day_input + 1
        input_day_max  = day_of_year
    
        # frequency
        tmp_df_input = agg_orders[(agg_orders['day_of_year'] >= input_day_min) &
                                  (agg_orders['day_of_year'] <= input_day_max)
                                 ].groupby('itemID')
        tmp_df['order_sum_last_'   + str(day_input)] = tmp_df_input['order'].agg('sum').reset_index()['order']
        tmp_df['order_count_last_' + str(day_input)] = tmp_df_input['order'].agg(lambda x: len(x[x > 0])).reset_index()['order']

    # add day of year
    tmp_df.insert(1, column = 'day_of_year', value = day_of_year)
    
    # merge data
    orders = pd.concat([orders, tmp_df], axis = 0)

# MERGE DATA SETS

In [None]:
print(orders.shape)
print(items.shape)
df = pd.merge(orders, items, on = 'itemID', how = 'left')
print(df.shape)
del items, orders

# EXPORT

In [None]:
# save data frame
# save_csv_version() automatically adds version number to prevent overwriting
save_csv_version('../data/prepared/df.csv', df, index = False, compression = 'gzip')
print(df.shape)