# SETTINGS

In [None]:
########## LIBRARIES

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import scipy.stats

import os
import time
import datetime
import random
import multiprocessing
import pickle
import warnings
import gc

In [None]:
########## SETTINGS

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
plt.style.use('dark_background')
%matplotlib inline
gc.enable()

# DATA IMPORT

In [None]:
# read data
items  = pd.read_csv('../data/prepared/items_v1.csv')
orders = pd.read_csv('../data/prepared/orders_v1.csv')
print(items.shape)
print(orders.shape)

In [None]:
items.head()

In [None]:
orders.head()

In [None]:
# convert dates
items['time']  = pd.to_datetime(items['promotion_0'].astype('str'), infer_datetime_format = True)
items['time']  = pd.to_datetime(items['promotion_1'].astype('str'), infer_datetime_format = True)
items['time']  = pd.to_datetime(items['promotion_2'].astype('str'), infer_datetime_format = True)
orders['time'] = pd.to_datetime(orders['time'].astype('str'),       infer_datetime_format = True)

# PARTITIONING

In [None]:
# check date interval
print(orders.time.min())
print(orders.time.max())
print('Time interval of {} days'.format((orders.time.max() - orders.time.min()).days))

In [None]:
##### SETUP PARTITIONING INDEX

# set windows
valid_days = 14  # no. days in each validation fold
train_days = 66  # no. days in each training fold
step_days  = 7   # no. days between folds i and i+1 

# number of folds
num_folds = 15

# placeholders
train_idx = []
valid_idx = []

# partitioning loop
for fold in range(num_folds):

    # validation dates
    if fold == 0:
        v_end  = orders['time'].max() - pd.DateOffset(days = valid_days - 1) * fold
    else:
        v_end = v_end - pd.DateOffset(days = step_days)
    v_start = v_end - pd.DateOffset(days = valid_days - 1)

    # training dates
    t_end   = v_start - pd.DateOffset(days = 1)
    t_start = t_end - pd.DateOffset(days = train_days - 1)
    
    # extract index
    train_idx.append(list(orders[(orders.time >= t_start) & (orders.time <= t_end)].index))
    valid_idx.append(list(orders[(orders.time >= v_start) & (orders.time <= v_end)].index))
    
    # save as array
    
    # print information
    print('-' * 55)
    print('FOLD {}/{}'.format(fold + 1, num_folds))
    print('-' * 55)
    print('- train period: {} -- {} (n = {})'.format(str(t_start)[0:10], str(t_end)[0:10], len(train_idx[fold])))
    print('- valid period: {} -- {} (n = {})'.format(str(v_start)[0:10], str(v_end)[0:10], len(valid_idx[fold])))
    print('-' * 55)
    print('')

In [None]:
# convert to numpy array
train_idx = np.asarray(train_idx)
valid_idx = np.asarray(valid_idx)
part_idx = np.vstack((train_idx, valid_idx))

In [None]:
# export partitioning index
np.save('../data/partitioning/part_idx_f{}_t{}_s{}'.format(num_folds, train_days, step_days), part_idx)
print(part_idx.shape)