In [1]:
import numpy as np
import pandas as pd
import datetime
from statsmodels.graphics.tsaplots import plot_acf

#https://openbase.com/python/tsextract
from tsextract.feature_extraction.extract import build_features
from tsextract.domain.statistics import median, mean, skew, kurtosis
from tsextract.domain.temporal import abs_energy


In [2]:
df = pd.read_csv("item_orders.csv")

  exec(code_obj, self.user_global_ns, self.user_ns)


In [3]:
def get_time_series_supervised(df_in, filter_in, frequency = "Daily"):
    
    """
    Returns the time series dataframe for given filter, frequency,  and imput dataframe
    
    df_in = Pandas dataframe
    filter_in = filter that will be applied on df_in dataframe. Example: df_in.group1 == "Running shoes"
    frequency = Daily, Weekly or Monthly
    """
    
    df_temp = df_in[filter_in]
    if frequency == "Daily":
        df_temp['date']=pd.to_datetime(df_temp['date'], format = "%Y-%m-%d")
        df_out = df_temp.groupby("date").sum()
        features_request = {
            "window":[28], # 4 weeks, based on auto-correlation plot on model_selection_ARIMA
            "window_statistic":[7, mean], #Include last week's mean
            "window_statistic":[28, mean], #Include last 4 weeks mean
            "momentum_statistic": [28, 1, mean],
            "difference":[28, 1],
            "difference_statistic":[28, 1, mean], 
        }
        df_out = build_features(df_out["quantity"], features_request, target_lag=1)
        df_out["week_day"] = [x.weekday() for x in df_out.index] #weekday
        df_out["week_number"] = [x.isocalendar()[1] for x in df_out.index] #week number
        

    if frequency == "Weekly":
        date=pd.to_datetime('2019-06-01')
        df_temp['date']=pd.to_datetime(df_temp['date'], format = "%Y-%m-%d")
        df_out = df_temp.resample('W-{:%a}'.format(date), on='date').sum()
        df_out = df_out.iloc[1: , :] # dropping the first row as it is an incomplete week
        features_request = {
            "window":[8],
            "window_statistic":[4, mean], # month
            "window_statistic":[8, mean], #2 months
            "momentum_statistic": [8, 1, mean],
            "difference":[8, 1],
            "difference_statistic":[8, 1, mean], 
        }
        df_out = build_features(df_out["quantity"], features_request, target_lag=1)
        df_out["week_number"] = [x.isocalendar()[1] for x in df_out.index] #week number
        
    
    if frequency == "Monthly":
        date=pd.to_datetime('2019-06-01')
        df_temp['date']=pd.to_datetime(df_temp['date'], format = "%Y-%m-%d")
        df_out = df_temp.resample('M'.format(date), on='date').sum()
        features_request = {
            "window":[3],
            "window_statistic":[3, mean], #1 quarter
            "window_statistic":[12, mean], #1 year
            "momentum_statistic": [3, 1, mean],
            "difference":[3, 1],
            "difference_statistic":[3, 1, mean], 
        }
        df_out = build_features(df_out["quantity"], features_request, target_lag=1)
    
    #adding other variables
    df_out["week_number"] = [x.isocalendar()[1] for x in df_out.index] #week number
    
    return(df_out) 


In [4]:
filter_in =np.ones(df.shape[0], dtype=bool) #All records

In [5]:
df_supervised = get_time_series_supervised(df, filter_in = filter_in, frequency = "Daily")

In [6]:
df_supervised.head()

Unnamed: 0_level_0,T-28,T-27,T-26,T-25,T-24,T-23,T-22,T-21,T-20,T-19,...,difference_28_1-5,difference_28_1-4,difference_28_1-3,difference_28_1-2,difference_28_1-1,difference_statistic_28_1_mean,tzero,Target_Tplus1,week_day,week_number
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-06-30,136.0,5.0,30.0,2146.0,2066.0,619.0,373.0,4.0,1071.0,1420.0,...,35.0,-77.0,-125.0,-56.0,232.0,25.0,680,1058.0,6,26
2019-07-01,5.0,30.0,2146.0,2066.0,619.0,373.0,4.0,1071.0,1420.0,214.0,...,-77.0,-125.0,-56.0,232.0,378.0,38.074074,1058,1064.0,0,27
2019-07-02,30.0,2146.0,2066.0,619.0,373.0,4.0,1071.0,1420.0,214.0,1053.0,...,-125.0,-56.0,232.0,378.0,6.0,-40.074074,1064,1118.0,1,27
2019-07-03,2146.0,2066.0,619.0,373.0,4.0,1071.0,1420.0,214.0,1053.0,368.0,...,-56.0,232.0,378.0,6.0,54.0,-35.111111,1118,1130.0,2,27
2019-07-04,2066.0,619.0,373.0,4.0,1071.0,1420.0,214.0,1053.0,368.0,10.0,...,232.0,378.0,6.0,54.0,12.0,18.925926,1130,1030.0,3,27


In [7]:
df_supervised.to_csv("Daily_all_supervised.csv")

In [8]:
df_supervised.shape

(928, 62)