### Add lag features

In [None]:
def add_lag_features(X, y, column_id, column_sort, feature_dict, time_windows, n_jobs = 32, disable_progressbar = False):
    """
    Create lag features for y and add them to X
    Parameters:
    -----------
    X: pandas.DataFrame 
    feature matrix to which TS features are added.
    y: pandas.DataFrame, 
    time series to compute the features for.
    column_id: list, 
    list of column names to group by, e.g. ["shop","product"]. If set to None, 
    either there should be nothing to groupby or each group should be 
    represented by a separate target column in y. 
    column_sort: str,
    column name used to sort the DataFrame. If None, will be filled by an 
    increasing number, meaning that the order of the passed dataframes are used 
    as “time” for the time series.
    feature_dict: dict,
    dictionary containing feature calculator names with the corresponding 
    parameters
    time_windows : list of tuples, 
    each tuple (min_timeshift, max_timeshift), represents the time shifts for 
    ech time windows to comupute e.g. [(7,7),(1,14)] for two time windos 
    a) time window with a fix size of 7 and b) time window that starts with size
    1 and increases up to 14. Then shifts by 1 for each step. 
    """

    if column_id == None:
        X['id'] = 1

    else:
        X['id'] = X[column_id].astype(str).agg('_'.join, axis = 1)

    if column_sort == None:
        X['time'] = range(X.shape[0])  

    else:
        X["time"] = X[column_sort].copy()
    
    y = pd.concat([y, X[['id', 'time']]], axis = 1)
    X = X.set_index(['id', 'time'])
  
    for window in time_windows:
        
        # create time series for given time window 
        df_rolled = roll_time_series(y, 
                                     column_id = "id", 
                                     column_sort = "time", 
                                     min_timeshift = window[0]-1, 
                                     max_timeshift = window[1]-1,
                                     n_jobs = n_jobs,
                                     disable_progressbar = disable_progressbar)
        
        df_rolled['id'] = df_rolled['id'].apply(lambda x: (x[0], x[1] + 1))

        # create lag features for given time window 
        df_features = extract_features(df_rolled, 
                                       column_id = "id", 
                                       column_sort = "time",
                                       default_fc_parameters = feature_dict,
                                       n_jobs = n_jobs,
                                       disable_progressbar = disable_progressbar)

        # Add time window to feature name for clarification 
        feature_names = df_features.columns.to_list()
        feature_names = [name + "_" + str(window[1]) for name in feature_names]
        df_features.columns = feature_names
        
        # add features for given time window to feature matrix temp
        X = pd.concat([X, df_features], axis = 1)
    
    y = y.set_index(['id', 'time'])
    y_column_names = y.columns.to_list()

    df = pd.concat([X, y],axis = 1)
    df = df.dropna()
    df.index.names = ['id', 'time']
    df = df.reset_index(drop = False, inplace = False).drop(['time'], axis = 1, inplace = False)
    
    y = df[y_column_names]
    X = df.drop(y_column_names, axis = 1)

    return X, y

In [None]:
def loadDataYaz(testDays = 28, returnXY = True, daysToCut = 0, unstacked = False, disable_progressbar = False):
    
    # LOAD DATA
    dataPath = pkg_resources.resource_stream(__name__, 'datasets/dataYaz_unprocessed.csv')
    data = pd.read_csv(dataPath)
    
    #---
    
    # DAY INDEX
    data = data.reset_index().rename(columns = {'index': 'dayIndex'})
    data['dayIndex'] = data.dayIndex

    #---
    
    # SEPARATE DEMAND OF DIFFERENT ITEMS
    X = data.iloc[:, 0:11]
    y = data.iloc[:, 12:]

    dataList = list()
    for col in y.columns:
        X_temp = X
        X_temp["item"] = col
        y_temp = y[col]
        data_temp = pd.concat([X_temp, y_temp], axis=1)
        data_temp.rename(columns={col: "demand"}, inplace=True)
        dataList.append(data_temp)

    data = pd.concat(dataList, axis = 0)

    #---
    
    # ID FEATURE AND SORTING
    data['id'] = data['item']
    data = data.sort_values(by = ['id', 'dayIndex'], axis = 0).reset_index(drop = True)

    #---

    # CUT DAYS DEPENDING ON DAYSTOCUT
    cutOffDate = data.dayIndex.max() - daysToCut
    data = data[data['dayIndex'] <= cutOffDate].reset_index(drop = True)
    
    #---
    
    # LABEL
    if isinstance(testDays, int):
        nDaysTest = testDays
    else:
        tsSizes = data.groupby(['id']).size()
        nDaysTest = int(tsSizes.iloc[0] * testDays)

    cutoffDateTest = data.dayIndex.max() - nDaysTest
    data['label'] = ['train' if data.dayIndex.iloc[i] <= cutoffDateTest else 'test' for i in range(data.shape[0])]    
    
    #---
    
    # NORMALIZE DEMAND
    scalingData = data[data.label == 'train'].groupby('id')['demand'].agg('max').reset_index()
    scalingData.rename(columns = {'demand': 'scalingValue'}, inplace = True)
    data = pd.merge(data, scalingData, on = 'id')

    data['demand'] = data.demand / data.scalingValue

    #---

    # DEMAND LAG FEATURES
    y = pd.DataFrame(data['demand'])
    X = data.drop(columns = ['demand'])

    # set lag features
    fc_parameters = MinimalFCParameters()

    # delete length features
    del fc_parameters['length']

    # create lag features
    X, y = add_lag_features(X = X, 
                            y = y, 
                            column_id = ['id'], 
                            column_sort = 'dayIndex', 
                            feature_dict = fc_parameters, 
                            time_windows = [(7, 7), (14, 14), (28, 28)],
                            n_jobs = 6, 
                            disable_progressbar = False)
    
    #---

    # CREATE UNSTACKED MULTIDIMENSIONAL DEMAND VECTOR IF DESIRED
    if unstacked:
        colsDemand = [column for column in X.columns if 'demand__' in column]
        colsOther = [column for column in X.columns if not 'demand__' in column]
        
        generalData = X[colsOther][X['id'] == X['id'][0]].reset_index(drop = True).drop(['id', 'scalingValue'], axis = 1)
        
        XList = list()
        yList = list()
        scalingValueList = list()
        
        XList.append(generalData)

        for item in np.unique(X['id']):
            XItem = X[X['id'] == item].reset_index(drop = True)
            yItem = y[X['id'] == item].reset_index(drop = True)

            newColNames = {col: col + '_' + item for col in colsDemand}
            
            XToAdd = XItem[colsDemand].rename(columns = newColNames)
            yToAdd = yItem.rename(columns = {'demand': 'demand_' + item})
            scalingValueToAdd = XItem[['scalingValue']].rename(columns = {'scalingValue': 'scalingValue_' + item})
            
            XList.append(XToAdd)
            yList.append(yToAdd)
            scalingValueList.append(scalingValueToAdd)
            
        X = pd.concat(XList, axis = 1)
        y = pd.concat(yList, axis = 1)
        scalingValues = pd.concat(scalingValueList, axis = 1)
        
        X = pd.concat([X, scalingValues], axis = 1)

    #---
    
    # DATE DUMMY VARIABLES
    X['year'] = X['year'].apply(lambda x: str(int(x)))

    X = pd.concat([X, 
                  pd.get_dummies(X.weekday, prefix = 'weekday'), 
                  pd.get_dummies(X.month, prefix = 'month'), 
                  pd.get_dummies(X.year, prefix = 'year')], axis = 1).drop(['weekday', 'month', 'year'], axis = 1)

    X = pd.concat([X, pd.get_dummies(X.item, prefix = 'item')], axis = 1).drop(['item'], axis = 1)

    #---
    
    # SPLIT INTO TRAIN AND TEST DATA
    data = pd.concat([y, X], axis = 1)   
    
    if unstacked:
        XArray = np.array(X.drop(['label'], axis = 1))
        yArray = np.array(y)           
    else:
        XArray = np.array(X.drop(['label', 'id'], axis = 1))
        yArray = np.ravel(y)    
    
    XTrain = XArray[data['label'] == 'train']
    yTrain = yArray[data['label'] == 'train']

    XTest = XArray[data['label'] == 'test']
    yTest = yArray[data['label'] == 'test']

    #---

    if returnXY:
        return data, XTrain, yTrain, XTest, yTest
    else:
        return data    


In [None]:
def loadDataBakery2(testDays = 28, returnXY = True, daysToCut = 0, disable_progressbar = False):
    
    # LOAD RAW DATA
    dataPath = pkg_resources.resource_stream(__name__, 'datasets/dataBakery_unprocessed.csv')
    data = pd.read_csv(dataPath)
    
    #---
    
    # RENAME AND DROP COLUMNS
    data.drop(columns=["temp_min", "temp_max"], inplace=True)
    
    data.rename(columns={"date_short": "date", 
                         "shop_no": "store", 
                         "product_no": "item", 
                         "temp_avg_celsius": "temperature", 
                         "rain_mm": "rain"}, 
                inplace=True)
    
    #---
    
    # REMOVE INTERMITTENT DEMAND
    data_grouped = data.groupby(["store", "item"])
    groups = list(data_grouped.groups.keys())
    
    # get all store/item instances with more than 20 percent zero sales
    more_than_20_p_zero = []
    for group in groups:
        data_temp = data_grouped.get_group(group)
        zero = data_temp[data_temp["demand"]==0].shape[0]
        non_zero = data_temp[data_temp["demand"]!=0].shape[0]
        if zero/(non_zero+zero) >= 0.2:
            more_than_20_p_zero.append(group)
            
    # drop zero sales instances
    for group in more_than_20_p_zero:
        data = data.drop(data_grouped.get_group(group).index)
    
    #---
    
    # CALENDAR FEATURES
    data['date'] = pd.to_datetime(data['date'], format='%Y-%m-%d')
    data['year'] = data['date'].dt.year
    
    #---
    
    # ID Feature
    data['id'] = [str(data.store.iloc[i]) + '_' + str(data.item.iloc[i]) for i in range(data.shape[0])]
    
    #---
    
    # DAY INDEX
    data['dayIndex'] = data['date'].apply(lambda x: getDayIndex(x))
    
    #---
    
    # CUT DAYS DEPENDING ON DAYSTOCUT
    cutOffDate = data.dayIndex.max() - daysToCut
    data = data[data['dayIndex'] <= cutOffDate].reset_index(drop = True)
    
    #---
    
    # LABEL
    if isinstance(testDays, int):
        nDaysTest = testDays
    else:
        tsSizes = data.groupby(['id']).size()
        nDaysTest = int(tsSizes.iloc[0] * testDays)

    cutoffDateTest = data.dayIndex.max() - nDaysTest
    data['label'] = ['train' if data.dayIndex.iloc[i] <= cutoffDateTest else 'test' for i in range(data.shape[0])]    

    #---

    # data = data.sort_values(by = ['id', 'dayIndex'], axis = 0).reset_index(drop = True)

    #---

    # NORMALIZE DEMAND
    scalingData = data[data.label == 'train'].groupby('id')['demand'].agg('max').reset_index()
    scalingData.rename(columns = {'demand': 'scalingValue'}, inplace = True)
    data = pd.merge(data, scalingData, on = 'id')
    
    data['demand'] = data.demand / data.scalingValue

    #---

    # DEMAND LAG FEATURES
    
    y = pd.DataFrame(data['demand'])
    X = data.drop(columns = ['demand'])

    # set lag features
    fc_parameters = MinimalFCParameters()

    # delete length features
    del fc_parameters['length']

    # create lag features
    X, y = add_lag_features(X = X, 
                            y = y, 
                            column_id = ['id'],
                            column_sort = 'dayIndex', 
                            feature_dict = fc_parameters, 
                            time_windows = [(7, 7), (14, 14), (28, 28)],
                            n_jobs = 32, 
                            disable_progressbar = False)
    
    #---
    
    X['year'] = X['year'].apply(lambda x: str(int(x)))

    X = pd.concat([X, 
                  pd.get_dummies(X.weekday, prefix = 'weekday'), 
                  pd.get_dummies(X.month, prefix = 'month'), 
                  pd.get_dummies(X.year, prefix = 'year')], axis = 1).drop(['weekday', 'month', 'year'], axis = 1)

    X = pd.concat([X, pd.get_dummies(X.item, prefix = 'item')], axis = 1).drop(['item'], axis = 1)

    #---
    
    # STORE AND ITEM DUMMY VARIABLES
    data['item'] = data['item'].apply(lambda x: str(int(x)))
    data['store'] = data['store'].apply(lambda x: str(int(x)))
    dataTrain = pd.concat([data, 
                           pd.get_dummies(data.item, prefix = 'item'), 
                           pd.get_dummies(data.store, prefix = 'store')], axis = 1).drop(['store', 'item'], axis = 1)
    
    #---
    
    # SPLIT INTO TRAIN AND TEST DATA
    data = pd.concat([y, X], axis = 1)
    XArray = np.array(X.drop(['label', 'id'], axis = 1))   
    yArray = np.ravel(y)    
    
    XTrain = XArray[data['label'] == 'train']
    yTrain = yArray[data['label'] == 'train']

    XTest = XArray[data['label'] == 'test']
    yTest = yArray[data['label'] == 'test']

    #---

    if returnXY:
        return data, XTrain, yTrain, XTest, yTest
    else:
        return data    