In [1]:
cd ..

C:\Users\guest0\joeran\M5Forecast\uncertainty


In [2]:
cd ..

C:\Users\guest0\joeran\M5Forecast


In [3]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os, gc
import matplotlib.pyplot as plt
from tqdm import tqdm

from tensorflow.keras.layers import (Dense, Dropout, Flatten, Input, BatchNormalization, Lambda, 
                                     concatenate, Embedding, Reshape)
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras import backend as K
from tensorflow.keras.utils import Sequence
from tensorflow.keras.callbacks import Callback, LearningRateScheduler

import wandb
from wandb.keras import WandbCallback

# own imports
from evaluation import Referee, load_data, select_dates, select_day_nums
from train import BatchCreator, Logger, plot_confidence_series, plot_some_confidence_intervals
from model_builder import (get_pinball_losses, get_simple_dist_model, get_simple_dense_model,
                           get_variable_dist_model, get_direct_dist_model)
from flow import model_predict, denorm_preds, warp_preds_to_ref_form, plot_some, evaluate_model
from preprocess import preprocess, get_features, pandas_cat_data, reset_categorical_features


# Setup file paths
data_dir = 'data/'
os.environ['DATA_DIR'] = data_dir
sub_dir = 'submissions_uncertainty/'
plt.rcParams['font.size'] = 13

# Hardcode requested quantiles
quantiles = [0.005, 0.025, 0.165, 0.25, 0.5, 0.75, 0.835, 0.975, 0.995]

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


# Implement logic for obtaining the features for the validation set

In [4]:
# Load in the data set
calendar, sales_train_validation, sell_prices = load_data()

sales_true = select_dates(sales_train_validation, day_end=1913, num_days=28, include_metadata=True)
sales_train = select_dates(sales_train_validation, day_start=1, num_days=1913-28, include_metadata=True)

ref = Referee(sales_true, sales_train, sell_prices, calendar, verbose=True)

Initializing Referee
Converting true sales to quantile form
Calculating weights for each level...
Calculating scale for each level...
Finished setup.


## Setup

For each level:  

1. Read the (prepared) dataset
2. Get the validation batch creator
3. Get the denormalised predictions
4. Save to file

Then:

1. Combine predictions

In [5]:
import os, gc
import numpy as np
import pandas as pd

from flow import restore_tags_converted_sales, read_converted_sales
from lightgbm_kernel import read_data, encode_categorical, reduce_mem_usage
from feature_extraction import aggregate_adapted_fe

from sklearn.preprocessing import LabelEncoder

In [6]:
# Set input sizes for the model
input_shapes = {
    1: (178,),
    2: (181,),
    3: (187,),
    4: (181,),
    5: (188,),
    6: (184,),
    7: (191,),
    8: (194,),
    9: (201,),
    10: (3237,),
    11: (3240,),
    12: (3255,),
}

# select whether to perform the category-reset step of the preprocessing
# reset_categories = {d: False if d in [3] else True for d in range(1, 1+12)}
reset_categories = {d: True for d in range(1, 1+12)}

In [7]:
from preprocess import categorical_features
level = 3
DAYS_PRED = 28
prediction_lag = DAYS_PRED
n_years = 6

In [8]:
verbose = True

In [9]:
for level in range(1, 1+12):
    # read data for pipeline from lightgbm-poisson-w-scaled-pinball-loss.ipynb
    calendar, sell_prices, sales_train_val, submission = read_data()  # with memory reduction

    # read comverted sales
    converted_sales = read_converted_sales(level=level, data_dir=data_dir)

    ### Replace demand with normalised sales
    sales_train_val = converted_sales

    ## Count
    NUM_ITEMS = sales_train_val.shape[0]  # 1 / ... / 70 / ... / 30,240
    if DAYS_PRED is None:
        DAYS_PRED = submission.shape[1] - 1  # 28
    print(NUM_ITEMS, DAYS_PRED)
    
    nrows = int(365 * n_years * NUM_ITEMS)

    ## Encode categorical features
    calendar = encode_categorical(
        calendar, ["event_name_1", "event_type_1", "event_name_2", "event_type_2"]
    ).pipe(reduce_mem_usage)

    sales_train_val = encode_categorical(
        sales_train_val, categorical_features[level],
    ).pipe(reduce_mem_usage)

    sell_prices = encode_categorical(sell_prices, ["item_id", "store_id"]).pipe(
        reduce_mem_usage
    )

    ## Reshape
    sales_train_val = pd.melt(sales_train_val,
                              id_vars=['id', *categorical_features[level]],
                              var_name='day', value_name='demand')
    print('Melted sales train validation has {} rows and {} columns'.format(*sales_train_val.shape))
    
    #################################### ADD EVAL ###
    
    # select which days to add
    day_start = 1914
    day_end = 1941
    n_days = day_end - day_start + 1

    # select a 'prototype' for the new days
    df_bit = sales_train_val[sales_train_val.day == 'd_1913'].copy()
    df_bit['demand'] = -1
    df_bit = df_bit.reset_index(drop=True)
    
    # repeat the prototype for n days
    df_eval = pd.concat([df_bit]*n_days, ignore_index=True)

    # set the day to the corresponding values, e.g. d_1914, ..., d_1941
    validation_d_cols = ['d_%d'%d for d in range(day_start, day_end+1)]
    df_eval.day = np.repeat(validation_d_cols, df_bit.index.size)
    
    # merge with 'normal' train/val sales
    sales_train_val = pd.concat((sales_train_val, df_eval), ignore_index=True)
    #################################### END OF ADD EVAL ###
    
    print("Selecting {} rows ({:.1%})".format(nrows, nrows / sales_train_val.index.size))
    data = sales_train_val.iloc[-nrows:, :]

    ## Add calendar features
    # calendarの結合
    # drop some calendar features(不要な変数の削除:weekdayやwdayなどはdatetime変数から後ほど作成できる。)
    calendar.drop(['weekday', 'wday', 'month', 'year'],
                  inplace=True, axis=1)

    # notebook crash with the entire dataset (maybe use tensorflow, dask, pyspark xD)(dayとdをキーにdataに結合)
    data = pd.merge(data, calendar, how='left', left_on=['day'], right_on=['d'])
    data.drop(['d', 'day'], inplace=True, axis=1)

    # add sell price if all of the columns 'store_id', 'item_id', 'wm_yr_wk' are available
    # sell price
    if np.prod([col in data.columns for col in ['store_id', 'item_id', 'wm_yr_wk']]):
        # get the sell price data (this feature should be very important)
        data = data.merge(sell_prices, on=['store_id', 'item_id', 'wm_yr_wk'], how='left')
        print('Our final dataset to train has {} rows and {} columns'.format(data.shape[0], data.shape[1]))

    # memory
    del sell_prices
    gc.collect()

    data = reduce_mem_usage(aggregate_adapted_fe(data, DAYS_PRED=DAYS_PRED))
    
    if reset_categories[level]:
        print("Converting to pandas data..") if verbose else None
        # set categorical features
        data, available_cat_features = pandas_cat_data(data)

        print("Resetting categories..") if verbose else None
        # reset categorical features, set NaN events as additional category, set NaN shift/std/mean/kurt/skew to zero
        data = reset_categorical_features(data, available_cat_features)

    # select features
    features = get_features(level=level, prediction_lag=prediction_lag,
                            sell_price_features=('sell_price' in data.columns))
    print(features) if verbose else None
    
    #################################### SAVE EVAL ###
    eval_df = data[(data['date'] >= '2016-04-24') & (data['date'] <= '2016-05-22')]
    
    fn = data_dir + 'eval_prep/level_{}_simple_fe_{}_{}_normalised_demand_lag_{}.pickle'.format(
        level, eval_df.date.min().date().strftime("%Y_%m_%d"), eval_df.date.max().date().strftime("%Y_%m_%d"),
        DAYS_PRED
    )
    print("Saving to file..")
    eval_df.to_pickle(fn)
    print("Finished.")
    
    del data
    gc.collect()

Reading files...
Mem. usage decreased to  0.12 Mb (41.9% reduction)
Calendar has 1969 rows and 14 columns
Mem. usage decreased to 130.48 Mb (37.5% reduction)
Sell prices has 6841121 rows and 4 columns
Sales train validation has 30490 rows and 1919 columns
Mem. usage decreased to  0.00 Mb (74.3% reduction)
1 28
Mem. usage decreased to  0.08 Mb (36.9% reduction)
Mem. usage decreased to  0.00 Mb (0.0% reduction)
Mem. usage decreased to 45.67 Mb (41.7% reduction)
Melted sales train validation has 1913 rows and 3 columns
Selecting 2190 rows (112.8%)
Mem. usage decreased to  0.15 Mb (53.6% reduction)
Converting to pandas data..
Resetting categories..
['event_name_1', 'event_type_1', 'event_name_2', 'event_type_2', 'snap_CA', 'snap_TX', 'snap_WI', 'shift_t28', 'shift_t29', 'shift_t30', 'rolling_std_t7', 'rolling_std_t30', 'rolling_std_t60', 'rolling_std_t90', 'rolling_std_t180', 'rolling_mean_t7', 'rolling_mean_t30', 'rolling_mean_t60', 'rolling_mean_t90', 'rolling_mean_t180', 'rolling_skew_t

FileNotFoundError: [Errno 2] No such file or directory: 'data/eval_prep/level_1_simple_fe_2016_04_24_2016_05_22_normalised_demand_lag_28.pickle'

In [None]:
print(f'done')

In [None]:
# redid lavel 3 with reset categories = true