In [10]:
import darts
print(darts.__version__)


#!pip install --upgrade numpy
import numpy as np
import time

 
from darts import TimeSeries
from darts.utils.timeseries_generation import gaussian_timeseries, linear_timeseries, sine_timeseries
from darts.models import LightGBMModel, CatBoostModel, Prophet, RNNModel, TFTModel, NaiveSeasonal, ExponentialSmoothing, NHiTSModel
from darts.metrics import mape, smape, rmse, rmsle
from darts.dataprocessing import Pipeline
from darts.dataprocessing.transformers import Scaler, StaticCovariatesTransformer, MissingValuesFiller, InvertibleMapper
from darts.utils.timeseries_generation import datetime_attribute_timeseries
from darts.utils.statistics import check_seasonality, plot_acf, plot_residuals_analysis, plot_hist
from darts.utils.likelihood_models import QuantileRegression
from darts.utils.missing_values import fill_missing_values
# from darts.models import MovingAverage

import optuna
from optuna.integration import PyTorchLightningPruningCallback
from optuna.visualization import (
    plot_optimization_history,
    plot_contour,
    plot_param_importances,
)


from pytorch_lightning.callbacks.early_stopping import EarlyStopping

from tqdm import tqdm

import sklearn
from sklearn import preprocessing
import pandas as pd
import torch
import matplotlib.pyplot as plt


%matplotlib inline
torch.manual_seed(1); np.random.seed(1)  # for reproducibility
import gc

0.24.0


# Data Loading

In [12]:
# Load all Datasets
df_train = pd.read_csv('../Data/train.csv')
df_test = pd.read_csv('../Data/test.csv')
df_holidays_events = pd.read_csv('../Data/holidays_events.csv')
df_oil = pd.read_csv('../Data/oil.csv')
df_stores = pd.read_csv('../Data/stores.csv')
df_transactions = pd.read_csv('../Data/transactions.csv')
df_sample_submission = pd.read_csv('../Data/sample_submission.csv')


In [22]:
df_stores.head()

Unnamed: 0,store_nbr,city,state,type,cluster
0,1,Quito,Pichincha,D,13
1,2,Quito,Pichincha,D,13
2,3,Quito,Pichincha,D,8
3,4,Quito,Pichincha,D,9
4,5,Santo Domingo,Santo Domingo de los Tsachilas,D,4


# Merging

In [18]:
family_list = df_train['family'].unique()
store_list = df_stores['store_nbr'].unique()

train_merged = pd.merge(df_train,df_stores,on='store_nbr')
train_merged =  train_merged.sort_values(['store_nbr','family','date'])
train_merged = train_merged.astype({"store_nbr":'str', "family":'str', "city":'str',
                          "state":'str', "type":'str', "cluster":'str'})

df_test_dropped = df_test.drop(['onpromotion'], axis=1)
df_test_sorted = df_test_dropped.sort_values(by=['store_nbr','family'])



In [33]:
df_family

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,city,state,type,cluster
32,32,2013-01-01,1,SEAFOOD,0.000000,0,Quito,Pichincha,D,13
65,1814,2013-01-02,1,SEAFOOD,38.029000,0,Quito,Pichincha,D,13
98,3596,2013-01-03,1,SEAFOOD,17.366001,0,Quito,Pichincha,D,13
131,5378,2013-01-04,1,SEAFOOD,29.907001,0,Quito,Pichincha,D,13
164,7160,2013-01-05,1,SEAFOOD,24.842000,0,Quito,Pichincha,D,13
...,...,...,...,...,...,...,...,...,...,...
2778467,2993627,2017-08-11,54,SEAFOOD,0.000000,0,El Carmen,Manabi,C,3
2778500,2995409,2017-08-12,54,SEAFOOD,1.000000,1,El Carmen,Manabi,C,3
2778533,2997191,2017-08-13,54,SEAFOOD,2.000000,0,El Carmen,Manabi,C,3
2778566,2998973,2017-08-14,54,SEAFOOD,0.000000,0,El Carmen,Manabi,C,3


In [41]:
family_TS_dict = {}
for family in family_list:
    df_family = train_merged.loc[train_merged['family'] == family]
    list_of_TS_family = TimeSeries.from_group_dataframe(
                                df_family,
                                time_col="date",
                                group_cols=["store_nbr","family"],  # individual time series are extracted by grouping `df` by `group_cols`
                                static_cols=["city","state","type","cluster"], # also extract these additional columns as static covariates
                                value_cols="sales", # target variable
                                fill_missing_dates=True,
                                freq='D')
    
    for ts in list_of_TS_family:
            ts = ts.astype(np.float32)
            
            
    list_of_TS_family = sorted(list_of_TS_family, key=lambda ts: int(ts.static_covariates_values()[0,0]))
    family_TS_dict[family] = list_of_TS_family

In [43]:
train_filler

NameError: name 'train_filler' is not defined

In [44]:
# Transform the Sales Data

family_pipeline_dict = {}
family_TS_transformed_dict = {}
for key in family_TS_dict:
    train_filler = MissingValuesFiller(verbose=False, n_jobs=-1, name="Fill NAs")
    static_cov_transformer = StaticCovariatesTransformer(verbose=False, transformer_cat = sklearn.preprocessing.OneHotEncoder(), name="Encoder") #OneHotEncoder would be better but takes longer
    log_transformer = InvertibleMapper(np.log1p, np.expm1, verbose=False, n_jobs=-1, name="Log-Transform")   
    train_scaler = Scaler(verbose=False, n_jobs=-1, name="Scaling")
    
    train_pipeline = Pipeline([train_filler,static_cov_transformer,log_transformer,train_scaler])
    

    training_transformed = train_pipeline.fit_transform(family_TS_dict[key])
    family_pipeline_dict[key] = train_pipeline
    family_TS_transformed_dict[key] = training_transformed
    # Create TimeSeries objects (Darts) 1782

list_of_TS = TimeSeries.from_group_dataframe(
                                train_merged,
                                time_col="date",
                                group_cols=["store_nbr","family"],  # individual time series are extracted by grouping `df` by `group_cols`
                                static_cols=["city","state","type","cluster"], # also extract these additional columns as static covariates
                                value_cols="sales", # target variable
                                fill_missing_dates=True,
                                freq='D')
for ts in list_of_TS:
            ts = ts.astype(np.float32)