#### Biblioteki

In [1]:
from datetime import datetime
import datetime as dt

import glob

import numpy as np

import math

import os

import pandas as pd

# Biblioteki sktime
import sktime
from sktime.forecasting.arima import AutoARIMA
from sktime.forecasting.base import ForecastingHorizon
from sktime.forecasting.compose import make_reduction
from sktime.forecasting.compose import TransformedTargetForecaster, EnsembleForecaster
from sktime.forecasting.fbprophet import Prophet
from sktime.forecasting.model_selection import (ForecastingRandomizedSearchCV, ForecastingGridSearchCV, SingleWindowSplitter)
from sktime.forecasting.model_selection import temporal_train_test_split
from sktime.forecasting.model_selection._split import temporal_train_test_split

# metryki
from sklearn.metrics import mean_squared_error
from sktime.performance_metrics.forecasting import mean_absolute_percentage_error, mean_squared_percentage_error,MeanSquaredError
from sktime.transformations.series.feature_selection import FeatureSelection
from sktime.transformations.series.outlier_detection import HampelFilter
from sktime.utils.plotting import plot_series
from sklearn.ensemble import RandomForestRegressor
import plotly.express as px
import plotly.graph_objects as go

from xgboost import XGBRegressor

In [2]:
import pickle
from tqdm.auto import tqdm

In [3]:
from utils import wykres, compute_metrics, convert_and_preprocess_daily_to_monthly, convert_weekly_to_daily, convert_weekly_to_daily_xgb, convert_weekly_to_monthly, convert_weekly_to_monthly_xgb, wczytaj_dane_oczyszczone, get_most_important_cols, get_most_important_cols2, generate_confidence_intervals, wczytaj_dane_odstajace, wczytaj_dane_inwentaryzacje

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)

`Lista ze sklepami`

In [None]:
STORE = "all"

START_DATE = '' # Dobrać odpowiedni zakres w związku ze słabą jakościa danych na początku szeregu
END_DATE = ''

TEST_SIZE_DICT = {
    "oczyszczone": 31,
    "odstajace": 7,
    "inwentaryzacje":7
}

MODEL_TYPES = ["oczyszczone", "odstajace", "inwentaryzacje"]

ALL_STORES = [...]

#### Pozostałe funkcje

In [5]:
def select_best_model(metrics_dict, metric='RMSE'):
    return sorted(metrics_dict.items(), key=lambda x: x[1][metric])[0][0]

def save_best_model(best_model, model_type, mode="test"):
    if best_model == "ARIMAX":
        pickle.dump(forecaster_arimax, open(f'models/{mode}/{STORE}_{model_type}_best_model.pkl', 'wb'))
        pickle.dump(X_test_selected.columns.tolist(), open(f'models/{mode}/selected_features/{STORE}_{model_type}_features.pkl', 'wb'))
    elif best_model == "Prophet":
        pickle.dump(forecaster_prophet, open(f'models/{mode}/{STORE}_{model_type}_best_model.pkl', 'wb'))
        pickle.dump(X_test_selected2.columns.tolist(), open(f'models/{mode}/selected_features/{STORE}_{model_type}_features.pkl', 'wb'))
    elif best_model == "XGBoost":
        pickle.dump(gscv_x, open(f'models/{mode}/{STORE}_{model_type}_best_model.pkl', 'wb'))
        pickle.dump(X_test_selected2.columns.tolist(), open(f'models/{mode}/selected_features/{STORE}_{model_type}_features.pkl', 'wb'))

def get_data_reader(model_type):
    if model_type == "oczyszczone":
        return wczytaj_dane_oczyszczone
    if model_type == "odstajace":
        return wczytaj_dane_odstajace
    if model_type == "inwentaryzacje":
        return wczytaj_dane_inwentaryzacje

# Master pętla

In [8]:
model_config = {
    "oczyszczone":
        {
            "ARIMAX":
                {
                    "sp": 4,
                },
            "XGBoost":
                {
                    "window_lengths": [4, 8, 12, 32],
                    "cv_window_length": 52,
                },
            "group_period": 'W'
        },
    "odstajace":
        {
            "ARIMAX":
                {
                    "sp": 1,
                },
            "XGBoost":
                {
                    "window_lengths": [2, 4, 6, 12],
                    "cv_window_length": 6,
                },
            "group_period": 'M'
        },
    "inwentaryzacje":
        {
            "ARIMAX":
                {
                    "sp": 1,
                },
            "XGBoost":
                {
                    "window_lengths": [2, 4, 6, 12],
                    "cv_window_length": 6,
                },
            "group_period": 'M'
            
        }
}

In [None]:
check = 'niestety'

################################################################
# - Obiekt do wyłapywania sklepów bez historii inwentaryzacji -#
sklepy_bez_historii_inwentaryzacji = []
#
################################################################

mode = 'test'                  # do weryfikowania plików w folderach i nie zaczynania pętli od początku
for STORE in tqdm(ALL_STORES):
    print(f"------------------------ STORE: {STORE} ----------------------------")

    for MODEL_TYPE in MODEL_TYPES:
        if (f"{STORE}_{MODEL_TYPE}_best_model.pkl" not in os.listdir(f"./models/test/")) | (f"{STORE}_{MODEL_TYPE}_best_model.pkl" not in os.listdir(f"./models/pred/")) :
            print(f"------ MODEL TYPE: {MODEL_TYPE} ------------")
            read_data = get_data_reader(model_type=MODEL_TYPE)

            # ---------------------- read and preprocess data ---------------------------------

            # data
            y, X = read_data(start_date=START_DATE, end_date=END_DATE, store=STORE)
            
            # not empty df
            if len(y) > 0:

                # test_train_split
                y_train, y_test, X_train, X_test = temporal_train_test_split(y, X, test_size=TEST_SIZE_DICT[MODEL_TYPE])

                # fh
                fh = ForecastingHorizon(y_test.index, is_relative=False)

                # feature selection
                try:
                    most_important_cols = get_most_important_cols(X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test, fh=fh)
                    most_important_cols2 = get_most_important_cols2(X_train=X_train, y_train=y_train, n_features=4)

                    X_train_selected, X_test_selected = X_train[most_important_cols], X_test[most_important_cols]
                    X_train_selected2, X_test_selected2 = X_train[most_important_cols2], X_test[most_important_cols2]

                except:
                    most_important_cols2 = get_most_important_cols2(X_train=X_train, y_train=y_train, n_features=4)
                    X_train_selected2, X_test_selected2 = X_train[most_important_cols2], X_test[most_important_cols2]

                # ---------------------- train models ---------------------------------
                metrics_dict = {}

                # arimax
                forecaster_arimax = AutoARIMA(sp=model_config[MODEL_TYPE]["ARIMAX"]["sp"], start_P=1, start_Q=1, max_P=8, max_Q=8, suppress_warnings=True)
                try:
                    forecaster_arimax.fit(y=y_train, X=X_train_selected)
                    y_pred_arimax = forecaster_arimax.predict(X=X_test_selected, fh=fh) # predict
                except:
                    forecaster_arimax.fit(y=y_train, X=X_train_selected2)
                    y_pred_arimax = forecaster_arimax.predict(X=X_test_selected2, fh=fh) # predict
                metrics_dict["ARIMAX"] = compute_metrics(y_test, y_pred_arimax).to_dict()['wynik']

                # prophet
                forecaster_prophet = Prophet(add_country_holidays={'country_name': 'Poland'})
                forecaster_prophet.fit(y_train, X_train_selected2)

                y_pred_prophet = forecaster_prophet.predict(X=X_test_selected2, fh=fh).rename(columns={"yhat": "VALUE"}) # predict
                metrics_dict["Prophet"] = compute_metrics(y_test, y_pred_prophet).to_dict()['wynik']

                # xgboost
                validation_size = len(fh)

                #cv = SingleWindowSplitter(window_length=model_config[MODEL_TYPE]["XGBoost"]["cv_window_length"], fh=validation_size)

                cv = SingleWindowSplitter(window_length=52, fh=validation_size)

                regressor = XGBRegressor(objective='reg:squarederror', random_state=42)


                forecaster = make_reduction(
                    regressor,
                    scitype="tabular-regressor",
                    strategy="recursive"
                )

                pipeline = TransformedTargetForecaster(
                    [
                        # ("deseasonalize", Deseasonalizer(model="multiplicative", sp=4)),
                        # ("detrend", Detrender(forecaster=PolynomialTrendForecaster(degree=2))),
                        ("model", forecaster),
                    ]
                )

                # hyperparameters grid to search over grid

                param_grid = {
                    #'model__window_length': model_config[MODEL_TYPE]["XGBoost"]["window_lengths"],
                    'model__window_length': [2, 4, 8, 12],
                    'model__estimator__max_depth': [3, 5, 6, 10, 15, 20],
                    'model__estimator__learning_rate': [0.01, 0.1, 0.2, 0.3],
                    'model__estimator__subsample': np.arange(0.5, 1.0, 0.1),
                    'model__estimator__colsample_bytree': np.arange(0.4, 1.0, 0.1),
                    'model__estimator__colsample_bylevel': np.arange(0.4, 1.0, 0.1),
                    'model__estimator__n_estimators': [100, 500, 1000],
                }

                # Do rozwiązania
                gscv_x = ForecastingRandomizedSearchCV(
                    pipeline,
                    cv=cv,
                    param_distributions=param_grid,
                    # error_score='raise',
                    n_iter=10,
                    n_jobs=-1,
                    random_state=42,
                    verbose=3,
                    error_score = 'raise'
                )

                # Część, którą musi dodać Kamil
                if kamil == 'niestety':
                    for i in [y_train, y_test, X_train_selected2, X_test_selected2]:
                        i.index = i.index.to_period(model_config[MODEL_TYPE]["group_period"])
                    fh = ForecastingHorizon(y_test.index, is_relative=False)

                gscv_x.fit(y=y_train, X=X_train_selected2)
                y_pred_xgb = gscv_x.predict(X=X_test_selected2, fh=fh)

                metrics_dict["XGBoost"] = compute_metrics(y_test, y_pred_xgb).to_dict()['wynik']

                # best - train/test
                save_best_model(best_model=select_best_model(metrics_dict=metrics_dict), model_type=MODEL_TYPE)

                # best - pelny model, nowe dane 2023
                if select_best_model(metrics_dict=metrics_dict) == "ARIMAX":
                    try:
                        forecaster_arimax.fit(y=y, X=X[most_important_cols])
                    except:
                        forecaster_arimax.fit(y=y, X=X[most_important_cols2])
                elif select_best_model(metrics_dict=metrics_dict) == "Prophet":
                    forecaster_prophet.fit(y=y, X=X[most_important_cols2])
                elif select_best_model(metrics_dict=metrics_dict) == "XGBoost":

                    # Kolejna część, którą musi dodać Kamil
                    if kamil == 'niestety':
                        for i in [y,X]:
                            i.index = i.index.to_period(model_config[MODEL_TYPE]["group_period"])
                        fh = ForecastingHorizon(y.index, is_relative=False)

                    gscv_x.fit(y=y, X=X[most_important_cols2])
                save_best_model(best_model=select_best_model(metrics_dict=metrics_dict), model_type=MODEL_TYPE, mode="pred")
            
            else:
                sklepy_bez_historii_inwentaryzacji.append((STORE,MODEL_TYPE))
                print(f"------ MODEL TYPE: {MODEL_TYPE} HAS NO DATA !!! ------------")
            
        else:
            print(f"------ MODEL TYPE: {MODEL_TYPE} ALREADY IN DIRECTORY ------------")
        

In [None]:
sklepy_bez_historii_inwentaryzacji