In [7]:
from datetime import datetime
import datetime as dt

import numpy as np

import math
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages

import os

import pandas as pd

# Biblioteki sktime
import sktime
from sktime.forecasting.arima import AutoARIMA
from sktime.forecasting.base import ForecastingHorizon
from sktime.forecasting.compose import make_reduction
from sktime.forecasting.compose import TransformedTargetForecaster, EnsembleForecaster
from sktime.forecasting.fbprophet import Prophet
from sktime.forecasting.ets import AutoETS
from sktime.forecasting.model_selection import (ForecastingRandomizedSearchCV, ForecastingGridSearchCV, SingleWindowSplitter)
from sktime.forecasting.model_selection import temporal_train_test_split
from sktime.forecasting.model_selection._split import temporal_train_test_split
from sktime.forecasting.trend import PolynomialTrendForecaster, STLForecaster
# metryki
from sklearn.metrics import mean_squared_error
from sktime.performance_metrics.forecasting import mean_absolute_percentage_error, mean_squared_percentage_error,MeanSquaredError
from sktime.transformations.series.feature_selection import FeatureSelection
from sktime.transformations.series.outlier_detection import HampelFilter
from sktime.utils.plotting import plot_series
from sklearn.ensemble import RandomForestRegressor
import plotly.express as px
import plotly.graph_objects as go

from xgboost import XGBRegressor

import pickle
from tqdm.auto import tqdm
from functools import reduce

In [8]:
from utils import convert_and_preprocess_daily_to_monthly, convert_weekly_to_daily, convert_weekly_to_daily_xgb, convert_weekly_to_monthly, convert_weekly_to_monthly_xgb, wczytaj_dane_oczyszczone, wczytaj_dane_oczyszczone_do_predykcji, get_most_important_cols, get_most_important_cols2, generate_confidence_intervals, wczytaj_dane_odstajace_do_predykcji, wczytaj_dane_inwentaryzacje_do_predykcji, wczytaj_dane_odstajace, wczytaj_dane_inwentaryzacje, wczytaj_dane_wszystkie, fh_predykcje

`Notatnik do diagnostyki modeli`

In [9]:
def get_data_reader(model_type, mode="train/test"):
    if mode == "train/test":
        if model_type == "oczyszczone":
            return wczytaj_dane_oczyszczone
        if model_type == "odstajace":
            return wczytaj_dane_odstajace
        if model_type == "inwentaryzacje":
            return wczytaj_dane_inwentaryzacje
    elif mode == "pred":
        if model_type == "oczyszczone":
            return wczytaj_dane_oczyszczone_do_predykcji
        if model_type == "odstajace":
            return wczytaj_dane_odstajace_do_predykcji
        if model_type == "inwentaryzacje":
            return wczytaj_dane_inwentaryzacje_do_predykcji
    else:
        raise ValueError(f"Mode {mode} not supported.")

def wykres(y_train ,y_test = None, y_pred_test = None, y_pred_future=None):
    # Wyplotowanie wyników
    fig = px.line(y)
    fig = go.Figure()
    fig.add_trace(
        go.Scatter(
            x=y_train.index,
            y=y_train['VALUE'],
            mode='lines+markers',
            name='TRAIN',
            line=dict(color='blue')

        )
    )
    if y_test is not None:
        fig.add_trace(
            go.Scatter(
                x=y_test.index,
                y=y_test['VALUE'],
                mode='lines+markers',
                name='TEST',
                line=dict(color='green')
            )
        )
    if y_pred_test is not None:
        fig.add_trace(
            go.Scatter(
                x=y_pred_test.index,
                y=y_pred_test['VALUE'],
                name='PRED_TEST',
                mode='lines+markers',
                line=dict(color='orange')
            )
        )

    if y_pred_future is not None:
        fig.add_trace(
            go.Scatter(
                x=y_pred_future.index,
                y=y_pred_future['VALUE'],
                name='MAIN_PRED',
                mode='lines+markers',
                line=dict(color='red')
            )
        )
    return fig

def compute_metrics(y_true, y_pred):
    mape = mean_absolute_percentage_error(y_true, y_pred, symmetric=False)
    mspe = mean_squared_percentage_error(y_true, y_pred, symmetric=False)
    _rmse = MeanSquaredError(square_root=True)
    rmse = _rmse(y_true, y_pred)
    ramka = pd.DataFrame([[mape],[mspe],[rmse]], index= ['MAPE','MSPE','RMSE'])
    ramka.columns = ['wynik']
    return ramka

def postprocess_pred(y_pred):
    y_pred['VALUE'] = y_pred['VALUE'].apply(lambda x: min(x,0))
    return y_pred

In [10]:
model_config = {
    "oczyszczone":
        {
            "ARIMAX":
                {
                    "sp": 4,
                },
            "XGBoost":
                {
                    "window_lengths": [4, 8, 12, 32],
                    "cv_window_length": 52,
                },
            "group_period": 'W'
        },
    "odstajace":
        {
            "ARIMAX":
                {
                    "sp": 1,
                },
            "XGBoost":
                {
                    "window_lengths": [2, 4, 6, 12],
                    "cv_window_length": 6,
                },
            "group_period": 'M'
        },
    "inwentaryzacje":
        {
            "ARIMAX":
                {
                    "sp": 1,
                },
            "XGBoost":
                {
                    "window_lengths": [2, 4, 6, 12],
                    "cv_window_length": 6,
                },
            "group_period": 'M'

        }
}

In [11]:
START_DATE = '...'
END_DATE = '...'

START_PREDICTION_DATE = '...'
TEST_SIZE_M = 7

TEST_SIZE_DICT = {
    "oczyszczone": 31,
    "odstajace": 7,
    "inwentaryzacje":7
}

MODEL_TYPES = ["oczyszczone", "odstajace", "inwentaryzacje"]

# ALL_STORES = pickle.load(open(f'models/all_stores.pkl', 'rb'))
ALL_STORES = [...]

In [None]:
import plotly.express as px
check = 'niestety'
metrics_dict = {}

for STORE in tqdm(ALL_STORES):
    print(f"------------------------ STORE: {STORE} ----------------------------")
    # Update słownika
    metrics_dict[STORE] = {}
    y_pred_test_all = []
    y_pred_all = []

    for MODEL_TYPE in MODEL_TYPES:
        print(f"------ MODEL TYPE: {MODEL_TYPE} ------------")
        
        #### FOR TEST ####
        read_data = get_data_reader(model_type=MODEL_TYPE, mode="train/test")
        
        #data
        y, X = read_data(start_date=START_DATE, end_date=END_DATE, store=STORE)
        
        if len(y) > 0:
                    # może nie być modelu dla inwentaryzacji
            # test_train_split
            y_train, y_test, X_train, X_test = temporal_train_test_split(y, X, test_size=TEST_SIZE_DICT[MODEL_TYPE])

            # fh
            fh = ForecastingHorizon(y_test.index, is_relative=False)

            # load model & features
            best_model_test = pickle.load(open(f'models/test/{STORE}_{MODEL_TYPE}_best_model.pkl', 'rb'))
            selected_features_test = pickle.load(open(f'models/test/selected_features/{STORE}_{MODEL_TYPE}_features.pkl', 'rb'))

            # predict
            if check == 'niestety':
                if type(best_model_test) == ForecastingRandomizedSearchCV:
                    
                    X_test.index = X_test.index.to_period(model_config[MODEL_TYPE]["group_period"])
                    fh = ForecastingHorizon(X_test.index, is_relative=False)

                    y_pred_test = best_model_test.predict(X=X_test[selected_features_test], fh=fh).rename(columns={'yhat': "VALUE"})
                    y_pred_test.index = y_pred_test.index.to_timestamp(model_config[MODEL_TYPE]["group_period"])

                else:
                    y_pred_test = best_model_test.predict(X=X_test[selected_features_test], fh=y_test.index) \
                                                        .rename(columns={'yhat': "VALUE"})
            else:
                y_pred_test = best_model_test.predict(X=X_test[selected_features_test],fh=y_test.index) \
                                                        .rename(columns={'yhat': "VALUE"})
            
            
            ### FOR PRED ###

            # load model & features
            best_model_pred = pickle.load(open(f'models/pred/{STORE}_{MODEL_TYPE}_best_model.pkl', 'rb'))
            selected_features_pred = pickle.load(open(f'models/pred/selected_features/{STORE}_{MODEL_TYPE}_features.pkl', 'rb'))

            # read data
            read_data = get_data_reader(model_type=MODEL_TYPE, mode="pred")

            X_do_predykcji = read_data(start_date=START_PREDICTION_DATE, store=STORE)[selected_features_pred]

            # predict
            if check == 'niestety':

                if type(best_model_pred) == ForecastingRandomizedSearchCV:

                    X_do_predykcji.index = X_do_predykcji.index.to_period(model_config[MODEL_TYPE]["group_period"])
                    fh = ForecastingHorizon(X_do_predykcji.index, is_relative=False)
                    y_pred = best_model_pred.predict(X=X_do_predykcji, fh=fh)
                    y_pred = y_pred.rename(columns={'yhat': "VALUE"})
                    y_pred.index = y_pred.index.to_timestamp(model_config[MODEL_TYPE]["group_period"])

                else:
                    y_pred = best_model_pred.predict(X=X_do_predykcji,
                                                      fh=fh_predykcje(freq=model_config[MODEL_TYPE]["group_period"]))
                    y_pred = y_pred.rename(columns={'yhat': "VALUE"})
            else:
                y_pred = best_model_pred.predict(X=X_do_predykcji,
                                                  fh=fh_predykcje(freq=model_config[MODEL_TYPE]["group_period"]))
                y_pred = y_pred.rename(columns={'yhat': "VALUE"})

            
            # plot and save to html
            if MODEL_TYPE == "oczyszczone":
                y_train = convert_weekly_to_monthly(y_train)
                y_test = convert_weekly_to_monthly(y_test)
                y_pred_test = convert_weekly_to_monthly(y_pred_test)
                y_pred = convert_weekly_to_monthly(y_pred)

            # Update słownika
            metrics_dict[STORE][MODEL_TYPE] = compute_metrics(y_test, y_pred_test)

            for i in [y_train, y_test, X_train, X_test]:
                try:
                    i.index = i.index.to_timestamp(model_config[MODEL_TYPE]["group_period"])
                except:
                    pass
            wykres(y_train, y_test, y_pred_test, y_pred).write_html(f"./plots/{STORE}_{MODEL_TYPE}_wykres_eksploracyjny.html")

            y_pred_test_all.append(y_pred_test)
            y_pred_all.append(y_pred)

        # wykres po zsumowaniu
        y_pred_test = pd.DataFrame(reduce(lambda x, y: x+y, y_pred_test_all))
        y_pred = pd.DataFrame(reduce(lambda x, y: x+y, y_pred_all))
        y, X = wczytaj_dane_wszystkie(start_date=START_DATE, end_date=END_DATE, store=STORE)
        y_train, y_test, X_train, X_test = temporal_train_test_split(y, X, test_size=TEST_SIZE_M)

        wykres(y_train, y_test, y_pred_test, postprocess_pred(y_pred)).write_html(f"./plots/{STORE}_final_wykres_eksploracyjny.html")


print(f"{STORE} done!")

In [None]:
metrics_dict['all']