In [2]:
import pandas as pd
import os
import numpy as np
import time
from tqdm.autonotebook import tqdm
from matplotlib import pyplot

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import cross_val_score, TimeSeriesSplit
from sklearn.metrics import make_scorer

from catboost import CatBoostRegressor

  from tqdm.autonotebook import tqdm


# Прогнозирование временных рядов с помощью обычных регрессионных моделей

Основная концепция заключается в том что мы воссоздаем алгоритм авторегрессии, плюсы данного подхода состоят в том что мы можем выкидывать неудобные периоды (ковид и пр.), а также можем легко учитывать другие признаки

In [107]:
df = pd.read_csv('Product_demand.csv')
df.dropna(inplace=True)
df.drop_duplicates(subset=['Product_Code', 'Warehouse', 'Date'], inplace=True)

In [108]:
# тут будет преобразование столбцов, надо преобразовать date в дату, а order_demand в инт
import re
import datetime

df['Order_Demand'] = df.apply(lambda row: re.sub('[() ]', '', row.Order_Demand), axis = 1)
df['Order_Demand'] = df['Order_Demand'].astype(float)

df['Date'] = df['Date'].astype(str)
df['Date'] = df.apply(lambda row: datetime.datetime.strptime(row.Date, "%Y/%m/%d").date(), axis = 1)

df = df[['Product_Code', 'Warehouse', 'Date', 'Order_Demand']]
df.sort_values(by=['Date'], ignore_index=True, inplace=True)
df = df[-100000:].reset_index(drop=True)
df

Unnamed: 0,Product_Code,Warehouse,Date,Order_Demand
0,Product_1645,Whse_A,2016-03-03,8.0
1,Product_1792,Whse_A,2016-03-03,2.0
2,Product_2010,Whse_J,2016-03-03,3000.0
3,Product_1454,Whse_J,2016-03-03,1000.0
4,Product_0304,Whse_J,2016-03-03,16.0
...,...,...,...,...
99995,Product_1970,Whse_J,2017-01-06,2000.0
99996,Product_1408,Whse_J,2017-01-06,1000.0
99997,Product_0250,Whse_C,2017-01-09,148.0
99998,Product_1904,Whse_C,2017-01-09,4000.0


In [109]:
# тут начинается трансформация
def lag_feature(df, col_name):
    lags = [1, 2, 3, 5, 8, 13, 21]
    lag_features = pd.concat(
        [df
         .set_index(['Product_Code', 'Warehouse', 'Date'])
         .groupby(['Product_Code', 'Warehouse'])
         [col_name]
         .transform('shift', periods=lag)
         .rename(f'{col_name}_lag_{lag}')
         for lag in lags], axis=1
    )
    return lag_features.reset_index()

def mean_feature(df, col_name):
    window_sizes = [5, 7, 14]
    mean_features = pd.concat(
        [df
         .set_index(['Product_Code', 'Warehouse', 'Date'])
         .groupby(['Product_Code', 'Warehouse'])
         [col_name]
         .rolling(w)
         .mean()
         .rename(f'{col_name}_mean_{str(w)}')
         for w in window_sizes], axis=1)
#     print(mean_features.head())
    return mean_features.reset_index()

def add_product_rank(df):
    return (df
            .groupby('Product_Code')
            ['Order_Demand']
            .sum()
            .rank(method='dense', ascending=False)
            .reset_index()
            .rename({'Order_Demand': 'Product_Demand_Rank'}, axis=1)
            )

def add_wh_rank(df):
    return (df
            .groupby('Warehouse')
            ['Order_Demand']
            .sum()
            .rank(method='dense', ascending=False)
            .reset_index()
            .rename({'Order_Demand': 'Warehouse_Demand_Rank'}, axis=1)
            )

In [110]:
lag_features_cnt_ord = lag_feature(df, 'Order_Demand')
# mean_features_cnt_ord = mean_feature(reserved_data, 'cnt_ord')

# ds_rank = add_ds_rank(reserved_data)

In [111]:
df = df.merge(lag_features_cnt_ord, how='left', on=['Product_Code', 'Warehouse', 'Date'])

In [112]:
horizons = range(1, 4)
target_cols = [f'target_{i}' for i in horizons]
non_feature_cols = ['Product_Code', 'Warehouse', 'Order_Demand']
drop_cols = target_cols + non_feature_cols + ['Date']


def explode_targets(concat, horizons):
    targets = pd.concat(
        [concat
         .set_index(['Product_Code', 'Warehouse', 'Date'])
         .groupby(['Product_Code', 'Warehouse'])
         ['Order_Demand']
         .transform('shift', periods=-horizon)
         .rename(f'target_{horizon}')
         for horizon in horizons], axis=1)
    return (concat.merge(targets, on=['Product_Code', 'Warehouse', 'Date']))

In [113]:
from dataclasses import dataclass

sales_qty_mean_on_test_threshold = 10

@dataclass
class TrainingConfig:
    validation_start_date : datetime.date
    validation_end_date : datetime.date
    test_start_date : datetime.date
        
def build_validation(sales, horizons):
    """
    :param sales:
            expected columns plu, store_id, date, sales_qty
    :return validation
            columns: plu, store_id, created_date, horizon, target_date, sales_qty
    """
    return (
        sales
        .assign(sales_qty_mean_on_test=lambda x: x.groupby('Date')['Order_Demand'].transform('mean'))
        .query('sales_qty_mean_on_test >= @sales_qty_mean_on_test_threshold')
        [['Product_Code', 'Warehouse', 'Date']]
        .merge(pd.Series(horizons).to_frame('horizon'), how='cross')
        .rename({'Date': 'created_date'}, axis=1)
        .assign(target_date=lambda x: x['created_date'] + pd.to_timedelta(x['horizon'], unit='d'))
        .merge(sales[['Product_Code', 'Warehouse', 'Date', 'Order_Demand']].rename({'Date': 'target_date'}, axis=1),
               on=['Product_Code', 'Warehouse', 'target_date'])
    )

def build_simple_baselines(concat, test_date, horizons):
    sales_test = concat.query('Date >= @test_date')

    validation_df = build_validation(sales_test, horizons)
    return validation_df

def train_model_cb(df, horizon, training_config: TrainingConfig):
    validation_start_date = training_config.validation_start_date
    validation_end_date = training_config.validation_start_date

    df = (
        df.drop(non_feature_cols, axis=1)
          .query(f'~target_{horizon}.isnull().values')
    )

    train_df = df.query('Date < @validation_start_date')
    train_target = train_df[f'target_{horizon}']
    train_df_no_targets = train_df.drop(target_cols + ['Date'], axis=1)

    val_df = df.query('Date >= @validation_start_date').query('Date <= @validation_end_date')
    val_target = val_df[f'target_{horizon}']
    val_df_no_targets = val_df.drop(target_cols + ['Date'], axis=1)

    model = CatBoostRegressor(loss_function='MAE', logging_level="Silent")
    model.fit(train_df_no_targets,
                   train_target,
                   eval_set=[(train_df_no_targets, train_target),
                             (val_df_no_targets, val_target)])
    return model


def predict_cb(model, horizon, concat):
    predict_test = model.predict(concat.drop(non_feature_cols + ['Date'], axis=1))
    predict_test_pd = pd.Series(predict_test, index=concat.index, name='predict_xgb')
    return (
        pd.concat([
            concat[['Product_Code', 'Warehouse', 'Date']],
            predict_test_pd
        ], axis=1)
        .assign(horizon = lambda _ : horizon)
        .rename({'Date': 'created_date'}, axis=1)
    )

In [114]:
# оставим только послдение 40к, тк миллион будет учится долго, а принцип обучения не изменится
df_test = df.copy()

In [115]:
df_test = explode_targets(df_test, horizons)
test_date = datetime.date(2016, 12, 11)

training_config = TrainingConfig(
    validation_start_date=datetime.date(2016, 12, 1),
    validation_end_date=datetime.date(2016, 12, 10),
    test_start_date=datetime.date(2016, 12, 11)
)

models = {}
for horizon in horizons:
    model = train_model_cb(df_test, horizon, training_config)
    models[horizon] = model

concat_exploded_test = df_test.query('Date >= @test_date').drop(target_cols, axis=1)
baseline_forecasts = build_simple_baselines(df_test, test_date, horizons)
predicts_xgb = []
for horizon in horizons:
    predicts_xgb.append(predict_cb(models[horizon], horizon, concat_exploded_test))

baseline_forecasts = baseline_forecasts.merge(pd.concat(predicts_xgb),
                                              on=['Product_Code', 'Warehouse', 'created_date', 'horizon'])

In [116]:
baseline_forecasts

Unnamed: 0,Product_Code,Warehouse,created_date,horizon,target_date,Order_Demand,predict_xgb
0,Product_1263,Whse_C,2016-12-11,1,2016-12-12,50000.0,10421.885877
1,Product_1263,Whse_C,2016-12-11,2,2016-12-13,10000.0,6782.102862
2,Product_1263,Whse_C,2016-12-12,1,2016-12-13,10000.0,6116.113964
3,Product_1263,Whse_C,2016-12-11,3,2016-12-14,6000.0,15418.004183
4,Product_1263,Whse_C,2016-12-12,2,2016-12-14,6000.0,6042.864338
...,...,...,...,...,...,...,...
3598,Product_0202,Whse_C,2016-12-28,1,2016-12-29,5000.0,1590.813382
3599,Product_0608,Whse_S,2016-12-29,1,2016-12-30,80.0,16.501040
3600,Product_1263,Whse_S,2016-12-29,1,2016-12-30,100000.0,17737.980560
3601,Product_1954,Whse_J,2017-01-05,1,2017-01-06,3.0,6.453399


In [117]:
def bias_metric(y_true: list, y_pred: list) -> float:
        """
        Parameters:
            y_true (array): Array of observed values
            y_pred (array): Array of prediction values

        Returns:
            mbe (float): Bias score
        """
        y_true = (np.array(y_true))
        y_pred = (np.array(y_pred))
        return (sum(y_pred - y_true)) / sum(y_pred) * 100
    
bias_metric(list(baseline_forecasts.Order_Demand), list(baseline_forecasts.predict_xgb))

-152.68375511599072