# Задание 1: Модель

In [1]:
from typing import List, Dict, Any, Tuple, Union, Optional

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

import warnings

warnings.filterwarnings("ignore")

import logging

logging.basicConfig()
logger = logging.getLogger("model")
logger.setLevel(logging.INFO)

## Вспомогательные функции

In [2]:
# для подсчета качества метрики
def calculate_mape(
    df: pd.DataFrame,
    true: str = "orders_num",
    pred: str = "pred_orders_num",
) -> float:
    df_copied = df.copy(deep=True)
    df_copied = df_copied[df_copied[true] > 0]
    df_copied["mape"] = np.abs(df_copied[true] - df_copied[pred]) / df_copied[true]
    return df_copied["mape"].mean()

In [3]:
from typing import List, Tuple

# Добавляем новые фичи (окна, сезонные фичи)


# def calc_rolling_window(
#     df: pd.DataFrame,
#     window: int,
#     col: str,
#     lvl: str,
#     shift: int,
# ) -> pd.DataFrame:
#     df = df.sort_values(["ds", lvl])
#     df["ts"] = pd.to_datetime(df["ds"].astype(str))
#     df["col"] = df[col]
#     rolling_df = df.set_index("ts")
#     rolling_df = (
#         rolling_df.groupby([lvl], group_keys=True)["col"]
#         # используем shift, так как потом нам нужно предсказывать на N дней вперед:
#         # для однородности датасета используем сдвиг на кол-во дат в предсказании
#         .apply(
#             lambda x: x.asfreq("1D")
#             .rolling(window=window, closed="left", min_periods=0)
#             .mean()
#             .shift(shift)
#         )
#         .reset_index()
#         .rename(columns={"col": f"rolling_{col}_w_{window}"})
#     )
#     df = df.merge(rolling_df, how="left", on=[lvl, "ts"])
#     df = df.drop(columns=["ts", "col"])
#     return df


def calc_rolling_window(
    df: pd.DataFrame,
    window: int,
    col: str,
    lvl: str,
    shift: int,
) -> pd.DataFrame:
    out = df.copy()
    out["ts"] = pd.to_datetime(out["ds"].astype(str))

    # collapse duplicates per (lvl, ts)
    agg_map = {"orders_num": "sum", "discount": "mean"}
    agg_func = agg_map.get(col, "mean")
    base = (
        out[[lvl, "ts", col]]
        .groupby([lvl, "ts"], as_index=False)[col]
        .agg(agg_func)
    )

    rolled = (
        base.set_index("ts")
        .groupby(lvl)[col]
        .apply(
            lambda x: x.asfreq("1D")
            .rolling(window=window, closed="left", min_periods=0)
            .mean()
            .shift(shift)
        )
        .reset_index()
        .rename(columns={col: f"rolling_{col}_w_{window}"})
    )

    out = out.merge(rolled, how="left", on=[lvl, "ts"]).drop(columns=["ts"])
    return out


def postprocess_transform(
    df: pd.DataFrame,
    norms: List[Tuple[str, str]],
    roll_cols: List[str],
    windows: List[int],
    dropna_cols: List[str],
    lvl: str,
    shift: int,
):
    # new features
    for window in windows:
        for col in roll_cols:
            logger.info(f"Rolling window={window} days for col `{col}`")
            df = calc_rolling_window(
                df=df, window=window, col=col, lvl=lvl, shift=shift
            )

    # normalisation
    # нормализация даст сигнал модели об изменении признаков: если изменилась цена, то к какому изменению спроса это привело?
    for col1, col2 in norms:
        logger.info(f"Normalizing `{col1}` / `{col2}`")
        df[col1] = df[col1] / (
            df[col2] + 1e-10
        )  # добавляем epsilon, чтобы не было деления на 0

    # postprocessing
    # для однородности данных удаляем первые даты, по которым собирались окна не по полным данным
    disadvantaged_ds_list = sorted(df["ds"].unique())[: max(shift, max(windows))]
    df = df[~df["ds"].isin(disadvantaged_ds_list)]

    df = df.dropna(subset=dropna_cols)
    df = df.round(2)
    df = df.sort_values(["ds", lvl])
    return df


def create_features(df: pd.DataFrame, dropna_cols: List[str], W: int) -> pd.DataFrame:
    df["day_of_week"] = pd.DatetimeIndex(df["ds"]).day_of_week
    df = postprocess_transform(
        df=df,
        norms=[
            ("orders_num", f"rolling_orders_num_w_{W}"),
            ("rolling_orders_num_w_1", f"rolling_orders_num_w_{W}"),
            ("rolling_orders_num_w_7", f"rolling_orders_num_w_{W}"),
            ("rolling_orders_num_w_14", f"rolling_orders_num_w_{W}"),
            ("rolling_orders_num_w_21", f"rolling_orders_num_w_{W}"),
            ("discount", f"rolling_discount_w_{W}"),
            ("rolling_discount_w_1", f"rolling_discount_w_{W}"),
            ("rolling_discount_w_7", f"rolling_discount_w_{W}"),
            ("rolling_discount_w_14", f"rolling_discount_w_{W}"),
            ("rolling_discount_w_21", f"rolling_discount_w_{W}"),
        ],
        roll_cols=["orders_num", "discount"],
        dropna_cols=dropna_cols,
        windows=[1, 7, 14, 21, 28],
        lvl="sku_id",
        shift=14,
    )
    return df

In [4]:
# Функция для разделения датасета для кросс-валидации

from sklearn.model_selection import TimeSeriesSplit
from typing import Tuple


def get_dfs(
    df: pd.DataFrame,
    n_splits: int,
    test_size: int,
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """
    Возвращает 3 датасета: трейн, вал, тест
    """
    # Уникальные даты в датасете
    dates = pd.Series(sorted(df["ds"].unique()))
    # Создание объекта TimeSeriesSplit
    tscv = TimeSeriesSplit(n_splits=n_splits, test_size=test_size)
    # Проход по разделениям
    for split_id, (train_idxs, test_idxs) in enumerate(tscv.split(dates)):
        train_dates = dates.iloc[train_idxs]
        test_dates = dates.iloc[test_idxs]
        val_dates = train_dates[-test_size:]
        train_dates = train_dates[:-test_size]

        logger.info(f"===== SPLIT {split_id} =====")
        logger.info(
            f"Train: {train_dates.min()} - {train_dates.max()}, size: {train_dates.shape[0]}"
        )
        logger.info(
            f"Validation: {val_dates.min()} - {val_dates.max()}, size: {val_dates.shape[0]}"
        )
        logger.info(
            f"Test: {test_dates.min()} - {test_dates.max()}, size: {test_dates.shape[0]}"
        )

        train_df = df[df["ds"].isin(train_dates)]
        val_df = df[df["ds"].isin(val_dates)]
        test_df = df[df["ds"].isin(test_dates)]
        yield train_df, val_df, test_df

In [5]:
from catboost import CatBoost, Pool
from typing import List, Dict, Any


# Функция для преобразования данных на вход Catboost
def prepare_pool(
    df: pd.DataFrame,
    flt_features: List[str],
    cat_features: List[str],
    target: Optional[str] = None,
) -> Pool:
    data = df[flt_features + cat_features]
    pool = Pool(
        data=data,
        label=df[target] if target is not None else None,
        cat_features=cat_features,
    )
    return pool


# Функция для тренировки модели спроса
def train_model(
    train_df: pd.DataFrame,
    val_df: pd.DataFrame,
    flt_features: List[str],
    cat_features: List[str],
    target: str,
    train_params: Dict[str, Any],
) -> CatBoost:
    train_data = train_df[flt_features + cat_features]
    train_pool = prepare_pool(
        df=train_df,
        flt_features=flt_features,
        cat_features=cat_features,
        target=target,
    )
    val_pool = prepare_pool(
        df=val_df,
        flt_features=flt_features,
        cat_features=cat_features,
        target=target,
    )
    model = CatBoost(params=train_params).fit(X=train_pool, eval_set=val_pool)
    return model


# Функция для предсказания спроса с обученной моделью
def predict(
    df: pd.DataFrame,
    model: CatBoost,
    flt_features: List[str],
    cat_features: List[str],
    target: str,
) -> List[float]:
    pool = prepare_pool(
        df=df,
        flt_features=flt_features,
        cat_features=cat_features,
    )
    preds = model.predict(data=pool)
    return preds

In [6]:
# Функция для обработки выбросов
def drop_outliers(
    df: pd.DataFrame,
    lvl: str,
    low_quantile: float,
    high_quantile: float,
) -> pd.DataFrame:
    # Убираем выбросы с помощью квантильного размаха
    df = df.merge(
        df.groupby(lvl)["orders_num"]
        .quantile(low_quantile)
        .reset_index()
        .rename(columns={"orders_num": "q25"}),
        how="left",
        on=lvl,
    )
    df = df.merge(
        df.groupby(lvl)["orders_num"]
        .quantile(high_quantile)
        .reset_index()
        .rename(columns={"orders_num": "q75"}),
        how="left",
        on=lvl,
    )
    df = df[
        (df["orders_num"] < df["q75"] + 1.5 * (df["q75"] - df["q25"]))
        & (df["orders_num"] > df["q25"] - 1.5 * (df["q75"] - df["q25"]))
    ]
    return df

In [7]:
from typing import List, Dict, Any, Callable, Optional


# Пайплайн для валидации модели
def run_pipeline(
    df: pd.DataFrame,
    flt_features: List[str],
    cat_features: List[str],
    train_params: Dict[str, Any],
    target: str,
    n_splits: int,
    test_size: int,
    preprocess_train: Optional[Callable] = None,
    preprocess_train_params: Optional[Dict[str, Any]] = None,
) -> float:
    mapes = []
    for train_df, val_df, test_df in get_dfs(
        df=df,
        n_splits=n_splits,
        test_size=test_size,
    ):
        if preprocess_train is not None and preprocess_train_params is not None:
            train_df = preprocess_train(df=train_df, **preprocess_train_params)
        # Тренируем модель
        model = train_model(
            train_df=train_df,
            val_df=val_df,
            flt_features=flt_features,
            cat_features=cat_features,
            target=target,
            train_params=train_params,
        )
        # Смотрим на важность фичей
        logger.info(model.get_feature_importance(prettified=True).head(10))
        # Предсказываем с помощью обученной модели
        test_df["pred_orders_num"] = predict(
            df=test_df,
            model=model,
            flt_features=flt_features,
            cat_features=cat_features,
            target=target,
        )
        # считаем качество
        # лучше считать качество с учетом постпроцессинга (включая clip), но сейчас эти пренебрежём
        mape = calculate_mape(
            df=test_df,
            true="orders_num",
            pred="pred_orders_num",
        )
        mapes.append(mape)
        logger.info(f"MAPE: {mape}")

    mean_mape = np.mean(mapes)
    logger.info(f"Mean MAPE: {mean_mape}")
    return mean_mape, model, train_df, val_df, test_df

## Данные

In [8]:
sales_df = pd.read_parquet("../hm/sales.parquet")
sales_df["ds"] = sales_df["ds"].astype(str)
sales_df.head()

Unnamed: 0,sku_id,orders_num,discount,ds
282,283,169.0,0.0,20230601
392,393,159.0,0.01,20230601
234,235,1061.0,0.03,20230601
214,215,630.0,0.0,20230601
42,43,198.0,0.01,20230601


In [9]:
categories_df = pd.read_parquet("../hm/categories.parquet")
categories_df.head()

Unnamed: 0,sku_id,group_1,group_2
33881,199,100004,100004
63640,293,100010,100018
20250,41,100010,100018
45018,325,100003,100032
33182,248,100001,100001


In [10]:
promo_df = pd.read_parquet("../hm/promo.parquet")
promo_df["ds"] = promo_df["ds"].astype(str)
promo_df.head()

Unnamed: 0,promo,ds
0,jun,20230612
1,no,20240108
2,jun,20230610
3,jun,20230611
4,jun,20230614


## Сборка данных

In [11]:
# Джойним все данные в единый датасет
orig_df = sales_df.copy(deep=True)
orig_df = orig_df.merge(categories_df, on=["sku_id"])
orig_df = orig_df.merge(promo_df, on=["ds"])
orig_df.head()

Unnamed: 0,sku_id,orders_num,discount,ds,group_1,group_2,promo
0,283,169.0,0.0,20230601,100000,100006,no
1,393,159.0,0.01,20230601,100009,100021,no
2,235,1061.0,0.03,20230601,100003,100032,no
3,215,630.0,0.0,20230601,100011,100020,no
4,43,198.0,0.01,20230601,100006,100023,no


## Преобразование данных

In [12]:
W = 28  # TODO: написать что такое
df = create_features(df=orig_df, dropna_cols=["orders_num", "discount"], W=W)
df.head()

INFO:model:Rolling window=1 days for col `orders_num`
INFO:model:Rolling window=1 days for col `discount`
INFO:model:Rolling window=7 days for col `orders_num`
INFO:model:Rolling window=7 days for col `discount`
INFO:model:Rolling window=14 days for col `orders_num`
INFO:model:Rolling window=14 days for col `discount`
INFO:model:Rolling window=21 days for col `orders_num`
INFO:model:Rolling window=21 days for col `discount`
INFO:model:Rolling window=28 days for col `orders_num`
INFO:model:Rolling window=28 days for col `discount`
INFO:model:Normalizing `orders_num` / `rolling_orders_num_w_28`
INFO:model:Normalizing `rolling_orders_num_w_1` / `rolling_orders_num_w_28`
INFO:model:Normalizing `rolling_orders_num_w_7` / `rolling_orders_num_w_28`
INFO:model:Normalizing `rolling_orders_num_w_14` / `rolling_orders_num_w_28`
INFO:model:Normalizing `rolling_orders_num_w_21` / `rolling_orders_num_w_28`
INFO:model:Normalizing `discount` / `rolling_discount_w_28`
INFO:model:Normalizing `rolling_di

Unnamed: 0,sku_id,orders_num,discount,ds,group_1,group_2,promo,day_of_week,rolling_orders_num_w_1,rolling_discount_w_1,rolling_orders_num_w_7,rolling_discount_w_7,rolling_orders_num_w_14,rolling_discount_w_14,rolling_orders_num_w_21,rolling_discount_w_21,rolling_orders_num_w_28,rolling_discount_w_28
43001,2,0.86,0.6,20230629,100001,100001,no,3,0.85,1.2,0.88,1.14,1.0,1.0,1.0,1.0,190.33,0.03
42961,3,0.96,3.6,20230629,100002,100002,no,3,0.98,0.0,1.01,0.72,1.0,1.0,1.0,1.0,163.83,0.01
42990,4,0.47,1.16,20230629,100003,100003,no,3,0.91,1.16,0.97,1.01,1.0,1.0,1.0,1.0,1730.31,0.03
42911,5,0.68,0.0,20230629,100004,100004,no,3,0.59,1.0,0.88,1.0,1.0,1.0,1.0,1.0,595.5,0.01
43071,8,2.73,0.0,20230629,100006,100007,no,3,,,1.03,0.0,1.0,0.0,1.0,0.0,1075.73,0.0


## Имплементация подхода: градиентный бустинг

In [13]:
df

Unnamed: 0,sku_id,orders_num,discount,ds,group_1,group_2,promo,day_of_week,rolling_orders_num_w_1,rolling_discount_w_1,rolling_orders_num_w_7,rolling_discount_w_7,rolling_orders_num_w_14,rolling_discount_w_14,rolling_orders_num_w_21,rolling_discount_w_21,rolling_orders_num_w_28,rolling_discount_w_28
43001,2,0.86,0.60,20230629,100001,100001,no,3,0.85,1.20,0.88,1.14,1.00,1.00,1.00,1.00,190.33,0.03
42961,3,0.96,3.60,20230629,100002,100002,no,3,0.98,0.00,1.01,0.72,1.00,1.00,1.00,1.00,163.83,0.01
42990,4,0.47,1.16,20230629,100003,100003,no,3,0.91,1.16,0.97,1.01,1.00,1.00,1.00,1.00,1730.31,0.03
42911,5,0.68,0.00,20230629,100004,100004,no,3,0.59,1.00,0.88,1.00,1.00,1.00,1.00,1.00,595.50,0.01
43071,8,2.73,0.00,20230629,100006,100007,no,3,,,1.03,0.00,1.00,0.00,1.00,0.00,1075.73,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42029,393,0.90,0.00,20231231,100009,100021,no,6,,,1.01,4.00,1.01,2.00,1.00,1.41,174.17,0.00
42196,397,0.97,52.00,20231231,100004,100022,no,6,0.96,0.00,1.00,2.17,1.01,1.00,1.00,0.65,162.46,0.00
42109,398,0.87,1.71,20231231,100000,100000,no,6,0.77,1.71,0.87,0.57,1.11,0.71,1.01,0.91,227.88,0.01
42168,400,0.79,0.96,20231231,100001,100024,no,6,,,0.85,0.96,0.98,0.96,0.98,0.96,199.86,0.01


In [14]:
df["promo"].unique()

array(['no', 'sep', 'oct', 'nov', 'dec'], dtype=object)

In [15]:
# Объявляем фичи и параметры для обучения
FLT_FEATURES = [
    "discount",
    "rolling_orders_num_w_1",
    "rolling_orders_num_w_7",
    "rolling_orders_num_w_14",
    "rolling_orders_num_w_21",
    "rolling_discount_w_1",
    "rolling_discount_w_7",
    "rolling_discount_w_14",
    "rolling_discount_w_21",
]
CAT_FEATURES = [
    "sku_id",
    "group_1",
    "group_2",
    "promo",
    "day_of_week",
]
TARGET = "orders_num"
TRAIN_PARAMS = {
    "task_type": "CPU",
    "verbose": 100,
    "random_seed": 0,
    "iterations": 1000,
    "loss_function": "Quantile:alpha=0.5",
    # выставляем ограничения на монотонность для поддержания закона спрос: чем больше цена, тем меньше спрос
    "monotone_constraints": [
        -1 if "discount" in feature else 0 for feature in FLT_FEATURES
    ],
    # хотим, чтобы спрос в большей степени зависел от цены => ставим фиче discount больший вес
    "feature_weights": "discount:2.0",
    "metric_period": 100,
    "early_stopping_rounds": 100,
    "eval_metric": "MAPE",
}

TEST_SIZE = 14
N_SPLITS = 3

In [16]:
mape, model, train_df, val_df, test_df = run_pipeline(
    df=df,
    flt_features=FLT_FEATURES,
    cat_features=CAT_FEATURES,
    train_params=TRAIN_PARAMS,
    target=TARGET,
    n_splits=N_SPLITS,
    test_size=TEST_SIZE,
    preprocess_train=drop_outliers,
    preprocess_train_params={
        "lvl": "sku_id",
        "low_quantile": 0.25,
        "high_quantile": 0.75,
    },
)

INFO:model:===== SPLIT 0 =====
INFO:model:Train: 20230629 - 20231105, size: 130
INFO:model:Validation: 20231106 - 20231119, size: 14
INFO:model:Test: 20231120 - 20231203, size: 14


0:	learn: 0.1650081	test: 0.2668826	best: 0.2668826 (0)	total: 168ms	remaining: 2m 48s
100:	learn: 0.1314374	test: 0.2651974	best: 0.2628709 (15)	total: 2.45s	remaining: 21.8s


INFO:model:                Feature Id  Importances
0                   sku_id    54.258772
1   rolling_orders_num_w_7    20.547966
2   rolling_orders_num_w_1    10.370890
3  rolling_orders_num_w_14     5.941729
4                 discount     3.957614
5  rolling_orders_num_w_21     1.875666
6                    promo     1.540883
7     rolling_discount_w_1     0.690916
8                  group_2     0.587474
9                  group_1     0.189870
INFO:model:MAPE: 0.35709179378925976
INFO:model:===== SPLIT 1 =====
INFO:model:Train: 20230629 - 20231119, size: 144
INFO:model:Validation: 20231120 - 20231203, size: 14
INFO:model:Test: 20231204 - 20231217, size: 14


Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.2628708689
bestIteration = 15

Shrink model to first 16 iterations.
0:	learn: 0.1690138	test: 0.2313026	best: 0.2313026 (0)	total: 25.2ms	remaining: 25.2s




100:	learn: 0.1382775	test: 0.2033877	best: 0.2033877 (100)	total: 2.11s	remaining: 18.8s
200:	learn: 0.1364132	test: 0.2032558	best: 0.2030665 (182)	total: 4.67s	remaining: 18.5s
300:	learn: 0.1354019	test: 0.2026543	best: 0.2025419 (288)	total: 7.27s	remaining: 16.9s
400:	learn: 0.1346050	test: 0.2026367	best: 0.2021838 (364)	total: 9.81s	remaining: 14.7s


INFO:model:                Feature Id  Importances
0                   sku_id    21.833902
1   rolling_orders_num_w_7    20.795882
2   rolling_orders_num_w_1    16.396645
3  rolling_orders_num_w_14     9.180558
4                 discount     7.925160
5  rolling_orders_num_w_21     7.307511
6                  group_2     4.620149
7                  group_1     4.539768
8                    promo     2.769484
9              day_of_week     2.065264
INFO:model:MAPE: 0.37725418488930434
INFO:model:===== SPLIT 2 =====
INFO:model:Train: 20230629 - 20231203, size: 158
INFO:model:Validation: 20231204 - 20231217, size: 14
INFO:model:Test: 20231218 - 20231231, size: 14


Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.202183836
bestIteration = 364

Shrink model to first 365 iterations.
0:	learn: 0.1720524	test: 0.2658701	best: 0.2658701 (0)	total: 30.7ms	remaining: 30.7s




100:	learn: 0.1409018	test: 0.1928054	best: 0.1928054 (100)	total: 2.5s	remaining: 22.3s
200:	learn: 0.1390453	test: 0.1889875	best: 0.1886472 (183)	total: 4.81s	remaining: 19.1s
300:	learn: 0.1381710	test: 0.1881697	best: 0.1873918 (269)	total: 9.16s	remaining: 21.3s
400:	learn: 0.1372612	test: 0.1859882	best: 0.1853550 (371)	total: 14.1s	remaining: 21s


INFO:model:                Feature Id  Importances
0                   sku_id    25.974188
1   rolling_orders_num_w_7    18.659276
2   rolling_orders_num_w_1    14.945999
3  rolling_orders_num_w_14    10.149261
4  rolling_orders_num_w_21     9.043544
5                 discount     6.875881
6                  group_2     4.193079
7                  group_1     3.628186
8                    promo     2.572731
9              day_of_week     1.819153
INFO:model:MAPE: 0.31892674064307297
INFO:model:Mean MAPE: 0.3510909064405457


Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.1853550438
bestIteration = 371

Shrink model to first 372 iterations.


In [17]:
# сохраняем модель
model.save_model("demand")

## Сборка датасета и предсказание

In [18]:
pred_df = pd.read_csv("../hm/homework_3_1.csv")
pred_df["ds"] = pred_df["ds"].astype(str)
START_DS, END_DS = pred_df["ds"].min(), pred_df["ds"].max()

# На даты для предсказания не проходит промо
pred_df["promo"] = "no"

# Наджойниваем категории и комиссии
pred_df = pred_df.merge(categories_df, how="inner", on=["sku_id"])

# Конкатим с историческими данными, чтобы посчитать оконные фичи
cols = pred_df.columns.tolist()
pred_df = pd.concat([orig_df.copy(deep=True)[cols], pred_df[cols]])

# Считаем фичи
pred_df["orig_discount"] = pred_df["discount"]
pred_df = create_features(df=pred_df, dropna_cols=["discount"], W=W)
pred_df = pred_df[pred_df["ds"].between(START_DS, END_DS)]

# предсказываем спрос с помощью модели
pred_df["orders_num"] = predict(
    df=pred_df,
    model=model,
    flt_features=FLT_FEATURES,
    cat_features=CAT_FEATURES,
    target=TARGET,
)
# восстанавливаем реальные значения orders_num
pred_df["orders_num"] = pred_df["orders_num"] * pred_df["rolling_orders_num_w_28"]
# orders_num не может быть меньше 0
pred_df["orders_num"] = np.clip(pred_df["orders_num"], a_min=0.0, a_max=None)

pred_df = pred_df[["sku_id", "orig_discount", "orders_num", "ds"]]
pred_df = pred_df.rename(columns={"orig_discount": "discount"})
pred_df.head()

INFO:model:Rolling window=1 days for col `orders_num`
INFO:model:Rolling window=1 days for col `discount`
INFO:model:Rolling window=7 days for col `orders_num`
INFO:model:Rolling window=7 days for col `discount`
INFO:model:Rolling window=14 days for col `orders_num`
INFO:model:Rolling window=14 days for col `discount`
INFO:model:Rolling window=21 days for col `orders_num`
INFO:model:Rolling window=21 days for col `discount`
INFO:model:Rolling window=28 days for col `orders_num`
INFO:model:Rolling window=28 days for col `discount`
INFO:model:Normalizing `orders_num` / `rolling_orders_num_w_28`
INFO:model:Normalizing `rolling_orders_num_w_1` / `rolling_orders_num_w_28`
INFO:model:Normalizing `rolling_orders_num_w_7` / `rolling_orders_num_w_28`
INFO:model:Normalizing `rolling_orders_num_w_14` / `rolling_orders_num_w_28`
INFO:model:Normalizing `rolling_orders_num_w_21` / `rolling_orders_num_w_28`
INFO:model:Normalizing `discount` / `rolling_discount_w_28`
INFO:model:Normalizing `rolling_di

Unnamed: 0,sku_id,discount,orders_num,ds
47831,2,0.02,178.25929,20240101
47272,3,0.01,164.798662,20240101
47690,4,0.03,1754.71531,20240101
46613,5,0.01,498.479606,20240101
47961,12,0.0,186.781659,20240101


In [19]:
pred_df.to_csv("../hm/homework_3_1_solution.csv", index=False)

<br><br>
## TASK 2

In [20]:
discounts_df = pd.DataFrame({"discount": [0.05, 0.04, 0.03, 0.02, 0.01, 0.00, -0.01, -0.02, -0.03, -0.04, -0.05]})

In [21]:
pred_df_02 = pred_df \
    .filter(["ds"]) \
    .drop_duplicates() \
    .assign(orders_num = None,
            ds = lambda x: x["ds"].astype("str")) \
    .join(sales_df.filter(["sku_id"]).drop_duplicates(),
          how="cross") \
    .join(discounts_df, how="cross")

START_DS, END_DS = pred_df_02["ds"].min(), pred_df_02["ds"].max()

# На даты для предсказания не проходит промо
pred_df_02["promo"] = "no"

# Наджойниваем категории и комиссии
pred_df_02 = pred_df_02.merge(categories_df, how="inner", on=["sku_id"])

# Конкатим с историческими данными, чтобы посчитать оконные фичи
cols = pred_df_02.columns.tolist()
pred_df_02 = pd.concat([orig_df.copy(deep=True)[cols], pred_df_02[cols]])

# Считаем фичи
pred_df_02["orig_discount"] = pred_df_02["discount"]
pred_df_02 = create_features(df=pred_df_02, dropna_cols=["discount"], W=W)
pred_df_02 = pred_df_02[pred_df_02["ds"].between(START_DS, END_DS)]

# предсказываем спрос с помощью модели
pred_df_02["orders_num"] = predict(
    df=pred_df_02,
    model=model,
    flt_features=FLT_FEATURES,
    cat_features=CAT_FEATURES,
    target=TARGET,
)

# восстанавливаем реальные значения orders_num
pred_df_02["orders_num"] = pred_df_02["orders_num"] * pred_df_02["rolling_orders_num_w_28"]
# orders_num не может быть меньше 0
pred_df_02["orders_num"] = np.clip(pred_df_02["orders_num"], a_min=0.0, a_max=None)

pred_df_02 = pred_df_02[["sku_id", "orig_discount", "ds", "orders_num"]]
pred_df_02 = pred_df_02.rename(columns={"orig_discount": "discount"})
pred_df_02.head()

INFO:model:Rolling window=1 days for col `orders_num`
INFO:model:Rolling window=1 days for col `discount`
INFO:model:Rolling window=7 days for col `orders_num`
INFO:model:Rolling window=7 days for col `discount`
INFO:model:Rolling window=14 days for col `orders_num`
INFO:model:Rolling window=14 days for col `discount`
INFO:model:Rolling window=21 days for col `orders_num`
INFO:model:Rolling window=21 days for col `discount`
INFO:model:Rolling window=28 days for col `orders_num`
INFO:model:Rolling window=28 days for col `discount`
INFO:model:Normalizing `orders_num` / `rolling_orders_num_w_28`
INFO:model:Normalizing `rolling_orders_num_w_1` / `rolling_orders_num_w_28`
INFO:model:Normalizing `rolling_orders_num_w_7` / `rolling_orders_num_w_28`
INFO:model:Normalizing `rolling_orders_num_w_14` / `rolling_orders_num_w_28`
INFO:model:Normalizing `rolling_orders_num_w_21` / `rolling_orders_num_w_28`
INFO:model:Normalizing `discount` / `rolling_discount_w_28`
INFO:model:Normalizing `rolling_di

Unnamed: 0,sku_id,discount,ds,orders_num
64086,2,0.05,20240101,174.902539
64087,2,0.04,20240101,176.634461
64088,2,0.03,20240101,176.312158
64089,2,0.02,20240101,178.25929
64090,2,0.01,20240101,183.81498


In [22]:
# Save the results
pred_df_02.to_csv("../hm/homework_3_2_solution.csv", index=False)

<br><br>
## TASK 3

In [23]:
pred_df_03 = pred_df_02 \
    .sort_values("orders_num", ascending=False) \
    .groupby(["sku_id", "ds"], as_index=False) \
    .head(1) \
    .sort_values("discount", ascending=False) \
    .groupby(["sku_id", "ds"], as_index=False) \
    .head(1) \
    .filter(["sku_id", "ds", "discount"])

In [24]:
pred_df_03 = pred_df_02 \
    .groupby(["ds", "sku_id"], as_index=False) \
    .agg(max_orders_num = ("orders_num", "max")) \
    .merge(pred_df_02,
           how="left",
           left_on=["ds", "sku_id", "max_orders_num"],
           right_on=["ds", "sku_id", "orders_num"]) \
    .sort_values("discount", ascending=False) \
    .groupby(["sku_id", "ds"], as_index=False) \
    .head(1) \
    .filter(["sku_id", "ds", "discount"])

In [25]:
# Save the results
pred_df_03.to_csv("../hm/homework_3_3_solution.csv", index=False)

In [26]:
pred_df_03

Unnamed: 0,sku_id,ds,discount
3395,92,20240104,0.05
1989,377,20240102,0.05
4091,377,20240104,0.05
5784,189,20240106,0.05
13062,100,20240113,0.05
...,...,...,...
1954,362,20240102,-0.02
4189,13,20240105,-0.02
4846,248,20240105,-0.02
2091,13,20240103,-0.02
