## Черновик

In [2]:
df_train = pd.read_csv('./demand-forecasting-kernels-only/train.csv')

In [3]:
df_train.head()

Unnamed: 0,date,store,item,sales
0,2013-01-01,1,1,13
1,2013-01-02,1,1,11
2,2013-01-03,1,1,14
3,2013-01-04,1,1,13
4,2013-01-05,1,1,10


In [4]:
df_train.shape

(913000, 4)

In [5]:
df_test = pd.read_csv('./demand-forecasting-kernels-only/test.csv')

In [6]:
df_test.head()

Unnamed: 0,id,date,store,item
0,0,2018-01-01,1,1
1,1,2018-01-02,1,1
2,2,2018-01-03,1,1
3,3,2018-01-04,1,1
4,4,2018-01-05,1,1


In [7]:
df_test.shape

(45000, 4)

In [8]:
df_sample_submission = pd.read_csv('./demand-forecasting-kernels-only/sample_submission.csv')

In [9]:
df_sample_submission.head()

Unnamed: 0,id,sales
0,0,52
1,1,52
2,2,52
3,3,52
4,4,52


In [10]:
df_sample_submission.shape

(45000, 2)

In [11]:
from contest.file import extract_hybrid_strategy_features,read_timeseries,build_datasets,extract_advanced_features

In [12]:
train_ts, val_ts = read_timeseries('./demand-forecasting-kernels-only/train.csv')

In [13]:
train_ts

date
2013-01-01    13
2013-01-02    11
2013-01-03    14
2013-01-04    13
2013-01-05    10
              ..
2016-12-27    10
2016-12-28    16
2016-12-29    21
2016-12-30    24
2016-12-31    14
Name: sales, Length: 1461, dtype: int64

In [14]:
val_ts

date
2017-01-01    19
2017-01-02    15
2017-01-03    10
2017-01-04    16
2017-01-05    14
              ..
2017-12-27    14
2017-12-28    19
2017-12-29    15
2017-12-30    27
2017-12-31    23
Name: sales, Length: 365, dtype: int64

In [15]:
extract_hybrid_strategy_features(train_ts,5,5)

array([20, 21, 13, 20, 16, 10, 16, 21, 24, 14])

In [16]:
data1 = build_datasets(train_ts[:100],extract_hybrid_strategy_features,5,3)

In [17]:
train_ts[:11]

date
2013-01-01    13
2013-01-02    11
2013-01-03    14
2013-01-04    13
2013-01-05    10
2013-01-06    12
2013-01-07    10
2013-01-08     9
2013-01-09    12
2013-01-10     9
2013-01-11     9
Name: sales, dtype: int64

In [18]:
# data1

In [19]:
for x in data1:
    print(x[0].shape)

(95, 5)
(94, 6)
(93, 7)


In [20]:
arr = extract_hybrid_strategy_features(train_ts,5,5)

In [22]:
arr = np.append(arr,1)
arr

array([20, 21, 13, 20, 16, 10, 16, 21, 24, 14,  1])

## Подготовка данных

In [3]:
import datetime
import sklearn
import typing as tp
import numpy as np
import pandas as pd

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import Ridge

X_type = tp.NewType("X_type", np.ndarray)
X_row_type = tp.NewType("X_row_type", np.ndarray)
Y_type = tp.NewType("Y_type", np.array)
TS_type = tp.NewType("TS_type", pd.Series)
Model_type = tp.TypeVar("Model_type")

def extract_advanced_features(
        timeseries: TS_type,
        model_idx: int,
        window_size: int = 28
) -> X_row_type:
    """
    Расширенная функция для получения вектора фичей с дополнительными признаками

    Args:
        timeseries --- временной ряд до момента времени T (не включительно)
        model_idx --- индекс модели
        window_size --- количество последних значений ряда для прогноза

    Returns:
        Расширенный вектор фичей для модели с индексом model_idx
    """
    # Базовые признаки
    base_features = extract_hybrid_strategy_features(timeseries, model_idx, window_size)

    # Если недостаточно данных, возвращаем только базовые признаки
    if len(timeseries) < window_size + model_idx:
        return base_features

    feature_window = window_size + model_idx
    # Добавляем признаки на основе дат
    dates = timeseries.index[-feature_window:] if isinstance(timeseries, pd.Series) else pd.to_datetime(datetime.datetime.now())
    date_features = []

    for date in dates:
        date_features.extend([
            date.dayofweek,  # День недели
            date.month,  # Месяц
            date.day,  # День месяца
            date.quarter,  # Квартал
            # Признак выходного дня
            1 if date.dayofweek >= 5 else 0
        ])
    next_date = dates[-1]+ pd.Timedelta(days=1)
    date_features.extend([
            next_date.dayofweek,  # День недели
            next_date.month,  # Месяц
            next_date.day,  # День месяца
            next_date.quarter,  # Квартал
            # Признак выходного дня
            1 if next_date.dayofweek >= 5 else 0
        ])

    # Добавляем статистические признаки
    if len(timeseries) >= window_size + model_idx:
        recent_data = timeseries[-window_size - model_idx:].values
        stat_features = [
            np.mean(recent_data),  # Среднее
            np.std(recent_data),  # Стандартное отклонение
            np.min(recent_data),  # Минимум
            np.max(recent_data),  # Максимум
            np.median(recent_data)  # Медиана
        ]

        # # Добавляем лаги для учета сезонности
        # if len(timeseries) >= window_size + model_idx + 7:
        #     weekly_lag = timeseries[-(window_size + model_idx + 7):-(model_idx + 7)].values
        #     stat_features.extend([np.mean(weekly_lag), np.std(weekly_lag)])
        # else:
        #     stat_features.extend([0, 0])
    else:
        stat_features = [0, 0, 0, 0, 0, 0, 0]

    # Объединяем все признаки
    all_features = np.concatenate([base_features, date_features, stat_features])
    return all_features


In [1]:
import pandas as pd
import numpy as np
import kagglehub
from tqdm import tqdm 
import matplotlib.pyplot as plt
from IPython.display import clear_output

In [2]:
from contest.file import extract_hybrid_strategy_features,read_timeseries,build_datasets,extract_advanced_features, predict,train_models,score_models

In [3]:
train_ts, val_ts = read_timeseries('./demand-forecasting-kernels-only/train.csv')

In [4]:
models = train_models(train_ts,30)

  y.append(timeseries[j])


In [5]:
models

[Ridge(),
 Ridge(),
 Ridge(),
 Ridge(),
 Ridge(),
 Ridge(),
 Ridge(),
 Ridge(),
 Ridge(),
 Ridge(),
 Ridge(),
 Ridge(),
 Ridge(),
 Ridge(),
 Ridge(),
 Ridge(),
 Ridge(),
 Ridge(),
 Ridge(),
 Ridge(),
 Ridge(),
 Ridge(),
 Ridge(),
 Ridge(),
 Ridge(),
 Ridge(),
 Ridge(),
 Ridge(),
 Ridge(),
 Ridge()]

In [6]:
predict(val_ts,models,extract_advanced_features)

array([12.91469161, 16.89536072, 15.00118602, 16.5921341 , 15.41928255,
       19.85281022, 21.70962541, 13.14866428, 15.86720088, 14.02758238,
       15.62125075, 17.14600305, 20.70528175, 21.72591683, 14.2388538 ,
       17.65908784, 15.82408062, 15.19880789, 17.08890806, 21.15330992,
       24.3027826 , 14.55716634, 14.64782594, 15.17241039, 17.31681101,
       14.75402941, 18.27870271, 22.02488729, 14.60533226, 13.18535589])

In [7]:
score_models(train_ts,val_ts,models,predict)

28.583571854332735

In [15]:
train_ts.index[0]

Timestamp('2013-01-01 00:00:00')

In [18]:
def is_retail_important_day(date):
    # Словарь с важными для розничной торговли датами
    retail_dates = {
        # Основные праздники
        "new_year": (date.month == 1 and date.day == 1),
        "christmas": (date.month == 12 and date.day == 25),
        # "thanksgiving": is_thanksgiving(date),
        # "black_friday": is_black_friday(date),
        # "cyber_monday": is_cyber_monday(date),
        "valentines": (date.month == 2 and date.day == 14),
        "halloween": (date.month == 10 and date.day == 31),
        
        # Сезонные особенности
        "back_to_school": (date.month == 8 and date.day >= 15) or (date.month == 9 and date.day <= 15),
        "summer_start": (date.month == 6 and date.day <= 10),
        "winter_sale": (date.month == 1 and date.day >= 5 and date.day <= 15),
    }
    
    return [1 if retail_dates[key] else 0 for key in retail_dates]

retail_features = is_retail_important_day(train_ts.index[0])
retail_features

[1, 0, 0, 0, 0, 0, 0]