<a href="https://colab.research.google.com/github/ivander16052002-droid/randomname/blob/main/Untitled5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
import xgboost as xgb
import joblib
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error
from dateutil.relativedelta import relativedelta
import logging
import os

logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s:%(message)s')

TARGET = 'Продажи'
FORECAST_MONTHS = 18


# ==== 1. Загрузка данных ====
def load_data(sales_path, stock_path, ms_path, dist_index_path=None, weather_path=None,
              discounts_path=None, longterm_avg_path=None):
    logging.info("Шаг 1: Загружаем данные...")
    try:
        sales = pd.read_csv(sales_path, encoding='utf-8')
        stock = pd.read_csv(stock_path, encoding='utf-8')
        ms = pd.read_csv(ms_path, encoding='utf-8')
    except FileNotFoundError as e:
        logging.error(f"Не найден файл: {e}")
        return None, None, None, None, None, None, None

    dist = pd.read_csv(dist_index_path, encoding='utf-8') if dist_index_path and os.path.exists(dist_index_path) else None
    weather = pd.read_csv(weather_path, encoding='utf-8') if weather_path and os.path.exists(weather_path) else None
    discounts = pd.read_csv(discounts_path, encoding='utf-8') if discounts_path and os.path.exists(discounts_path) else None
    longterm = pd.read_csv(longterm_avg_path, encoding='utf-8') if longterm_avg_path and os.path.exists(longterm_avg_path) else None

    logging.info("Данные успешно загружены.")
    return sales, stock, ms, dist, weather, discounts, longterm


# ==== 2. Предобработка ====
def preprocess_dates(df, year_col='Год', month_col='Месяц'):
    df[year_col] = df[year_col].astype(int)
    df[month_col] = df[month_col].astype(int)
    df['Дата'] = pd.to_datetime(dict(year=df[year_col], month=df[month_col], day=1))
    return df


def parse_city_index(df, col='Город/Индекс'):
    def _parse(v):
        if pd.isna(v):
            return (None, None)
        s = str(v).strip()
        if s.isdigit():
            return (None, s)
        else:
            return (s, None)
    parsed = df[col].apply(lambda x: pd.Series(_parse(x), index=['Город', 'Индекс']))
    df = pd.concat([df.drop(columns=[col]), parsed], axis=1)
    return df


# ==== 3. Объединение ====
def merge_all(sales, stock, ms, dist=None, weather=None, discounts=None):
    logging.info("Шаг 2: Объединяем таблицы...")
    df = sales.merge(stock, on=['Сеть', 'ID_Контрагента', 'Год', 'Месяц'], how='left')
    df = df.merge(ms, on=['Сеть', 'Год', 'Месяц'], how='left')
    if dist is not None:
        df = df.merge(dist, on=['ID_Контрагента','Год','Месяц'], how='left')
    if weather is not None:
        df = df.merge(weather, on=['Год','Месяц'], how='left')
    if discounts is not None:
        df = df.merge(discounts, on=['Сеть','ID_Контрагента','Год','Месяц'], how='left')
    df = preprocess_dates(df)
    logging.info(f"Объединённый датафрейм: {df.shape}")
    return df


# ==== 4. Сетка ====
def create_full_grid(df, forecast_months=FORECAST_MONTHS):
    logging.info("Шаг 3: Создаём полную временную сетку...")
    start = df['Дата'].min()
    last_historical = df['Дата'].max()
    end = (last_historical + relativedelta(months=forecast_months)).replace(day=1)

    combos = df[['Сеть','ID_Контрагента','Город','Индекс']].drop_duplicates().reset_index(drop=True)
    all_months = pd.date_range(start=start, end=end, freq='MS')

    grid = pd.MultiIndex.from_product([combos.index, all_months], names=['comb_idx','Дата']).to_frame(index=False)
    full = grid.merge(combos, left_on='comb_idx', right_index=True).drop(columns=['comb_idx'])
    full = full.merge(df, on=['Сеть','ID_Контрагента','Город','Индекс','Дата'], how='left')

    full['Год'] = full['Дата'].dt.year
    full['Месяц'] = full['Дата'].dt.month
    logging.info(f"Сетка готова: {full.shape}")
    return full


# ==== 5. Чистка и фичи ====
def clean_and_features(df):
    logging.info("Шаг 4: Очистка и добавление фич...")
    df[TARGET] = df[TARGET].fillna(0)
    for col in ['Остатки в рынке','УСТМ','НТЗ','ТМА','discount_pct']:
        if col in df.columns:
            df[col] = df[col].fillna(0)
    df['Сеть'] = df['Сеть'].fillna('Без сети')
    df['Город'] = df['Город'].fillna('UNKNOWN')
    df['Индекс'] = df['Индекс'].fillna('UNKNOWN')
    df['is_active'] = ((df[TARGET] > 0) | (df['Остатки в рынке'] > 0)).astype(int)
    return df


# ==== 6. Лаги ====
def add_lags(df, lags=[1,2,3], rollings=[3,6,12]):
    logging.info("Шаг 5: Добавляем лаги и скользящие средние...")
    df = df.sort_values(['ID_Контрагента','Дата'])
    for lag in lags:
        df[f'{TARGET}_lag_{lag}'] = df.groupby('ID_Контрагента')[TARGET].shift(lag)
    for w in rollings:
        df[f'{TARGET}_rollmean_{w}'] = df.groupby('ID_Контрагента')[TARGET].shift(1).rolling(w).mean().reset_index(level=0,drop=True)
    return df.fillna(0)


# ==== 7. Обучение модели ====
def train_model(df, features):
    logging.info("Шаг 6: Обучаем модель...")
    df = df.sort_values('Дата')
    unique_dates = df['Дата'].unique()
    tss = TimeSeriesSplit(n_splits=3)

    val_scores = []
    for fold, (train_idx, val_idx) in enumerate(tss.split(unique_dates)):
        train_dates = unique_dates[train_idx]
        val_dates = unique_dates[val_idx]
        train_mask = df['Дата'].isin(train_dates)
        val_mask = df['Дата'].isin(val_dates)

        X_train, y_train = df.loc[train_mask, features], df.loc[train_mask, TARGET]
        X_val, y_val = df.loc[val_mask, features], df.loc[val_mask, TARGET]

        model = xgb.XGBRegressor(n_estimators=300, max_depth=6, learning_rate=0.05, n_jobs=4)
        model.fit(X_train, y_train)
        preds = model.predict(X_val)
        rmse = mean_squared_error(y_val, preds, squared=False)
        val_scores.append(rmse)
        logging.info(f"Fold {fold} RMSE = {rmse:.4f}")

    final_model = xgb.XGBRegressor(n_estimators=300, max_depth=6, learning_rate=0.05, n_jobs=4)
    final_model.fit(df[features], df[TARGET])
    logging.info(f"Средний RMSE CV: {np.mean(val_scores):.4f}")
    return final_model


# ==== 8. Прогноз ====
def forecast(df, model, features, start_date, months=FORECAST_MONTHS):
    logging.info("Шаг 7: Прогнозируем...")
    forecasts = []
    for step in range(months):
        cur_month = (start_date + relativedelta(months=step)).replace(day=1)
        logging.info(f"Прогноз для месяца: {cur_month.strftime('%Y-%m')}")
        idx = df['Дата'] == cur_month
        X = df.loc[idx, features].fillna(0)
        preds = model.predict(X)
        df.loc[idx, TARGET] = preds
        forecasts.append(df.loc[idx, ['Сеть','ID_Контрагента','Дата', TARGET]])
    return pd.concat(forecasts)


# ==== 9. Основной пайплайн ====
def pipeline(sales_path, stock_path, ms_path,
             dist_path=None, weather_path=None, discounts_path=None, longterm_path=None):
    sales, stock, ms, dist, weather, discounts, longterm = load_data(
        sales_path, stock_path, ms_path, dist_path, weather_path, discounts_path, longterm_path
    )

    if sales is None:
        return

    sales = preprocess_dates(parse_city_index(sales))
    stock = preprocess_dates(stock)
    ms = preprocess_dates(ms)
    if dist is not None:
        dist = preprocess_dates(dist)
    if weather is not None:
        weather = preprocess_dates(weather)
    if discounts is not None:
        discounts = preprocess_dates(discounts)

    df = merge_all(sales, stock, ms, dist, weather, discounts)
    df = create_full_grid(df)
    df = clean_and_features(df)
    df = add_lags(df)

    features = [c for c in df.columns if c not in ['Сеть','Город','Индекс','Дата',TARGET,'ID_Контрагента']]
    last_hist_date = sales['Дата'].max()

    train_df = df[df['Дата'] <= last_hist_date]
    model = train_model(train_df, features)

    forecast_df = forecast(df, model, features, start_date=last_hist_date + relativedelta(months=1))
    agg_forecast = forecast_df.groupby('Дата')[TARGET].sum().reset_index()

    forecast_df.to_csv('forecast_by_pharmacy.csv', index=False)
    agg_forecast.to_csv('forecast_agg_product.csv', index=False)
    joblib.dump(model, 'xgb_final_model.joblib')

    logging.info("Готово! Прогноз и модель сохранены.")
    return model, forecast_df, agg_forecast


if __name__ == '__main__':
    pipeline(
        sales_path='Продажи.csv',
        stock_path='Остатки.csv',
        ms_path='МС.csv',
        dist_path='index_by_pharmacy.csv',
        weather_path='weather.csv',
        discounts_path='discounts.csv',
        longterm_path='10y_monthly_avg.csv'
    )
