## Импорты библиотек

In [1]:
import os
import json
import pandas as pd
import numpy as np
from datetime import datetime

import optuna
import optuna.logging
optuna.logging.set_verbosity(optuna.logging.CRITICAL)

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, r2_score, mean_absolute_percentage_error

from utils.utils import connection, data_from_ticker, data_from_tpulse, data_from_macrofactors
import utils.config_ml as config_ml

## Импорты глобальных переменных

In [2]:
ticker = config_ml.TICKER
left_date = config_ml.LEFT_DATE
right_date = config_ml.RIGHT_DATE
train_period = config_ml.TRAIN_PERIOD
val_period = config_ml.VAL_PERIOD
test_period = config_ml.TEST_PERIOD
step = config_ml.STEP
models = config_ml.MODELS
n_trials = config_ml.N_TRIALS
metric_optuna = config_ml.METRIC_OPTUNA
top_n_features = config_ml.TOP_N_FEATURES

## Сбор фичей по тикеру по дням

In [4]:
def pack_all_data_for_ml_models(ticker: str, left_date: str, right_date: str, conn):


    tpulse_data = data_from_tpulse(ticker, left_date, right_date, conn)
    ticker_data = data_from_ticker(ticker, left_date, right_date, conn)
    macrofactor_data = data_from_macrofactors(ticker, left_date, right_date, conn)

    data = tpulse_data.merge(macrofactor_data, how='left', on=['dt', 'ticker']).merge(ticker_data, how='left', on=['dt', 'ticker'])
    data = data[~data['target'].isnull()]

    return data

data = pack_all_data_for_ml_models(ticker, left_date, right_date, connection())
data.tail(3)

Unnamed: 0,dt,ticker,commentscount_sum,commentscount_mean,commentscount_max,reactioncount_sum,reactioncount_mean,reactioncount_max,num_words_sum,num_words_mean,...,high_lag_22_low_lag_22_diff_pct,high_low_diff,high_low_diff_pct,open_lag_22_close_lag_22_diff,open_lag_22_close_lag_22_diff_pct,macd_lag_22,rsi_14_lag_22,boll_ub_lag_22,boll_lb_lag_22,target
60,2025-08-30,SBER,482,10.255319,73,784,16.680851,120,7551,160.659574,...,3.175,0.83,0.268,2.82,0.917,-1.836,50.577,321.56,298.593,309.92
61,2025-08-31,SBER,553,10.843137,62,902,17.686275,61,8114,159.098039,...,2.507,1.32,0.426,3.21,1.035,-1.265,55.439,319.949,299.389,310.99
62,2025-09-01,SBER,1078,7.0,119,1688,10.961039,163,16145,104.837662,...,1.721,3.68,1.192,3.29,1.046,-0.446,61.245,316.68,301.75,309.2


## Цикл ML

In [5]:
# Сохраните датафрейм сверху в .csv для удобства

df = data.copy()
df.to_csv(fr'{os.getcwd()}/all_features.csv', index=False)

df = pd.read_csv(fr'{os.getcwd()}/all_features.csv')
df = df[~df['target'].isnull()]
df.head()

Unnamed: 0,dt,ticker,commentscount_sum,commentscount_mean,commentscount_max,reactioncount_sum,reactioncount_mean,reactioncount_max,num_words_sum,num_words_mean,...,high_lag_22_low_lag_22_diff_pct,high_low_diff,high_low_diff_pct,open_lag_22_close_lag_22_diff,open_lag_22_close_lag_22_diff_pct,macd_lag_22,rsi_14_lag_22,boll_ub_lag_22,boll_lb_lag_22,target
0,2025-07-01,SBER,467,5.494118,49,1382,16.258824,117,12947,152.317647,...,,3.3,1.043,,,,,,,318.42
1,2025-07-02,SBER,570,6.86747,47,1318,15.879518,114,10629,128.060241,...,,2.44,0.769,,,,,,,319.43
2,2025-07-03,SBER,547,8.287879,70,863,13.075758,50,8282,125.484848,...,,2.76,0.866,,,,,,,319.17
3,2025-07-04,SBER,528,9.777778,45,1269,23.5,114,8157,151.055556,...,1.777,3.91,1.231,-3.52,-1.119,0.0,,,,319.44
4,2025-07-05,SBER,437,15.607143,186,517,18.464286,81,4818,172.071429,...,1.87,1.1,0.344,-2.53,-0.813,-0.056,0.0,313.446,306.404,319.55


In [6]:
# Кросс-валидация скользящим окном
def sliding_windows_cross_validatin(df, train_days, val_days, test_days, step):
    windows = []
    n = len(df)
    current_test_end = n
    while True:
        test_start = current_test_end - test_days
        if test_start < 0:
            break
        val_start = test_start - val_days
        if val_start < 0:
            break
        train_start = val_start - train_days
        if train_start < 0:
            break
        windows.append({'train': df.iloc[train_start:val_start],
                        'val': df.iloc[val_start:test_start],
                        'test': df.iloc[test_start:current_test_end],
                        'id': current_test_end,
                        'dates': {
                                    'train': (df['dt'].iloc[train_start], df['dt'].iloc[val_start-1]),
                                    'val': (df['dt'].iloc[val_start], df['dt'].iloc[test_start-1]),
                                    'test': (df['dt'].iloc[test_start], df['dt'].iloc[current_test_end-1])
                                }
                        })
        current_test_end -= step
        if current_test_end < train_days + val_days + test_days:
            break
    return windows[::-1]

# Отбор топ фичей
def select_top_features(model, feature_names, top_n):
    if hasattr(model, 'coef_') and len(model.coef_) > 0:
        coefs = np.abs(model.coef_)
        top_indices = np.argsort(coefs)[-top_n:]
    elif hasattr(model, 'feature_importances_'):
        importances = model.feature_importances_
        top_indices = np.argsort(importances)[-top_n:]
    else:
        return feature_names[:top_n]
    return [feature_names[i] for i in top_indices]

# Подбор гиперпараметров
def make_objective(model_name, X_train, y_train, X_val, y_val):
    def objective(trial):
        model_cfg = models[model_name]
        params = {}
        if 'optuna_objective' in model_cfg and callable(model_cfg['optuna_objective']):
            params = model_cfg['optuna_objective'](trial)
        if model_name == 'CatBoost':
            model = CatBoostRegressor(**params, verbose=False)
        else:
            model = model_cfg['model']
            if params:
                model.set_params(**params)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)

        if metric_optuna == 'MAPE':
            return mean_absolute_percentage_error(y_val, y_pred)
        elif metric_optuna == 'MAE':
            return mean_absolute_error(y_val, y_pred)
    return objective


In [7]:
# Цикл ML
def train_val_test_ml_models(df, windows, models):
    print(f'Запущен цикл разработки ML моделей для прогнозирования стоимости акций "{ticker}"\n')
    df_ml_db = pd.DataFrame(columns=['dt', 'ticker', 'left_date', 'right_date', 'model_name', 'model_value', 'train_period', 'val_period', 'test_period', 'step', 'mape_mean', 'mae_mean', 'best_features'])
    df_ml = pd.DataFrame(columns=['test_period', 'model_name', 'mape', 'mae'])
    for model in models:
        ml_info = [] # model, features, mape,
        for idx, window in enumerate(windows):
            train, val, test = window['train'], window['val'], window['test']
            feature_cols = [col for col in train.columns if col not in ['dt', 'ticker', 'target']]

            print(f'Окно {idx+1} / модель {model}')
            print(f"Обучение {len(window['train'])} дн. {window['dates']['train'][0]} - {window['dates']['train'][1]} / Валидация {len(window['val'])} дн. {window['dates']['val'][0]} - {window['dates']['val'][1]} / Тестирование {len(window['test'])} дн. {window['dates']['test'][0]} - {window['dates']['test'][1]}")

            if model in ['LinearRegression', 'DecisionTree', 'GradientBoosting']:
                train = train.fillna(0)
                val = val.fillna(0)
                test = test.fillna(0)

            # Деление на train/test/val
            X_train, y_train = train[feature_cols], train['target']
            X_val, y_val = val[feature_cols], val['target']
            X_test, y_test = test[feature_cols], test['target']

            # Для линейных моделей важно стандартизировать данные
            if model == 'LinearRegression':
                best_params = {}
                scaler = StandardScaler()
                X_train_scaled = scaler.fit_transform(X_train)
                X_test_scaled = scaler.transform(X_test)
            else:
                # Для нелинейных моделей поиск ГП с помощью Optuna
                objective = make_objective(model, X_train, y_train, X_val, y_val)
                study = optuna.create_study(direction='minimize')
                study.optimize(objective, n_trials=4)
                best_params = study.best_params
                print(f"Лучшие ГП модели: {best_params} c {metric_optuna}={study.best_value:.3f}")

            if model == 'CatBoost':
                best_optuna_model = CatBoostRegressor(**best_params, verbose=False)
            else:
                best_optuna_model = models[model]['model']
                best_optuna_model.set_params(**best_params)
            if model == 'LinearRegression':
                best_optuna_model.fit(X_train_scaled, y_train)
            else:
                best_optuna_model.fit(X_train, y_train)

            # Отбор топ фичей
            selected_features = select_top_features(best_optuna_model, feature_cols, top_n_features)
            print(f'Отобрано топ-{top_n_features} фичей, 5 лучших из них: {selected_features[:5]}')
            selected_idx = [feature_cols.index(feature) for feature in selected_features]

            # Тестирование модели с лучшими ГП только на топ фичах
            if model == 'LinearRegression':
                best_optuna_model.fit(X_train_scaled[:, selected_idx], y_train)
                y_test_pred = best_optuna_model.predict(X_test_scaled[:, selected_idx])
            else:
                best_optuna_model.fit(X_train[selected_features], y_train)
                y_test_pred = best_optuna_model.predict(X_test[selected_features])
            test_mape = mean_absolute_percentage_error(y_test, y_test_pred)
            test_mae = mean_absolute_error(y_test, y_test_pred)
            print(f'Метрики лучшей модели в данном окне: MAPE={test_mape:.3f}, MAE={test_mae:.2f}\n')
            ml_info.append([best_optuna_model, selected_features, test_mape])

        # Определение лучшей модели - ГП из модели из окна с минимальным mape, лучшие фичи - те, которые есть во всех окнах
        best_model = sorted(ml_info, key=lambda x: x[-1])[0][0]
        best_features = set(ml_info[0][1])
        for feature_list in ml_info[1:]:
            best_features.intersection_update(feature_list[1])
        best_features = list(best_features)
        print('Формируется лучшая модель из построенных ранее...')
        print(f'Лучшая {model} модель - {best_model} с фичами в количестве {len(best_features)} шт.')
        mape_mean, mae_mean = [], []

        # Расчет статистик
        for idx, window in enumerate(windows):
            train, val, test = window['train'], window['val'], window['test']
            X_train, y_train = train[best_features], train['target']
            X_test, y_test = test[best_features], test['target']
            best_model.fit(X_train, y_train)
            best_model_pred = best_model.predict(X_test)
            test_mape = mean_absolute_percentage_error(y_test, best_model_pred)
            test_mae = mean_absolute_error(y_test, best_model_pred)
            mape_mean.append(test_mape)
            mae_mean.append(test_mae)
            new_row = pd.Series([f"{window['dates']['test'][0]} - {window['dates']['test'][1]}", model, round(test_mape,3), round(test_mae,3)], index=df_ml.columns)
            df_ml = pd.concat([df_ml, new_row.to_frame().T], ignore_index=True)
            print(f'Окно {idx+1}: лучшая {model} модель имеет MAPE={test_mape:.3f} ({round(100*(ml_info[idx][-1] - test_mape)/ml_info[idx][-1])}% к модели на окне {idx+1} ранее), MAE={test_mae:.2f}')

        new_row_db = pd.Series([datetime.now().strftime('%Y-%m-%d %H:%M:%S'), ticker, left_date, right_date, model, str(best_model), train_period, val_period, test_period, step, float(round(np.mean(mape_mean),3)), float(round(np.mean(mae_mean),3)), json.dumps(best_features)], index=df_ml_db.columns)
        df_ml_db = pd.concat([df_ml_db, new_row_db.to_frame().T], ignore_index=True)
        print('------------------------------------------------------------------------------------------------------------')

    # Отправить в бд созданные лучшие модели
    df_ml_db.to_sql('ml_models_data', con=connection(), if_exists='append', index=False)

    return df_ml.sort_values(by=['test_period', 'mae']), df_ml_db

windows = sliding_windows_cross_validatin(df, train_period, val_period, test_period, step)
df_ml, df_ml_db = train_val_test_ml_models(df, windows, models)
df_ml

Запущен цикл разработки ML моделей для прогнозирования стоимости акций "SBER"

Окно 1 / модель DecisionTree
Обучение 21 дн. 2025-07-04 - 2025-07-24 / Валидация 14 дн. 2025-07-25 - 2025-08-11 / Тестирование 7 дн. 2025-08-12 - 2025-08-18
Лучшие ГП модели: {'max_depth': 15, 'min_samples_split': 19, 'min_samples_leaf': 13, 'max_features': None} c MAPE=0.021
Отобрано топ-500 фичей, 5 лучших из них: ['RTSI_open_lag_10', 'RTSI_open_rm_9', 'RTSI_open_lag_9', 'RTSI_open_rm_8', 'RTSI_open_lag_8']
Метрики лучшей модели в данном окне: MAPE=0.010, MAE=3.13

Окно 2 / модель DecisionTree
Обучение 21 дн. 2025-07-11 - 2025-07-31 / Валидация 14 дн. 2025-08-01 - 2025-08-18 / Тестирование 7 дн. 2025-08-19 - 2025-08-25
Лучшие ГП модели: {'max_depth': 6, 'min_samples_split': 6, 'min_samples_leaf': 11, 'max_features': 'sqrt'} c MAPE=0.017
Отобрано топ-500 фичей, 5 лучших из них: ['RTSI_open_lag_10', 'RTSI_open_rm_9', 'RTSI_open_lag_9', 'RTSI_open_rm_8', 'RTSI_open_lag_8']
Метрики лучшей модели в данном окне:

Unnamed: 0,test_period,model_name,mape,mae
3,2025-08-12 - 2025-08-18,RandomForest,0.004,1.175
12,2025-08-12 - 2025-08-18,CatBoost,0.005,1.604
9,2025-08-12 - 2025-08-18,LightGBM,0.01,3.13
6,2025-08-12 - 2025-08-18,XGBoost,0.015,4.895
0,2025-08-12 - 2025-08-18,DecisionTree,0.016,5.063
13,2025-08-19 - 2025-08-25,CatBoost,0.004,1.28
10,2025-08-19 - 2025-08-25,LightGBM,0.005,1.62
1,2025-08-19 - 2025-08-25,DecisionTree,0.009,2.819
4,2025-08-19 - 2025-08-25,RandomForest,0.01,3.03
7,2025-08-19 - 2025-08-25,XGBoost,0.016,4.98


In [8]:
df_ml_db

Unnamed: 0,dt,ticker,left_date,right_date,model_name,model_value,train_period,val_period,test_period,step,mape_mean,mae_mean,best_features
0,2025-12-02 22:15:13,SBER,2025-07-01,2025-09-01,DecisionTree,"DecisionTreeRegressor(max_depth=11, max_featur...",21,14,7,7,0.012,3.744,"[""RUMBTRNS_open_rm_13"", ""RUPAI_open_lag_14"", ""..."
1,2025-12-02 22:15:16,SBER,2025-07-01,2025-09-01,RandomForest,"RandomForestRegressor(max_depth=9, max_feature...",21,14,7,7,0.007,2.104,"[""RTSTN_open_lag_21"", ""RUCBTRNS_open_lag_10"", ..."
2,2025-12-02 22:15:45,SBER,2025-07-01,2025-09-01,XGBoost,"XGBRegressor(base_score=None, booster=None, ca...",21,14,7,7,0.014,4.402,"[""RUMBTRNS_open_rm_13"", ""num_words_sum"", ""comm..."
3,2025-12-02 22:15:45,SBER,2025-07-01,2025-09-01,LightGBM,LGBMRegressor(colsample_bytree=0.9598356966412...,21,14,7,7,0.006,2.012,"[""RUMBTRNS_open_rm_13"", ""RUPAI_open_lag_14"", ""..."
4,2025-12-02 22:16:50,SBER,2025-07-01,2025-09-01,CatBoost,<catboost.core.CatBoostRegressor object at 0x3...,21,14,7,7,0.004,1.41,"[""open"", ""open_lag_21"", ""MOEXBMI_close_rm_3"", ..."
