# Гиперпараметры для выборки

In [37]:
train_size1 = 0.9
time_step = 28

metrics = 'mean_absolute_error'


# Загрузка датасета и подготовка данных

In [6]:
import pandas as pd
import plotly.express as px

In [15]:
# возьмем свой датасет с finam с котировками за 3 месяца, соответствующие имеющимся данным по новостям
df = pd.read_csv('/content/drive/MyDrive/SBER_240315_240610 (1).csv', sep = ';')
df.columns = ['TICKER', "PER",	'DATE',	'TIME',	'OPEN',	'HIGH',	'LOW',	'CLOSE',	'VOL']
df['DATE'] = df['DATE'].astype(str)
df['TIME'] = df['TIME'].astype(str).str.zfill(6)
df['datetime'] = pd.to_datetime(df['DATE'] + ' ' + df['TIME'], format='%Y%m%d %H%M%S')
px.line(df, y = 'CLOSE', x = 'datetime', template = 'presentation', title = 'Котировки акций Сбера', labels = {'datetime':'Дата', 'CLOSE':'Цена закрытия'},
        color_discrete_sequence=['#777777'])

In [14]:
news = pd.read_csv('/content/drive/MyDrive/news.csv')
px.histogram(news, x = 'created_at', template = 'presentation', title = 'Распределение новостей по времени', labels = {'created_at':'Дата', 'count':'количество'},
             color_discrete_sequence=['#777777'], nbins = 70)

# Новостной анализ

Возьмем модель с Hugging Face. Модель анализирует сентимент финансовых новостей и выдает значения positive, neutral, negative

In [18]:
!pip install deep-translator
from transformers import pipeline
from tqdm import tqdm
from deep_translator import GoogleTranslator
model_hf = pipeline("text2text-generation", model="cometrain/moexT5")

Collecting deep-translator
  Downloading deep_translator-1.11.4-py3-none-any.whl (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.3/42.3 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: deep-translator
Successfully installed deep-translator-1.11.4




The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.



config.json:   0%|          | 0.00/712 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.95k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Сделаем просто анализ сентимента для каждой новости:

In [19]:
# запишем в один список - дату новости, в другой - сентимент
list_of_dates = []
list_of_values = []
for i in tqdm(range(len(news))):
  date = news.iloc[i, 1]
  content = news.iloc[i, 0]
  # для большего качества модели переведем на английский (эмпирическим путем было выявлено, что так лучще)
  content = GoogleTranslator(source='ru', target='en').translate(content)
  list_of_dates.append(date)
  value = model_hf(content)[0]['generated_text']
  list_of_values.append(value)

# запишем в отдельный dataframe для удобства
news_analyse_all = pd.DataFrame()
news_analyse_sber = pd.DataFrame()

news_analyse_all['values'] = list_of_values
news_analyse_all['datetime'] = list_of_dates
news_analyse_all['datetime'] = pd.to_datetime(news_analyse_all['datetime'])
news_analyse_all['values'] = news_analyse_all['values'].replace({'positive': 1, 'neutral': 0, 'negative': -1}).astype(int)


Using the model-agnostic default `max_length` (=20) to control the generation length. We recommend setting `max_new_tokens` to control the maximum length of the generation.

100%|██████████| 4151/4151 [1:15:49<00:00,  1.10s/it]


In [20]:
# сделаем точно также, но для новостей о Сбере
filtered_news = news[news['content'].str.contains(r'\bсбер\w*', case=False, regex=True)]

list_of_dates2 = []
list_of_values2 = []
for i in tqdm(range(len(filtered_news))):
  date = filtered_news.iloc[i, 1]
  content = filtered_news.iloc[i, 0]
  content = GoogleTranslator(source='ru', target='en').translate(content)
  list_of_dates2.append(date)
  value = model_hf(content)[0]['generated_text']
  list_of_values2.append(value)

news_analyse_sber['values'] = list_of_values2
news_analyse_sber['datetime'] = list_of_dates2
news_analyse_sber['datetime'] = pd.to_datetime(news_analyse_sber['datetime'])
news_analyse_sber['values'] = news_analyse_sber['values'].replace({'positive': 1, 'neutral': 0, 'negative': -1}).astype(int)


Using the model-agnostic default `max_length` (=20) to control the generation length. We recommend setting `max_new_tokens` to control the maximum length of the generation.

100%|██████████| 241/241 [02:00<00:00,  2.01it/s]


Для каждого часа в df проведем следующие операции:
- найдем все новости за последний X промежуток
- усредним сентимент (ранее выразили его в числах)
- повторим для различных периодов времени

In [22]:
def last_hour(dt, news_analyse):
  dt = pd.to_datetime(dt).tz_localize(None)
  dt2 = dt - pd.to_timedelta(1, unit='h')
  dt1 = dt
  mean_news = news_analyse[(news_analyse['datetime'].dt.tz_localize(None) >= dt2) &
                          (news_analyse['datetime'].dt.tz_localize(None) <= dt1)]
  if len(mean_news) > 0 :
    return mean_news['values'].mean()
  else:
    return 0

def last_day(dt, news_analyse):
  dt = pd.to_datetime(dt).tz_localize(None)
  dt2 = dt - pd.to_timedelta(24, unit='h')
  dt1 = dt
  mean_news = news_analyse[(news_analyse['datetime'].dt.tz_localize(None) >= dt2) &
                          (news_analyse['datetime'].dt.tz_localize(None) <= dt1)]
  if len(mean_news) > 0 :
    return mean_news['values'].mean()
  else:
    return 0

def last_3days(dt, news_analyse):
  dt = pd.to_datetime(dt).tz_localize(None)
  dt2 = dt - pd.to_timedelta(72, unit='h')
  dt1 = dt
  mean_news = news_analyse[(news_analyse['datetime'].dt.tz_localize(None) >= dt2) &
                          (news_analyse['datetime'].dt.tz_localize(None) <= dt1)]
  if len(mean_news) > 0 :
    return mean_news['values'].mean()
  else:
    return 0

def last_week(dt, news_analyse):
  dt = pd.to_datetime(dt).tz_localize(None)
  dt2 = dt - pd.to_timedelta(168, unit='h')
  dt1 = dt
  mean_news = news_analyse[(news_analyse['datetime'].dt.tz_localize(None) >= dt2) &
                          (news_analyse['datetime'].dt.tz_localize(None) <= dt1)]
  if len(mean_news) > 0 :
    return mean_news['values'].mean()
  else:
    return 0

def last_month(dt, news_analyse):
  dt = pd.to_datetime(dt).tz_localize(None)
  dt2 = dt - pd.to_timedelta(5040, unit='h')
  dt1 = dt
  mean_news = news_analyse[(news_analyse['datetime'].dt.tz_localize(None) >= dt2) &
                          (news_analyse['datetime'].dt.tz_localize(None) <= dt1)]
  if len(mean_news) > 0 :
    return mean_news['values'].mean()
  else:
    return 0

df['hour_news_stat'] = df['datetime'].apply(last_hour, args=(news_analyse_all,))
df['day_news_stat'] = df['datetime'].apply(last_day, args=(news_analyse_all,))
df['3days_news_stat'] = df['datetime'].apply(last_3days, args=(news_analyse_all,))
df['week_news_stat'] = df['datetime'].apply(last_week, args=(news_analyse_all,))
df['month_news_stat'] = df['datetime'].apply(last_month, args=(news_analyse_all,))

df['hour_news_stat'] = df['datetime'].apply(last_hour, args=(news_analyse_sber,))
df['day_news_stat'] = df['datetime'].apply(last_day, args=(news_analyse_sber,))
df['3days_news_stat'] = df['datetime'].apply(last_3days, args=(news_analyse_sber,))
df['week_news_stat'] = df['datetime'].apply(last_week, args=(news_analyse_sber,))
df['month_news_stat'] = df['datetime'].apply(last_month, args=(news_analyse_sber,))

# Создадим еще признаки на основе распределения котировок в прошлом
Например: средняя, максимальная, минимальная цена за X предыдущих дней, дисперсия и тд.
Часть данных будет учитывать другие, особенно при загрузке этого в модель с небольшим окном, но бустингу не страшны скоррелированные признаки.


In [25]:
df['datetime'] = pd.to_datetime(df['datetime'])

df['day'] = df['datetime'].dt.day
df['day_of_week'] = df['datetime'].dt.dayofweek

df['Mean_1d'] = df['CLOSE'].rolling(window=24).mean()
df['Max_1d'] = df['CLOSE'].rolling(window=24).max()
df['Min_1d'] = df['CLOSE'].rolling(window=24).min()
df['Std_1d'] = df['CLOSE'].rolling(window=24).std()
df['Range_1d'] = df['Max_1d'] - df['Min_1d']

df['Mean_3d'] = df['CLOSE'].rolling(window=72).mean()
df['Max_3d'] = df['CLOSE'].rolling(window=72).max()
df['Min_3d'] = df['CLOSE'].rolling(window=72).min()
df['Std_3d'] = df['CLOSE'].rolling(window=72).std()
df['Range_3d'] = df['Max_3d'] - df['Min_3d']

df['Mean_7d'] = df['CLOSE'].rolling(window=168).mean()
df['Max_7d'] = df['CLOSE'].rolling(window=168).max()
df['Min_7d'] = df['CLOSE'].rolling(window=168).min()
df['Std_7d'] = df['CLOSE'].rolling(window=168).std()
df['Range_7d'] = df['Max_7d'] - df['Min_7d']

df.fillna(method='bfill', inplace=True)



# Введем метрику
Метрика - money. Иммитирует торговлю, оптимальную при предикте. Выдает % изменения баланса. Нынешняя реализация упрощеная, есть резон усложнять.

In [53]:
def money(y_pred, y_test, comission):
  counter = 100
  num_of_deals = 0
  num_of_profits = 0
  num_of_losses = 0
  skip = 0
  #comission = 0.0003
  state = 0
  cur_state = 0
  fc = 0

  for i in range(len(y_pred)-1):
    future_real = y_test[i+1]
    real_real = y_test[i]
    predicted = y_pred[i+1]

    if real_real < predicted - comission * real_real - predicted * comission:

   #   print(f'сделка лонг, покупаем за {real_real}, продаем за {future_real}')
      num_of_deals += 1
      cur_state = 1
      if state == cur_state:
        fc = 0

      counter *= ((future_real - real_real - comission * real_real * fc - comission * future_real * fc)/real_real + 1)

      if (future_real - real_real) - fc * comission * real_real - fc * comission * future_real > 1:
        num_of_profits += 1
      else:
        num_of_losses += 1


    elif real_real > predicted + comission * real_real + predicted * comission:
 #     print(f'сделка шорт, продаем за {real_real}, покупаем за {future_real}')
      num_of_deals += 1
      cur_state = -1
      if state == cur_state:
        fc = 0

      counter *= ((real_real - future_real - fc * comission * real_real - fc * comission * future_real)/real_real + 1)

      if (real_real - future_real) - fc * comission * real_real - fc * comission * future_real > 0:
        num_of_profits += 1
      else:
        num_of_losses += 1

    else:
      skip += 1
      cur_state = 0

    fc = 1

    state = cur_state

  if counter > 100:
    print(f'Вы заработали {counter-100} %')
  elif counter < 100:
    print(f'Вы потеряли {100 - counter} %')
  #print(f'Вы сделали всего {num_of_deals} сделок, из которых {num_of_profits} прибыльных, {num_of_losses} убыточных, {skip} пропустили, при комиссии {comission}')
  return counter

# Загрузка уже обработанного датасета

In [40]:
df.to_csv('/content/drive/My Drive/final.csv', index=False)



In [41]:
df = pd.read_csv('/content/final.csv')

# Подготовка данных к XGBoost

In [42]:
import numpy as np
import plotly.express as px
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, mean_absolute_error
!pip install optuna
import optuna
from optuna.samplers import TPESampler



Для отдельного часа будут данные по всем котировкам, распределениям, новостям за {time_step} предыдущих периодов

In [43]:
def create_dataset_no_scaling(data, time_step):
    X, y = [], []
    for i in range(len(data) - time_step):
        X.append(data.iloc[i:(i + time_step), :].values)  # Все признаки
        y.append(data.iloc[i + time_step]['CLOSE'])  # цена закрытия - таргет
    return np.array(X), np.array(y)

X, y = create_dataset_no_scaling(df.drop(columns=['TICKER', 'PER', 'DATE', 'TIME', 'datetime']), time_step)

train_size = int(len(X) * train_size1)
X_train, X_test = X[0:train_size], X[train_size:len(X)]
y_train, y_test = y[0:train_size], y[train_size:len(X)]

X_train.shape

(743, 28, 28)

(743, 28, 28) - Величина выборки **X** размер окна **X** число признаков

Вытягиваем в двумерные данные:

In [44]:
X_train = X_train.reshape(X_train.shape[0], -1)
X_test = X_test.reshape(X_test.shape[0], -1)

# Обучение и Валидация XGBoost

In [45]:
XGB = XGBRegressor(
    learning_rate = 0.002405307703021195,
    n_estimators = 1575,
    max_depth = 50,
    colsample_bytree = 0.5601447664996351,
    subsample = 0.9429936444399803,
    min_child_weight = 75,
    reg_alpha = 0.3152860879659751,
    reg_lambda = 0.7807431270227388,
    gamma = 0.9610139844801846,
    tree_method = 'hist',
    device = "cuda"
)

XGB.fit(X_train, y_train, verbose=200)





In [46]:
predictions = XGB.predict(X_test)

y_test_original = y_test
predictions_original = predictions

mse = mean_squared_error(y_test_original, predictions_original)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test_original, predictions_original)
mape = np.mean(np.abs((y_test_original - predictions_original) / y_test_original)) * 100

correct_direction = np.sign(np.diff(y_test_original)) == np.sign(np.diff(predictions_original))
da = np.mean(correct_direction) * 100

print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"MAE: {mae}")
print(f"MAPE: {mape}")
print(f"DA: {da} \n")

money_metric = money(predictions_original, y_test_original, 0.00015)

MSE: 1.753587060140319
RMSE: 1.324230742786286
MAE: 1.0510735275084706
MAPE: 0.3342633668070373
DA: 46.34146341463415 

Вы заработали 1.4636362642155234 %
Вы сделали всего 76 сделок, из которых 12 прибыльных, 64 убыточных, 6 пропустили, при комиссии 0.00015


In [52]:
df_plot = pd.DataFrame({
    'Период в часах': range(len(predictions_original)),
    'Предикт': predictions_original.flatten(),
    'Тест': y_test_original.flatten()
})

px.line(df_plot, x='Период в часах', y=['Тест', 'Предикт'], template='presentation', title=f'Котировки и предсказания на тест выборке ({int(money_metric.round(0))-100}% прибыли, за {len(y_test)} часа)',
color_discrete_sequence = ['#159f86', '#ea5707']
        )

Как видно по метрикам и графику - модель вполне справляется. Тем более стоит учитывать маленькую выборку для обучения. В перспективе можно сделать постоянный парсинг новостей и котировок, что явно улучшит качество модели.

# Подбор Гиперпараметров с Optuna
В целом ничего интересного

In [27]:
sampler = optuna.samplers.TPESampler()

def objective(trial):
    params = {
        "n_estimators":trial.suggest_int("n_estimators", 10, 2000),
        "learning_rate": trial.suggest_float("learning_rate", 1e-4, 0.5, log=True),
        "max_depth": trial.suggest_int("max_depth", 1, 100),
        "subsample": trial.suggest_float("subsample", 0.01, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.01, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 80),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 1.0),
        'gamma': trial.suggest_float('gamma', 1e-8, 1.0),
    }

    model = XGBRegressor(**params, tree_method='hist',device="cuda")
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    mape = mean_absolute_percentage_error(y_test, predictions)
    return mape

study = optuna.create_study(direction='minimize', pruner=optuna.pruners.MedianPruner(),sampler=sampler)
study.optimize(objective, n_trials=1000)

print('Best hyperparameters:', study.best_params)
print('Best MAPE:', study.best_value)

[I 2024-07-08 12:21:59,762] A new study created in memory with name: no-name-651c1692-13df-4611-9ad8-4d139d6367c2
[I 2024-07-08 12:22:02,062] Trial 0 finished with value: 0.00546545649924631 and parameters: {'n_estimators': 301, 'learning_rate': 0.020379345296616287, 'max_depth': 90, 'subsample': 0.8739542467171181, 'colsample_bytree': 0.20475355141015836, 'min_child_weight': 12, 'reg_alpha': 0.04377295812758207, 'reg_lambda': 0.057206144031013206, 'gamma': 0.0657602171261395}. Best is trial 0 with value: 0.00546545649924631.
[I 2024-07-08 12:22:03,384] Trial 1 finished with value: 0.009556111439369581 and parameters: {'n_estimators': 1051, 'learning_rate': 0.0007111652458025426, 'max_depth': 77, 'subsample': 0.2745953212009124, 'colsample_bytree': 0.13652256664359308, 'min_child_weight': 73, 'reg_alpha': 0.7171492150802885, 'reg_lambda': 0.8392270876315939, 'gamma': 0.3783513687979537}. Best is trial 0 with value: 0.00546545649924631.
[I 2024-07-08 12:22:06,700] Trial 2 finished with 

Best hyperparameters: {'n_estimators': 1179, 'learning_rate': 0.006486882759273946, 'max_depth': 1, 'subsample': 0.8704583868457367, 'colsample_bytree': 0.6921542793223385, 'min_child_weight': 52, 'reg_alpha': 0.10489364439466552, 'reg_lambda': 0.5389276853616853, 'gamma': 0.344372122858664}
Best MAPE: 0.002317719830732119


In [32]:
best_params = {'n_estimators': 1179, 'learning_rate': 0.006486882759273946, 'max_depth': 1, 'subsample': 0.8704583868457367, 'colsample_bytree': 0.6921542793223385, 'min_child_weight': 52, 'reg_alpha': 0.10489364439466552, 'reg_lambda': 0.5389276853616853, 'gamma': 0.344372122858664}

In [33]:
XGB = XGBRegressor(**best_params)
XGB.fit(X_train, y_train, verbose=200)

# Предсказания
predictions = XGB.predict(X_test)

In [34]:
predictions = XGB.predict(X_test)

y_test_original = y_test
predictions_original = predictions

mse = mean_squared_error(y_test_original, predictions_original)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test_original, predictions_original)
mape = np.mean(np.abs((y_test_original - predictions_original) / y_test_original)) * 100

correct_direction = np.sign(np.diff(y_test_original)) == np.sign(np.diff(predictions_original))
da = np.mean(correct_direction) * 100

print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"MAE: {mae}")
print(f"MAPE: {mape}")
print(f"DA: {da} \n")

money_metric = money(predictions_original, y_test_original, 0.00015)

MSE: 1.2380076559754503
RMSE: 1.1126579240608725
MAE: 0.7867924186982294
MAPE: 0.25021615772816763
DA: 43.90243902439025 

Вы заработали 0.6674211541743631 %
Вы сделали всего 72 сделок, из которых 26 прибыльных, 46 убыточных, 10 пропустили, при комиссии 0.00015


In [35]:
df_plot = pd.DataFrame({
    'Время': range(len(predictions_original)),
    'Предикт': predictions_original.flatten(),
    'Тест': y_test_original.flatten()
})

px.line(df_plot, x='Время', y=['Предикт', 'Тест'], template='plotly_dark', title=f'пов как зарабатывать деньги (так то {int(money_metric.round(0))-100}% прибыли)')