В этом блоке обучаю модель на предсказание цен закрытия, максимальную и минимальную по акциям SBER.


# Загрузка библиотек

In [None]:
# данные
import pandas as pd
pd.set_option('display.float_format', lambda x: '%.3f' % x)
import math

# подготовка данных
from sklearn.model_selection import train_test_split

# метрики
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import max_error as me
from sklearn.metrics import r2_score

# регресия
from sklearn.linear_model import LinearRegression

# сохранение/загрузка модели
from joblib import dump, load

# Загрузка данных

In [None]:
# прочитаем файл csv с историческими данными
parse_dates = ['<DATE>']
data = pd.read_csv('SBER_100101_211231.csv', parse_dates=parse_dates)
data.drop(columns=['<PER>', '<TIME>'], inplace=True)
data.head(2)

Unnamed: 0,<TICKER>,<DATE>,<OPEN>,<HIGH>,<LOW>,<CLOSE>,<VOL>
0,SBER,2010-01-11,86.56,88.17,85.51,86.69,148661237
1,SBER,2010-01-12,86.55,86.77,84.9,85.0,130276079


In [None]:
# переименуем названия признаков в более удобные
data.rename(columns={'<TICKER>':'TICKER', '<DATE>':'DATE', '<OPEN>':'OPEN', '<HIGH>':'HIGH', 
                     '<LOW>':'LOW', '<CLOSE>':'CLOSE', '<VOL>':'VOL'}, inplace=True)

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3016 entries, 0 to 3015
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   TICKER  3016 non-null   object        
 1   DATE    3016 non-null   datetime64[ns]
 2   OPEN    3016 non-null   float64       
 3   HIGH    3016 non-null   float64       
 4   LOW     3016 non-null   float64       
 5   CLOSE   3016 non-null   float64       
 6   VOL     3016 non-null   int64         
dtypes: datetime64[ns](1), float64(4), int64(1), object(1)
memory usage: 165.1+ KB


# Feature engineering

Функция для feature engineering.

In [None]:
def features(data):

    # день недели
    data['DAY_OF_WEEK'] = data['DATE'].dt.dayofweek

    # месяц
    data['MONTH'] = data['DATE'].dt.month
    
    # разность между ценой открытия и закрытия
    data['DIF_O_C'] = data['OPEN'] - data['CLOSE']
    
    # разность между максимальной и минимальной ценой
    data['DIF_H_L'] = data['HIGH'] - data['LOW']

    # средние цены закрытия за ... дней
    data['MEAN_2'] = data['CLOSE'].rolling(window=2, center=False).mean()
    data['MEAN_3'] = data['CLOSE'].rolling(window=3, center=False).mean()
    data['MEAN_4'] = data['CLOSE'].rolling(window=4, center=False).mean()
    data['MEAN_5'] = data['CLOSE'].rolling(window=5, center=False).mean()

    # максимальные цены за ... дней
    data['HIGH_2'] = data['HIGH'].rolling(window=2, center=False).max()
    data['HIGH_3'] = data['HIGH'].rolling(window=3, center=False).max()
    data['HIGH_4'] = data['HIGH'].rolling(window=4, center=False).max()
    data['HIGH_5'] = data['HIGH'].rolling(window=5, center=False).max()

    # минимальные цены за ... дней
    data['LOW_2'] = data['LOW'].rolling(window=2, center=False).min()
    data['LOW_3'] = data['LOW'].rolling(window=3, center=False).min()
    data['LOW_4'] = data['LOW'].rolling(window=4, center=False).min()
    data['LOW_5'] = data['LOW'].rolling(window=5, center=False).min()
    
    # цены и объем за прошлые дни
    data[['OPEN-1', 'HIGH-1', 'LOW-1', 'CLOSE-1', 'VOL-1']] = data[['OPEN', 'HIGH', 'LOW', 'CLOSE', 'VOL']].shift(1)
    data[['OPEN-2', 'HIGH-2', 'LOW-2', 'CLOSE-2', 'VOL-2']] = data[['OPEN', 'HIGH', 'LOW', 'CLOSE', 'VOL']].shift(2)
    data[['OPEN-3', 'HIGH-3', 'LOW-3', 'CLOSE-3', 'VOL-3']] = data[['OPEN', 'HIGH', 'LOW', 'CLOSE', 'VOL']].shift(3)
    
    # цена открытия сегодня
    data['OPEN_TODAY'] = data['OPEN'].shift(-1)

    return pd.get_dummies(data, columns=['DAY_OF_WEEK', 'MONTH'], prefix=['DAY_OF_WEEK', 'MONTH'])

In [None]:
data = features(data)
data.head()

Unnamed: 0,TICKER,DATE,OPEN,HIGH,LOW,CLOSE,VOL,DIF_O_C,DIF_H_L,MEAN_2,...,MONTH_3,MONTH_4,MONTH_5,MONTH_6,MONTH_7,MONTH_8,MONTH_9,MONTH_10,MONTH_11,MONTH_12
0,SBER,2010-01-11,86.56,88.17,85.51,86.69,148661237,-0.13,2.66,,...,0,0,0,0,0,0,0,0,0,0
1,SBER,2010-01-12,86.55,86.77,84.9,85.0,130276079,1.55,1.87,85.845,...,0,0,0,0,0,0,0,0,0,0
2,SBER,2010-01-13,84.4,87.23,84.1,86.61,128684773,-2.21,3.13,85.805,...,0,0,0,0,0,0,0,0,0,0
3,SBER,2010-01-14,87.5,87.87,86.75,87.55,111263614,-0.05,1.12,87.08,...,0,0,0,0,0,0,0,0,0,0
4,SBER,2010-01-15,87.47,88.67,87.16,88.15,142060148,-0.68,1.51,87.85,...,0,0,0,0,0,0,0,0,0,0


# Целевые признаки

In [None]:
data['TARGET_CLOSE'] = data['CLOSE'].shift(-1)
data['TARGET_HIGH'] = data['HIGH'].shift(-1)
data['TARGET_LOW'] = data['LOW'].shift(-1)
data[['OPEN', 'HIGH',	'LOW', 'CLOSE', 'OPEN_TODAY', 'TARGET_HIGH', 'TARGET_LOW', 'TARGET_CLOSE']].head(5)

Unnamed: 0,OPEN,HIGH,LOW,CLOSE,OPEN_TODAY,TARGET_HIGH,TARGET_LOW,TARGET_CLOSE
0,86.56,88.17,85.51,86.69,86.55,86.77,84.9,85.0
1,86.55,86.77,84.9,85.0,84.4,87.23,84.1,86.61
2,84.4,87.23,84.1,86.61,87.5,87.87,86.75,87.55
3,87.5,87.87,86.75,87.55,87.47,88.67,87.16,88.15
4,87.47,88.67,87.16,88.15,87.86,90.65,87.79,90.49


In [None]:
# избавимся от nan-ов
data = data.dropna()

# Обучение

In [None]:
# модель для предсказания CLOSE
model_close = LinearRegression()

# предикторы
X_close = data.drop(columns=['TICKER', 'DATE', 'TARGET_CLOSE', 'TARGET_HIGH', 'TARGET_LOW'])
# целевой признак
y_close = data['TARGET_CLOSE']

# разделим на трейн и тест без перемешивания
# учимся на более ранних данных, тестируем на более поздних
X_close_train, X_close_test, y_close_train, y_close_test = train_test_split(X_close, y_close, test_size=0.3, shuffle=False)

model_close.fit(X_close_train, y_close_train)

y_close_pred = model_close.predict(X_close_test)    
print('mae =', mae(y_close_test[:-1], y_close_pred[:-1]))
print('me =', me(y_close_test[:-1], y_close_pred[:-1]))
print('r2 =', r2_score(y_close_test[:-1], y_close_pred[:-1]))

# коэффициенты модели
coef_table = pd.DataFrame(list(X_close_train.columns)).copy()
coef_table.insert(len(coef_table.columns),"Coefs",model_close.coef_)
coef_table.sort_values(by='Coefs', ascending=False)

mae = 2.9101155823037947
me = 28.28146036543734
r2 = 0.9928796204203083


Unnamed: 0,0,Coefs
34,OPEN_TODAY,1.129
40,DAY_OF_WEEK_5,0.557
42,MONTH_1,0.33
21,LOW-1,0.294
0,OPEN,0.258
52,MONTH_11,0.252
31,LOW-3,0.236
3,CLOSE,0.202
17,LOW_4,0.145
48,MONTH_7,0.113


In [None]:
# модель для предсказания HIGH
model_high = LinearRegression()

# предикторы
X_high = data.drop(columns=['TICKER', 'DATE', 'TARGET_HIGH', 'TARGET_CLOSE', 'TARGET_LOW'])
# целевой признак
y_high = data['TARGET_HIGH']

# разделим на трейн и тест без перемешивания
# учимся на более ранних данных, тестируем на более поздних
X_high_train, X_high_test, y_high_train, y_high_test = train_test_split(X_high, y_high, test_size=0.3, shuffle=False)

model_high.fit(X_high_train, y_high_train)

y_high_pred = model_high.predict(X_high_test)    
print('mae =', mae(y_high_test[:-1], y_high_pred[:-1]))
print('me =', me(y_high_test[:-1], y_high_pred[:-1]))
print('r2 =', r2_score(y_high_test[:-1], y_high_pred[:-1]))

# коэффициенты модели
coef_table = pd.DataFrame(list(X_high_train.columns)).copy()
coef_table.insert(len(coef_table.columns),"Coefs",model_high.coef_)
coef_table.sort_values(by='Coefs', ascending=False)

mae = 1.786373892179456
me = 19.425079224232462
r2 = 0.9972747574786175


Unnamed: 0,0,Coefs
34,OPEN_TODAY,0.929
40,DAY_OF_WEEK_5,0.568
41,DAY_OF_WEEK_6,0.354
52,MONTH_11,0.193
42,MONTH_1,0.189
14,HIGH_5,0.174
20,HIGH-1,0.168
3,CLOSE,0.166
29,OPEN-3,0.122
0,OPEN,0.11


In [None]:
# модель для предсказания LOW
model_low = LinearRegression()

# предикторы
X_low = data.drop(columns=['TICKER', 'DATE', 'TARGET_HIGH', 'TARGET_CLOSE', 'TARGET_LOW'])
# целевой признак
y_low = data['TARGET_LOW']

# разделим на трейн и тест без перемешивания
# учимся на более ранних данных, тестируем на более поздних
X_low_train, X_low_test, y_low_train, y_low_test = train_test_split(X_low, y_low, test_size=0.3, shuffle=False)

model_low.fit(X_low_train, y_low_train)

y_low_pred = model_low.predict(X_low_test)    
print('mae =', mae(y_low_test[:-1], y_low_pred[:-1]))
print('me =', me(y_low_test[:-1], y_low_pred[:-1]))
print('r2 =', r2_score(y_low_test[:-1], y_low_pred[:-1]))

# коэффициенты модели
coef_table = pd.DataFrame(list(X_low_train.columns)).copy()
coef_table.insert(len(coef_table.columns),"Coefs",model_low.coef_)
coef_table.sort_values(by='Coefs', ascending=False)

mae = 1.815044316405406
me = 15.723205855792742
r2 = 0.9970230602445588


Unnamed: 0,0,Coefs
34,OPEN_TODAY,1.19
41,DAY_OF_WEEK_6,0.233
16,LOW_3,0.204
42,MONTH_1,0.169
31,LOW-3,0.167
50,MONTH_9,0.141
51,MONTH_10,0.141
0,OPEN,0.099
48,MONTH_7,0.095
19,OPEN-1,0.074


# Сохранение моделей

In [None]:
dump(model_close, './saved_models/sber_model_close.joblib')

['./saved_models/sber_model_close.joblib']

In [None]:
dump(model_high, './saved_models/sber_model_high.joblib')

['./saved_models/sber_model_high.joblib']

In [None]:
dump(model_low, './saved_models/sber_model_low.joblib')

['./saved_models/sber_model_low.joblib']