In [1]:
import pandas as pd
import numpy as np

# Загрузка и подготовка данных

In [2]:
instruments = ['BTCUSDT', 'ETHUSDT', 'XRPUSDT', 'LTCUSDT', 'EOSUSDT']
files = ['close.csv', 'open.csv', 'low.csv', 'high.csv', 'volume.csv']
columns = ['close', 'open', 'low', 'high', 'volume']

In [3]:
series_per_ccys = {k: [] for k in instruments}

for file in files:
    frame = pd.read_csv('data/' + file,parse_dates=True,index_col=0)
    for instrument in instruments:
        series_per_ccys[instrument].append(frame[instrument])
df_per_ccys = {k: pd.DataFrame(series_per_ccys[k], columns).transpose() for k in instruments}
df_per_ccys['BTCUSDT'].head()

Unnamed: 0,close,open,low,high,volume
2020-01-01 00:00:00,7186.68,7184.42,7183.14,7196.25,51.642812
2020-01-01 00:01:00,7184.03,7186.68,7182.2,7188.06,7.248148
2020-01-01 00:02:00,7182.43,7184.03,7180.26,7184.71,11.681677
2020-01-01 00:03:00,7185.94,7182.43,7182.49,7188.94,10.025391
2020-01-01 00:04:00,7179.78,7185.94,7178.64,7185.54,14.911105


Пока что будем работать только с `BTCUSDT`:

In [4]:
df = df_per_ccys['BTCUSDT']

Добавляем доходности:

In [5]:
df['return'] = (df['open'] - df['close']) / (df['open'])
df.head()

Unnamed: 0,close,open,low,high,volume,return
2020-01-01 00:00:00,7186.68,7184.42,7183.14,7196.25,51.642812,-0.000315
2020-01-01 00:01:00,7184.03,7186.68,7182.2,7188.06,7.248148,0.000369
2020-01-01 00:02:00,7182.43,7184.03,7180.26,7184.71,11.681677,0.000223
2020-01-01 00:03:00,7185.94,7182.43,7182.49,7188.94,10.025391,-0.000489
2020-01-01 00:04:00,7179.78,7185.94,7178.64,7185.54,14.911105,0.000857


Объявляем функцию для ресеплирования:

In [6]:
def resample(freq, dataframe: pd.DataFrame) -> pd.DataFrame:
  return dataframe.groupby(pd.Grouper(freq=freq)).agg({'open':'first', 'close':'last', 'high':'max', 'low':'min', 'volume': 'sum', 'return': 'std'}).rename(columns={'return': 'vol^2'})

Пока что будем работать только с часовыми данными, выбрасываем точки, где не было торгов:

In [7]:
df_1h = resample('1h', df)
df_1h['return'] = (df_1h['open'] - df_1h['close']) / (df_1h['open'])
df_1h = df_1h.dropna()
df_1h = df_1h[df_1h['volume'] != 0]
df_1h.head()

Unnamed: 0,open,close,high,low,volume,vol^2,return
2020-01-01 00:00:00,7184.42,7177.02,7196.25,7175.46,511.814901,0.000359,0.00103
2020-01-01 01:00:00,7177.02,7216.27,7230.0,7175.71,883.052603,0.000552,-0.005469
2020-01-01 02:00:00,7216.27,7242.85,7244.87,7211.41,655.156809,0.000354,-0.003683
2020-01-01 03:00:00,7242.85,7225.01,7245.0,7220.0,783.724867,0.000394,0.002463
2020-01-01 04:00:00,7225.01,7217.27,7230.0,7215.03,467.812578,0.000248,0.001071


Объявляем метрику `RMSPE`:

In [8]:
def rmspe(y_true, y_pred):
    return np.sqrt(np.mean(np.square((y_true - y_pred) / y_true)))

# Historical Average

In [9]:
def calculate_HA(realized_vol, l_history=1):
  data = np.full(realized_vol.size, np.nan)

  for i in range(l_history, len(realized_vol)):
    history = realized_vol[i - l_history: i]
    data[i] = np.mean(history)

  return pd.Series(data=data, index=realized_vol.index, name="historical_average")

In [10]:
l_min = None
value_min = None
for l in range(1, 30):
    value = rmspe(df_1h['vol^2'], calculate_HA(df_1h['vol^2'], l))
    if value_min is None or value_min > value:
        value_min = value
        l_min = l
print("Optimal l: ", l_min, " value:", value_min)

Optimal l:  2  value: 0.4161037400438944


# Exponentially Weighted Moving Average

In [11]:
def ewma(data, alpha):
  sum = 0.0
  for x in data:
    sum = sum * alpha + x * (1-alpha)
  return sum

In [12]:
def calculate_EWMA(realized_vol, l_history=3, alpha=0.33):
  data = np.full(realized_vol.size, np.nan)

  for i in range(l_history, len(realized_vol)):
    history = realized_vol[i - l_history: i]
    data[i] = ewma(history, alpha)

  return pd.Series(data=data, index=realized_vol.index, name="ewma_average")


In [13]:
alpha_min = None
l_min = None
value_min = None
for l in range (2, 10):
  for alpha in np.arange(0.0, 1.0, 0.05):
    value = rmspe(df_1h['vol^2'], calculate_EWMA(df_1h['vol^2'], l, alpha))
    if value_min is None or value_min > value:
        value_min = value
        l_min = l
        alpha_min = alpha
print("Optimal l: ", l_min, " alpha,", alpha_min, " value:", value_min)

Optimal l:  6  alpha, 0.75  value: 0.32251579278333714


# GARCH

In [14]:
!pip install arch

Defaulting to user installation because normal site-packages is not writeable


In [21]:
from arch import arch_model
from arch.__future__ import reindexing
from math import sqrt

def garch(returns):

  scaling_const = 100.0 / returns.std()

  am = arch_model(scaling_const * returns,
                  mean='Constant',
                  vol='Garch', p=1, o=0, q=1,
                  dist='skewstudent')
  
  res = am.fit(options={'ftol' : 1e-2}, update_freq=0, disp='off')

  forecasts = res.forecast(horizon=1)

  return sqrt(float(forecasts.variance.iloc[-1])) / scaling_const**2

In [22]:
def calculate_GARCH(returns, l_history=3):
  data = np.full(returns.size, np.nan)

  for i in range(l_history, len(returns)):
    history = returns[i - l_history: i]
    data[i] = garch(history)

  return pd.Series(data=data, index=returns.index, name="historical_average")


In [23]:
rmspe(df_1h.dropna()['vol^2'], calculate_GARCH(df_1h.dropna()['return']))

0.9995649779629676