In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')
df = pd.read_csv("btc_1h_data.csv")
df = df.set_index('Unnamed: 0')
df.index = pd.to_datetime(df.index)
df = df.drop(['close_time', 'qav', 'taker_base_vol', 'taker_quote_vol', 'ignore', 'open_time'], axis=1)

def generate_features(df):
    df_n = df.copy()
    df_n[f"target"] = df_n["open"].shift(-2)
    #for n in range(1, n_lags):
        #df_n[f"lag{n}"] = df_n["open"].shift(n)
        #df_n[f"diff_lag{n}"] = df_n["open"] - df_n[f"lag{n}"]
    df_n = df_n.iloc[:-2]

    df_n['price_diff1'] = df_n['high'] - df_n['open']
    df_n['price_diff2'] = df_n['high'] - df_n['low']
    df_n['price_diff3'] = df_n['low'] - df_n['open']
    df_n['price_diff4'] = df_n['close'] - df_n['open']

    df_n['avg_vol'] = df_n['volume']/df_n['num_trades']
    return df_n
    
df = generate_features(df)

df = (
    df
    .assign(minute = df.index.minute)
    .assign(hour = df.index.hour)
    .assign(day = df.index.day)
    .assign(month = df.index.month)
    .assign(week_of_year = df.index.week)
    .assign(year = df.index.year)
    )
df.drop(columns=["week_of_year"], inplace=True)
df['year'] -= 2022
df = df.iloc[0:10000]
df.head(2)

Unnamed: 0_level_0,open,high,low,close,volume,num_trades,target,price_diff1,price_diff2,price_diff3,price_diff4,avg_vol,hour,day,month,day_of_week,year
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2022-01-01 00:00:00,46216.93,46731.39,46208.37,46656.13,1503.33095,38608.0,46778.14,514.46,523.02,-8.56,439.2,0.038938,0,1,1,5,0
2022-01-01 01:00:00,46656.14,46949.99,46574.06,46778.14,943.81539,31872.0,46811.77,293.85,375.93,-82.08,122.0,0.029613,1,1,1,5,0


In [2]:
target = ['target']
prices_features = ['open', 'high', 'low', 'close']
time_features = ['hour', 'day', 'day_of_week', 'month', 'year']
other_features = ['volume', 'num_trades', 'avg_vol']
diff_features = ['price_diff1', 'price_diff2', 'price_diff3', 'price_diff4']
all_but_prices_and_time = other_features + diff_features

In [3]:
from sklearn.preprocessing import StandardScaler
scaler_price = StandardScaler()
scaler_everything_but_price = StandardScaler()
scaler_price.fit(df[target])
for feature in prices_features + target:
    df.loc[:, [feature]] = scaler_price.transform(df.loc[:, [feature]])
df[all_but_prices_and_time] = scaler_everything_but_price.fit_transform(df[all_but_prices_and_time])

In [4]:
df[prices_features].head(2)

Unnamed: 0_level_0,open,high,low,close
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2022-01-01 00:00:00,2.367626,2.432548,2.366545,2.423051
2022-01-01 01:00:00,2.423052,2.460135,2.412694,2.438448


In [5]:
df[diff_features].head(2)

Unnamed: 0_level_0,price_diff1,price_diff2,price_diff3,price_diff4
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2022-01-01 00:00:00,2.7561,1.467071,0.673547,2.566082
2022-01-01 01:00:00,1.264705,0.764807,0.180366,0.715912


In [6]:
df[other_features].head(2)

Unnamed: 0_level_0,volume,num_trades,avg_vol
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2022-01-01 00:00:00,-0.58995,-0.642011,-0.167248
2022-01-01 01:00:00,-0.672438,-0.689971,-0.907511


In [7]:
df[time_features].head(2)

Unnamed: 0_level_0,hour,day,day_of_week,month,year
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2022-01-01 00:00:00,0,1,5,1,0
2022-01-01 01:00:00,1,1,5,1,0


In [8]:
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.optim.lr_scheduler import LinearLR
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#device = torch.device("cpu")
dtype = torch.float32

In [9]:
X_train_other = torch.Tensor(df[other_features].to_numpy()).to(dtype)
X_train_price = torch.Tensor(df[prices_features].to_numpy()).to(dtype)
X_train_diff = torch.Tensor(df[diff_features].to_numpy()).to(dtype)
X_train_time = torch.Tensor(df[time_features].to_numpy()).to(torch.int)
y = torch.Tensor(df[target].to_numpy()).to(dtype)
len_history = 512
data = {'y': y, 'price': X_train_price, 'diff': X_train_diff, 'other': X_train_other, 'time': X_train_time}

In [10]:
class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, hidden_dim, head_size):
        super().__init__()
        self.key = nn.Linear(hidden_dim, head_size, bias=False)
        self.query = nn.Linear(hidden_dim, head_size, bias=False)
        self.value = nn.Linear(hidden_dim, head_size, bias=False)
        #self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # input of size (batch, time-step, channels)
        # output of size (batch, time-step, head size)
        B,T,C = x.shape
        k = self.key(x)   # (B,T,hs)
        q = self.query(x) # (B,T,hs)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * (k.shape[-1]**-0.5) # (B, T, hs) @ (B, hs, T) -> (B, T, T)
        #wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        #wei = self.dropout(wei) # randomly prevent some of the nodes to communicate
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,hs)
        out = wei @ v # (B, T, T) @ (B, T, hs) -> (B, T, hs)
        return out


class MultiHeadAttention(nn.Module):
    # masked MultiHeadAttention in the article
        #self.sa_heads = MultiHeadAttention(4, hidden_dim // 4) # i.e. 4 heads of 8-dim self-attention; 
        # similar to conv, instead of having 1 big conv, we have bigger number of smaller ones;
    def __init__(self, num_heads, head_size, hidden_dim):
        super().__init__()
        self.heads = nn.ModuleList(Head(hidden_dim, head_size) for _ in range(num_heads))
        self.proj = nn.Linear(hidden_dim, hidden_dim) # just a linear transformation of the outcome
        # it's the projection back to the residual pathway
        #self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim = -1)
        out = self.proj(out)#self.dropout(self.proj(out))
        return out
    

class FeedForward(nn.Module):
    """ a simple linear layer followed by a non-linearity """
    # Feed Forward in the article)
    def __init__(self, hidden_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(hidden_dim, 4  * hidden_dim),
            nn.ReLU(),
            nn.Linear(4 * hidden_dim, hidden_dim), # it's the projection back to the residual pathway
            #nn.Dropout(dropout), # before res con right before the res goes back to the res path way
        )
    
    def forward(self, x):
        return self.net(x)
    

class Block(nn.Module):
    """ Transformer block: communication followed by computation """
    def __init__(self, hidden_dim, n_head):
        # hidden_dim: embedding dim, n_head: number of heads we would like
        # self attention is the communication
        # all tokens do that independently, so multihead att gathered all the data 
        # now ffwd will think on that data individually - ffrd

        """
        about the res con:
        you can do some computation and then add it to the original feature
        so you are going from the inputs to the targets only via plus and plus and plus
        it's good, because + distributes gradients equally to both of it's brances
        it's super usefull for you optimization
        """
        super().__init__()
        head_size = hidden_dim // n_head # will become 8
        self.sa = MultiHeadAttention(n_head, head_size, hidden_dim)
        self.ffwd = FeedForward(hidden_dim)
        self.ln1 = nn.LayerNorm(hidden_dim) # makes features unit gausiian in initialisation
        self.ln2 = nn.LayerNorm(hidden_dim)

    def forward(self, x):
        x = x + self.sa(self.ln1(x)) # skip connections: we fork off, doing some computations, and come back
        x = x + self.ffwd(self.ln2(x)) # difference with paper, before ffwd, not after in current 5 years
        return x
    

class TransformerEncoder(nn.Module):
    def __init__(self, hidden_dim, n_head, n_layer):
        super().__init__()
        self.blocks = nn.Sequential(*[Block(hidden_dim, n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(hidden_dim)
        self.lm_head = nn.Linear(hidden_dim, 1) # from token emds to the logits; (B, T, vocab_size)

    def forward(self, x):
        # integers from 0 to T-1 = torch.arange(T)
        x = self.blocks(x)
        #print(x.shape) 128, 512, 64
        x = self.ln_f(x)
        #print(x.shape) # 128, 512, 64
        x = x.mean(1)
        #print(x.shape) # 128, 64
        x = self.lm_head(x)
        #print(x.shape) # 128, 1
        return x

In [11]:
class Projector(nn.Module):
    def __init__(self, hidden_dim, len_history, device=device, layer_norm=True):
        super().__init__()
        # Fully connected layer
        self._len_history = len_history
        self._hour_emb = nn.Embedding(24, hidden_dim)
        self._day_emb = nn.Embedding(32, hidden_dim)
        self._day_week = nn.Embedding(7, hidden_dim)
        self._month = nn.Embedding(13, hidden_dim)
        self._year = nn.Embedding(3, hidden_dim)
        self._position_emb = nn.Embedding(len_history, hidden_dim)
        self._price_linear = nn.Linear(4, hidden_dim)
        self._diff_linear = nn.Linear(4, hidden_dim)
        self._other_linear = nn.Linear(3, hidden_dim)
        self._ln = nn.Identity()
        if layer_norm:
            self._ln = nn.LayerNorm(hidden_dim)
        
        self._init_weights()
        self._device = device

    def _init_weights(self):
        nn.init.trunc_normal_(self._hour_emb.weight)
        nn.init.trunc_normal_(self._day_emb.weight)
        nn.init.trunc_normal_(self._day_week.weight)
        nn.init.trunc_normal_(self._month.weight)
        nn.init.trunc_normal_(self._year.weight)
        nn.init.trunc_normal_(self._position_emb.weight)

        nn.init.trunc_normal_(self._price_linear.weight)
        nn.init.trunc_normal_(self._diff_linear.weight)
        nn.init.trunc_normal_(self._other_linear.weight)
    
    def _forward_time(self, x):
        hours = self._hour_emb(x[:, :, 0])
        days = self._day_emb(x[:, :, 1])
        week_day = self._day_week(x[:, :, 2])
        month = self._month(x[:, :, 3])
        year = self._year(x[:, :, 4])
        out = year + hours + days + week_day + month
        return out
    
    def _forward_position(self, x):
        out = self._position_emb(x)
        return out
    
    def _forward_price(self, x):
        #print(x.shape) 128, 512, 4
        x = self._price_linear(x)
        return x

    def _forward_diff(self, x):
        #print(x.shape) 128, 512, 4
        x = self._diff_linear(x)
        return x

    def _forward_other(self, x):
        #print(x.shape) 128, 512, 3
        x = self._other_linear(x)
        return x
    
    def forward(self, x):
        #x_pos = torch.tensor(range(self._len_history-1, -1, -1)).to(self._device)
        #x_pos = x_pos.repeat(x['price'].shape[0]).view(x['price'].shape[0], self._len_history)
        #out_pos = self._forward_position(x_pos)
        #out_time = self._forward_time(x['time'])
        out_price = self._forward_price(x['price'])
        out_diff = self._forward_diff(x['diff'])
        out_other = self._forward_other(x['other'])
        #out = out_pos + out_time + 
        out = out_price + out_diff + out_other
        return out
    
class Encoder(nn.Module):
    def __init__(self, hidden_dim, len_history, device=device, layer_norm=True):
        super().__init__()
        # Fully connected layer
        self._len_history = len_history
        self._lin1 = nn.Linear(hidden_dim*len_history, hidden_dim)
        self._lin2 = nn.Linear(hidden_dim, 1)
        self._tanh = nn.Tanh()
        self._init_weights()
        self._device = device

    def _init_weights(self):
        nn.init.trunc_normal_(self._lin1.weight)
        nn.init.trunc_normal_(self._lin2.weight)
    
    def forward(self, x):
        out = self._lin1(x.view(x.shape[0], -1))
        out = self._tanh(out)
        out = self._lin2(out)
        return out
    
class TraiderModel(nn.Module):
    def __init__(self, len_history, hidden_dim, device):
        super().__init__()
        self._projector_model = Projector(hidden_dim, len_history, device)
        self._encoder_model = TransformerEncoder(hidden_dim, n_head=4, n_layer=2)
        #self._encoder_model = Encoder(hidden_dim+11, len_history=len_history, device = device)

    def forward(self, x):
        out = self._projector_model(x) #128,512,64
        out = self._encoder_model(out) #128,1
        #out = x['price'][:, -1, 0].view(-1, 1) + out
        return out

In [12]:
class Trainer:
    def __init__(self, 
                 data, 
                 n_epochs, 
                 len_history, 
                 hidden_dim, 
                 device, 
                 batch_size,
                 target_scaller,
                 n_eval=1440,
                 lr=1e-3, 
                 weight_decay=1e-6, 
                 need_to_print_loss=False,
                 max_iterations_per_epoch=None,
                 ):
        
        self._n_epochs = n_epochs
        self._max_iterations = max_iterations_per_epoch
        self._batch_size = batch_size
        self._n_eval = n_eval
        self._len_history = len_history
        self._device = device
        self._model = TraiderModel(len_history, hidden_dim, self._device).to(self._device)
        self._loss = nn.MSELoss(reduction="mean").to(self._device)
        self._optimizer = torch.optim.Adam(self._model.parameters(), lr=lr)
        self._lr_scheduller = LinearLR(self._optimizer, start_factor=0.8, total_iters=self._n_epochs)
        self._target_scaller = target_scaller
        self._need_to_print_loss = need_to_print_loss

        self.data_train = {key: data[key][:-self._n_eval] for key in data.keys()}
        self.data_eval = {key: data[key][-self._n_eval - self._len_history:] for key in data.keys()}
        self.feature_names = [key for key in data.keys() if key != 'y']

    def train(self):
        for epoch in range(1, self._n_epochs+1):
            print(f'epoch {epoch}')
            cur_epoch_losses = self._train_one_epoch()
            print(f'epoch {epoch} done, loss is {round(sum(cur_epoch_losses), 2)}')
            eval_loss = self._eval_batch(batch_size=self._n_eval)
            print(f'epoch {epoch} done, validation loss is {round(eval_loss, 2)}')
            self._lr_scheduller.step()

    def _train_one_epoch(self):
        self._model.train()
        i = 0
        num_iterations = 0
        losses = list()
        while self.data_train['y'].shape[0] - i - len_history > self._batch_size:
            loss_value = self._train_batch(i, self._batch_size)
            losses.append(loss_value)
            i += self._batch_size
            num_iterations += 1
            if self._need_to_print_loss and num_iterations%1000==0:
                print(f'{round(i/self.data_train["y"].shape[0]*100)}% are done', f'loss={loss_value}')
            if self._max_iterations:
                if num_iterations > self._max_iterations:
                    return losses
        #last batch training
        loss_value = self._train_batch(i, self.data_train['y'].shape[0] - i - len_history)
        losses.append(loss_value)
        return losses

    def _train_batch(self, i, batch_size):
        targets = self.data_train['y'][i+self._len_history-1:i+self._len_history+batch_size-1].to(self._device)
        features_batch = {}
        for key in self.feature_names:
            features = self.data_train[key][i:i+self._len_history+batch_size]
            features_batch[key] = torch.zeros((batch_size, self._len_history, features.shape[-1]), dtype=features.dtype)
            for j in range(batch_size):
                features_batch[key][j] = features[j:j+self._len_history]
            features_batch[key] = features_batch[key].to(self._device)
        prices_predicted = self._model(features_batch).squeeze(1)
        self._optimizer.zero_grad()
        loss = self._loss(targets, prices_predicted)
        loss.backward()
        self._optimizer.step()
        return loss.item()

    def _eval_batch(self, batch_size):
        features_batch = {}
        for key in self.feature_names:
            features = self.data_eval[key][:self._len_history+batch_size]
            features_batch[key] = torch.zeros((batch_size, self._len_history, features.shape[-1]), dtype=features.dtype)
            for j in range(batch_size):
                features_batch[key][j] = features[j:j+self._len_history]
            features_batch[key] = features_batch[key].to(self._device)
        
        prices_predicted = self._model(features_batch).squeeze(1).detach().cpu()
        targets_for_eval = self.data_eval['y'][self._len_history-1:self._len_history+batch_size-1]
        return np.abs(prices_predicted - targets_for_eval).mean().item()

In [13]:
trainer = Trainer(
                  data=data,
                  n_epochs=1,
                  len_history=len_history, 
                  hidden_dim=64,
                  device=device,
                  batch_size=32,
                  target_scaller=scaler_price,
                  n_eval=24,
                  lr=3e-4,
                  max_iterations_per_epoch=None,
                  need_to_print_loss=True
                  )
trainer.train()

print(f'old_price_pred: {np.abs(trainer.data_eval["price"][len_history:, 0] - trainer.data_eval["y"][len_history:, 0]).mean().item()}')

epoch 1
epoch 1 done, loss is nan
epoch 1 done, validation loss is nan
old_price_pred: 0.020168164744973183


In [14]:
plt.figure(figsize=(12,8))
plt.plot(dates, preds, color='blue', marker='o', linestyle='dashed', linewidth=2, markersize=1)
plt.plot(dates, targets, color='black', linewidth=1, markersize=1)
plt.plot(dates, prev_targets, color='red', marker='o', linestyle='dashed', linewidth=0.2, markersize=1)
plt.title(f'loss = {np.abs(preds - targets).mean()}')
plt.ylim(targets.min()-300, targets.max()+300)
plt.show()

NameError: name 'dates' is not defined

<Figure size 864x576 with 0 Axes>

In [None]:
epoch 53 done, loss is 100.42
epoch 53 done, validation loss is 0.29
epoch 54
epoch 54 done, loss is 91.77
epoch 54 done, validation loss is 0.09
epoch 55
epoch 55 done, loss is 103.77
epoch 55 done, validation loss is 0.36
epoch 56
epoch 56 done, loss is 82.57
epoch 56 done, validation loss is 0.08
epoch 57
epoch 57 done, loss is 100.27
epoch 57 done, validation loss is 0.09