# ENV

In [1]:
%%capture
# !pip install datasets
!pip install accelerate

# Architecture

In [1]:
from types import SimpleNamespace

config=SimpleNamespace(
    emb_dim=512,
    n_q_heads=8,
    n_kv_heads=4,
    RoPE_theta=500000,
    norm_eps=1e-8,
    ff_dim=128,
    drop_rate=0.1,
    num_layer=6,
    max_length=100,
    input_dim=3,
    num_expert=4,
    num_experts_per_tok=2,
    capacity_factor=1.25
)

In [2]:
import torch
from torch import nn
from torch.nn import functional as F

class BaseAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.emb_dim=config.emb_dim
        self.n_q_heads=config.n_q_heads
        self.n_kv_heads=config.n_kv_heads
        self.head_dim=config.emb_dim//config.n_q_heads

        self.q_proj=nn.Linear(self.emb_dim, self.n_q_heads*self.head_dim)
        self.k_proj=nn.Linear(self.emb_dim, self.n_kv_heads*self.head_dim)
        self.v_proj=nn.Linear(self.emb_dim, self.n_kv_heads*self.head_dim)
        self.out_proj=nn.Linear(self.n_q_heads*self.head_dim, self.emb_dim)

    def RoPE(self,q,k,freq_cis):
        new_q=torch.view_as_complex(q.float().view(*q.shape[:-1],-1,2))
        new_k=torch.view_as_complex(k.float().view(*k.shape[:-1],-1,2))

        new_q=torch.view_as_real(new_q*freq_cis).view(q.shape).to(q.dtype)
        new_k=torch.view_as_real(new_k*freq_cis).view(k.shape).to(k.dtype)

        return new_q,new_k


    def Scaled_dot_product_attention(self, q, k, v, mask=None):
        scores = torch.matmul(q,k.transpose(-2,-1))/torch.sqrt(torch.tensor(self.head_dim))
        if mask is not None:
            scores=scores.masked_fill(mask==0, float('-inf'))

        scores=torch.softmax(scores, dim=-1)
        out=torch.matmul(scores,v)
        return out

    def MultiHeadAttention(self, x,freq_cis, mask=None):
        batch_size, seq_len, emb_dim = x.shape
        q=self.q_proj(x).view(batch_size,seq_len,self.n_q_heads,self.head_dim).permute(0,2,1,3)
        k=self.k_proj(x).view(batch_size,seq_len,self.n_kv_heads,self.head_dim).permute(0,2,1,3)
        v=self.v_proj(x).view(batch_size,seq_len,self.n_kv_heads,self.head_dim).permute(0,2,1,3)

        k=torch.repeat_interleave(k, dim=1, repeats=self.n_q_heads//self.n_kv_heads)
        v=torch.repeat_interleave(v, dim=1, repeats=self.n_q_heads//self.n_kv_heads)

        q,k=self.RoPE(q,k,freq_cis)
        o=self.Scaled_dot_product_attention(q,k,v,mask)
        o=o.permute(0,2,1,3).contiguous().view(batch_size,seq_len,self.n_q_heads*self.head_dim)
        o=self.out_proj(o)
        return o

# x=torch.randn(1,17,config.emb_dim)
# mask=torch.tril(torch.ones(x.shape[-2],x.shape[-2]).float())
# freq_cis=torch.randn(1,17,config.emb_dim//config.n_q_heads//2)
# tmp=BaseAttention(config)
# tmp.MultiHeadAttention(x,freq_cis,mask)

In [3]:
class RMS_norm(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.norm_eps=config.norm_eps
        self.norm_weight=nn.Parameter(torch.ones(config.emb_dim))
    def _norm(self, x):
        return x*torch.rsqrt(x.pow(2).mean(dim=-1,keepdim=True)+self.norm_eps)

    def forward(self, x):
        return self._norm(x)*self.norm_weight

# x=torch.randn(1,17,config.emb_dim)
# tmp=RMS_norm(config)
# tmp(x).shape

In [4]:
class SelfAttention(BaseAttention):
    def __init__(self, config):
        super().__init__(config)
        self.attn_norm=RMS_norm(config)

    def forward(self, x, freq, mask=None):
        x=self.attn_norm(x)
        return self.MultiHeadAttention(x,freq,mask)

# x=torch.randn(1,17,config.emb_dim)
# mask=torch.tril(torch.ones(x.shape[-2],x.shape[-2]).float())
# freq_cis=torch.randn(1,17,config.emb_dim//config.n_q_heads//2)
# tmp=SelfAttention(config)
# tmp(x,freq_cis,mask).shape

In [5]:
class ffn(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.w1=nn.Linear(config.emb_dim, config.ff_dim)
        self.w2=nn.Linear(config.emb_dim, config.ff_dim)
        self.w3=nn.Linear(config.ff_dim, config.emb_dim)
        self.SiLU=nn.SiLU(inplace=True)

        self.ffn_norm=RMS_norm(config)

    def forward(self, x):
        x=self.ffn_norm(x)
        return self.w3(self.w1(x)*self.SiLU(self.w2(x)))

# x=torch.randn(1,17,config.emb_dim)
# tmp=ffn(config)
# tmp(x).shape

In [6]:
class Router(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.top_k=config.num_experts_per_tok
        self.route_linear=nn.Linear(config.emb_dim,config.num_expert)
        self.noise_linear=nn.Linear(config.emb_dim,config.num_expert)

    def forward(self,x):
        route_logits=self.route_linear(x)
        noise_logits=self.noise_linear(x)

        noise=torch.randn_like(noise_logits)*F.softplus(noise_logits)
        logits=route_logits+noise

        weights,indices=torch.topk(logits,self.top_k)

        zeros=torch.full_like(logits,float('-inf'))
        sparse_weight=zeros.scatter(-1,indices,weights)

        return sparse_weight,indices

# x=torch.randn(1,17,config.emb_dim)
# tmp=Router(config)
# a,b=tmp(x)
# print(a.shape)
# print(b.shape)

In [7]:
class Mixtral_of_Expert(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.num_experts_per_tok=config.num_experts_per_tok
        self.num_expert=config.num_expert
        self.capacity_factor=config.capacity_factor
        self.experts=nn.ModuleList([ffn(config) for _ in range(config.num_expert)])
        self.gate=Router(config)

    def forward(self,x):
        batch_size, seq_len, _ = x.shape
        weights,indices=self.gate(x)
        weights=weights.softmax(dim=-1).to(x.dtype)

        flat_x=x.view(-1,x.shape[-1])
        flat_weights=weights.view(-1,weights.shape[-1])

        final_result=torch.zeros_like(flat_x)

        num_batch_token=batch_size*seq_len*self.num_experts_per_tok
        expert_capacity=int((num_batch_token/self.num_expert)*self.capacity_factor)

        for i,expert in enumerate(self.experts):
            flat_mask=(indices==i).any(dim=-1).view(-1)
            selected_indices=flat_mask.nonzero().squeeze(-1)

            limited_indice=selected_indices[:expert_capacity] if selected_indices.numel()>expert_capacity else selected_indices

            if limited_indice.numel()>0:
              input=flat_x[limited_indice]
              output=expert(input)
              sparse_weights=flat_weights[limited_indice,i,None]

              weighted_output=output*sparse_weights
              final_result.index_add_(0, limited_indice, weighted_output)

        return final_result.view(x.shape)

# x=torch.randn(1,17,config.emb_dim)
# tmp=Mixtral_of_Expert(config)
# tmp(x).shape

In [8]:
class Decoder_layer(nn.Module):
    def __init__(self,config):
        super().__init__()
        self.self_attn=SelfAttention(config)
        self.MoE=Mixtral_of_Expert(config)
        self.dropout=nn.Dropout(config.drop_rate)

    def forward(self,x,freq,mask=None):
        x=self.self_attn(x,freq,mask)
        x=x+self.dropout(x)

        x=self.MoE(x)
        x=x+self.dropout(x)

        return x

# x=torch.rand(1,17,config.emb_dim)
# mask=torch.tril(torch.ones(x.shape[-2],x.shape[-2]).float())
# freq_cis=torch.rand(1,17,config.emb_dim//config.n_q_heads//2)
# tmp=Decoder_layer(config)
# tmp(x,freq_cis,mask).shape

In [9]:
class Decoder(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.layers=nn.ModuleList([Decoder_layer(config) for _ in range(config.num_layer)])

    def forward(self, x, freq, mask=None):
        for layer in self.layers:
            x=layer(x,freq,mask)
        return x

# x=torch.rand(1,17,config.emb_dim)
# mask=torch.tril(torch.ones(x.shape[-2],x.shape[-2]).float())
# freq_cis=torch.rand(1,17,config.emb_dim//config.n_q_heads//2)
# tmp=Decoder(config)
# tmp(x,freq_cis,mask).shape

In [10]:
class Transformer(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.norm_1=nn.LayerNorm(config.input_dim)
        self.norm_2=nn.LayerNorm(config.emb_dim)
        self.w0=nn.Linear(config.input_dim, config.emb_dim)
        head_dim=config.emb_dim//config.n_q_heads
        split_angle=torch.tensor(range(head_dim//2))/(head_dim//2)
        freq=1.0/(config.RoPE_theta**split_angle)
        self.freq_cis=torch.polar(torch.ones_like(freq),torch.outer(torch.arange(config.max_length),freq))
        self.decoder=Decoder(config)
        self.final=nn.Linear(config.emb_dim,1)

    def forward(self, x):
        x=self.w0(self.norm_1(x))
        casual_mask=torch.tril(torch.ones(x.shape[-2],x.shape[-2]).float()).to(x.device)
        self.freq_cis=self.freq_cis.to(x.device)
        sub_freq_cis=self.freq_cis[0:x.shape[-2]]
        x=self.decoder(x,sub_freq_cis,casual_mask)

        x=self.norm_2(x)

        return self.final(x)

# x=torch.rand(1,17,3)
# tmp=Transformer(config)
# tmp(x).shape

In [11]:
# def init_parameter(layer):
#     if isinstance(layer,nn.Linear):
#         nn.init.xavier_uniform_(layer.weight)

# model=Transformer(config)
# model.apply(init_parameter)
# # model=model.to(torch.float16)
# model.half()

# Preprocess data

In [12]:
import numpy as np
import pandas as pd

def process_data(df, code):
    df['Date/Time'] = pd.to_datetime(df['Date/Time'], format='%m/%d/%Y %H:%M')

    df = df.assign(
        # Minute=df['Date/Time'].dt.minute,
        # Hour=df['Date/Time'].dt.hour,
        # Day=df['Date/Time'].dt.day,
        # Month=df['Date/Time'].dt.month,
        # Year=df['Date/Time'].dt.year,
        Ticker=code,
        Time_diff=0.0,
        Income=0.0,
        Diff=0.0
    )

    df['Time_diff'] = (df['Date/Time'].shift(-1) - df['Date/Time']).dt.total_seconds()/60
    df.drop(columns=['Open Interest', 'Date/Time'], inplace=True)

    df = df.astype(np.float32)
    df['Income']=(df['Close']+df['Low'])*df['Volume']/2
    df['Diff'] = df['Close'].diff(-1).fillna(0)

    df.drop(df.index[-1],inplace=True)
    # display(df.head())
    return df.to_numpy()

PNJ=pd.read_csv("PNJ.csv")
FPT=pd.read_csv("FPT.csv")
MSN=pd.read_csv("MSN.csv")
VIC=pd.read_csv("VIC.csv")

PNJ=process_data(PNJ,1)
FPT=process_data(FPT,2)
MSN=process_data(MSN,3)
VIC=process_data(VIC,4)

len(FPT)+len(PNJ)+len(MSN)+len(VIC)

459331

Mỗi điểm dữ liệu là một tập liên tiếp các dòng để tận dụng thông tin về thời gian

In [13]:
import numpy as np
from tqdm.auto import tqdm
from torch.utils.data import DataLoader,TensorDataset

def create_data_loader(length,step,batch_size, train_size):
    data=[]
    label=[]
    def create_data_point(df):
        for i in tqdm(range(0,len(df)-length,step)):
            data_point=df[i:i+length,:-1]
            data.append(df[i:i+length,:-1])
            label.append(df[i:i+length,-1])

    create_data_point(PNJ)
    create_data_point(FPT)
    create_data_point(MSN)
    create_data_point(VIC)

    data=np.array(data)
    label=np.array(label)

    permutation=np.random.permutation(len(data))
    pos=int(len(permutation)*train_size)

    train_data, train_labels = data[permutation[:pos]], label[permutation[:pos]]
    val_data, val_labels = data[permutation[pos:]], label[permutation[pos:]]

    # Create TensorDataset and DataLoader for training and validation sets
    train_dataset = TensorDataset(torch.from_numpy(train_data), torch.from_numpy(train_labels))
    val_dataset = TensorDataset(torch.from_numpy(val_data), torch.from_numpy(val_labels))

    train_loader=DataLoader(train_dataset,
                            batch_size=batch_size,
                            drop_last=True,
                            shuffle=True)
    val_loader=DataLoader(val_dataset,
                          batch_size=batch_size,
                          drop_last=True,
                          shuffle=True)
    return train_loader,val_loader

# train_loader,val_loader=create_data_loader(512,8,128,0.8)
# print(len(train_loader)*128)
# for i in train_loader:
#     print(type(i))
#     print(i[0].dtype)
#     print(i[0].shape)
#     print(i[1].dtype)
#     print(i[1].shape)
#     break

# Single device training

In [60]:
def init_parameter(layer):
    if isinstance(layer,nn.Linear):
        nn.init.kaiming_uniform_(layer.weight)

config=SimpleNamespace(
    emb_dim=96,
    n_q_heads=12,
    n_kv_heads=4,
    RoPE_theta=500000,
    norm_eps=1e-8,
    ff_dim=144,
    drop_rate=0.1,
    num_layer=2,
    max_length=1024,
    input_dim=8,
    num_expert=2,
    num_experts_per_tok=1,
    capacity_factor=1.0
)

model=Transformer(config)
model.apply(init_parameter)
# model=model.to(torch.float16)
# model.half() # vanishing gradient in this task
batch_size=128
train_loader,val_loader=create_data_loader(512,32,batch_size,0.9)
# 128/32

  0%|          | 0/3900 [00:00<?, ?it/s]

  0%|          | 0/3028 [00:00<?, ?it/s]

  0%|          | 0/4214 [00:00<?, ?it/s]

  0%|          | 0/3149 [00:00<?, ?it/s]

In [61]:
import gc
from tqdm.auto import tqdm
from accelerate import Accelerator

accelerator = Accelerator()

num_epoch=5
gc.collect()
loss_fn=nn.MSELoss()
optimizer=torch.optim.AdamW(model.parameters(),lr=1e-3,weight_decay=0.01)
# optimizer=torch.optim.Adam(model.parameters(),lr=1e-5)
lr_scheduler=torch.optim.lr_scheduler.StepLR(optimizer,step_size=100,gamma=0.3)

model, optimizer, lr_scheduler,train_loader,val_loader=accelerator.prepare(model, optimizer, lr_scheduler,train_loader,val_loader)

for epoch in range(num_epoch):
    model.train()
    progress_bar=tqdm(train_loader,desc=f"Epoch: {epoch+1}/{num_epoch}")
    for batch,label in progress_bar:
        output=model(batch)

        loss=loss_fn(output.squeeze(-1),label)
        accelerator.backward(loss)
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

        progress_bar.set_postfix(train_loss=loss.item())

    model.eval()
    val_loss=0.0
    progress_bar=tqdm(val_loader,desc=f"Epoch: {epoch+1}/{num_epoch}")
    for batch,label in progress_bar:
        with torch.no_grad():
            output=model(batch)
            loss=loss_fn(output.squeeze(-1),label)
            val_loss=val_loss+loss.item()*batch_size
            progress_bar.set_postfix(val_loss=val_loss/((progress_bar.n+1)*batch_size))

Epoch: 1/5:   0%|          | 0/100 [00:00<?, ?it/s]

Epoch: 1/5:   0%|          | 0/11 [00:00<?, ?it/s]

Epoch: 2/5:   0%|          | 0/100 [00:00<?, ?it/s]

Epoch: 2/5:   0%|          | 0/11 [00:00<?, ?it/s]

Epoch: 3/5:   0%|          | 0/100 [00:00<?, ?it/s]

Epoch: 3/5:   0%|          | 0/11 [00:00<?, ?it/s]

Epoch: 4/5:   0%|          | 0/100 [00:00<?, ?it/s]

Epoch: 4/5:   0%|          | 0/11 [00:00<?, ?it/s]

Epoch: 5/5:   0%|          | 0/100 [00:00<?, ?it/s]

Epoch: 5/5:   0%|          | 0/11 [00:00<?, ?it/s]

# Multiple GPU training

In [None]:
import gc
from tqdm.auto import tqdm
from accelerate import Accelerator

def ddp_training():
    accelerator = Accelerator()

    num_epoch=10

    model=Transformer(config)
    model.apply(init_parameter)
    batch_size=256
    train_loader,val_loader=create_data_loader(512,4,batch_size,0.9)

    gc.collect()
    loss_fn=nn.MSELoss()
    optimizer=torch.optim.AdamW(model.parameters(),lr=1e-2,weight_decay=0.01)
    # optimizer=torch.optim.Adam(model.parameters(),lr=1e-5)
    lr_scheduler=torch.optim.lr_scheduler.StepLR(optimizer,step_size=100,gamma=0.7)

    model, optimizer, lr_scheduler,train_loader,val_loader=accelerator.prepare(model, optimizer, lr_scheduler,train_loader,val_loader)

    for i in tqdm(range(10)):
        i=i+1

    for epoch in range(num_epoch):
        model.train()
        progress_bar=tqdm(train_loader,desc=f"Epoch: {epoch+1}/{num_epoch}")
        for batch,label in progress_bar:
            output=model(batch)

            loss=loss_fn(output.squeeze(-1),label)
            accelerator.backward(loss)
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()

            progress_bar.set_postfix(train_loss=loss.item())

        model.eval()
        val_loss=0.0
        progress_bar=tqdm(val_loader,desc=f"Epoch: {epoch+1}/{num_epoch}")
        for batch,label in progress_bar:
            with torch.no_grad():
                output=model(batch)
                loss=loss_fn(output.squeeze(-1),label)
                val_loss=val_loss+loss.item()*batch_size
                progress_bar.set_postfix(val_loss=val_loss/((progress_bar.n+1)*batch_size))

In [None]:
from accelerate import notebook_launcher

notebook_launcher(ddp_training,args=(),num_processes=2)

In [None]:
# import os

# os._exit(00)