In [6]:
import yfinance as yf
import numpy
import torch
import pdb

TICKERS = [
    "SMCI",
]

class DataFrame(object):
    def __init__(self, device):
        self.device = device
        self.data = None
        for ticker in TICKERS:
            hist = yf.Ticker(ticker).history(period="60d", interval="2m", actions=False)
            if self.data is None:
                    self.data = hist
            else:
                self.data = numpy.concatenate((self.data, hist), axis=1)
        self.data = self.data.to_numpy().astype(numpy.float32)
        self.data = torch.from_numpy(self.data)

    def getBatch(self, batch_size : int, block_size: int, split='training'):
        n = int(0.9 * len(self.data))
        data = self.data[:n]
        eval = self.data[n:]
        if split == "training":
            training_data = data
        else:
            training_data = eval
        ix = torch.randint(len(training_data) - block_size - 1, (batch_size,))
        x = torch.stack([ training_data[i:i+block_size] for i in ix])
        y = torch.stack([ training_data[i+1:i+block_size+1] for i in ix])
        x, y = x.to(self.device), y.to(self.device)
        return x, y

class Head(torch.nn.Module):
    def __init__(self, head_size, n_embed):
        super().__init__()
        self.q = torch.nn.Linear(n_embed, head_size, bias=False)
        self.k = torch.nn.Linear(n_embed, head_size, bias=False)
        self.v = torch.nn.Linear(n_embed, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        self.dropout = torch.nn.Dropout(0.2)

    def forward(self, index):
        B,T,C = index.shape
        q = self.q(index) # B, T, head_size
        k = self.k(index) # B, T, head_size
        w = q @ k.transpose(-1, -2) # [B, T, head_size] @ [B, head_size, T] = [B, T, T]
        w = w.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # B, T, T
        w = torch.nn.functional.softmax(w, dim=-1) # (B, T, T)
        w = self.dropout(w)
        v = self.v(index) # B, T, head_size
        out = w @ v
        return out

class MultiHeadAttention(torch.nn.Module):
    def __init__(self, n_head, n_embed, head_size):
        super().__init__()
        self.heads = torch.nn.ModuleList([ Head(head_size, n_embed) for _ in range(n_head) ])
        self.proj = torch.nn.Linear(head_size * n_head, n_embed)
        self.dropout = torch.nn.Dropout(0.14)

    def forward(self, index):
        x = torch.cat([h(index) for h in self.heads], dim=-1)
        x = self.proj(x)
        x = self.dropout(x)
        return x

class FeedFoward(torch.nn.Module):
    def __init__(self, n_embed):
        super().__init__()
        hiddenshape = 8192
        self.net = torch.nn.Sequential(
            torch.nn.Linear(n_embed, hiddenshape),
            torch.nn.ReLU(),
            torch.nn.Linear(hiddenshape, n_embed),
            torch.nn.Dropout(0.1),
        )

    def forward(self, x):
        return self.net(x)

class Block(torch.nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embed, n_head):
        super().__init__()
        head_size = n_embed // n_head
        self.sa = MultiHeadAttention(n_head, n_embed, head_size)
        self.ffwd = FeedFoward(n_embed)
        self.ln1 = torch.nn.LayerNorm(n_embed, eps=1e-6)
        self.ln2 = torch.nn.LayerNorm(n_embed, eps=1e-6)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x


class LLM(torch.nn.Module):
    # data = DataFrame
    # B, T, C = data.BTC()
    # LLM(T, C, n_head, n_layer)
    def __init__(self, block_size, n_embed, n_head, n_layer, device="cpu"):
        super().__init__()
        self.device = device
        self.position_embedding_table = torch.nn.Embedding(block_size, n_embed)
        self.blocks = torch.nn.Sequential(*[Block(n_embed, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = torch.nn.LayerNorm(n_embed, eps=1e-6) # final layer norm
        self.final_linear = torch.nn.Linear(n_embed, n_embed)
        self.loss = torch.nn.MSELoss()

    @torch.no_grad() 
    def estimate_loss(self, get_batch, batch_size, block_size, eval_iters=200):
        out = {}
        self.eval()
        for split in ['train', 'val']:
            losses = torch.zeros(eval_iters)
            for k in range(eval_iters):
                X, Y = get_batch(batch_size, block_size, split)
                logits, loss = self(X, Y)
                losses[k] = loss.item()
            out[split] = losses.mean()
        self.train()
        return out

    def forward(self, index, targets):
        # B, T
        B, T, C = index.shape
        pos_emb = self.position_embedding_table(torch.arange(T, device=self.device)) # (T,C)
        x = index + pos_emb # (B,T,C)
        x = self.blocks(index)
        x = self.ln_f(x)
        logits = self.final_linear(x)
        if targets is None:
            return logits, None
        B,T,C = logits.size()
        loss = self.loss(logits, targets)
        return logits, loss

    def generate(self, index, max_gen_token=500):
        for i in range(max_gen_token):
            logits, loss = self.forward(index, None)
            logits = logits[:, -1, :]
            prob = torch.nn.functional.softmax(logits, dim=-1)
            index_next = torch.multinomial(prob, num_samples=1)
            index = torch.cat((index, index_next), dim=1)
        return index

    def train_and_update(self, get_batch, batch_size, block_size, lr, epoch, eval_interval=1e2):
        optimizer = torch.optim.Adam(self.parameters(), lr=lr)
        for i in range(int(epoch)):
            if i % eval_interval == 0:
                losses = self.estimate_loss(get_batch, batch_size, block_size)
                print(f"step {i}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
            x, y = get_batch(batch_size, block_size)
            logits, loss = self.forward(x, y)
            optimizer.zero_grad(set_to_none=True)
            loss.backward()
            optimizer.step()

device = 'cuda'
data = DataFrame(device)
batch_size = 2
block_size = 2048
x, y = data.getBatch(batch_size, block_size)
B, T, C = x.shape
n_embed = C
n_head = 1
n_layer = 128
llm = LLM(block_size, n_embed, n_head, n_layer, device=device)
llm.to(device)
# llm.forward(x, y)
llm.train_and_update(data.getBatch, block_size, batch_size, 1e-4, 1e4)

step 0: train loss 600640256.0000, val loss 604272128.0000


OutOfMemoryError: CUDA out of memory. Tried to allocate 128.00 MiB. GPU 0 has a total capacity of 10.00 GiB of which 0 bytes is free. Of the allocated memory 16.48 GiB is allocated by PyTorch, and 13.65 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)