In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import os
import matplotlib.pyplot as plt

In [2]:
with open(os.getcwd()+'\\data\\input.txt', 'r', encoding='utf8') as f:
    text = f.read()

In [3]:
torch.cuda.is_available()

True

In [4]:
chars = sorted(list(set(text)))

vocab_size = len(chars)

print(''.join(chars))
print(vocab_size)

#strategy to tokenize text

#character level mapping

s_to_i = {ch:i for i,ch in enumerate(chars)}
i_to_s = {i:ch for i,ch in enumerate(chars)}

encode = lambda s: [s_to_i[c] for c in s]
decode = lambda l: ''.join([i_to_s[c] for c in l])

print(encode("hii there"))
print(decode(encode("hii there")))

#take all of the text and encode them into pytorch tensors

data = torch.tensor(encode(text), dtype = torch.long)
print(data.shape, data.dtype)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65
[46, 47, 47, 1, 58, 46, 43, 56, 43]
hii there
torch.Size([1115394]) torch.int64


In [5]:
#splitting into a train and validation split

n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

In [6]:
#input the text sequences into the transformer

#we only work with chunks of the dataset. when we train the model, we sample chunks of the dataset and train on just chunks at a time. chunks have a maximum length: called block_size

block_size = 8
train_data[:block_size+1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [7]:
x = train_data[:block_size]

#shift to the right by 1 to get the target
y = train_data[1:block_size+1]

In [8]:
for t in range(block_size):
    context = x[:t+1]
    target = y[t]

    print(f"when the input is {context}, the target will be {target}")

#we want the model to be robust when seeing contexts of multiple lengths, we set it as 8 so that the model has the capability of looking back at least 'block_size' characters of contexts of. 
# When we need to predict the 9th or 10th character, we start to truncate the context to continue onwards

when the input is tensor([18]), the target will be 47
when the input is tensor([18, 47]), the target will be 56
when the input is tensor([18, 47, 56]), the target will be 57
when the input is tensor([18, 47, 56, 57]), the target will be 58
when the input is tensor([18, 47, 56, 57, 58]), the target will be 1
when the input is tensor([18, 47, 56, 57, 58,  1]), the target will be 15
when the input is tensor([18, 47, 56, 57, 58,  1, 15]), the target will be 47
when the input is tensor([18, 47, 56, 57, 58,  1, 15, 47]), the target will be 58


In [40]:
#GLOBALS

batch_size = 64 #This is the value of B
block_size = 256 #This is the value of T
n_embd = 512 #This is the value of C
n_head = 8
dropout = 0.25
n_layers = 8 #number of decoder blocks we will initialize
max_new_tokens = 1000
device = 'cuda' if torch.cuda.is_available() else 'cpu'
learning_rate = 3e-4

eval_interval = 500
eval_iters = 200
max_iters = 5000

In [41]:
#getting the batches
torch.manual_seed(1337)
def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y, = x.to(device), y.to(device)
    return x, y

xb, yb = get_batch('train')

# print('inputs:')
# print(xb.shape)
# print(xb)
# print('targets:')
# print(yb.shape)
# print(yb)

#xb, yb, of shape (B, T) where B = batch_size, and T = block_size (time sequence)

transformer time!

head -> multi-head self attention -> feedforward -> block -> Decoder

special layers:
dropout
layernorm
linear
embedding
pos_embedding

In [42]:
#Pytorch's positional encoding https://pytorch.org/tutorials/beginner/transformer_tutorial.html
import math

class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        """
        Arguments:
            x: Tensor, shape ``[seq_len, batch_size, embedding_dim]``
            x: (T, B, C)
            We have to change our shape dimensions in to (T, B, C) and then change it back to (B, T, C) when done
        """
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

In [43]:
class Head(nn.Module):
    """
    One Head of self-attention 
    """
    def __init__(self, head_size):
        super().__init__()

        self.head_size = head_size
        #initialize the key, query, and value matrices
        self.Wk = nn.Linear(n_embd, head_size, bias=False)
        self.Wq = nn.Linear(n_embd, head_size, bias=False)
        self.Wv = nn.Linear(n_embd, head_size, bias=False)

        #since this is a decoder, we need to initialize the mask as well

        #we register this as a buffer, which still exists as a 'matrix' to use, but we don't compute gradients on this or use in the backward pass
        #model parameters are objects that we use during the forward pass and we update using gradient descent
        #model buffers are objects that we use during computation but do not update

        #both parameters and buffers are saved to the right device when calling .to_device
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        #assume x is of shape (B, T, C)
        B, T, C = x.shape
        K = self.Wk(x) #(B, T, head_size)
        Q = self.Wq(x) #(B, T, head_size)
        V = self.Wv(x) #(B, T, head_size)

        #K.T needs to be of shape (B, C, T), so we swap the -2 and -1 positions
        scores = Q @ K.transpose(-2, -1) * 1/(self.head_size)**2 #(B, T, T)
        masked_scores = scores.masked_fill(self.tril[:T, :T] == 0, float('-inf')) #(B, T, T)
        attention_scores = F.softmax(masked_scores, dim = -1) #applying softmax along the rows (B, T, T)
        attention_scores = self.dropout(attention_scores) #(B, T, T)
        out = attention_scores @ V #(B, T, head_size)
        
        return out #(B, T, head_size)


In [44]:
class MultiHeadAttention(nn.Module):
    def __init__(self, n_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(n_heads)]) #(B, T, n_heads*head_size)
        self.proj = nn.Linear(n_heads * head_size, n_embd) #paper specifies a final linear layer
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        #assume input x is of size (B, T, C)

        #Each Head returns a output of size (B, T, head_size), we concatenate along the final dimension so that our variable 'out' is now (B, T, n_heads*head_size)
        out = torch.cat([h(x) for h in self.heads], dim = -1) #(B, T, n_heads*head_size)
        out = self.proj(out) #(B, T, C)
        out = self.dropout(out) #(B, T, C)

        return out

In [45]:
class FeedForward(nn.Module):
    def __init__(self):
        super().__init__()
        #input and output will be of size (B, T, C)
        self.ff1 = nn.Linear(n_embd, 4*n_embd)
        self.ff2 = nn.Linear(4*n_embd, n_embd)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        #assume x is of shape (B, T, C)
        x = self.ff1(x)
        x = self.relu(x)
        x = self.ff2(x)
        x = self.dropout(x)

        return x

In [46]:
class Block(nn.Module):
    #implementaion of one transformer block
    def __init__(self, n_head):
        super().__init__()
        self.head_size = n_embd // n_head

        self.sa = MultiHeadAttention(n_head, self.head_size)
        self.ffw = FeedForward()
        self.layernorm1 = nn.LayerNorm(n_embd)
        self.layernorm2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        #assume input of x is size (B, T, C) where x is the sum of the embedded input + positional_encoding
        x = self.layernorm1(x) #(B, T, C)
        x = x + self.sa(x) #(B, T, C)
        x = self.layernorm2(x) #(B, T, C)
        x = x + self.ffw(x) #(B, T, C)

        return x


In [47]:
class Decoder(nn.Module):
    def __init__(self):
        super().__init__()
        self.tok_embedding_matrix = nn.Embedding(vocab_size, n_embd)
        self.pos_embedding = PositionalEncoding(n_embd)

        # need '*' before list comprehension otherwise we get TypeError: list is not a Module subclass
        self.blocks = nn.Sequential(*[Block(n_head) for _ in range(n_layers)])

        self.final_layer_norm = nn.LayerNorm(n_embd)
        self.final_linear = nn.Linear(n_embd, vocab_size)
        
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            
    def forward(self, x, y = None):
        B, T = x.shape
        C = n_embd
        #assume our inputs x are of size (B, T)
        #assume our targets y are of size (B)

        token_embed = self.tok_embedding_matrix(x) #(B, T, C)
        pos_embed = self.pos_embedding(token_embed.view(T,B,C)).view(B, T, C) #(B, T, C)

        input = token_embed + pos_embed #(B, T, C)
        input = self.blocks(input) #(B, T, C)

        input = self.final_layer_norm(input) #(B, T, C)
        logits = self.final_linear(input) #(B, T, C)

        if y is not None:
            logits = logits.view(B*T, -1) #(B*T, C)
            y = y.view(B*T)
            loss = F.cross_entropy(logits, y)
        else:
            loss = None

        return logits, loss
    
    def generate(self, idx, max_new_tokens):
    #generating some stuff
    #idx is (B, T) array of indices in our current context <-- current context of some list of characters in some batch
    #we keep extending (B, T) to (B, T+1), (B, T+2) and so on.. continuing until we reach max new tokens

        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            #get the predictions
            logits, loss = self(idx_cond) #<-- output of this is (B, T, C)
            #print(f"new dim of logits: {logits.shape}")
            #focus only on the last time step because the last time step is the prediction on what comes next
            logits = logits[:, -1, :] #becomes (B, C)
            #apply softmax to get the probabilities
            probs = F.softmax(logits, dim =-1) # (B, C)
            #sample from the distribution
            idx_next = torch.multinomial(probs, num_samples = 1) # (B, 1)


            idx = torch.cat((idx, idx_next), dim = 1) #(B, T+1)


        return idx

In [48]:
#initializing the stuff

model = Decoder()
m = model.to(device)

print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

25.274433 M parameters


In [49]:
print(model)

Decoder(
  (tok_embedding_matrix): Embedding(65, 512)
  (pos_embedding): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (blocks): Sequential(
    (0): Block(
      (sa): MultiHeadAttention(
        (heads): ModuleList(
          (0-7): 8 x Head(
            (Wk): Linear(in_features=512, out_features=64, bias=False)
            (Wq): Linear(in_features=512, out_features=64, bias=False)
            (Wv): Linear(in_features=512, out_features=64, bias=False)
            (dropout): Dropout(p=0.25, inplace=False)
          )
        )
        (proj): Linear(in_features=512, out_features=512, bias=True)
        (dropout): Dropout(p=0.25, inplace=False)
      )
      (ffw): FeedForward(
        (ff1): Linear(in_features=512, out_features=2048, bias=True)
        (ff2): Linear(in_features=2048, out_features=512, bias=True)
        (relu): ReLU()
        (dropout): Dropout(p=0.25, inplace=False)
      )
      (layernorm1): LayerNorm((512,), eps=1e-05, elementwise_affine=T

In [50]:
#use this function to estimate the loss every once in a while
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [51]:
#training loop
for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=1000)[0].tolist()))
#open('more.txt', 'w').write(decode(m.generate(context, max_new_tokens=10000)[0].tolist()))

step 0: train loss 4.3642, val loss 4.3649


In [None]:
print(decode(m.generate(context, max_new_tokens=1000)[0].tolist()))


OXETELY:
Come;
En caleshe neter, I'll betas Myon, witasharthine s yooncurtupit ithalflly s; he onis.
Weth myonorenoue heably ds.'s winy let thalt th wit ine harkert-redy wing
Withieat. Ifler ps y betho's burhou thes ond, ge movis ant thad toeyof th. usthis no I hour talold bonea unt tofo lt tag a sandend manigigives lot ves
Myofe mby ay hitha ishim; s nelim uthis thene waty,
ofethe ble plin hosome carifuks an. lits, of: inors I makestellve thor s thigremy thincof winey ieee t 'sonde chis of on haf wonsabe man the d, may athin y
Ditoflpallo mit
I'dreck rays,
Be on My totheuneinont out!'s mof st this amermor t burdstucinous. cf humps, wrenden,
Whis, ber flulowomblle men ding tof too uckempe foralt wit at weve, oforser in; our the thef,
Yonevo Rearosthulthinrwhe bus tey t rexkinonoreceaby,
Aroncrs iveng tuces tellongad an ofe at bus, withenely, trit han Leseg tupuponde s Talm this yonount! thon avigs wishires;
Th'sout-pe
Shan flinurin thoolve, honowie t atr w Ous seff t I fituswaltonchin

rough work

In [None]:
tokens = nn.Embedding(vocab_size, n_embd)

xb = tokens(xb)

xb.shape

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument index in method wrapper_CUDA__index_select)

In [None]:
pos = PositionalEncoding(32)
B, T, C = xb.shape
pos(xb.view(T,B,C)).view(B, T, C)

tensor([[[-0.0000, -0.3046, -0.7895,  ..., -0.0000, -0.1914,  2.7128],
         [-2.1587,  1.6099,  0.3175,  ..., -0.7726, -0.1732, -0.7560],
         [-0.0228,  1.3423,  0.0000,  ...,  0.3276,  1.2187,  0.0000],
         ...,
         [ 1.1110,  1.8604, -0.6897,  ...,  2.7487,  0.0767,  2.5919],
         [ 1.7454,  2.0296,  1.7677,  ...,  2.2489,  0.3629, -0.2505],
         [-1.2237,  0.0000,  0.9099,  ..., -0.7726, -0.1730, -0.7560]],

        [[ 1.5315, -1.4319,  2.7456,  ...,  0.3968,  1.0700,  0.2157],
         [ 2.4284, -1.2822, -1.4963,  ...,  1.0317,  0.3879,  2.1038],
         [ 1.6417,  1.3021,  0.1874,  ...,  3.0464, -0.9241,  2.8748],
         ...,
         [ 0.9672,  0.3293,  2.2789,  ...,  2.2489,  0.3633, -0.0000],
         [-2.0836, -0.1690,  2.3959,  ...,  1.0396, -2.2987, -1.1044],
         [ 0.0000, -0.8688,  1.8949,  ...,  0.3276,  1.2193,  2.0859]],

        [[-2.0830, -1.2652,  2.4538,  ..., -0.0343,  0.4258,  2.5545],
         [-0.8637, -0.4950,  1.6562,  ...,  0

In [None]:
#H = MultiHeadAttention(2, 16)
#H = Head(16)
H = Block(2)
H(xb)

tensor([[[ 1.3696,  1.5744,  0.2509,  ..., -0.5898,  0.3345, -2.2403],
         [ 0.7096, -2.3177, -0.5724,  ..., -0.5082, -0.8088,  0.2871],
         [ 0.8979, -2.1066, -0.8966,  ..., -1.2316, -0.1825,  2.1817],
         ...,
         [-1.1446,  0.0414,  1.8208,  ..., -0.9161, -0.3360, -0.3574],
         [ 0.4984, -1.2605, -0.7518,  ..., -0.9832, -1.1752,  0.1661],
         [ 0.5135, -1.7658, -0.4623,  ..., -0.7278, -1.0931, -0.3946]],

        [[ 0.1684, -1.6184,  1.0792,  ...,  0.9328, -0.1259,  1.9468],
         [-0.1390,  2.1019,  1.9862,  ...,  1.2079,  0.1201,  1.2189],
         [ 1.4138, -0.7045, -0.3073,  ..., -1.7178,  1.8023,  1.1076],
         ...,
         [ 0.2745, -1.1965, -1.2149,  ..., -0.8091, -0.5446,  0.0062],
         [-0.5786,  1.1474, -0.6361,  ..., -0.4063,  0.2948,  0.0563],
         [ 0.2589, -1.9941, -1.6683,  ..., -1.1319, -0.1778,  1.6774]],

        [[ 1.0505,  1.2797,  0.4887,  ..., -1.5235, -0.5249, -0.4292],
         [ 0.4774, -1.9272, -1.6452,  ..., -1