this is going to be a character level language model
which means the tokens are going to be individual characters that exists in the dataset

In [39]:
with open("input.txt", 'r') as f:
    text = f.read()
    
print(f"dataset length: {len(text)}")

dataset length: 1115394


In [40]:
chars = sorted(list(set(text)))
vocab_size = len(chars)

print("".join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [41]:
"""
each element of the encoded array will be from the range
0 - len(vocab) - 1

generate an encoded sequence using the following
a simple encoding function that converts a string into array of integers
a simple decoder that converts an array of integers back to original string

other popular tokenizers include 

tiktoken - by open ai used by gpt
sentencepiece - by google
"""
stoi = { ch: i for i, ch in enumerate(chars) }
itos = { i: ch for i, ch in enumerate(chars) }

encode = lambda string: [stoi[char] for char in string]
decode = lambda array: "".join([itos[i] for i in array])

print(encode("hello world"))
print(decode(encode("hello world")))

[46, 43, 50, 50, 53, 1, 61, 53, 56, 50, 42]
hello world


In [42]:
import torch

data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:1000])

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47, 59, 57,  1, 25, 39, 56, 41,
      

In [43]:
train_test_split = int(0.9 * len(data))
train_data = data[:train_test_split]
test_data = data[train_test_split:]

print(f"train length: {len(train_data)}")
print(f"test length: {len(test_data)}")

train length: 1003854
test length: 111540


In [44]:
"""
    hyperparameter that determines the size of the sequences that is fed into the transformer
    this is also called the time dimension or simply "length" of the input sequence

"""
block_size = 8

# why + 1 ?
sample_block = train_data[:block_size + 1]
print(sample_block)


tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])


In [45]:
"""
    + 1 is added because
    in the above tensor there are 8 different samples
    18
    18, 47
    18, 47, 56
    ...
    18, 47, 56, 57, 58,  1, 15, 47
    
    for each sample, the transformer predicts the next token and validates with the training sample
    that is,
    
    for first sample 18, the next token will be 47
    for second sample 18, 47 the next token will be 56
    ...
    for 8th sample 18, 47, 56, 57, 58,  1, 15, 47 the next token will be 58
    
    to account for the next token for the last sample, we add the + 1
"""
x = train_data[:block_size]
y = train_data[1:block_size + 1]
print(x, y)

for t in range(block_size):
    context = x[:t + 1]
    target = y[t]
    
    print(f"when input in: {context} the target: {target}")

tensor([18, 47, 56, 57, 58,  1, 15, 47]) tensor([47, 56, 57, 58,  1, 15, 47, 58])
when input in: tensor([18]) the target: 47
when input in: tensor([18, 47]) the target: 56
when input in: tensor([18, 47, 56]) the target: 57
when input in: tensor([18, 47, 56, 57]) the target: 58
when input in: tensor([18, 47, 56, 57, 58]) the target: 1
when input in: tensor([18, 47, 56, 57, 58,  1]) the target: 15
when input in: tensor([18, 47, 56, 57, 58,  1, 15]) the target: 47
when input in: tensor([18, 47, 56, 57, 58,  1, 15, 47]) the target: 58


In [46]:
"""
    now usually training is done in batches to train multiple chunks (input samples) at a time
    to improve parallel processing
    so we use the above logic and then split the data into batches
"""

torch.manual_seed(1337)
batch_size = 4 # batch dimension, how many independent sequences to process in parallel
block_size = 8 # time dimension, maximum context length for predictions (max length of input sequence)

def get_batch(split: str):
    # generate a small batch of data inputs x and targets y
    data = train_data if split == 'train' else test_data
    ix = torch.randint(len(data) - block_size, (batch_size, ))
    x = torch.stack([data[i:i + block_size] for i in ix])
    y = torch.stack([data[i + 1: i + block_size + 1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print("inputs: ")
print(xb.shape)
print(xb)

print("targets: ")
print(yb.shape)
print(yb)
print('-' * 10)

for b in range(batch_size): # batch dimension
    for t in range(block_size): # time dimension
        context = xb[b, :t + 1]
        target = yb[b, t]
        
        print(f"when input in: {context} the target: {target}")
    print('-' * 10)
    

inputs: 
torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
targets: 
torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])
----------
when input in: tensor([24]) the target: 43
when input in: tensor([24, 43]) the target: 58
when input in: tensor([24, 43, 58]) the target: 5
when input in: tensor([24, 43, 58,  5]) the target: 57
when input in: tensor([24, 43, 58,  5, 57]) the target: 1
when input in: tensor([24, 43, 58,  5, 57,  1]) the target: 46
when input in: tensor([24, 43, 58,  5, 57,  1, 46]) the target: 43
when input in: tensor([24, 43, 58,  5, 57,  1, 46, 43]) the target: 39
----------
when input in: tensor([44]) the target: 53
when input in: tensor([44, 53]) the target: 56
when input in: tensor([44, 53, 56]) the

In [47]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

"""
    A bigram language model is a type of statistical language model used in natural language processing (NLP) 
    to predict the probability of a word based on the previous word in a sequence. 
    Specifically, it focuses on pairs of consecutive words, known as bigrams
    
    in our case the bigrams are tokens, which are individual characters
    the embedding table forms a tensor of shape (vocabulary size, embedding dimension)
    the embedding dimension determines how "big" each token tensor is
    in our case both the vocab size and embedding dim is 65 (since 65 unique characters in the corpus)
    
"""

class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
        
    def forward(self, idx, target=None):
        # both idx and target are tensors of shape (B, T)
        logits = self.token_embedding_table(idx) # logits will be tensor of shape (B, T, C)
        
        # calculate the loss, using categorical cross entropy loss
        # now the cross entropy accepts the input in form (B, C, T) not (B, T, C) so reshape it
        if target is None:
            # inference only, not training
            loss = None
        else:
            # print("dim before reshape", logits.shape)
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            target = target.view(B * T)
            loss = F.cross_entropy(logits, target)
            # print("dim after reshape", logits.shape)
        
        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # self() is basically calling the model() itself, so it calls the forward() function
            # and returns logits, loss
            # logits is (B, T, C)
            logits, loss = self(idx)
            # out of this we only need the last predicted time step/sequence
            logits = logits[:, -1, :] # (B, C) since only the last T is selected, the dimension of T is essentially 1 and discarded
            
            # use softmax for probabilities
            probs = F.softmax(logits, dim=1) # probabilities along C (embedding dimension axis)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1), where the 1 is the token that is predicted
            # append sampled token/index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
            
        print("generation", idx.shape)
        return idx

        
model = BigramLanguageModel(vocab_size)

logits, loss = model(xb, yb)
print(f"logits = {logits}")
print(loss)

print(decode(model.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[0].tolist()))

logits = tensor([[-1.5101, -0.0948,  1.0927,  ..., -0.6126, -0.6597,  0.7624],
        [ 0.3323, -0.0872, -0.7470,  ..., -0.6716, -0.9572, -0.9594],
        [ 0.2475, -0.6349, -1.2909,  ...,  1.3064, -0.2256, -1.8305],
        ...,
        [-2.1910, -0.7574,  1.9656,  ..., -0.3580,  0.8585, -0.6161],
        [ 0.5978, -0.0514, -0.0646,  ..., -1.4649, -2.0555,  1.8275],
        [-0.6787,  0.8662, -1.6433,  ...,  2.3671, -0.7775, -0.2586]],
       grad_fn=<ViewBackward0>)
tensor(4.8786, grad_fn=<NllLossBackward0>)
generation torch.Size([1, 101])

Sr?qP-QWktXoL&jLDJgOLVz'RIoDqHdhsV&vLLxatjscMpwLERSPyao.qfzs$Ys$zF-w,;eEkzxjgCKFChs!iWW.ObzDnxA Ms$3


In [48]:
"""
    NOTE: 
    embedding table is callable, accepting a parameter of type tensor
    as you can see from the example, we can get the corresponding embedding vector for each character
    the string "he" produces a tensor of shape (2, 65) 
    where first row is the embedding vector for "h"
    and the second row is the embedding vector for "e"
"""
h = model.token_embedding_table(torch.tensor(encode("h")))
e = model.token_embedding_table(torch.tensor(encode("e")))
he = model.token_embedding_table(torch.tensor(encode("he")))
print(h, h.shape)
print(e, e.shape)
print(he, he.shape)

tensor([[ 1.0901,  0.2170, -2.9996,  1.4690, -0.1948, -0.1507,  0.2601, -0.9647,
          0.1162, -0.8295, -0.2266,  0.0219, -0.2785, -0.4851, -1.8023, -0.7330,
         -1.2828,  0.8863,  1.0515, -0.9823, -1.6369, -1.3499,  0.1830,  0.0532,
         -1.1438, -0.2829, -0.5979,  1.4757,  0.4655, -3.0346,  0.5516,  1.3107,
          0.1240, -1.8046,  0.2700, -0.4322,  0.2784, -0.5599,  1.2502,  0.7051,
         -1.0169,  0.4854, -1.0808, -0.3128, -0.4189, -0.5718,  0.8215,  1.7384,
          0.5578,  0.6167,  1.5260, -0.3508, -1.5615,  0.4548, -0.8935,  0.3642,
          0.5714,  2.7072, -1.5443,  1.1288, -1.1217, -1.7328, -0.5472, -0.8017,
          0.7761]], grad_fn=<EmbeddingBackward0>) torch.Size([1, 65])
tensor([[ 0.3323, -0.0872, -0.7470, -0.6074,  0.3418,  0.5343,  0.3957, -0.4919,
         -0.0894, -1.3886,  1.2835, -0.3975,  2.0152,  1.6773, -0.3833,  1.5728,
          1.9458,  0.7247, -0.4834, -0.3263,  0.3193, -0.4198, -0.6435, -0.3311,
          0.7554, -1.2385,  0.4067,  0.

In [None]:
batch_size = 32
max_iters = 10000
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
for iter in range(max_iters):
    if (iter % 100 == 0):
        print(f"{iter}/{max_iters}")
    xb, yb = get_batch('train')
    
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    
print("final", loss.item())

0/10000
100/10000
200/10000
300/10000
400/10000
500/10000
600/10000
700/10000
800/10000
900/10000
1000/10000
1100/10000
1200/10000
1300/10000
1400/10000
1500/10000
1600/10000
1700/10000
1800/10000
1900/10000
2000/10000
2100/10000
2200/10000
2300/10000
2400/10000
2500/10000
2600/10000
2700/10000
2800/10000
2900/10000
3000/10000
3100/10000
3200/10000
3300/10000
3400/10000
3500/10000
3600/10000
3700/10000
3800/10000
3900/10000
4000/10000
4100/10000
4200/10000
4300/10000
4400/10000
4500/10000
4600/10000
4700/10000
4800/10000
4900/10000
5000/10000
5100/10000
5200/10000
5300/10000
5400/10000
5500/10000
5600/10000
5700/10000
5800/10000
5900/10000
6000/10000
6100/10000
6200/10000
6300/10000
6400/10000
6500/10000
6600/10000
6700/10000
6800/10000
6900/10000
7000/10000
7100/10000
7200/10000
7300/10000
7400/10000
7500/10000
7600/10000
7700/10000
7800/10000
7900/10000
8000/10000
8100/10000
8200/10000
8300/10000
8400/10000
8500/10000
8600/10000
8700/10000
8800/10000
8900/10000
9000/10000
9100/10000


In [52]:
print(decode(model.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=500)[0].tolist()))

generation torch.Size([1, 501])

Tror?
Bu ne-ingof acat nd l,
Fothind aty y:
ARDUTA llllld!
AMQUThes med thestw cos wand herf s hafold mZWirus je ney biPoeronngabsestouMOLAUCES: ONDohery ththe tonmy th, fourf thatys ng dd pp qur ace Einowhemy azer:
I,
Ishit tinghast ha tteredef seariomams.
Makine,

Than ts hientr?
3nd woft re y l, uCABe codauseabertierr,
her tr fed?
NCK: p?
S:
Awo!
I'l,
Atyxevee ugiARIn telo,
ANG cousk, thoa hroro s lly ndst on meave S:

QUSTAg? therecr.
ULofre,
We, sthablddff; chof fome ureswir anqur t h sele 
