# References 
- Attention is All You Need paper: https://arxiv.org/abs/1706.03762
- OpenAI GPT-3 paper: https://arxiv.org/abs/2005.14165
- GPU: https://lambdalabs.com/ 
- Data: https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
- Video: https://www.youtube.com/watch?v=kCc8FmEb1nY [26:39]

In [3]:
# download the data set 
!curl https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt > input.txt

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 1089k  100 1089k    0     0  6069k      0 --:--:-- --:--:-- --:--:-- 6119k


In [6]:
# read the data set 
with open('input.txt', 'r') as file:
    text = file.read()

print(len(text)) # len of the text
print(text[:100]) # print the first 100 characters

1115394
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [21]:
# get all unique characters in the text
char = sorted(set(text))
vocab_size = len(char)
print(vocab_size)
print(''.join(char))

65

 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz


In [13]:
# building a character level encoding by a mapping from unique characters to integers 
""" 
stoi = {}  # Dictionary to store the index of each character in char list

for i in range(len(char)):
    ch = char[i]
    stoi[ch] = i
"""
stoi = {ch:i for i,ch in enumerate(char)} # stores the index of each character in char
itos = {i:ch for i,ch in enumerate(char)} # stores the character of each integer in char

# string to integer list
def encode(s:str) -> list:
    """
    result = []
    for c in s:
        result.append(stoi[c])
    return result
    """
    return [stoi[c] for c in s] 

def decode(s:list) -> str:
    """
    result = []
    for i in s:
        result.append(itos[i])
    return ''.join(result)
    """
    return ''.join([itos[i] for i in s])

print(encode('hello'))
print(decode(encode('hello')))

[46, 43, 50, 50, 53]
hello


In [15]:
# encode the entire dataset and store it in toruch.Tensor
import torch
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:100])

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])


In [17]:
# split the data to train and validation sets
n = int(len(data) * 0.9)
train_data = data[:n] # training data
val_data = data[n:] # validation data; wil be used to get a sense of the overfitting

In [18]:
block_size = 8  # length of one block of data; what is the maximum context length for predictions
# understanding the block_size
x = train_data[0:block_size]
y = train_data[1:block_size+1]
for i in range(block_size):
    context = x[:i+1]
    target = y[i]
    print(f"When input is {context} then target is {target}")
# helps the model predict everything up to block_size 
# as the transformer will never receive more than block_size at a time
# keeps gpu busy as we can process multiple chunks of data in parallel
# those chunks are completely independent of each other 

When input is tensor([18]) then target is 47
When input is tensor([18, 47]) then target is 56
When input is tensor([18, 47, 56]) then target is 57
When input is tensor([18, 47, 56, 57]) then target is 58
When input is tensor([18, 47, 56, 57, 58]) then target is 1
When input is tensor([18, 47, 56, 57, 58,  1]) then target is 15
When input is tensor([18, 47, 56, 57, 58,  1, 15]) then target is 47
When input is tensor([18, 47, 56, 57, 58,  1, 15, 47]) then target is 58


In [20]:
batch_size = 4 # how many independent sequences we want to process in parallel
torch.manual_seed(1337) # will be used to sample random chunks of data

def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,)) # random starting indices for each chunk of data
    x = torch.stack([data[i:i+block_size] for i in ix]) # stack the 1d chunks as rows, 4 by 8 tensor
    # each row is a chunk of the training set 
    y = torch.stack([data[i+1:i+block_size+1] for i in ix]) # stack the chunks shifted by 1
    return x, y

xb, yb = get_batch('train')
print(f"input shape: {xb.shape}\n", xb) # contains 32 examples (4*8) which are completely idenepndent
# as each row has 8 examples  
print(f"target shape: {yb.shape}\n", yb)

input shape: torch.Size([4, 8])
 tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
target shape: torch.Size([4, 8])
 tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])


In [23]:
# we are gonna start with the simplest possible model which is the bigram language model
import torch 
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class BigramLangModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.vocab_size = vocab_size
        # each token directly reads off the logits of the next token from a lookup table 
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets):
        # when we pass idx every integer in our input is going to refer to the embedding table
        # and its going to pluck out a row of that embedding table corresponding to that index
        #  
        # idx and targets are both (B, T) tesors of integers 
        logits = self.token_embedding_table(idx) # (B, T, C) 
        # pytorch will arrange it to batch by time by channel 
        # batch is 4, time is 8 and C is vocab size which is 65
        # logits is the score for the next charecter 
        # we are predicting what comes next based on the indivisual identity of a single token 
        # we evaluate the loss
        loss = F.cross_entropy(logits, targets) 
        # loss is the cross entropy between predictions and targets 
        # measures the quality of the logits with respect to the targets  
        # we have the identity of the next character so how well we are predicting the next character 
        # the correct dimension of logits depeending on whatever the target is should have 
        # a very high number and all other dimensions should have a very low number
        # cross_entropy needs a (B, C, T)

        return logits, loss

m = BigramLangModel(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape, loss) # (B, T, C) = (4, 8, 65)

RuntimeError: Expected target size [4, 65], got [4, 8]