In [1]:
from pathlib import Path
import torch


## Data Preparation

In [2]:

input_file_path = Path('../data/tinyshakespeare.txt')

with open(input_file_path, 'r') as f:
    text = f.read()
print(f"length of dataset in characters: {len(text):,}")


length of dataset in characters: 1,115,393


#### get all the unique characters that occur in this text


In [3]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print("all the unique characters:", ''.join(chars))
print(f"vocab size: {vocab_size:,}")

all the unique characters: 
 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
vocab size: 65


#### create a mapping from characters to integers


In [4]:
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
def encode(s):
    return [stoi[c] for c in s] # encoder: take a string, output a list of integers
def decode(l):
    return ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

print(f"{encode("khoa") = }")
print(f"{decode(encode("khoa")) = }")

encode("khoa") = [49, 46, 53, 39]
decode(encode("khoa")) = 'khoa'


In [5]:
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.type)
print(data[:100])

torch.Size([1115393]) <built-in method type of Tensor object at 0x711785575540>
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])


So, we have a very small code book of 65 characters, very simple `encode` and `decode` functions, but we get very long sequences as a result

#### create the train and validation splits

In [6]:
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

In [7]:
print(f"{train_data.shape = }")
print(f"{val_data.shape = }")

train_data.shape = torch.Size([1003853])
val_data.shape = torch.Size([111540])


In [8]:
block_size = 8
train_data[:block_size]

tensor([18, 47, 56, 57, 58,  1, 15, 47])

In [9]:
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"when input is {context}, target = {target}")

when input is tensor([18]), target = 47
when input is tensor([18, 47]), target = 56
when input is tensor([18, 47, 56]), target = 57
when input is tensor([18, 47, 56, 57]), target = 58
when input is tensor([18, 47, 56, 57, 58]), target = 1
when input is tensor([18, 47, 56, 57, 58,  1]), target = 15
when input is tensor([18, 47, 56, 57, 58,  1, 15]), target = 47
when input is tensor([18, 47, 56, 57, 58,  1, 15, 47]), target = 58


> **Training Notes**:  
>
> We will train on all 8 input examples with the context of 1 character upto 8 characters. This is not just because of computational reasons, but also to make our Transformer get used to see inputs of different sizes (upto block size characters).  
>
> We will stack many batches of multiple chunks of text in a single torch `Tensor`, so we can keep the GPU busy since it is very good at parallel processing of data. These chunks will be processed independently in a parallel manner

In [19]:
torch.manual_seed(1337)
batch_size = 4 # how many independent sequences will we process in parallel?
block_size = 8 # what is the maximum context length for predictions?

def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,)) # randomizing the training data 
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print()
print('targets:')
print(yb.shape)
print(yb)
print()

print(f'---- there are {batch_size*block_size} training examples here ----')

for b in range(batch_size): # batch dimension
    for t in range(block_size): # time dimension
        context = xb[b, :t+1]
        target = yb[b,t]
        print(f"when input is {context.tolist()} the target: {target}")

inputs:
torch.Size([4, 8])
tensor([[53, 59,  6,  1, 58, 56, 47, 40],
        [49, 43, 43, 54,  1, 47, 58,  1],
        [13, 52, 45, 43, 50, 53,  8,  0],
        [ 1, 39,  1, 46, 53, 59, 57, 43]])

targets:
torch.Size([4, 8])
tensor([[59,  6,  1, 58, 56, 47, 40, 59],
        [43, 43, 54,  1, 47, 58,  1, 58],
        [52, 45, 43, 50, 53,  8,  0, 26],
        [39,  1, 46, 53, 59, 57, 43,  0]])

---- there are 32 training examples here ----
when input is [53] the target: 59
when input is [53, 59] the target: 6
when input is [53, 59, 6] the target: 1
when input is [53, 59, 6, 1] the target: 58
when input is [53, 59, 6, 1, 58] the target: 56
when input is [53, 59, 6, 1, 58, 56] the target: 47
when input is [53, 59, 6, 1, 58, 56, 47] the target: 40
when input is [53, 59, 6, 1, 58, 56, 47, 40] the target: 59
when input is [49] the target: 43
when input is [49, 43] the target: 43
when input is [49, 43, 43] the target: 54
when input is [49, 43, 43, 54] the target: 1
when input is [49, 43, 43, 54