In [1]:
import torch

In [2]:
with open('data/Two sailor lads.txt', 'r', encoding = 'utf-8') as f:
    text = f.read()
chars = sorted(set(text))
print(chars, len(chars))

['\n', ' ', '!', '"', '&', "'", '(', ')', '*', ',', '-', '.', '0', '1', '3', '5', '7', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '£', '§', 'Ñ', 'æ', 'é', 'ê', 'ñ', 'ö', '†', '‡', '\ufeff'] 86


In [3]:
#tokenizer
#character level tokenization- char to int, #smalll vocabulary
char_to_int = {char: idx for idx, char in enumerate(chars)}
int_to_chars = {idx: char for idx, char in enumerate(chars)}

In [4]:
encode = lambda text: [char_to_int[char] for char in text]
decode = lambda encoded_text: ''.join(int_to_chars[encoded_int] for encoded_int in encoded_text) 

In [5]:
encoded = encode('hello')
encoded

[56, 53, 60, 60, 63]

In [6]:
decode(encoded)

'hello'

In [7]:
data = torch.tensor(encode(text), dtype = torch.long) #long seq of integers

In [8]:
print(data[:100])

tensor([85,  1,  1, 48, 39, 71, 63,  1, 38, 49, 57, 60, 63, 66,  1, 31, 49, 52,
        67, 11, 48,  0,  0,  1,  1, 20,  1, 38, 39, 34, 37, 44,  1, 34, 25,  0,
         0,  1,  1, 38, 39, 28, 37, 37, 28, 33, 26,  1, 20, 23, 41, 24, 33, 39,
        40, 37, 24, 38,  1, 34, 33,  1, 38, 24, 20,  1, 20, 33, 23,  1, 31, 20,
        33, 23,  0,  0,  0,  1,  1, 21, 44,  0,  0,  1,  1, 26, 34, 37, 23, 34,
        33,  1, 38, 39, 20, 21, 31, 24, 38,  9])


In [9]:
#validation and training split
split_index = int(0.8 * len(text))

In [10]:
train = data[:split_index]
test = data[split_index:]

In [11]:
print(len(train), len(test))

380708 95178


In [12]:
#block size

block_size = 8

x = train[:block_size]
y = train[1: block_size+1]

print(x)
print(y)

#x[:i+1] -> x upto i, including i

for i in range(block_size):
    print(f"input tensor is: {x[:i+1]}, context: {y[i]}")

tensor([85,  1,  1, 48, 39, 71, 63,  1])
tensor([ 1,  1, 48, 39, 71, 63,  1, 38])
input tensor is: tensor([85]), context: 1
input tensor is: tensor([85,  1]), context: 1
input tensor is: tensor([85,  1,  1]), context: 48
input tensor is: tensor([85,  1,  1, 48]), context: 39
input tensor is: tensor([85,  1,  1, 48, 39]), context: 71
input tensor is: tensor([85,  1,  1, 48, 39, 71]), context: 63
input tensor is: tensor([85,  1,  1, 48, 39, 71, 63]), context: 1
input tensor is: tensor([85,  1,  1, 48, 39, 71, 63,  1]), context: 38


In [13]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


In [15]:
block_size = 8
batch_size = 4
def get_batched_data(split):
    data = train if 'train' else test
    random_indices = torch.randint((len(data)-block_size), (batch_size,)) #random indices of batch size inside the data
    print(random_indices)
    x = torch.stack([data[i:i+block_size]for i in random_indices]) #stacking a batch of 4 input block each of block size = 8 
    y = torch.stack([data[i+1:i+block_size+1]for i in random_indices]) #stacking a batch of 4 target block each of block size = 8 
    return x.to(device), y.to(device)

X,y = get_batched_data('train')
print(f'inputs: {X}, outputs: {y}')

tensor([ 97411, 338564, 211851,  47758])
inputs: tensor([[68,  1, 55, 63, 18,  1, 50, 69],
        [56, 53,  1, 51, 60, 53, 49, 66],
        [57, 51, 69, 60, 49, 66,  1, 62],
        [68, 53, 66, 68, 49, 57, 62, 53]], device='cuda:0'), outputs: tensor([[ 1, 55, 63, 18,  1, 50, 69, 68],
        [53,  1, 51, 60, 53, 49, 66,  1],
        [51, 69, 60, 49, 66,  1, 62, 57],
        [53, 66, 68, 49, 57, 62, 53, 52]], device='cuda:0')


In [18]:
print(X, x.shape)

tensor([[68,  1, 55, 63, 18,  1, 50, 69],
        [56, 53,  1, 51, 60, 53, 49, 66],
        [57, 51, 69, 60, 49, 66,  1, 62],
        [68, 53, 66, 68, 49, 57, 62, 53]], device='cuda:0') torch.Size([8])


### Forward pass experiment

In [19]:
import torch.nn as nn

In [26]:
vocab_size = len(chars)
print(vocab_size)
embedding_matrix = nn.Embedding(vocab_size, vocab_size).to(device)
print(list(embedding_matrix.parameters())[0].shape)

86
torch.Size([86, 86])


In [30]:
X.shape

torch.Size([4, 8])

In [27]:
logits = embedding_matrix(X)

In [29]:
print(logits.shape)
B, T, C = logits.shape

torch.Size([4, 8, 86])


In [31]:
logits = logits.view(B*T, C)

In [32]:
logits.shape

torch.Size([32, 86])

In [33]:
print(y.shape)

torch.Size([4, 8])


In [35]:
targets = y.view(B*T)

In [36]:
targets.shape

torch.Size([32])

In [37]:
targets

tensor([ 1, 55, 63, 18,  1, 50, 69, 68, 53,  1, 51, 60, 53, 49, 66,  1, 51, 69,
        60, 49, 66,  1, 62, 57, 53, 66, 68, 49, 57, 62, 53, 52],
       device='cuda:0')

In [39]:
import torch.nn.functional as F
loss = F.cross_entropy(logits, targets)

In [40]:
loss

tensor(4.7035, device='cuda:0', grad_fn=<NllLossBackward0>)

In [41]:
logits.shape

torch.Size([32, 86])

In [42]:
logits = logits[:,-1,:]

IndexError: too many indices for tensor of dimension 2

In [50]:
context = torch.zeros((1,2), dtype = torch.long, device = device)

In [51]:
context.shape

torch.Size([1, 2])

In [52]:
logits = embedding_matrix(context)
print(logits.shape)
B, T, C = logits.shape
logits = logits[:,-1,:]
print(logits.shape)

torch.Size([1, 2, 86])
torch.Size([1, 86])
