In [16]:
import torch
import torch.nn as nn

# Preparing the Dataset

In [1]:
with open('shakespeare.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [2]:
print("length of dataset in characters:", len(text))

length of dataset in characters: 1115393


In [7]:
chars = sorted(list(set(text)))
vocab_size = len(chars)

In [14]:
# map chars to int and vice versa
stoi = {ch:i for i,ch in enumerate(chars)}
itos = {i:ch for ch,i in stoi.items()}
# function to encode a string into a list of integers
encode = lambda s : [stoi[ch] for ch in s]
# function to decode a list of integers into a string
decode = lambda l : ''.join([itos[i] for i in l])

In [18]:
# store the dataset in a tensor
data = torch.tensor(encode(text), dtype=torch.long)

In [21]:
# 90% will be training, 10% will be validation
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

In [22]:
# we will feed in up to 8 chars as the encoder input
# this is the maximum context length
block_size = 8

In [26]:
x = train_data[:block_size]
y = train_data[1:block_size+1]
for i in range(block_size):
    print(x[:i+1], '-->', y[i])

tensor([18]) --> tensor(47)
tensor([18, 47]) --> tensor(56)
tensor([18, 47, 56]) --> tensor(57)
tensor([18, 47, 56, 57]) --> tensor(58)
tensor([18, 47, 56, 57, 58]) --> tensor(1)
tensor([18, 47, 56, 57, 58,  1]) --> tensor(15)
tensor([18, 47, 56, 57, 58,  1, 15]) --> tensor(47)
tensor([18, 47, 56, 57, 58,  1, 15, 47]) --> tensor(58)


In [27]:
batch_size = 4
block_size = 8

In [None]:
def get_batch(split):
    data = train_data if split == 'train' else val_data
    # get batch_size number of samples
    # ix is the first index, so it must have at least block_size chars after it
    ix = torch.randint(len(data) - block_size, (batch_size,))
    # stack lets us put tensor of the same shape together
    # we can also specify the dimension that they are stacked along
    x = torch.stack([data[i:i+block_size] for i in ix]
    y = torch.stack([data[i+block_size] for i in ix]
