In [1]:
import torch

In [2]:
with open('wizard_of_oz.txt','r',encoding='utf-8') as f:
    text=f.read()
chars=sorted(set(text))
print(chars)
vocab_size=len(chars)

['\n', ' ', '!', '#', '$', '%', '&', '(', ')', '*', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '—', '‘', '’', '“', '”', '•', '™']


In [3]:
print(len(chars))

88


In [4]:
string_to_int = { ch:i for i,ch in enumerate(chars) }
int_to_string = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [string_to_int[c] for c in s]
decode = lambda l: ''.join([int_to_string[i] for i in l])

data = torch.tensor(encode(text), dtype=torch.long) #This line converts the input text into a tensor of long integers. 
#The encode function is responsible for converting the text into a sequence of numerical tokens.
print(data[:100])

tensor([46, 62, 59,  1, 42, 72, 69, 64, 59, 57, 74,  1, 33, 75, 74, 59, 68, 56,
        59, 72, 61,  1, 59, 28, 69, 69, 65,  1, 69, 60,  1, 46, 62, 59,  1, 49,
        69, 68, 58, 59, 72, 60, 75, 66,  1, 49, 63, 80, 55, 72, 58,  1, 69, 60,
         1, 41, 80,  0,  0,  0, 46, 63, 74, 66, 59, 24,  1, 46, 62, 59,  1, 49,
        69, 68, 58, 59, 72, 60, 75, 66,  1, 49, 63, 80, 55, 72, 58,  1, 69, 60,
         1, 41, 80,  0,  0,  0, 27, 75, 74, 62])


In [5]:
#This line calculates the index at which the dataset should be split into training and validation sets. 
#In this case, 80% of the data is used for training.
n = int(0.8*len(data))
#These lines split the data into training and validation sets based on the index n calculated in the previous step
train_data = data[:n]
val_data = data[n:]

In [6]:
#his sets the context length (or window size) for the bigram model. 
#A bigram model predicts the next token based on the current token and the previous token.
block_size=8

#These lines create the input (x) and target (y) tensors for the first training example. 
#x represents the context (the first block_size tokens), and y represents the targets (the next block_size tokens, shifted by one position).
x=train_data[:block_size]
y=train_data[1:block_size+1]

for t in range(block_size):
    context=x[:t+1] #This creates a sub-tensor of x representing the context up to the current position t.
    target=y[t] #This retrieves the target token at position t from y.
    print('when input is ',context,'target is ', target)
    

when input is  tensor([46]) target is  tensor(62)
when input is  tensor([46, 62]) target is  tensor(59)
when input is  tensor([46, 62, 59]) target is  tensor(1)
when input is  tensor([46, 62, 59,  1]) target is  tensor(42)
when input is  tensor([46, 62, 59,  1, 42]) target is  tensor(72)
when input is  tensor([46, 62, 59,  1, 42, 72]) target is  tensor(69)
when input is  tensor([46, 62, 59,  1, 42, 72, 69]) target is  tensor(64)
when input is  tensor([46, 62, 59,  1, 42, 72, 69, 64]) target is  tensor(59)
