In [2]:
from pathlib import Path
import torch


## Data Preparation

In [11]:

input_file_path = Path('../data/tinyshakespeare.txt')

with open(input_file_path, 'r') as f:
    text = f.read()
print(f"length of dataset in characters: {len(text):,}")


length of dataset in characters: 1,115,393


#### get all the unique characters that occur in this text


In [12]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print("all the unique characters:", ''.join(chars))
print(f"vocab size: {vocab_size:,}")

all the unique characters: 
 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
vocab size: 65


#### create a mapping from characters to integers


In [10]:
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
def encode(s):
    return [stoi[c] for c in s] # encoder: take a string, output a list of integers
def decode(l):
    return ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

print(f"{encode("khoa") = }")
print(f"{decode(encode("khoa")) = }")

encode("khoa") = [49, 46, 53, 39]
decode(encode("khoa")) = 'khoa'


In [16]:
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.type)
print(data[:100])

torch.Size([1115393]) <built-in method type of Tensor object at 0x757d97f44140>
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])


So, we have a very small code book of 65 characters, very simple `encode` and `decode` functions, but we get very long sequences as a result

#### create the train and validation splits

In [17]:
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

In [20]:
print(f"{train_data.shape = }")
print(f"{val_data.shape = }")

train_data.shape = torch.Size([1003853])
val_data.shape = torch.Size([111540])
