# nanoGPT Development

In [11]:
import torch

In [1]:
# Read in data, inspect and convert to mapping

with open("input.txt", "r") as infile:
    text = infile.read()
print(f"Dataset length: {len(text)}")

chars = sorted(list(set(text)))
vocab_size = len(chars)
print("".join(chars))
print(f"Vocab Size: {vocab_size}")

Dataset length: 1115394

 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
Vocab Size: 65


In [6]:
# Create mapping from characters to integers (character level tokenizer)

STOI = {character: integer for integer, character in enumerate(chars)}
ITOS = {integer: character for integer, character in enumerate(chars)}


def encode(input: str) -> "list[int]":
    return [STOI[char] for char in input]


def decode(input: "list[int]") -> str:
    return "".join([ITOS[integer] for integer in input])


message = "Hello, there!"
print(f"Encode: {message}: {encode(message)}")
print(f"Decode: Encode({message}): {decode(encode(message))}")

Encode: Hello, there!: [20, 43, 50, 50, 53, 6, 1, 58, 46, 43, 56, 43, 2]
Decode: Encode(Hello, there!): Hello, there!


In [10]:
# Compare the above with OpenAI tiktoken encoding (much larger vocabulary)
import tiktoken

encoding = tiktoken.get_encoding("gpt2")

print(f"tiktoken Vocab Size: {encoding.n_vocab}")
print(f"tiktoken Encode: {encoding.encode(message)}")
print(f"tiktoken Decode: {encoding.decode(encoding.encode(message))}")

tiktoken Vocab Size: 50257
tiktoken Encode: [15496, 11, 612, 0]
tiktoken Decode: Hello, there!


In [12]:
# Convert input data to torch tensor

data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)

torch.Size([1115394]) torch.int64


In [13]:
# Convert the data into a train/test split
n = int(0.9 * len(data))
train_data = data[:n]
test_data = data[n:]

print(f"Num train examples: {train_data.shape}")
print(f"Num test examples: {test_data.shape}")

Num train examples: torch.Size([1003854])
Num test examples: torch.Size([111540])
