In [1]:
import torch
import tiktoken
import numpy as np

from network import GPTLanguageModel
import constants

In [2]:
# Import hyperparameters from constants file
batch_size = constants.BATCH_SIZE
block_size = constants.BLOCK_SIZE
num_epochs = constants.NUM_EPOCHS
eval_interval = constants.EVAL_INTERVAL
learning_rate = constants.LEARNING_RATE
eval_iters = constants.EVAL_ITERS
n_embd = constants.N_EMBD
n_head = constants.N_HEAD
n_layer = constants.N_LAYER
dropout = constants.DROPOUT
# ------------

In [None]:
# Runtime config
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('mps') if torch.backends.mps.is_available() else torch.device('cpu')

In [None]:
# TODO: Choose what embeddings to use
embeddings = None
assert embeddings == 'gpt' or embeddings == 'character'

In [None]:
# paths
data_input_dir = '../data/inputs/'
output_dir = '../data/outputs/'

In [None]:
# Set a seed so that results can be replicated
torch.manual_seed(1234)

In [None]:
# TODO: Read the entire input file in as a single string
text_data = None

### TODO: Find the number of unique characters in the input text
vocab = [] # obtain vocab from text_data
char_vocab_size = len(vocab)
print("There are %d unique characters in the data." % char_vocab_size)
print(vocab)
###

In [None]:
"""
Now that we have the unique characters in the text, we can create a mapping from characters to integers and integers to characters. This will allow us to encode the characters for processing in the network. Create two functions: one that maps characters to integers and one that maps integers to characters. This is the process of tokenization, in this case a simple mapping of the characters of the text to an integer. 
"""

stoi = None # TODO: create a mapping from string (character) to integer
itos = None # TODO: create a reverse mapping from integer to string (character)

# save your mappings to retrieve them later for encoding/decoding
np.save(f'{output_dir}stoi.npy', stoi)
np.save(f'{output_dir}itos.npy', itos)

encode = None # TODO: create a function that uses 'stoi' dictionary to encode a text of arbitrary length
decode = None # TODO: create a function that uses 'itos' to decode text previously encoded 

# Encode the text_data using the stoi mapping
basic_encoded_data = torch.tensor(encode(text_data), dtype=torch.long)
print("Length of encoding:", len(basic_encoded_data), "Encoding:", basic_encoded_data)

In [None]:
"""
However, there are also other more complex forms of tokenization, which we encourage you to explore and experiment with on your own.
Below, we present one example of a sub-word tokenizer package developed by OpenAI called tiktoken (full documentation at https://github.com/openai/tiktoken). If you have implemented the above cells correctly, you can see that the length of encoding for text_data using character-level encoding is much longer than the tiktoken (sub-word) encoding used by GPT-2. On the other hand, GPT-2 has a much larger vocab size (number of total available encodings), so we will need a larger token embedding table within the model architecture. 
"""
# GPT embedding
encoder = tiktoken.encoding_for_model('gpt2')
gpt_encoded_data = torch.tensor(encoder.encode(text_data), dtype=torch.long)
print("Length of encoding:", len(gpt_encoded_data), "Encoding:", gpt_encoded_data)

# default vocab size of GPT-2
gpt_vocab_size = constants.GPT_VOCAB_SIZE

In [None]:
# Choose the data and vocab size corresponding to your chosen embedding option
if embeddings == 'character':
    encoded_data = basic_encoded_data
    vocab_size = char_vocab_size
else:
    encoded_data = gpt_encoded_data
    vocab_size = gpt_vocab_size

In [None]:
# TODO: Perform train-test split on encoded_data
train_ratio = 0.0
train_cutoff = None
train_data = None
test_data = None

In [None]:
model = GPTLanguageModel(vocab_size)
model.to(device)

In [None]:
def load_batch(split):
    ### TODO: generate a small batch of data of inputs x and targets y
    data = None # based on the value of split, load either training data or testing data
    ix = None # generate a tensor of length batch_size that contains randomly generated valid indices within the data
    x = None # for each index in ix, extract the corresponding chunk of text. Stack chunks using torch.stack()
    y = None # stack targets (y) corresponding to inputs (x) from the previous line
    x, y = x.to(device), y.to(device) # move data to GPU, if available
    ###
    return x, y

@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval() # switch mode to eval to disable gradient computation and compute loss faster
    for split in ['train', 'test']:
        losses = None # initialize a tensor of length eval_iters
        for k in range(eval_iters):
            # 1. Load a batch of data
            # 2. Forward pass through your model to get logits and loss
            # 3. Save your loss at a corresponding index in losses
            pass
        out[split] = losses.mean() # record mean loss for training and test set separately
    model.train() # switch back to training mode
    return out

In [None]:
# TODO: choose a PyTorch optimizer for your model
optimizer = None

for epoch in range(num_epochs):

    # every once in a while evaluate the loss on train and val sets, and save model parameters at this checkpoint
    if epoch % eval_interval == 0:
        losses = estimate_loss()
        print(f"step {epoch}: train loss {losses['train']:.4f}, test loss {losses['test']:.4f}")
        # TODO: save model parameters 

    # TODO: sample a batch of training data
    xb, yb = None, None

    # TODO: Evaluate the loss and perform gradient step