<h1>Import Dependencies</h1>

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import regex as re # for the regex pattern that the tokenizer will use
import pandas as pd
import numpy as np

<h1>Load the data into each component(validation and the train)</h1>

In [2]:
val_text = pd.read_csv("/kaggle/input/tinystories-narrative-classification/validation.csv").text # strip the text for each story

In [3]:
train_text = pd.read_csv("/kaggle/input/tinystories-narrative-classification/train.csv").text

In [4]:
# checking how many stories there are in each split
# i tried to use them all but the datasets took an extraordinary amount of time to compile for training
# you will later see that I only use 4000 stories from the training split and 1000 stories from the testing split
len(val_text), len(train_text)

(21990, 2119719)

<h1>Class Definitions</h1>
<ul>
    <li>
        Tokenizer
    </li>
    <li>
        GPT
        <ul>
            <li>
                Feed Forward
            </li>
            <li>
                Multihead Attention
            </li>
            <li>
                Decoder Block
            </li>
            <li>
                Full Definition
            </li>
        </ul>
    </li>
</ul>

In [33]:
class Tokenizer():
    """
    Trains a tokenizer for a small GPT specifically for the Tiny Stories dataset.
    Uses the GPT4 regex pattern to break the text into chunks to prevent the merging of inconvenient tokens.
    Adapted from https://github.com/karpathy/minbpe/blob/master/minbpe/regex.py.
    Made to handle special tokens in a more simple manner.
    """
    
    
    def __init__(self, pattern):
        
        # initialize vocab size to include the standard 256 vocab set of characters
        self.vocab = {idx: bytes([idx]) for idx in range(256)}
        self.vocab_size = 256
        
        # merges table for use when encoding and decoding after training the tokenizer
        self.merges = {}
        
        # stores the pattern that we will use (this is mostly useless because it only gets used to compile pattern)
        self.pattern = pattern
        
        # compiles the regex pattern that will be used to break the given text into chunks based on some rules that are used for GPT4
        self.compiled_pattern = re.compile(pattern)
        
        # stores the special tokens that we register
        self.special_tokens = {}
    
    
    def register_special_tokens(self, special_tokens):
        
        # for each token in the given list of special_tokens to register
        for token in special_tokens:
            
            # if this is not a token we have already registered
            if token not in self.special_tokens:
                
                # add an entry into our special_tokens dictionary and map it to the corresponding index that it will be represented by
                self.special_tokens[token] = self.vocab_size
                
                # add it to our vocabulary under its index representation and map it to the raw bytes of its encoding
                self.vocab[self.vocab_size] = bytes(token.encode("utf-8"))
                
                # reflect in our vocab_size that a new vocab word was added
                self.vocab_size += 1
    
    
    def get_stats(self, ids, counts = None):
        """
        Given a list of integers (ids in our context), this function computes the frequencies of every pair of ids that occur
        If we are given a list of counts to use as our base for frequencies we use it, if not we start from scratch
        """
        
        # here is the logic to implement the optional counts passed to the function
        counts = {} if counts is None else counts
        
        # for each pair in the zipped version of ids and ids shifted over by 1 so that consecutive ids get zipped together
        for pair in zip(ids, ids[1:]):
            # we increment the frequency of that pair if it is already in the table, otherwise we set it to 1 if its the first instance
            counts[pair] = counts.get(pair, 0) + 1
            
        return counts
    
    
    def merge(self, ids, pair, idx):
        """
        Given a list of ids, a pair of ids, and an id to replace that pair with, merges every pair in the list of ids into idx
        """
        
        # our new list of merged ids
        newids = []
        # a counter variable to help us traverse through the list of ids
        i = 0
        
        # while we have not gone through all of the ids
        while i < len(ids):
            
            # if the current id is the very last id, it cannot be merged so we ignore this if branch and just append it
            # however, if the first element of the pair is equal to the current id
            #          and if the second element of the pair is equal to the following id
            #              then we should merge this pair into our new ids by appending the idx to replace the pair and going forward in ids by 2
            if i < len(ids) - 1 and pair[0] == ids[i] and pair[1] == ids[i + 1]:
                newids.append(idx)
                i += 2
                
            # otherwise we should just add the id we are on as it is without merging because it is not the pair we want to merge
            else:
                newids.append(ids[i])
                i += 1
                
        return newids
    
    
    def train(self, text, vocab_size, verbose = False):
        """
        Trains the tokenizer on a given text for a given number of iterations (vocab_size) and prints the merges if requested
        """
        
        # uses the regex pattern to break the text into chunks
        chunks = re.findall(self.compiled_pattern, text)
        
        # then we assemble our list of ids by encoding each chunk in chunks and making a mini list of ids for each chunk
        ids = [list(chunk.encode("utf-8")) for chunk in chunks]
        
        # while we have not reached the target vocab_size
        while self.vocab_size < vocab_size:
            
            # we want to calculate the current stats on the ids to find which pair we should turn into a token next
            stats = {}
            # for each chunk of ids in ids
            for chunk in ids:
                # we want to use the stats we are building up for the entire list of ids
                # we want to add on the stats for each individual chunk to get the stats across all chunks
                self.get_stats(chunk, stats)
            
            # we use stats.get as our key which allows us to compare each pairing option in stats by their frequencies and returns to us the pair with the highest frequency
            pair = max(stats, key = stats.get)
            
            # once we have the pair we want, we go through each chunk in ids and merge the pairs into our new token
            ids = [self.merge(chunk, pair, self.vocab_size) for chunk in ids]
            # then we add this pair into our merges dictionary under its id
            self.merges[pair] = self.vocab_size
            # then we add this id under our vocabulary as the combination of the two tokens in the pair
            self.vocab[self.vocab_size] = self.vocab[pair[0]] + self.vocab[pair[1]]
            
            # if verbose
            if verbose is True:
                # print what tokens we are merging into what id
                print(f"merging {self.vocab[pair[0]], self.vocab[pair[1]]} -> {self.vocab_size}")
                
            # increase our vocab size by 1
            self.vocab_size += 1
                
    
    def encode(self, text):
        """
        Given text, encodes it into tokens based on what our tokenizer has learned from its training
        """
        
        # first we encode the text through utf-8 and turn it into a list of integers
        ids = list(text.encode("utf-8"))
        
        # if the length of the text is less than 2, it is already full merged so we should not do anything
        while len(text) >= 2:
            
            # get the stats for this list of ids
            stats = self.get_stats(ids)
            
            # then we choose the pair that occurs the least because in order to build up our larger tokens (e.g. "Hello"),
            # we first need to build up our smaller tokens (e.g "He", "ll", "o") that make up this larger token
            # we key into stats through a lambda expression that gets the frequencies recorded in stats, making sure to set
            # pairs that are not in our merges list should be set as infinity so that we do not try to merge pairs that we
            # do not actually have an id for
            pair = min(stats, key = lambda p: self.merges.get(p, float("inf")))
            
            # if we run out of possible merges, we will by default eventually choose a pair of infinite frequency so we have
            # to add this extra check to break out of the loop if we have no more merges left to make
            if pair not in self.merges:
                break
                
            # if we have selected a pair we can merge, then we will merge those pairs in our list of ids under their tokenized id
            ids = self.merge(ids, pair, self.merges[pair])
            
        return ids
    
    
    def decode(self, ids):
        """
        Given a list of ids, decodes them back into plain text
        """
        
        # takes each index in the list of ids and gets their bytes in our vocabulary dictionary then joins all the bytes together
        text_bytes = b"".join(self.vocab[idx] for idx in ids)
        
        # then we decode these bytes back into text via utf-8, making sure to replace an errors along the way with the ? character
        text = text_bytes.decode("utf-8", errors = "replace")
        
        return text

In [6]:
class FeedForward(nn.Module):
    """
    Feed forward layer used after each MultiHeadAttention layer
    """
    
    
    def __init__(self):
        super(FeedForward, self).__init__()
        self.c_fc = nn.Linear(n_hidden, 4 * n_hidden)
        self.gelu = nn.GELU() # apparently GELU is better than RELU here (at least thats what nanoGPT does)
        self.c_proj = nn.Linear(4 * n_hidden, n_hidden)
        self.dropout = nn.Dropout(dropout)
        
        
    def forward(self, x):
        x = self.c_fc(x)
        x = self.gelu(x)
        x = self.c_proj(x)
        x = self.dropout(x)
        return x

In [7]:
class MultiHeadAttention(nn.Module):
    """
    MultiHeadAttention at the beginning of each block in each decoder layer
    """
    
    
    def __init__(self):
        
        super(MultiHeadAttention, self).__init__()
        
        # combines qkv into one linear layer that will get split into three for efficiency
        self.attn = nn.Linear(n_hidden, 3 * n_hidden)
        
        # our projection after attention
        self.proj = nn.Linear(n_hidden, n_hidden)
        
        # dropout for each stage
        self.attn_dropout = nn.Dropout(dropout)
        self.proj_dropout = nn.Dropout(dropout)
        
        # the F.scaled_dot_product_attention is apparently faster and more efficient so use this if available in the PyTorch version
        self.flash = hasattr(torch.nn.functional, 'scaled_dot_product_attention')
        
        # if not available
        if not self.flash:
            # just use the regular system where a mask must be registered to block off attention on future tokens
            self.register_buffer("bias", torch.tril(torch.ones(block_size, block_size)).view(1, 1, block_size, block_size))
            
            
    def forward(self, x):
        # Batches by Time steps by Channels
        B, T, C = x.shape
        
        # split self.attn into thirds along basically the last dimension of channels
        q, k, v = self.attn(x).split(n_hidden, dim = 2) # B, T, C
        
        # reorganize q, k, v
        q = q.view(B, T, n_heads, C // n_heads).transpose(2, 1) # B, H, T, C//H
        k = k.view(B, T, n_heads, C // n_heads).transpose(2, 1) # B, H, T, C//H
        v = v.view(B, T, n_heads, C // n_heads).transpose(2, 1) # B, H, T, C//H
        
        # if the speed up function is available
        if self.flash:
            # simply compute the scaled dot product attention using q, k, v, and if we are not in training mode we should not use dropout
            x = F.scaled_dot_product_attention(q, k, v, attn_mask = None, dropout_p = dropout if self.training else 0, is_causal = True)
        else:
            # queries inner product with k, the reason for the transpose of k is being without the transpose, a row in q corresponds to a row in k
            # when we do an inner product, rows in the first get dot producted with cols in the second, so if we want to pair the row in q with the
            # row in k we have to transpose k so that a row in q now corresponds to a column in k so that the inner product actually does what is intended
            x = (q @ k.transpose(-2, -1)) * k.shape[-1] ** -0.5 # B, H, T, C//H  @  B, H, C//H, T  ->  B, H, T, T
            x = x.masked_fill(self.bias[:, :, :T, :T] == 0, float("-inf")) # B, H, T, T
            x = F.softmax(x, dim = -1) # B, H, T, T
            x = self.attn_dropout(x) # B, H, T, T
            x = x @ v # B, H, T, T  @  B, H, T, C//H  ->  B, H, T, C//H
        
        x = x.transpose(1, 2).contiguous().view(B, T, C) # B, H, T, C//H  ->  B, T, H, C//H  ->  B, T, C
        x = self.proj_dropout(self.proj(x)) # B, T, C
        
        return x

In [8]:
class Block(nn.Module):
    """
    The block of MultiHeadAttentions and FeedForwards that make up each layer in n_layers
    """
    
    
    def __init__(self):
        super(Block, self).__init__()
        
        self.ln1 = nn.LayerNorm(n_hidden)
        self.attn = MultiHeadAttention()
        self.ln2 = nn.LayerNorm(n_hidden)
        self.ffwd = FeedForward()
        
        
    def forward(self, x):
        # layer norm the inputs from the previous stage
        # use residual connections so that the gradients flow through with more branches resulting in less chance for exploding/vanishing gradients
        x = x + self.attn(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

In [9]:
class GPT(nn.Module):
    """
    The full definition of the GPT will all components except the Encoder layer formally included
    """
    
    
    def __init__(self):
        super(GPT, self).__init__()
        
        self.token_embedding = nn.Embedding(tokenizer.vocab_size, n_hidden)
        self.position_embedding = nn.Embedding(block_size, n_hidden) # simple position embeddings instead of the cosine sine thing they use in the original paper
        self.dropout = nn.Dropout(dropout)
        self.heads = nn.ModuleList([Block() for _ in range(n_layers)]) # this should really be called blocks instead of heads but its ok
        self.ln_f = nn.LayerNorm(n_hidden)
        self.lm_head = nn.Linear(n_hidden, tokenizer.vocab_size)
        
        
        # weight tying i think for regularization although im not sure because i havent read the paper on it yet
        self.token_embedding.weight = self.lm_head.weight
        
        # apply weight initializations
        self.apply(self._init_weights)
        
        
        for pn, p in self.named_parameters():
            if pn.endswith("c_proj.weight"):
                # not too sure why, maybe its to help with normalization and preventing exploding/vanishing gradients
                torch.nn.init.normal_(p, mean = 0.0, std = 0.01 * (2 * n_layers) ** -0.5)
           
        # visualize how large the model is based on number of parameters
        print(sum(p.numel() for p in self.parameters())/1e6, "million parameters")
        
        
    def _init_weights(self, module):
        """
        Weight initialization for best results
        """
        
        # if nn.Linear
        if isinstance(module, nn.Linear):
            # initialize weights by drawing from normal distribution centered at 0 with std 0.02
            torch.nn.init.normal_(module.weight, mean = 0.0, std = 0.02)
            # if there bias is enabled
            if module.bias is not None:
                # initialize the bias to zeroes
                torch.nn.init.zeros_(module.bias)
        # if nn.Embedding
        elif isinstance(module, nn.Embedding):
            # initialize the weights in the same way as nn.Linear
            torch.nn.init.normal_(module.weight, mean = 0.0, std = 0.02)
        
        
    def forward(self, idx, targets = None):
        """
        Given a list of indexes, returns the logits probability distribution to draw the next token from.
        Optionally argument for targets.
        If targets is given, it will calculate the loss and return it.
        Otherwise, only the logits will be calculated and the loss will be returned as None.
        """
        # Batches by Time step
        B, T = idx.shape
        
        # get token embedding
        tok_emb = self.token_embedding(idx)
        
        # get position embedding by drawing the embedding vector for each position in the list of indexes given
        pos_emb = self.position_embedding(torch.arange(T, device = device))
        
        # add the token embeddings and the position embeddings and use dropout
        x = self.dropout(tok_emb + pos_emb)
        
        # for each block in the network
        for block in self.heads:
            # pass the inputs into the block
            x = block(x)
            
        # once past all layers a layer norm will be used
        x = self.ln_f(x)
        
        # if targets are provided
        if targets is not None:
            # get logits by passing the inputs into the final linear layer
            logits = self.lm_head(x)
            # then get the loss using cross_entropy and reshaping the logits into B * T, C and the targets into B * T
            loss = F.cross_entropy(logits.view(-1, logits.shape[-1]), targets.view(-1), ignore_index = -1)
        # if no targets then no loss can be calculated
        else:
            # take only the last time step because it is likely we will be generating tokens if no targets are given so we only care about the last time step
            # because this is the model's prediction for what token comes next
            logits = self.lm_head(x[:, [-1], :])
            loss = None
            
        return logits, loss
        
        
    def generate(self):
        """
        No parameters to give.
        A random story is generated from scratch by using the <|startofstory|> token and stops generating when the model predicts a <|endofstory|> token
        """
        
        # switch to eval mode so that dropout isnt used
        model.eval()
        
        # create our list of ids containing only the special start token and reshape into B x T (1 x 1)
        ids = [tokenizer.special_tokens["<|startofstory|>"]]
        ids = torch.tensor(ids).view(1, 1).to(device)
    
        # we want to just keep generating until the end of story is predicted by the model
        while True:
            # pass the last block_size tokens into the model
            logits, loss = self(ids[:, -block_size:] if ids.shape[-1] > block_size else ids)
        
            # I think this is actually an unnecessary step because we already get the last time step by not passing any targets to the model
            logits = logits[:, -1, :]
        
            # apply the softmax to get the probability distribution
            logits = F.softmax(logits, dim = -1)
        
            # randomly sample a token from the probability distribution
            prediction = torch.multinomial(logits, num_samples = 1, replacement = True)
            
            # if the sampled token is the special end token, then that means the story is predicted to end here and we break out of the loop
            if prediction == (tokenizer.special_tokens["<|endofstory|>"]):
                break
                
            # otherwise we append the newly predicted token to the ids and go to the next iteration of generation
            ids = torch.cat((ids, prediction), dim = 1)
        
        # finally we take our ids and crop the first special start token out, then we move to cpu, then we convert to np array for efficiency, then we decode using our tokenizer into text
        return tokenizer.decode(np.array(ids[0][1:].to("cpu")))

<h1>Training the tokenizer on a chunk of the validation text</h1>

In [10]:
# initializing the tokenizer using the GPT4 regex pattern
tokenizer = Tokenizer(r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+""")

In [11]:
tokenizer_text = "" # text for training the tokenizer

# for each story in the validation set
for story in val_text:
    # add the story to the tokenizer text
    tokenizer_text += story
    
# take only 1/32 th of the tokenizer text because the amount of text in this dataset is just too much and this small chunk is representative enough of the entire dataset
tokenizer_text = tokenizer_text[:len(tokenizer_text) // 32]

In [12]:
# this small of a chunk is still 600k characters
len(tokenizer_text)

599697

In [13]:
# train the tokenizer to a vocab size of 1000 and verbose is on so you can see what merges are made by the tokenizer.
# I think I set the output cell to optional so maybe you are able to toggle it on and off or maybe not I will have to see when I download this as .ipynb
tokenizer.train(tokenizer_text, 1000, verbose = True)

merging (b'h', b'e') -> 256
merging (b' ', b't') -> 257
merging (b' ', b'a') -> 258
merging (b' ', b's') -> 259
merging (b' ', b'w') -> 260
merging (b'n', b'd') -> 261
merging (b' t', b'he') -> 262
merging (b'e', b'd') -> 263
merging (b' t', b'o') -> 264
merging (b' a', b'nd') -> 265
merging (b' ', b'b') -> 266
merging (b'i', b'n') -> 267
merging (b' ', b'h') -> 268
merging (b' w', b'a') -> 269
merging (b'i', b't') -> 270
merging (b'r', b'e') -> 271
merging (b' ', b'f') -> 272
merging (b'o', b'u') -> 273
merging (b' ', b'he') -> 274
merging (b' ', b'l') -> 275
merging (b' ', b'c') -> 276
merging (b' ', b'd') -> 277
merging (b'e', b'r') -> 278
merging (b' wa', b's') -> 279
merging (b' ', b'm') -> 280
merging (b' ', b'p') -> 281
merging (b'o', b'n') -> 282
merging (b'a', b'y') -> 283
merging (b'o', b'm') -> 284
merging (b' ', b'T') -> 285
merging (b'a', b'r') -> 286
merging (b'\n', b'\n') -> 287
merging (b'i', b'l') -> 288
merging (b'a', b't') -> 289
merging (b'in', b'g') -> 290
merging 

<h1>Adding special tokens for the start of a story and the end of a story</h1>

In [14]:
# add our special start and end tokens to our tokenizer
tokenizer.register_special_tokens(["<|startofstory|>", "<|endofstory|>"])

In [15]:
# make sure that our special tokens were added to the vocabulary
tokenizer.vocab[tokenizer.vocab_size - 2], tokenizer.vocab[tokenizer.vocab_size - 1]

(b'<|startofstory|>', b'<|endofstory|>')

<h1>Helper functions to ease the training process</h1>

In [22]:
def build_dataset():
    """
    Builds the training and testing dataset based on the training and validation text pulled from the tinystories dataset
    """
    
    
    train_data = []
    test_data = []
    i = 0
    
    # for each story in train_text
    for story in train_text:
        
        # this is just verbose stuff so I now how fast its going (its excrutiatingly slow)
        if (i + 1) % 1000 == 0:
            print(i + 1)
            print(len(train_data))
            
        # honestly 4000 stories takes like 10 minutes to compile so this was enough for now
        if i > 4000:
            break
            
        # every story starts with the special start token
        train_data.append(tokenizer.special_tokens["<|startofstory|>"])
        # we extend the list of ids acquired from encoding the story text into our tokens via the tokenizer we trained
        train_data.extend(tokenizer.encode(story))
        # we add the special end token to end every story
        train_data.append(tokenizer.special_tokens["<|endofstory|>"])
        
        i += 1
        
    i = 0
    
    # same deal as the above loop
    for story in val_text:
        
        # 1000 stories gives us an 80/20 split which is pretty standard
        if i > 1000:
            break
            
        test_data.append(tokenizer.special_tokens["<|startofstory|>"])
        test_data.extend(tokenizer.encode(story))
        test_data.append(tokenizer.special_tokens["<|endofstory|>"])
        i += 1
    
    return train_data, test_data

In [25]:
def get_batch(split):
    """
    Gets a batch of batch_size from the given split
    """
    
    # get sthe data by indexing into this dictionary using the given split name
    x = {
        "train": train_data,
        "test": test_data,
    }[split]
    
    # gets batch_size indices to get a block_sized chunk of training examples
    ix = torch.randint(0, len(x) - block_size - 1, (batch_size,))
    
    Xb, Yb = [], []
    
    # for each index in ix
    for idx in ix:
        # the training example of size block_size
        Xb.append(train_data[idx: idx + block_size])
        # the targets for each example will always be shifted one time step forward
        Yb.append(train_data[idx + 1: idx + block_size + 1])
        
    # some technicalities for getting the batches on the device that the model is on
    # using np.array because I once got a warning message saying it was faster to convert a bare list to a tensor by converting to an np array first
    # maybe this is wrong but I am not sure and it works the same either way
    Xb = torch.tensor(np.array(Xb)).to(device) 
    Yb = torch.tensor(np.array(Yb)).to(device)
    
    return Xb, Yb

In [26]:
def train():
    """
    Training loop for a model
    """
    
    # make sure in training mode
    model.train()
    
    # initialize the optimizer using the model parameters and the given learning rate
    optimizer = torch.optim.AdamW(model.parameters(), lr = learning_rate)
    
    # for each training step
    for i in range(total_steps):
        
        # get the training batch
        Xb, Yb = get_batch("train")
        
        # get the logits and the loss
        logits, loss = model(Xb, Yb)
        
        # reset the gradients
        optimizer.zero_grad(set_to_none = True)
        # calculate new gradients
        loss.backward()
        # take a step in the direction of the gradient
        optimizer.step()
        
        # every 100 steps
        if (i + 1) % 100 == 0:
            # print the current step and current loss
            print("Step[{}/{}], Loss: {:.4f}".format(i + 1, total_steps, loss.item()))
            
        # every eval_interval steps
        if (i + 1) % eval_interval == 0:
            # estimate the model's current loss on each data split ("train" and "test")
            get_loss()
        

In [28]:
# decorator so PyTorch doesn't waste compute on calculating unused gradients
@torch.no_grad()
def get_loss():
    """
    Estimates the loss on each split of data
    """
    
    # make sure in eval mode
    model.eval()
    
    # for each split
    for split in ["train", "test"]:
        # placeholder losses tensor of size eval_iters
        losses = torch.zeros(eval_iters)
        
        # for each iteration in eval_iters
        for i in range(eval_iters):
            # get a batch from the split
            Xb, Yb = get_batch(split)
            # get the logit sand loss
            logits, loss = model(Xb, Yb)
            # add an entry into our losses tensor for the calculated loss
            losses[i] = loss.item()
            
        # print mean of the losses and the split we just estimated the loss for
        print(split, losses.mean().item())

<h1>Initiate the dataset and the model, then train the model on the dataset</h1>

In [29]:
# builds the dataset
train_data, test_data = build_dataset()

1000
308779
2000
556040
3000
853034
4000
1106866


In [30]:
# model hyperparameters

total_steps = 2500 # training iterations
batch_size = 64 # batch sizes
learning_rate = 3e-4 # learning rate
n_hidden = 768 # number of dimensions used for embeddings and the hidden layers
block_size = 256 # the context length
dropout = 0.1 # the percentage of neurons set to 0 when dropout is applied to a tensor
eval_interval = total_steps // 4 # how often we will estimate the model's loss
eval_iters = 25 # how many iterations we will use to estiamte the model's loss
n_layers = 6 # number of blocks that the tokens will pass through
n_heads = 6 # number of heads that will pay attention to the tokens (head_size here is n_hidden // n_heads = 128)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # the device to run on (preferrably gpu/cuda, probably the same thing)

# initialize the model and immediately move to the device
model = GPT().to(device)
device # check which device we are working on

43.495914 million parameters


device(type='cuda')

In [31]:
# trains the model using the given hyperparameters above
train()

Step[100/2500], Loss: 5.1399
Step[200/2500], Loss: 4.0775
Step[300/2500], Loss: 3.5853
Step[400/2500], Loss: 3.1255
Step[500/2500], Loss: 2.7549
Step[600/2500], Loss: 2.4643
train 2.274078369140625
test 2.2731637954711914
Step[700/2500], Loss: 2.0863
Step[800/2500], Loss: 1.8307
Step[900/2500], Loss: 1.6626
Step[1000/2500], Loss: 1.3787
Step[1100/2500], Loss: 1.2529
Step[1200/2500], Loss: 1.0215
train 0.9156764149665833
test 0.8992904424667358
Step[1300/2500], Loss: 0.7875
Step[1400/2500], Loss: 0.6209
Step[1500/2500], Loss: 0.5452
Step[1600/2500], Loss: 0.4297
Step[1700/2500], Loss: 0.3501
Step[1800/2500], Loss: 0.2811
train 0.2585066854953766
test 0.25711798667907715
Step[1900/2500], Loss: 0.2660
Step[2000/2500], Loss: 0.2347
Step[2100/2500], Loss: 0.2203
Step[2200/2500], Loss: 0.2015
Step[2300/2500], Loss: 0.1934
Step[2400/2500], Loss: 0.1846
Step[2500/2500], Loss: 0.1629
train 0.17298263311386108
test 0.17800240218639374


In [34]:
# generates 5 random stories from the model to test its quality
for i in range(5):
    print(f"Story {i + 1}:")
    print(model.generate())
    print("------------------------------------------------------------------------------------------------------------------")

Story 1:
Once there was a little girl named Liz. She had a teddy bear who lived in her room. 

One day, Liz was feeling very sad. She wanted to make her dad feel better, but she was bored. She asked her mom, "Can I have a surprise for you?" 

Her mom smiled and said, "Yes, of course I can! Let's explore some competition to use safety."

Lizzy gathered up a bubbloves and carefully brought them inside. She showed them how to make cut them into different pieces. Finally after a lot of hard work, she found them. 

The boy found a big choiceber and things was very happy. He delicious treats had made his friend happy. 

He sang, They hugged each other and celebrated him. From then on, everyone ate the special crane and the silly poppy.
------------------------------------------------------------------------------------------------------------------
Story 2:
One day, two children were walking in the woods. The children called out, â€œLook! There is so much ice over there!â€ But the other chil

<h1>The previous model was half of the size of this one and its output was not too bad but it still made no sense so I will now try this larger model below to see how their outputs differ</h1>
<p>Some comments I want to make:</p>
<ul>
    <li>
        I cut the block size in half. The previous block size allowed the model to see an entire story in its context length since most stories were just about under 256 tokens so I thought this may help the model with knowing how to end a story that it started. However, I think the model learned to rely on having the full 256 tokens of context in order to make its predictions so when I gave it only one token of context, it was not able to generalize well enough to produce stories in the way that it was trained. Although this may not actually be true because the model trains on every time step of context so it trains on examples from varying context from 1 token to the full 256 tokens. In any case, below I am experimenting with a smaller block size and more resourceful model of double the size based on parameters.
    </li>
    <li>
        After training the larger model I have found that it isnt really any better (at least for the same number of training steps). I tried to train the larger model for another 1000 training steps but my notebook crashed after 1 hour and 45 minutes of run time so I think I will stop here for now.
    </li>
</ul>

In [35]:
total_steps = 2500
batch_size = 64
learning_rate = 3e-4
n_hidden = 768
block_size = 128
dropout = 0.1
eval_interval = total_steps // 4
eval_iters = 25
n_layers = 12
n_heads = 12
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = GPT().to(device)
device

85.924842 million parameters


device(type='cuda')

In [36]:
train()

Step[100/2500], Loss: 5.5434
Step[200/2500], Loss: 4.2697
Step[300/2500], Loss: 3.7927
Step[400/2500], Loss: 3.3661
Step[500/2500], Loss: 3.0356
Step[600/2500], Loss: 2.7804
train 2.652148485183716
test 2.636672258377075
Step[700/2500], Loss: 2.4743
Step[800/2500], Loss: 2.2839
Step[900/2500], Loss: 2.0526
Step[1000/2500], Loss: 1.9181
Step[1100/2500], Loss: 1.8679
Step[1200/2500], Loss: 1.7339
train 1.6238445043563843
test 1.623026728630066
Step[1300/2500], Loss: 1.6036
Step[1400/2500], Loss: 1.4009
Step[1500/2500], Loss: 1.3160
Step[1600/2500], Loss: 1.1951
Step[1700/2500], Loss: 1.0070
Step[1800/2500], Loss: 0.8998
train 0.8285203576087952
test 0.8397030830383301
Step[1900/2500], Loss: 0.7673
Step[2000/2500], Loss: 0.6686
Step[2100/2500], Loss: 0.5903
Step[2200/2500], Loss: 0.5510
Step[2300/2500], Loss: 0.4953
Step[2400/2500], Loss: 0.4288
Step[2500/2500], Loss: 0.3737
train 0.4065004587173462
test 0.4059840738773346


In [37]:
for i in range(5):
    print(f"Story {i + 1}:")
    print(model.generate())
    print("------------------------------------------------------------------------------------------------------------------")

Story 1:
The square shone in the park. He saw a big tree and wanted to share it. It was very fast and it walked very fast. The sun was warm and the cat could not heat. The dog started to sort the rock and splash the water in the sun.

The sun was shining on the other person. He thought it was very happy, so he jumped and splashed. Then he heard his mom calling him and tried told them. The staff got in his friends to hear him and@. The dog did not like the staff, but he got tired and carefully. Thepot felt safe to be upset, and the smelly dog was not sad. He wanted to be free again and they would never play together again.
------------------------------------------------------------------------------------------------------------------
Story 2:
One day, Lily's dad and dad took her to a park. There was a big fountain with a ink. He said, "Lily, you can wash my pet friend. She, you must become nice, but can we play if you do better outside because you believe your dad's dad's problems hel