In [81]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/shakespeare/input.txt


<h1>First, we will explore the bigram model for this and test its capabilities. We expect it to be ineffective in generating reasonable results because it is only given the previous character for context which is not nearly enough to be coherent</h1>

In [82]:
# import libraries
import torch
import torch.nn as nn
from torch.nn import functional as F

# seed the random generators for reproducible results
torch.manual_seed(1337)

<torch._C.Generator at 0x7d6166b574d0>

In [83]:
# hyperparameters of model

batch_size = 32 # independent sequences of the data to process in parallel
block_size = 8 # maximum context length for predictions (in bigram model, it only uses the previous character and does not take any thing from more previous characters so this does not do anything)
max_iters = 3000 # iterations of training
eval_interval = 300 # how often to report the loss of the model in training
learning_rate = 1e-2 # how large of a correction the model will make to itself after a training example
device = 'cuda' if torch.cuda.is_available() else 'cpu' # train on gpu if available otherwise cpu
eval_iters = 200 # idk yet LOL!!!!!

print(device)

cuda


In [84]:
# first open the input file in read mode and encode it using utf-8 and read in all the text into variable, text
with open('/kaggle/input/shakespeare/input.txt', 'r', encoding = 'utf-8') as f:
    text = f.read()
    
# first turn text into a set to remove duplicate characters, then into a list, then sort the list
chars = sorted(list(set(text))) # this tells us the vocabulary needed by the network to reproduce shakespeare according to our input training text

# as mentioned chars is the vocabulary needed to reproduce and its length is our vocab_size
vocab_size = len(chars)

# first enumerate chars so that it is numbers, but this also returns the original version as well so we take numbers and chars, and we make a dictionary of mappings from chars to numbers
stoi = {ch: i for i, ch in enumerate(chars)}

# same as above but reverse
itos = {i: ch for i, ch in enumerate(chars)}

# now we need an encoder and decoder to convert between strings and integers for us

# lambda means this is a function, encode is a function that takes s (string) as an input and returns a list of integers that are mapped from each character in the string
encode = lambda s: [stoi[c] for c in s]

# function that takes a list of integers and converts to a list of the corresponding character mappings, then joins them into one string
decode = lambda l: ''.join([itos[i] for i in l])

In [85]:
# data preparation

# the data will be a tensor of the encoded text (a long list of integer mappings of the original text), as the datatype of longs because all are integers
data = torch.tensor(encode(text), dtype = torch.long)

# we want a train/val split of 90/10

# this is a marker at the 90% of the dataset
n = int(0.9 * len(data))

# first 90%
train_data = data[:n]

# last 10%
val_data = data[n:]

In [86]:
# data loading
def get_batch(split):
    
    # if we want training split give training data otherwise val data
    data = train_data if split == 'train' else val_data
    
    # generate 32 random integers between 0 and length of data - block_size since we take a consecutive chunk of block_size for the batch
    ix = torch.randint(0, len(data) - block_size, (batch_size,))
    
    # make a vertical stack (2D array basically). for each index in the 32 random indices, we want to pull from the data all the data at the index to the index + block_size to get block_size amount of data
    x = torch.stack([data[i: i + block_size] for i in ix])
    
    # for each index in the 32 random indices, we want to pull from the data all the data at the index + 1 (because we are always predicting the next character) to index + block_size + 1
    y = torch.stack([data[i + 1: i + block_size + 1] for i in ix])
    
    # we want to put these batches on the device that we wish to train on
    x, y = x.to(device), y.to(device)
    
    # we give the data batches back
    return x, y

In [87]:
@torch.no_grad() # decorator to disable gradient tracking

def estimate_loss():
    
    out = {}
    
    # set the model to evaluation mode
    model.eval() # model is a global variable in this example so this works, but better practice to pass in as a function parameter
    
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        
        out[split] = losses.mean()
        
    # set the model to training mode
    model.train()
    
    return out

In [88]:
# simplest bigram model of language model

class BigramLanguageModel(nn.Module):
    
    # on initialization
    def __init__(self, vocab_size):
        
        # call the initialization of the parent class
        super().__init__()
        
        # the token embedding table of this model will be an embedding lookup table of vocab_size by vocab_size because for each previous character (vocab_size possibilities), we want to generate a probability distribution of the next character (vocab_size possibilities)
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
        
    # on feed forward data
    def forward(self, idx, targets = None):
        
        # we pull out the probability distribution created by the model at the indexes we are interested in making predictions on
        logits = self.token_embedding_table(idx)
        
        # if no targets are passed we can't really do any loss calculation of the model's efficacy because we have nothing to validate its results
        if targets is None:
            
            # so the loss is none
            loss = None
            
        # however if we do have targets to compare to
        else:
            
            # we want to get the three dimensions of our probability distribution logits (Bytes (# of batches), Time (location within the batch, we go from 1 character to block_size characters and we take this to be traveling through time), Channels (each instance in time has a probability distribution that determines what the next instance of time will look like (what the next character will be)))
            B, T, C = logits.shape
            
            # in order to comply with the requirements of the cross entropy loss function we need to compress our logits and targets
            
            # we will merge the batches and time dimensions together but maintain the channels
            logits = logits.view(B * T, C)
            
            # we will correspondingly merge the batches and time dimensions of the target
            targets = targets.view(B * T)
            
            # then we can get the loss of the logits based on what the labels are
            loss = F.cross_entropy(logits, targets)
            
        # then we return the logits and loss generated by the model
        return logits, loss
    
    # generating predictions from the model
    def generate(self, idx, max_new_tokens):
        
        # each token is a character prediction by the model so we want to produce as much content as requested
        for _ in range(max_new_tokens):
            
            # idx is the input given to the model so we pass this starting input into the model and it generates the next character
            logits, loss = self(idx)
            
            # then we are only interested in the last time step in the model's sequence of logits production so we index into the last instance in time in the generation process
            logits = logits[:, -1, :] # becomes (B, C)
            
            # then we apply a softmax on the logits by exponentiating them and normalizing to get a probability distribution and we want to do this for the last dimension of logits because this will always be the final result of model in which we are interested in
            probs = F.softmax(logits, dim = - 1)
            
            # then using this probability distribution generated by the model, we want to randomly sample the next character in the sequence
            idx_next = torch.multinomial(probs, num_samples = 1)
            
            # then we will add this new character index to our input and restart the process of adding more characters and producing a max_new_tokens sized output
            idx = torch.cat((idx, idx_next), dim = 1)
            
        # then finally we return the production of the model
        return idx

In [89]:
# model initialization

# bigram model with its vocab size
model = BigramLanguageModel(vocab_size)

# move the model to the device we wish to train on
model = model.to(device)

# then we choose which optimizers we want to use for training and passing in the model parameters to train and the learning rate of training
optimizer = torch.optim.AdamW(model.parameters(), lr = learning_rate)

In [90]:
# training time

# for each iteration of training
for iter in range(max_iters):
    
    # if the iteration is one that falls on the evaluation of training interval
    if iter % eval_interval == 0:
        
        # we want to get the current loss on the validation and training set
        losses = estimate_loss()
        
        # then we want to display the current training and validation loss at this iteration
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
        
    # we get a batch to train on from the training dataset
    xb, yb = get_batch('train')
    
    # we get logits from the model on this batch
    logits, loss = model(xb, yb)
    
    # we reset the gradients on the optimizer and set them to None because this is most efficient
    optimizer.zero_grad(set_to_none = True)
    
    # then we back propagate and calculate gradients for the parameters to descend the loss function
    loss.backward()
    
    # then we take a step in the right direction for the model parameters
    optimizer.step()

step 0: train loss 4.7305, val loss 4.7241
step 300: train loss 2.8110, val loss 2.8249
step 600: train loss 2.5434, val loss 2.5682
step 900: train loss 2.4932, val loss 2.5088
step 1200: train loss 2.4863, val loss 2.5035
step 1500: train loss 2.4665, val loss 2.4921
step 1800: train loss 2.4683, val loss 2.4936
step 2100: train loss 2.4696, val loss 2.4846
step 2400: train loss 2.4638, val loss 2.4879
step 2700: train loss 2.4738, val loss 2.4911


In [91]:
# the input we want to give to the model is just a 1 by 1 matrix that is initially just a 0
# i.e.
#   C0
# R0 0

# this context will be dataype long and want to move it to the device we wish to work on
context = torch.zeros((1, 1), dtype = torch.long, device = device)

# then we want to generate 500 characters given the context using the model specifically on the device we want to work on and then we want to pull out the indexes generated, convert them to a list of integers, and then give them to the decoder to decode so we can print them
print(decode(m.generate(context, max_new_tokens = 500)[0].tolist()))




CEThik brid owindakis b, bth

HAPet bobe d e.
S:
O:3 my d?
LUCous:
Wanthar u qur, t.
War dXENDoate awice my.

Hastarom oroup
Yowhthetof isth ble mil ndill, ath iree sengmin lat Heriliovets, and Win nghir.
Swanousel lind me l.
HAshe ce hiry:
Supr aisspllw y.
Hentofu n Boopetelaves
MPOLI s, d mothakleo Windo whth eisbyo the m dourive we higend t so mower; te

AN ad nterupt f s ar igr t m:

Thin maleronth,
Mad
RD:

WISo myrangoube!
KENob&y, wardsal thes ghesthinin couk ay aney IOUSts I&fr y ce.
J


<h1>Now we will investiage the much more effective transformer model approach to llm which thrives on self-attention and allowing the characters to work with each other to generate predictions rather than being limited to only the previous character to be the only thing the network can use to predict the next character</h1>

In [92]:
# hyperparameters of the transformer model
batch_size = 64 # how many independent sequences to train on in parallel
block_size = 256 # maximum size of input context and previous characters the network can use to predict the next character
max_iters = 5000 # how many iterations of training
eval_interval = 500 # how often to display the current loss in training
learning_rate = 3e-4 # the rate of parameter updating
device = 'cuda' if torch.cuda.is_available() else 'cpu' # deciding which device to train on
eval_iters = 200
n_embed = 384 # how many dimensions the network has to work with for embedding the vocabulary
n_head = 6 # the number of heads to use to extract features from the input data in order to better be able to understand what came before in order to more effectively predict what should come next
n_layer = 6 # the number of linear layers applied throughout the model to help the model not only transform the input into something it can understand, but actually be able to let the information marinate and go through some internal processing before the output
dropout = 0.2 # the proportion of neurons to shut off randomly during each iteration in order to prevent overtraining and overfitting to the dataset to make sure that the network is actually able to perform and predict well and not just simply memorize the data

In [93]:
# definition of a head to extract features from the input using self-attention

class Head(nn.Module):
    """ one head of self-attention """
    
    # on initialization
    def __init__(self, head_size):
        
        # call the parent function initialization
        super().__init__()
        
        # the key attribute will be a linear layer of input n_embed and output head_size with no bias
        self.key = nn.Linear(n_embed, head_size, bias = False)
        
        # the query attribute will be a linear layer of input n_embed and output head_size with no bias
        self.query = nn.Linear(n_embed, head_size, bias = False)
        
        # the value attribute will be a linear layer of input n_embed and output head_size with no bias
        self.value = nn.Linear(n_embed, head_size, bias = False)
        
        # we will register a buffer called 'tril' which will be a tril applied to a block_size by block_size tensor of ones
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        
        # i.e.
        # 1 1 1     1 0 0
        # 1 1 1 --> 1 1 0
        # 1 1 1     1 1 1
        
        # this is because when doing matrix multiplication like this and normalizing each row, you will be able to get a cumulative average as you progress through the rows which is what we want
        # the idea is that the prediction for the next character will take into account the previous characters
        # a primitive way of doing this is to simply just take the average of the previous characters and use this to predict the next character
        # however, a more effective and precise way, is to allow the model to assign its own weights and values to the previous characters based on what they are and their effect on what the next character might be
        
        # on the level of words, lets say that we are writing a sentence and all the previous words in the sentence where we want to predict the next word are all nouns and adjectives.
        # well in order for this to be a sentence we must have a verb, so the model is very likely to predict a verb as the next word in the sequence if it looks at the previous words and sees that they are all nouns and adjectives.
        # it will choose which verb based on its own feelings about which words that have come before are most interesting and their weights on what the predicted word will be
        
        # a simple example:
        # context a disabled man ...
        # the model will look at the past and see oh, there are no verbs so the next one could be a verb, and then it takes special note of the adjective describing the man as disabled and then reduces the likelihood that the verb has to do with physical action
        # in the primitive averaging the past words approach, the model may have missed out on this key detail that the man is disabled because an average is a lossy conversion and this important information may have been lost in the average of the past words
        # this is especially true when there are many past words in the hundreds and it becomes much better if the model is able to pick and choose which words stand out the most to it and be able to predict the next word based on its own accord
        
        # then the dropout attribute will be a dropout layer that drops 20% of the weights
        self.dropout = nn.Dropout(dropout)
        
    # then we feed forward the input
    def forward(self, x):
        
        # we get the B, T, C dimensions of the input
        B, T, C = x.shape
        
        # we get the keys for the given input
        k = self.key(x) # B, T, head_size
        
        # we get the queries for the given input
        q = self.query(x) # B, T, head_size
        
        # then we get our weights by matrix multiplying the queries by the transpose of the keys along the first and second dimension, and then we kaiming init/normalize by dividing by the square root of the head_size
        weights = q @ k.transpose(-2, -1) * k.shape[-1] ** -0.5 # (B, T, head_size) @ (B, head_size, T )-> (B, T, T)
        
        # then we apply the tril mask by setting anything in the weights that would be 0 on the tril matrix to -infinity
        weights = weights.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        
        # then we apply a softmax along the last dimension of the weights so that we can exponentiate the logits and normalize them to get some type of probability distribution 
        weights = F.softmax(weights, dim = -1) # (B, T, T)
        
        # then we randomly dropout some of the weights
        weights = self.dropout(weights)
        
        # then we apply the values onto the input 
        v = self.value(x) # (B, T, head_size)
        
        # then depending on how interested the model is in the values at certain locations, it will extract more of those values and take them into more consideration when making the prediction
        out = weights @ v # (B, T, T) @ (B, T, head_size) 
        
        # the keys and queries go hand in hand.
        # the model makes a query and gets the resulting keys for the queries
        # for characters that the model finds to stand out and be more interesting, it will extract more value out of that character
        # for characters that the model does not care about, it will extract little value out of the character
        
        # in the "a disbaled man" example at the word levels
        
        # 0.2 0 0
        # 0.3 0.7 0      -> multiplying this by another matrix will pull out the most from disabled meaning that the model has taken most interest in this adjective and will use a good portion of this word in order to make its prediction
        # 0.1 0.7 0.2
        
        # then the model will return its output from the transforming head
        return out

In [94]:
# multiple heads used in parallel

class MultiHeadAttention(nn.Module):
    """ multiple heads of self attention in parallel"""
    
    # on initialization
    def __init__(self, num_heads, head_size):
        
        # call the parent function intialization
        super().__init__()
        
        # the heads attribute will be a list of modules. this list of modules will be num_heads of heads with each Head having head_size
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        
        # the proj attribute will be for when the model will project and convert the head_size * num_heads input into n_embed output
        self.proj = nn.Linear(head_size * num_heads, n_embed)
        
        # then the dropout attribute will be a dropoutlayer that drops out 20% of the weights
        self.dropout = nn.Dropout(dropout)
        
    # when feeding forward an input
    def forward(self, x):
        # for each head in the heads of the model, we pass the input to each head, then we concatenate the results of heads on the last dimension
        out = torch.cat([h(x) for h in self.heads], dim = -1)
        
        # then we apply the last linear layer of projection and then dropout 20% of the neurons
        out = self.dropout(self.proj(out))
        
        # then return the output
        return out

In [95]:
# feeding forward input

class FeedForward(nn.Module):
    """ a simple linear layer followed by a non-linearity """
    
    # on initialization
    def __init__(self, n_embed):
        
        # call the parent function intialization
        super().__init__()
        
        # the net attribute will init the network as a sequential
        self.net = nn.Sequential(
            
            # linear layer that takes n_embed and gives 4 * n_embed
            nn.Linear(n_embed, 4 * n_embed),
            
            # relu non linearity
            nn.ReLU(),
            
            # linear layer that takes 4 * n_embed and gives n_embed
            nn.Linear(4 * n_embed, n_embed),
            
            # dropout that drops 20% of neurons
            nn.Dropout(dropout),
        )
        
    # when feeding input forward
    def forward(self, x):
        
        # pass the input into the network
        return self.net(x)

In [96]:
# a block within the network

class Block(nn.Module):
    """ Transformer block: communication followed by computation """
    
    # on intialization
    def __init__(self, n_embed, n_head):
        
        # call the parent function initialization
        super().__init__()
        
        # the head size will be the number of embeddings divided by the number of heads, it will be split among each head
        head_size = n_embed // n_head
        
        # the self attention attribute will be the the multi head attention part which will take in the number of heads and the size of each head
        self.sa = MultiHeadAttention(n_head, head_size)
        
        # the feed forward part will feed forward the input
        self.ffwd = FeedForward(n_embed)
        
        # the first layer norm applied (layer norm is just batch norm excpet instead of normalizing along the dimension of each batch, we normalize along the dimension of the layer)
        self.ln1 = nn.LayerNorm(n_embed)
        
        # the second layer norm applied
        self.ln2 = nn.LayerNorm(n_embed)
        
    # when feeding forward input to the block
    def forward(self, x):
        
        # residual connections are like a tree where the model goes from the input to the output along a big tall tree trunk but the residual connections are where the model branches out performs some computation and then rergroups with the trunk and branches back into it
        
        # we add residual connections where the model is able to normalize the layer, and then perform the multiheaded self attention and based on this computation add it back into the calculations
        x = x + self.sa(self.ln1(x))
        
        # there is a second residual connection where the model normalizes the layer, then feeds forward some linear layers, relu, and dropout layer, then adds back its calculations to the main branch
        x = x + self.ffwd(self.ln2(x))
        
        # return the ouput of computation
        return x

In [97]:
# the full gpt language model with all its parts 

class GPTLanguageModel(nn.Module):
    
    # on initialization
    def __init__(self):
        
        # call parent initialization function
        super().__init__()
        
        # make the embedding table for each token (vocabulary word) with the number of embeddings requested
        self.token_embedding_table = nn.Embedding(vocab_size, n_embed)
        
        # make an embedding table for each position in the block_size with the number of embeddings for each position
        self.position_embedding_table = nn.Embedding(block_size, n_embed)
        
        # we want to make n_layer number of blocks with a number of embeddings and number of heads, and we want to contain this list within a sequential module
        self.blocks = nn.Sequential(*[Block(n_embed, n_head = n_head) for _ in range(n_layer)])
        
        # the ln_f attribute will be our layer norm
        self.ln_f = nn.LayerNorm(n_embed)
        
        # the lm_head attribute will be our final linear layer that takes in n_embed as input and generates logits of vocab_size as expected
        self.lm_head = nn.Linear(n_embed, vocab_size)
        
        # then we apply to the model the specific intialization of weights
        self.apply(self._init_weights)
        
    # intialization of weights
    def _init_weights(self, module):
        
        # if the module we are looking at is a linear layer
        if isinstance(module, nn.Linear):
            
            # we want to initialize its weights as a normal distribution of mean 0 and standard deviation 0.02 (this is probably because the network is so large and senstitive that we want very small values as these will blow up inevitably as they work their way through the network)
            torch.nn.init.normal_(module.weight, mean = 0.0, std = 0.02)
            
            # if this linear layer has bias
            if module.bias is not None:
                
                # we want to initialize the bias as all zeroes
                torch.nn.init.zeros_(module.bias)
                
        # however if this is an embedding lookup table
        elif isinstance(module, nn.Embedding):
            # we want the weights of the lookup table to be drawn from a normal distribute with mean 0 and standard deviation 0.02
            torch.nn.init.normal_(module.weight, mean = 0.0, std = 0.02)
            
    # when feeding input forward into the model
    def forward(self, idx, targets = None):
        
        # we expect to receive our input in a 2D form of bytes by time
        B, T = idx.shape
        
        # we will get the token embeddings from the token embedding table given these indices
        tok_emb = self.token_embedding_table(idx)
        
        # we will get the position embeddings from the position embedding table by indexing into the table by position in the sequence
        pos_emb = self.position_embedding_table(torch.arange(T, device = device))
        
        # we will start off as a combination of token embeddings and position embeddings
        x = tok_emb + pos_emb
        
        # then we will apply the multiple blocks onto this initial input
        x = self.blocks(x)
        
        # then we will apply a layer norm before we make our prediction
        x = self.ln_f(x)
        
        # then we will pull logits out of our final linear layer's conversion of this input into a vocab_size probability distribution
        logits = self.lm_head(x)
        
        # if there are no targets given to compare to
        if targets is None:
            # then the loss will be none
            loss = None
            
        # however, if we are given targets to measure our accuracy relative to
        else:
            
            # we take the three dimensions of the logits
            B, T, C = logits.shape
            
            # merge the bytes (batches) and time (positions) dimension of the logits
            logits = logits.view(B * T, C)
            
            # merge the bytes and time dimensions of the targets
            targets = targets.view(B * T)
            
            # and then push them through the cross_entropy loss function to get the loss
            loss = F.cross_entropy(logits, targets)
            
        # then we return the logits and loss generated whether there were targets or not
        return logits, loss
    
    # when making generations
    def generate(self, idx, max_new_tokens):
        
        # we want to make max_new_tokens character generations
        for _ in range(max_new_tokens):
            
            # crop idx to be the last block_size tokens of input
            idx_cond = idx[:, -block_size:]
            
            # then we pass this input into the model which outputs logits and loss (in this case loss is 0 since we have no targets to compare to)
            logits, loss = self(idx_cond)
            
            # then we are only interested in the last instance of time of generation so logits goes from (B, T, C) to (B, C) as we pulled out the last instance of time
            logits = logits[:, -1, :]
            
            # then we take the logits and exponetiate them and normalize them into a probability distribution along the last dimension C (the channels)
            probs = F.softmax(logits, dim = -1)
            
            # then we get the index of the next character predicted by the model by taking the probability distribution generated by the model and using it to sample one character
            idx_next = torch.multinomial(probs, num_samples = 1)
            
            # then we add the new index to our list of indices for us to decode the generation of the model into string text
            idx = torch.cat((idx, idx_next), dim = 1)
            
        # we return the generation of the model
        return idx

In [98]:
# model time!!

# initialize the GPTLanguageModel
model = GPTLanguageModel()

# move the model to the device we wish to train on
model = model.to(device)

# get the number of parameters by looping through each parameter in the model parameters and getting the number of elements in those parameters and taking the sum of those, dividing by 1e-6 to get in units of millions
print(sum(p.numel() for p in model.parameters())/1e6, 'million parameters')

# init the optimizer as AdamW and passing in the model parameters to be updated along with the learning rate of updating them
optimizer = torch.optim.AdamW(model.parameters(), lr = learning_rate)

10.788929 million parameters


In [99]:
# training time!!!

# for each iteration of training
for iter in range(max_iters):
    
    # if the iteration is one that falls on the evaluation of training interval
    if iter % eval_interval == 0 or iter == max_iters - 1:
        
        # we want to get the current loss on the validation and training set
        losses = estimate_loss()
        
        # then we want to display the current training and validation loss at this iteration
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
        
    # we get a batch to train on from the training dataset
    xb, yb = get_batch('train')
    
    # we get logits from the model on this batch
    logits, loss = m(xb, yb)
    
    # we reset the gradients on the optimizer and set them to None because this is most efficient
    optimizer.zero_grad(set_to_none = True)
    
    # then we back propagate and calculate gradients for the parameters to descend the loss function
    loss.backward()
    
    # then we take a step in the right direction for the model parameters
    optimizer.step()

step 0: train loss 4.2669, val loss 4.2666
step 500: train loss 1.7948, val loss 1.9358
step 1000: train loss 1.4007, val loss 1.6216
step 1500: train loss 1.2764, val loss 1.5282
step 2000: train loss 1.1932, val loss 1.4974
step 2500: train loss 1.1275, val loss 1.4875
step 3000: train loss 1.0683, val loss 1.4856
step 3500: train loss 1.0195, val loss 1.5105
step 4000: train loss 0.9635, val loss 1.5149
step 4500: train loss 0.9110, val loss 1.5492
step 4999: train loss 0.8566, val loss 1.5787


In [100]:
# this context will be dataype long and want to move it to the device we wish to work on
context = torch.zeros((1, 1), dtype = torch.long, device = device)

# then we want to generate 500 characters given the context using the model specifically on the device we want to work on and then we want to pull out the indexes generated, convert them to a list of integers, and then give them to the decoder to decode so we can print them
print(decode(m.generate(context, max_new_tokens = 500)[0].tolist()))


To where I live saway? O do I swell thou wilt,
To show a tosse strawing become of mine hours: I
What doth nothing would said it were, be not sworn well
From which proter off, and justice comes.

SICINIUS:
Have we a sweary wife
That whereof? we have said here comes
With our fruit-pilegr wrection laid,
And by the weeds of whoxes her from them
The father's friends, rends to Time, and we two
The grates are on it.

First Soldier:
Have you slipted with the war fair?

ARTCUS:
Upon the tyrannous and Rom


In [101]:
# i would like to compare this model trained on 5000 steps with loss 0.8566 and val 1.5787 to the model trained on 3000 steps with loss 1.0683 and val 1.4856. The question I want to answer here is whether it is more important to have a lower training loss for text generation or a lower val loss

# hyperparameters of the transformer model
batch_size = 64 # how many independent sequences to train on in parallel
block_size = 256 # maximum size of input context and previous characters the network can use to predict the next character
max_iters = 3000 # how many iterations of training
eval_interval = 500 # how often to display the current loss in training
learning_rate = 3e-4 # the rate of parameter updating
device = 'cuda' if torch.cuda.is_available() else 'cpu' # deciding which device to train on
eval_iters = 200
n_embed = 384 # how many dimensions the network has to work with for embedding the vocabulary
n_head = 6 # the number of heads to use to extract features from the input data in order to better be able to understand what came before in order to more effectively predict what should come next
n_layer = 6 # the number of linear layers applied throughout the model to help the model not only transform the input into something it can understand, but actually be able to let the information marinate and go through some internal processing before the output
dropout = 0.2 # the proportion of neurons to shut off randomly during each iteration in order to prevent overtraining and overfitting to the dataset to make sure that the network is actually able to perform and predict well and not just simply memorize the data

# initialize the GPTLanguageModel
model = GPTLanguageModel()

# move the model to the device we wish to train on
m = model.to(device)

# get the number of parameters by looping through each parameter in the model parameters and getting the number of elements in those parameters and taking the sum of those, dividing by 1e-6 to get in units of millions
print(sum(p.numel() for p in m.parameters())/1e6, 'million parameters')

# init the optimizer as AdamW and passing in the model parameters to be updated along with the learning rate of updating them
optimizer = torch.optim.AdamW(model.parameters(), lr = learning_rate)

10.788929 million parameters


In [102]:
# for each iteration of training
for iter in range(max_iters):
    
    # if the iteration is one that falls on the evaluation of training interval
    if iter % eval_interval == 0 or iter == max_iters - 1:
        
        # we want to get the current loss on the validation and training set
        losses = estimate_loss()
        
        # then we want to display the current training and validation loss at this iteration
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
        
    # we get a batch to train on from the training dataset
    xb, yb = get_batch('train')
    
    # we get logits from the model on this batch
    logits, loss = m(xb, yb)
    
    # we reset the gradients on the optimizer and set them to None because this is most efficient
    optimizer.zero_grad(set_to_none = True)
    
    # then we back propagate and calculate gradients for the parameters to descend the loss function
    loss.backward()
    
    # then we take a step in the right direction for the model parameters
    optimizer.step()

step 0: train loss 4.1869, val loss 4.1959
step 500: train loss 1.7744, val loss 1.9227
step 1000: train loss 1.4043, val loss 1.6082
step 1500: train loss 1.2732, val loss 1.5263
step 2000: train loss 1.1967, val loss 1.5059
step 2500: train loss 1.1352, val loss 1.4893
step 2999: train loss 1.0780, val loss 1.4891


In [103]:
# this context will be dataype long and want to move it to the device we wish to work on
context = torch.zeros((1, 1), dtype = torch.long, device = device)

# then we want to generate 500 characters given the context using the model specifically on the device we want to work on and then we want to pull out the indexes generated, convert them to a list of integers, and then give them to the decoder to decode so we can print them
print(decode(m.generate(context, max_new_tokens = 500)[0].tolist()))


TAO, but thy brother gautes; 'tis fast thou did'st:
Thy heavensings! while the time in thy face, like trum!
Wroum like to have an head, apotIter,
So rudly, quart true! Who, senses! the wormst be
Might the book, some ready, to walk, and put I dark thee,
More-than chased with thee one:
Good with the son of station twen times of the low,
Show thy breath and these triner of time,
But in thy place kingdom out on't.

JULIET:
And were the namedples that ames-day in my chief:
What, you deserts doth sple


**5000 steps, train loss 0.8566, val loss 1.5787 model generation:**

*To where I live saway? O do I swell thou wilt,
To show a tosse strawing become of mine hours: I
What doth nothing would said it were, be not sworn well
From which proter off, and justice comes.*

*SICINIUS:
Have we a sweary wife
That whereof? we have said here comes
With our fruit-pilegr wrection laid,
And by the weeds of whoxes her from them
The father's friends, rends to Time, and we two
The grates are on it.*

*First Soldier:
Have you slipted with the war fair?*

*ARTCUS:
Upon the tyrannous and Rom*


**3000 steps, train loss 1.0780, val loss 1.4891 model generation:**

*TAO, but thy brother gautes; 'tis fast thou did'st:
Thy heavensings! while the time in thy face, like trum!
Wroum like to have an head, apotIter,
So rudly, quart true! Who, senses! the wormst be
Might the book, some ready, to walk, and put I dark thee,
More-than chased with thee one:
Good with the son of station twen times of the low,
Show thy breath and these triner of time,
But in thy place kingdom out on't.*

*JULIET:
And were the namedples that ames-day in my chief:
What, you deserts doth sple*


Personally, I would say the first one with the lower train loss and slightly higher val loss did better, so perhaps in the case of random text generation, the lower train loss seems to do better
