## Import statements and dependencies

In [26]:
import torch
import torch.nn as nn
import torch.nn.functional as F

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

block_size = 8
batch_size = 4
max_iters = 50000
eval_interval = 2500
learning_rate = 5e-2
eval_iters = 250

# dropp out random neurons to prevent overfitting
dropout = 0.2

cuda


## Exploration of dataset

In [27]:
with open('wizard_of_oz.txt', "r", encoding='utf-8') as f:
    text = f.read()
chars = sorted(set(text))
vocab_size = len(chars)
print(chars)
print(vocab_size)

['\n', ' ', '!', '"', '&', "'", '(', ')', '*', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '\ufeff']
81


## Defining encoder and decoder

using character level tokeniser which takes each character and converts it into an integer equivalent. small amount of tokens but a lot to decode and encode.

vs

word level tokeniser, which takes each word and converts it into a integer equivalent. large number of tokens but less to decode and encode.

vs

subword tokeniser which is somewhere in the moddle of character level and word level tokeniser.

In [28]:
string_to_int = { ch:i for i,ch in enumerate(chars) }
int_to_string = { i:ch for i,ch in enumerate(chars) }

# populating hash maps for decoding and encoding

encode = lambda s: [string_to_int[c] for c in s]
decode = lambda l: "".join([int_to_string[i] for i in l])

print("encoder example:")
print(encode("Hello"))
print("\ndecoder example:")
print(decode([32, 58, 65, 65, 68]))

print("\nencoding wizard of oz text:")
print(encode(text))

encoder example:
[32, 58, 65, 65, 68]

decoder example:
Hello

encoding wizard of oz text:
[80, 28, 39, 42, 39, 44, 32, 49, 1, 25, 38, 28, 1, 44, 32, 29, 1, 47, 33, 50, 25, 42, 28, 1, 33, 38, 1, 39, 50, 0, 0, 26, 49, 0, 0, 36, 11, 1, 30, 42, 25, 38, 35, 1, 26, 25, 45, 37, 0, 0, 25, 45, 44, 32, 39, 42, 1, 39, 30, 1, 44, 32, 29, 1, 47, 33, 50, 25, 42, 28, 1, 39, 30, 1, 39, 50, 9, 1, 44, 32, 29, 1, 36, 25, 38, 28, 1, 39, 30, 1, 39, 50, 9, 1, 39, 50, 37, 25, 1, 39, 30, 1, 39, 50, 9, 1, 29, 44, 27, 11, 0, 0, 33, 36, 36, 45, 43, 44, 42, 25, 44, 29, 28, 1, 26, 49, 1, 34, 39, 32, 38, 1, 42, 11, 1, 38, 29, 33, 36, 36, 0, 0, 26, 39, 39, 35, 43, 1, 39, 30, 1, 47, 39, 38, 28, 29, 42, 1, 47, 33, 36, 36, 33, 25, 37, 1, 37, 39, 42, 42, 39, 47, 1, 4, 1, 27, 39, 11, 9, 1, 33, 38, 27, 11, 1, 38, 29, 47, 1, 49, 39, 42, 35, 0, 0, 0, 51, 33, 65, 65, 74, 72, 73, 71, 54, 73, 62, 68, 67, 52, 0, 0, 0, 27, 39, 40, 49, 42, 33, 31, 32, 44, 1, 13, 21, 12, 20, 1, 26, 49, 1, 36, 11, 1, 30, 42, 25, 38, 35, 1, 26, 25,

## Using torch tensors

In [29]:
data = torch.tensor(encode(text), dtype=torch.long)
print(data)

tensor([80, 28, 39,  ..., 29, 67, 57])


## Train & validation split

In [30]:
n = int(0.8*len(data))
train_data=data[:n]
validation_data=data[n:]

def get_batch(split):
    data = train_data if split == 'train' else validation_data
    ix = torch.randint(len(data) - block_size, (batch_size, ))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x.to(device), y.to(device)

x, y = get_batch('train')
print('inputs: ')
print(x)
print('targets:')
print(y)


inputs: 
tensor([[62, 73,  9,  0, 54, 67, 57,  1],
        [61, 58, 71,  1, 60, 68, 68, 57],
        [ 1, 60, 71, 68, 74, 69, 72,  1],
        [69, 71, 58, 72, 58, 67, 73, 11]], device='cuda:0')
targets:
tensor([[73,  9,  0, 54, 67, 57,  1, 28],
        [58, 71,  1, 60, 68, 68, 57,  1],
        [60, 71, 68, 74, 69, 72,  1, 68],
        [71, 58, 72, 58, 67, 73, 11,  0]], device='cuda:0')


In [31]:
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print("When input is", context, "the target is ", target)

When input is tensor([80]) the target is  tensor(28)
When input is tensor([80, 28]) the target is  tensor(39)
When input is tensor([80, 28, 39]) the target is  tensor(42)
When input is tensor([80, 28, 39, 42]) the target is  tensor(39)
When input is tensor([80, 28, 39, 42, 39]) the target is  tensor(44)
When input is tensor([80, 28, 39, 42, 39, 44]) the target is  tensor(32)
When input is tensor([80, 28, 39, 42, 39, 44, 32]) the target is  tensor(49)
When input is tensor([80, 28, 39, 42, 39, 44, 32, 49]) the target is  tensor(1)


## Language Model Class

In [32]:
class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, index, targets=None):
        # a probability distribution table of what is going to come next
        logits = self.token_embedding_table(index)

        if targets is None:
            loss = None

        else:
            # batch size, sequence length, vocabulary size
            # elements reflect the probability of that token being the next output
            B, T, C = logits.shape

            # needs to reshape logits and targets, because the cross_entropy expects a B by C by T instead of a B by T by C as seen above.
            # we combine B and T into one parameter (B*T = N). note: need to know how torch expects the inputs to be based on the documentation.
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss
    
    def generate(self, index, max_new_tokens):
        # index is (B, T) array on indices in the current context
        for _ in range(max_new_tokens):
            # get the probability distribution based on the input index
            logits, loss = self.forward(index)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C). only select the probabilities that correspond to the last token
            # apply softmax to convert them to probabilities
            probs = F.softmax(logits, dim=-1) # (B, C), only want the last dimension
            # sample from the distribution
            index_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            index = torch.cat((index, index_next), dim=1) # (B, T+1)

        return index
    
model = BigramLanguageModel(vocab_size)
m = model.to(device)

# torch decorator to make sure torch library doesnt use graidents in this function. we only want to calculate the loss since this method is meant for evaluation rather than for improving model performance.
@torch.no_grad()
def estimate_loss():
    out = {}
     
    # model is being tested. disables certain behavior like dropout or batch normalisation which is not required.
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()

    # puts the model back in training mode. 
    model.train()
    return out


context = torch.zeros((1,1), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context, max_new_tokens=500)[0].tolist())
print(generated_chars)


QTT5??! 9?K?JwH'9cuX8xFnm_7Or P4aZODWdhYv!"O[Y74li_g?zqUZdrp[nIVY""hf2JW"XJzgL5B3Noo]Jo﻿cjm](F3uqU(T.Qu]ZzV6fDNO'7)0J;BK2beuDZ
ndY2XxfqLiD3125zv-;piIV -KvzRXXLws'StePgK,1iN741ZkY.R0y]&Mj[oQcm
-PSmPWKwHZ3j)zQXFu-a8NGzayS_!b
-f9'*I3Mlb8nuY((fu2baG[X4ncvza5w﻿l4S3n415TvO0Ij(﻿7Sjb8TeBoo*Bj0s;AqoK?u!eFA)cKmpuN;Y(hg*.]J._9.PBh]HaDm2Vs4Hjh
DoMC-eSnKS1s?.Pm5Z6VbqenAcKXirkITyuW8sQx"_NSBo,.Uwqv
P2;l:rs7h)i,qGLGG&MjdYSjXa-JCB-eyoNu﻿B﻿_bMtm1it58&lzTy*N2iK﻿84T﻿74R3V878"gshL﻿.i(cBpEz",WSCLiauq.Pz"]HyD1 31 ﻿3MC


## Optimiser

In [33]:
# create torch optimiser
optimiser = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):
    if iter % eval_iters == 0:
        losses = estimate_loss()
        print(f"step: {iter}, train loss: {losses['train']:.3f}, val loss: {losses['val']:.3f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model.forward(xb, yb)
    optimiser.zero_grad(set_to_none=True)
    loss.backward()
    optimiser.step()

print(loss.item())

step: 0, train loss: 4.841, val loss: 4.823
step: 250, train loss: 2.604, val loss: 2.690
step: 500, train loss: 2.519, val loss: 2.525
step: 750, train loss: 2.487, val loss: 2.547
step: 1000, train loss: 2.495, val loss: 2.509
step: 1250, train loss: 2.507, val loss: 2.549
step: 1500, train loss: 2.522, val loss: 2.530
step: 1750, train loss: 2.452, val loss: 2.543
step: 2000, train loss: 2.478, val loss: 2.508
step: 2250, train loss: 2.477, val loss: 2.507
step: 2500, train loss: 2.472, val loss: 2.519
step: 2750, train loss: 2.455, val loss: 2.498
step: 3000, train loss: 2.444, val loss: 2.491
step: 3250, train loss: 2.469, val loss: 2.494
step: 3500, train loss: 2.449, val loss: 2.496
step: 3750, train loss: 2.471, val loss: 2.512
step: 4000, train loss: 2.475, val loss: 2.507
step: 4250, train loss: 2.471, val loss: 2.495
step: 4500, train loss: 2.464, val loss: 2.530
step: 4750, train loss: 2.473, val loss: 2.484
step: 5000, train loss: 2.453, val loss: 2.524
step: 5250, train l

In [34]:
context = torch.zeros((1,1), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context, max_new_tokens=500)[0].tolist())
print(generated_chars)


CHed f Wichtoid dned e Sothe

an ist Lan, wantellk thed to ise wn."Ozekend tol fand d thencr.
thangr al of bis thed me war ullionldelyin way tchess T io refry."ACharinct, tin, I us rars
imad porstrkiellkealky ppisencln Sisind an sintheris

"A gy indissl tr atheven ize otheved ndvenalk gamed t e aklke se acume med oy omad Raintan quthesouid d tary, nedet to pue he saginearere hecke ay inl Win hense ay.

"I se; herf and mavend
"Nofatime rl fonithyean-thy winca ue s anthitem, THopenthioors t; qugon


## Concepts

### Gradient descent

to calculate loss, take the negative log of the likelihood of a correct prediction. the aim is to reduce the loss to increase the chance of correctly predicting the target. We can do this by using gradient descent. find the derivative of the negative log likelihood and find a way to get to either a global or a local minimum.

### Types of optimisers

1. Mean Squared Error (MSE): MSE is a common loss function used in regression problems, where the goal is to predict a continuous output (non-discrete). It measures the average squared difference between the predicted and actual values, and is often used to train neural networks for regression tasks.

2. Gradient Descent (GD): an optimization algorithm used to minimuse the loss function of a machine learning model. The loss function measures how well the model is able to predict the target variable based on the input features. The idea of GD is to iteratively adjust the model parameters in the direction of the steepest descent of the loss function.

3. Momentum: Momentum is an extension of SGD that adds a "momentum" term to the parameter updates. This term helps smooth out the updates and allows the optimiser to continue moving in the right direction, even if the gradient changes direction or varies in magnitude. Momentum is particularly useful for training deep neural networks. This is done by allowing the momentum term in optimization algorithms like SGD with momentum adjusts the weights based on both the magnitude and the direction of the gradient, and it uses the past gradients to influence future updates.

4. RMSprop: RMSprop is another extension of SGD that adds a moving average of the squared gradient to adapt the learning rate of each parameter. This helps to avoid oscillations in the parameter updates and can improve convergence in some cases.

5. Adam: Adam is a popular optimisation algorithm that combines the ideas of momentum and RMSprop. It uses a moving average of both the gradient and its squared value to adapt the learning rate of each parameter. Adam is often used as a default optimiser for deep learning models.

6. AdamW: AdamW is a modification of the Adam optimiser that adds weight decay to the paramter updates, This helps to regularise the model and can improve generalisation performance.