In [20]:
# importing libraries
import torch
import tiktoken
import torch.nn as nn
from torch.nn import functional as F
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cpu


In [21]:
# read the text file
def read_text_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    return text

In [22]:
# feeding the text and cleaning it
file_path = 'training_text.txt'
text = read_text_file(file_path)

text = text.strip("\ufeff")
text = text.replace("\n", " ")

In [23]:
# length of text
print(len(text))
print(text[:100])

270151
In my younger and more vulnerable years my father gave me some advice that I’ve been turning over in


In [24]:
# checking all the characters used in the text
char = sorted(list(set(text)))
vocab_size = len(char)
print(''.join(char))
print(vocab_size)

 !$()*,-.0123456789:;?ABCDEFGHIJKLMNOPQRSTUVWXY[]abcdefghijklmnopqrstuvwxyzçéêô —‘’“”…
86


In [25]:
# bringing in tiktoken's tokenizer

# THIS WILL NOT WORK

enc = tiktoken.get_encoding("gpt2")
enc.n_vocab

50257

In [26]:
# testing with encoding and decoding; it works now, but note that we only have
# a vocab_size of 85
encode = enc.encode("hello")
print(encode)
enc.decode(encode)

encode1 = enc.encode(text[:100])
print(encode1)
enc.decode(encode1)

[31373]
[818, 616, 7099, 290, 517, 8826, 812, 616, 2988, 2921, 502, 617, 5608, 326, 314, 447, 247, 303, 587, 6225, 625, 287]


'In my younger and more vulnerable years my father gave me some advice that I’ve been turning over in'

In [27]:
# hence we need to tokenize the vocab ourselves
stoi = { ch:i for i, ch in enumerate(char)}
itos = { i:ch for i, ch in enumerate(char)}

def encode(str):
    return [stoi[c] for c in str]

def decode(data):
    return "".join(itos[c] for c in data)

foo = encode("foo")
print(foo)
print(decode(foo))

[54, 63, 63]
foo


In [28]:
# testing encode with my own functions
# notice the difference between tiktoken
# but we cannot use tiktoken so this will do
test_encode = encode(text[:100])
print(test_encode)
print(decode(test_encode))

[30, 62, 0, 61, 73, 0, 73, 63, 69, 62, 55, 53, 66, 0, 49, 62, 52, 0, 61, 63, 66, 53, 0, 70, 69, 60, 62, 53, 66, 49, 50, 60, 53, 0, 73, 53, 49, 66, 67, 0, 61, 73, 0, 54, 49, 68, 56, 53, 66, 0, 55, 49, 70, 53, 0, 61, 53, 0, 67, 63, 61, 53, 0, 49, 52, 70, 57, 51, 53, 0, 68, 56, 49, 68, 0, 30, 82, 70, 53, 0, 50, 53, 53, 62, 0, 68, 69, 66, 62, 57, 62, 55, 0, 63, 70, 53, 66, 0, 57, 62]
In my younger and more vulnerable years my father gave me some advice that I’ve been turning over in


In [29]:
# now encoding the entire text
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:500])

torch.Size([270151]) torch.int64
tensor([30, 62,  0, 61, 73,  0, 73, 63, 69, 62, 55, 53, 66,  0, 49, 62, 52,  0,
        61, 63, 66, 53,  0, 70, 69, 60, 62, 53, 66, 49, 50, 60, 53,  0, 73, 53,
        49, 66, 67,  0, 61, 73,  0, 54, 49, 68, 56, 53, 66,  0, 55, 49, 70, 53,
         0, 61, 53,  0, 67, 63, 61, 53,  0, 49, 52, 70, 57, 51, 53,  0, 68, 56,
        49, 68,  0, 30, 82, 70, 53,  0, 50, 53, 53, 62,  0, 68, 69, 66, 62, 57,
        62, 55,  0, 63, 70, 53, 66,  0, 57, 62,  0, 61, 73,  0, 61, 57, 62, 52,
         0, 53, 70, 53, 66,  0, 67, 57, 62, 51, 53,  8,  0,  0, 83, 44, 56, 53,
        62, 53, 70, 53, 66,  0, 73, 63, 69,  0, 54, 53, 53, 60,  0, 60, 57, 59,
        53,  0, 51, 66, 57, 68, 57, 51, 57, 74, 57, 62, 55,  0, 49, 62, 73, 63,
        62, 53,  6, 84,  0, 56, 53,  0, 68, 63, 60, 52,  0, 61, 53,  6,  0, 83,
        58, 69, 67, 68,  0, 66, 53, 61, 53, 61, 50, 53, 66,  0, 68, 56, 49, 68,
         0, 49, 60, 60,  0, 68, 56, 53,  0, 64, 53, 63, 64, 60, 53,  0, 57, 62,
       

In [30]:
# splitting data into training data and validation data
n = int(0.9*len(data))
train = data[:n]
val = data[n:]

In [31]:
# setting up block size
block_size = 8
train[:block_size + 1]

tensor([30, 62,  0, 61, 73,  0, 73, 63, 69])

In [32]:
# setting batch size
batch_size = 4

# function for getting a batch of random blocks within data, set my batch_size
def get_batch(split):
    if split == "train":
        data = train
    else:
        data = val

    ix = torch.randint(len(data) - batch_size, (batch_size,))
    x = torch.stack([data[i:i + block_size] for i in ix])
    y = torch.stack([data[i + 1: i + block_size + 1] for i in ix])
    
    return x, y

In [33]:
# collecting inputs and targets from the training data
# targets used for creating the loss function later on
xb, yb = get_batch("train")

print(xb)
print(yb)

tensor([[ 8, 84,  0,  0, 30,  0, 71, 49],
        [69, 66,  0, 71, 57, 54, 53,  6],
        [63,  0, 68, 56, 53,  0, 54, 66],
        [30,  0, 61, 49, 52, 53,  0, 68]])
tensor([[84,  0,  0, 30,  0, 71, 49, 67],
        [66,  0, 71, 57, 54, 53,  6, 84],
        [ 0, 68, 56, 53,  0, 54, 66, 63],
        [ 0, 61, 49, 52, 53,  0, 68, 56]])


In [34]:
# class for the language model
class BigramLM(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, inputs, targets=None):
        logits = self.token_embedding_table(inputs) # batch, time, channel

        if targets is None:
            loss = None
        else:
            # need to reformat BTC into B*C, T for loss to work
            b, t, c = logits.shape
            logits = logits.view(b*t, c)

            # targets are in B T and needs to be B*T
            targets = targets.view(b*t)

            loss = F.cross_entropy(logits, targets)

        return logits, loss
    
    def generate(self, inputs, number):
        for _ in range(number):
            logits, loss = self(inputs)
            logits = logits[:, -1, :]
            prob = F.softmax(logits, 1)
            inputs_next = torch.multinomial(prob, 1)
            inputs = torch.cat((inputs, inputs_next), 1)

        return inputs
    

Usually we would expect a loss of -ln(1/86), which is approximately **-4.45**, but we are getting almost 5 right now. This means the inital predictions are not very diffused yet, and there is entropy.

In [35]:
# this was where i found that tiktoken wouldn't work
model = BigramLM(vocab_size)

logits, loss = model(xb, yb)

print(logits.shape)
print(loss)

torch.Size([32, 86])
tensor(5.0984, grad_fn=<NllLossBackward0>)


This looks silly now because history is not used; we only examine the last character in time: **logits = logits[:, -1, :]**

In [36]:
# setting an input as a 1 by 1 tensor of zeros
input = torch.zeros((1, 1), dtype=torch.long)
print(decode(model.generate(input, 100)[0].tolist()))

 Ixgé6a2Bbp8)8]vKEOW—;…7::*[’1WExQ:r,……OA…;gh:C9N3Gs”—E êRF.9çWô.‘8WRh78V‘Jw’‘2)bTGfL“5“6a;ISmHm:EaqE


In [37]:
# making a pytorch optimzer object
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)

**Training the model.**
We can see the loss going down everytime we train it.

In [38]:
# increasing batch size and setting a loop to evaluate loss
batch_size = 32

for i in range(10000): # my computer almost blew up
    xb, yb = get_batch("train")
    logits, loss = model(xb, yb)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    if i % 100 == 0:
        print(f"Step {i}: Loss = {loss.item()}")


print(loss.item())

KeyboardInterrupt: 

As we can see, there are drastic improvements than to before.


"—;84c7.nd]ontxe$4neHLKyéxUa—fW]’g0IySCPjoM3j,4R 0h—$é ],Oé4x$érBXJHP12MGay8?W’O[ ?[3xIV9é?*NC “va1G" 

to


" d I wnt harsurthed is owemeveg wayouio id aly a this wicy’se  d Minghe Shouad yoofezzive rnan y dineng o thiniss mecegs wayth inn r  “Antoke!” s ge, motthof, ay wes blut cemomos. astr I wa dad Wes dyecakneitlastat ceche. “Ale nidosef My I roufol amed cousy ay?”  s, od. son’tanonghat athad Routheed d".

In [None]:
# testing the generate function
input = torch.zeros((1, 1), dtype=torch.long)
print(decode(model.generate(input, 300)[0].tolist()))

 oum I tsperd and, herd h f t g I “o hanon I h bey, bomoonugol athe. ain oe t. “Dop fow ooane an an bofe te m be. hakn. ainy Plloryor dedinthe thon’sp. y y in cerachthe drdaly, ar ived sshedst s sil lye ce dr wadrld sinjurie oper g ancedio iexe ubringr “Tot, “I aitheler. Itre. Thag. thafo … Wed a knd
