In [1]:
# importing libraries
import torch
import torch.nn as nn
from torch.nn import functional as F
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cpu


In [2]:
# read the text file
def read_text_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    return text

In [3]:
# feeding the text and cleaning it
file_path = 'training_text.txt'
text = read_text_file(file_path)

text = text.strip("\ufeff")
text = text.replace("\n", " ")

In [4]:
# length of text
print(len(text))
print(text[:100])

270151
In my younger and more vulnerable years my father gave me some advice that I’ve been turning over in


In [5]:
# checking all the characters used in the text
char = sorted(list(set(text)))
vocab_size = len(char)
print(''.join(char))
print(vocab_size)
eval_iters = 200
max_iters = 10000
num_emb = 32

 !$()*,-.0123456789:;?ABCDEFGHIJKLMNOPQRSTUVWXY[]abcdefghijklmnopqrstuvwxyzçéêô —‘’“”…
86


In [6]:
# hence we need to tokenize the vocab ourselves
stoi = { ch:i for i, ch in enumerate(char)}
itos = { i:ch for i, ch in enumerate(char)}

def encode(str):
    return [stoi[c] for c in str]

def decode(data):
    return "".join(itos[c] for c in data)

foo = encode("foo")
print(foo)
print(decode(foo))

[54, 63, 63]
foo


In [7]:
# testing encode with my own functions
test_encode = encode(text[:100])
print(test_encode)
print(decode(test_encode))

[30, 62, 0, 61, 73, 0, 73, 63, 69, 62, 55, 53, 66, 0, 49, 62, 52, 0, 61, 63, 66, 53, 0, 70, 69, 60, 62, 53, 66, 49, 50, 60, 53, 0, 73, 53, 49, 66, 67, 0, 61, 73, 0, 54, 49, 68, 56, 53, 66, 0, 55, 49, 70, 53, 0, 61, 53, 0, 67, 63, 61, 53, 0, 49, 52, 70, 57, 51, 53, 0, 68, 56, 49, 68, 0, 30, 82, 70, 53, 0, 50, 53, 53, 62, 0, 68, 69, 66, 62, 57, 62, 55, 0, 63, 70, 53, 66, 0, 57, 62]
In my younger and more vulnerable years my father gave me some advice that I’ve been turning over in


In [8]:
# now encoding the entire text
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:500])

torch.Size([270151]) torch.int64
tensor([30, 62,  0, 61, 73,  0, 73, 63, 69, 62, 55, 53, 66,  0, 49, 62, 52,  0,
        61, 63, 66, 53,  0, 70, 69, 60, 62, 53, 66, 49, 50, 60, 53,  0, 73, 53,
        49, 66, 67,  0, 61, 73,  0, 54, 49, 68, 56, 53, 66,  0, 55, 49, 70, 53,
         0, 61, 53,  0, 67, 63, 61, 53,  0, 49, 52, 70, 57, 51, 53,  0, 68, 56,
        49, 68,  0, 30, 82, 70, 53,  0, 50, 53, 53, 62,  0, 68, 69, 66, 62, 57,
        62, 55,  0, 63, 70, 53, 66,  0, 57, 62,  0, 61, 73,  0, 61, 57, 62, 52,
         0, 53, 70, 53, 66,  0, 67, 57, 62, 51, 53,  8,  0,  0, 83, 44, 56, 53,
        62, 53, 70, 53, 66,  0, 73, 63, 69,  0, 54, 53, 53, 60,  0, 60, 57, 59,
        53,  0, 51, 66, 57, 68, 57, 51, 57, 74, 57, 62, 55,  0, 49, 62, 73, 63,
        62, 53,  6, 84,  0, 56, 53,  0, 68, 63, 60, 52,  0, 61, 53,  6,  0, 83,
        58, 69, 67, 68,  0, 66, 53, 61, 53, 61, 50, 53, 66,  0, 68, 56, 49, 68,
         0, 49, 60, 60,  0, 68, 56, 53,  0, 64, 53, 63, 64, 60, 53,  0, 57, 62,
       

In [9]:
# splitting data into training data and validation data
n = int(0.9*len(data))
train = data[:n]
val = data[n:]

In [10]:
# setting up block size
block_size = 8
train[:block_size + 1]

tensor([30, 62,  0, 61, 73,  0, 73, 63, 69])

In [11]:
# setting batch size
batch_size = 4

# function for getting a batch of random blocks within data, set my batch_size
def get_batch(split):
    if split == "train":
        data = train
    else:
        data = val

    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i + block_size] for i in ix])
    y = torch.stack([data[i + 1: i + block_size + 1] for i in ix])
    
    return x, y

# estimate function that estimates the average loss in splits
def estimate():
    out = {}
    model.eval()
    for split in ["train", "val"]:
        losses = torch.zeros(eval_iters)
        for i in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[i] = loss.item()
        out[split] = losses.mean() # collecting the average
    model.train()

    return out

In [12]:
# collecting inputs and targets from the training data
# targets used for creating the loss function later on
xb, yb = get_batch("train")

print(xb)
print(yb)

tensor([[68, 63,  0, 24, 56, 57, 51, 49],
        [ 0, 51, 63, 63, 60,  0, 68, 56],
        [62,  0, 68, 56, 53,  0, 67, 63],
        [53,  8,  0, 40, 63, 61, 53, 68]])
tensor([[63,  0, 24, 56, 57, 51, 49, 55],
        [51, 63, 63, 60,  0, 68, 56, 49],
        [ 0, 68, 56, 53,  0, 67, 63, 49],
        [ 8,  0, 40, 63, 61, 53, 68, 57]])


In [13]:
# class for the language model
class BigramLM(nn.Module):

    def __init__(self):
        super().__init__()
        self.block_size = block_size
        self.token_embedding_table = nn.Embedding(vocab_size, num_emb)
        self.position_embedding_table = nn.Embedding(block_size, num_emb)
        self.lm_head = nn.Linear(num_emb, vocab_size)

    def forward(self, inputs, targets=None):
        B,T = inputs.shape

        token_emb = self.token_embedding_table(inputs) # batch, time, channel
        pos_emb = self.position_embedding_table(torch.arange(T, device=inputs.device))

        T = min(T, self.block_size)
        x = token_emb + pos_emb

        logits = self.lm_head(x)

        if targets is None:
            loss = None
        else:
            # need to reformat BTC into B*C, T for loss to work
            b, t, c = logits.shape
            logits = logits.view(b*t, c)

            # targets are in B T and needs to be B*T
            targets = targets.view(b*t)

            loss = F.cross_entropy(logits, targets)

        return logits, loss
    
    def generate(self, inputs, number):
        for _ in range(number):
            inputs_cropped = inputs[:, -self.block_size:]
            logits, loss = self(inputs_cropped)
            logits = logits[:, -1, :]
            prob = F.softmax(logits, 1)
            inputs_next = torch.multinomial(prob, 1)
            inputs = torch.cat((inputs, inputs_next), 1)

        return inputs

In [14]:
# Initialize and move the model to the correct device
model = BigramLM().to(device)

logits, loss = model(xb.to(device), yb.to(device))

print(logits.shape)
print(loss)

torch.Size([32, 86])
tensor(4.5258, grad_fn=<NllLossBackward0>)


In [15]:
# making a pytorch optimizer object
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)

In [16]:
# increasing batch size and setting a loop to evaluate loss
batch_size = 32

for i in range(max_iters):

    if i % 100 == 0:
        # losses = estimate()
        print(f"Step {i}: Loss = {loss.item()}")

    xb, yb = get_batch("train")
    logits, loss = model(xb.to(device), yb.to(device))
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

print(loss.item())

Step 0: Loss = 4.760985374450684
Step 100: Loss = 3.538388252258301
Step 200: Loss = 2.91546630859375
Step 300: Loss = 2.7562575340270996
Step 400: Loss = 2.7423250675201416
Step 500: Loss = 2.8735148906707764
Step 600: Loss = 2.4632015228271484
Step 700: Loss = 2.4777352809906006
Step 800: Loss = 2.5186984539031982
Step 900: Loss = 2.490985631942749
Step 1000: Loss = 2.520765542984009
Step 1100: Loss = 2.6084511280059814
Step 1200: Loss = 2.3419439792633057
Step 1300: Loss = 2.422070264816284
Step 1400: Loss = 2.639904499053955
Step 1500: Loss = 2.5703036785125732
Step 1600: Loss = 2.474938154220581
Step 1700: Loss = 2.483727216720581
Step 1800: Loss = 2.562690496444702
Step 1900: Loss = 2.522002696990967
Step 2000: Loss = 2.3571014404296875
Step 2100: Loss = 2.453205108642578
Step 2200: Loss = 2.375826120376587
Step 2300: Loss = 2.4197616577148438
Step 2400: Loss = 2.391451358795166
Step 2500: Loss = 2.521249532699585
Step 2600: Loss = 2.5391042232513428
Step 2700: Loss = 2.369423151

In [17]:
input = torch.zeros((1, 1), dtype=torch.long).to(device)
print(decode(model.generate(input, 300)[0].tolist()))

 cath s  awan Gache beren me befo de ler wng, te be anan’they mowapif d toled allofreail. horot omolk—hely,” I Nomatolnthouneng, styos od cotind rmare rang ch mer Toter athenin’ss mewhisladeter. h aut sal. d ime hie’sendyours  In tche sureerce. boredn s Weng me ashe’siney, erey shin Mivit! thed  wapu


In [18]:
# matrix multiplication and softmax

B, T, C = 4, 8, 32
x = torch.randn(B, T, C)

tril = torch.tril(torch.ones(T, T))
wei = torch.zeros((T, T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)
out = wei @ x
out.shape

torch.Size([4, 8, 32])

In [19]:
tril

tensor([[1., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1.]])

In [21]:
wei

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])

In [30]:
# self attention
B, T, C = 4, 8, 32
x = torch.randn(B, T, C)

head_size = 16
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)

k = key(x) # B, T, 16
q = query(x)

wei = q @ k.transpose(-2, -1)

tril = torch.tril(torch.ones(T, T))
# wei = torch.zeros((T, T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)
# out = wei @ x

v = value(x)
out = wei @ v

out.shape

torch.Size([4, 8, 32])

In [35]:
wei[0]

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.8569, 0.1431, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5426, 0.3957, 0.0617, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.7973, 0.0452, 0.1281, 0.0295, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0184, 0.0808, 0.0182, 0.8508, 0.0318, 0.0000, 0.0000, 0.0000],
        [0.5542, 0.3354, 0.0113, 0.0065, 0.0119, 0.0808, 0.0000, 0.0000],
        [0.1774, 0.1786, 0.0395, 0.1391, 0.3335, 0.0742, 0.0578, 0.0000],
        [0.1507, 0.1743, 0.1649, 0.0092, 0.0655, 0.1764, 0.1420, 0.1169]],
       grad_fn=<SelectBackward0>)

tensor(1.0000, grad_fn=<AddBackward0>)