# Capstone LLM Project
This is going to be a beginner LLM from scratch using Python  
Need to install the following packages:  
- matplotlib
- numpy
- pylzma
- ipykernel
- jupyter
- torch

## MAKE SURE TO SELECT ANACANDA 3.11 ENVIRONMENT TO RUN

VIDEO LINK: https://www.youtube.com/watch?v=UU1WVnMk4E8
LEFT OFF ON 2:31:08

In [1]:
# Install packages
#!pip install matplotlib numpy pylzma ipykernel jupyter torchvision torchaudio torch numpy

# Import packages
import matplotlib
import numpy
import ipykernel
import jupyter
import torch
import torch
import pylzma
import os
import time
import numpy as np
import torch.nn as nn
import torch.nn.functional as F

## Testing Torch

In [2]:
import torch
x = torch.rand(5, 3)
print(x)

tensor([[0.4456, 0.6170, 0.3287],
        [0.9728, 0.9239, 0.8149],
        [0.8035, 0.2152, 0.1445],
        [0.2647, 0.7729, 0.6845],
        [0.1472, 0.7767, 0.1293]])


## Tokenizer

In [3]:
with open("wizardofoz.txt", 'r', encoding='utf-8') as f:
    text = f.read()
    
chars = sorted(set(text))
print(chars)
vocab_size = len(chars)
print(vocab_size)


['\n', ' ', '!', '"', '&', "'", '(', ')', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '\ufeff']
80


In [4]:
string_to_int = { ch:i for i,ch in enumerate(chars) }
int_to_string = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [string_to_int[c] for c in s]
decode = lambda l: ''.join([int_to_string[i] for i in l])

""" encoded_hello = encode('hello')
decoded_hello = decode([60, 57, 64, 64, 67])

print(encoded_hello)
print(decoded_hello) """

#Tensor likes having everything in tensor objects
data = torch.tensor(encode(text), dtype=torch.long) #This is just going to be a super long sequence of integers
print(data[:100])

tensor([79, 26, 31, 24, 39, 43, 28, 41,  1, 12, 10,  0,  0, 43, 31, 28,  1, 28,
        24, 41, 43, 31, 40, 44, 24, 34, 28,  0,  0,  0, 43, 60, 57,  1, 72, 70,
        53, 61, 66,  1, 58, 70, 67, 65,  1,  5, 29, 70, 61, 71, 55, 67,  1, 75,
        53, 71,  1, 74, 57, 70, 77,  1, 64, 53, 72, 57, 10,  1, 32, 72,  1, 71,
        60, 67, 73, 64, 56,  1, 60, 53, 74, 57,  1, 53, 70, 70, 61, 74, 57, 56,
         1, 53, 72,  1, 31, 73, 59, 71, 67, 66])


# Validation and Training Sets (Bigram Language Model)
We want to take a given text, split it up and train the model on 80% of it, and then use the remaining 20% to validate the training. If it was trained on the entire text, it would eventuall memeroize the entire training set and just spit that back out. The purpose of the model is to output text that is like the training data. This is why they are put into splits. Doing 80/20, it makes sure the splits are unique. We generate something like the trained text, but not it exactly. 


In [5]:
# Validation and Training Splits
n = int(0.8*len(data)) #80% of the text is training, 20% is validating
train_data = data[:n]
val_data = data [n:]

In [6]:
block_size = 8

x = train_data[:block_size]
y = train_data[1:block_size+1]

for t in range (block_size):
    context = x[:t+1]
    target = y[t]
    print('when input is', context, 'target is', target)


when input is tensor([79]) target is tensor(26)
when input is tensor([79, 26]) target is tensor(31)
when input is tensor([79, 26, 31]) target is tensor(24)
when input is tensor([79, 26, 31, 24]) target is tensor(39)
when input is tensor([79, 26, 31, 24, 39]) target is tensor(43)
when input is tensor([79, 26, 31, 24, 39, 43]) target is tensor(28)
when input is tensor([79, 26, 31, 24, 39, 43, 28]) target is tensor(41)
when input is tensor([79, 26, 31, 24, 39, 43, 28, 41]) target is tensor(1)


### Comparing CPU vs GPU Runtime For Multiplying Matrixes

With simple tasks, it is not very hard to do, like adding a lot of things up. The issue with running it on a CPU, is that it will run sequentially. With the GPU, it can perform all these tasks in parallel. This is why we need to run the above code in parallel. Below is an example showing the speed of CPU and GPU computation for a lot of simple tasks and the time difference between them. 

In [7]:
#Setting up GPU use
import os

def check_operating_system():
    if os.name == 'posix':
        # On Unix-like systems (including macOS)
        return 'macOS' if 'Darwin' in os.uname() else 'Linux'
    elif os.name == 'nt':
        # On Windows
        return 'Windows'
    else:
        return 'Unknown'

# Example usage
operating_system = check_operating_system()
if operating_system == 'macOS': device = 'mps' #This is for M1 Macs or newer
elif operating_system == 'Windows' : device = 'cuda' if torch.cuda_is_available() else 'cpu' #Must set up cuda with torch to run if on windows, but this project was developed and made for Mac

#print(device)

mps_device = torch.device(device)

batch_size = 8
block_size = 4

#Comparing Numpy (CPU) against Torch (GPU)
torch_rand1 = torch.rand(100, 100, 100, 100).to(mps_device)
torch_rand2 = torch.rand(100, 100, 100, 100).to(mps_device)
np_rand1 = torch.rand(100, 100, 100, 100)
np_rand2 = torch.rand(100, 100, 100, 100)

#With GPU - Much Faster
start_time = time.time()
rand = (torch_rand1 @ torch_rand2)
end_time = time.time()
elapsed_time = end_time - start_time
print("GPU: ", f"{elapsed_time:.4f}")

#With CPU - Much slower
start_time = time.time()
rand = np.multiply(np_rand1, np_rand2)
end_time = time.time()
elapsed_time = end_time - start_time
print("CPU: ", f"{elapsed_time:.4f}")



GPU:  0.7807
CPU:  1.1141


In [8]:
# Check that MPS is available
if not torch.backends.mps.is_available():
    if not torch.backends.mps.is_built():
        print("MPS not available because the current PyTorch install was not "
              "built with MPS enabled.")
    else:
        print("MPS not available because the current MacOS version is not 12.3+ "
              "and/or you do not have an MPS-enabled device on this machine.")

else:
    mps_device = torch.device("mps")

    # Create a Tensor directly on the mps device
    x = torch.ones(5, device=mps_device)
    # Or
    x = torch.ones(5, device="mps")

    # Any operation happens on the GPU
    y = x * 2
    print(y)



tensor([2., 2., 2., 2., 2.], device='mps:0')


## Back to our Bigram Model

In [9]:
block_size = 8
batch_size = 4
max_iters = 10000
learning_rate = 3e-4 # You have to experiment with learning rate
eval_iters = 250
#dropout = 0.2 #drops out random nuerons in the network to help the model learn better when things are not in the right place, takes 20% nuerons out at random

In [16]:
def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    #print(ix)
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(mps_device), y.to(mps_device) #Processes on the GPU, in paralell
    return x, y

x, y = get_batch('train')
print('inputs: ')
#print(x.shape)
print(x)
print('targets: ')
print(y)

inputs: 
tensor([[53, 66, 55, 57, 56,  1, 61, 66],
        [ 1, 77, 67, 73,  1, 65, 73, 71],
        [ 1, 53, 71,  1, 75, 57, 64, 64],
        [77,  1, 64, 57, 53, 66, 57, 56]], device='mps:0')
targets: 
tensor([[66, 55, 57, 56,  1, 61, 66,  0],
        [77, 67, 73,  1, 65, 73, 71, 72],
        [53, 71,  1, 75, 57, 64, 64,  1],
        [ 1, 64, 57, 53, 66, 57, 56,  1]], device='mps:0')


In [18]:
#AdamW is an optimizer of Torch, using gradient descent to optimize parameters and make sure certain parameters aren't affecting performance too much, using nn.Module allows us to use that
class BigramLanguageModel(nn.Module): #nn.Module allows us to make parameters learnable
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size) # Embedding table, giant grid of the probability distribution of what character comes next based on the previous character
        
    def forward(self, index, targets=None):
        # Logits are a bunch of normalized floating numbers
        logits = self.token_embedding_table(index) 
        
        if targets is None:
            loss = None
        else:
            #Batch, Time, Channels for cross entropy b/c it expects a certain shape
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        
        return logits, loss
    
    def generate(self, index, max_new_tokens):
        #index is (B, T) array of indicies in the current context
        for _ in range(max_new_tokens):
            #Get predictions
            logits, loss = self.forward(index)
            
            #Focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            
            #Apply Softmax to get probabilities
            probs = F.softmax(logits, dim=-1)# (B, C)
            
            #Sample from the distribution
            index_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            
            #Append sampled index to the running sequence
            index = torch.cat((index, index_next), dim=1) # (B, T+1)
        return index
    
model = BigramLanguageModel(vocab_size)
m = model.to(mps_device)

context = torch.zeros((1,1), dtype=torch.long, device=mps_device)
generated_chars = decode(m.generate(context, max_new_tokens=500)[0].tolist())
print(generated_chars)



I!Sf﻿ZHGEKYL1PdA75vGR﻿3UAv&W0oVq2X4(Mhr!"2JTt-&[un&OVOq&8-Y3b
?f_ALa_9VBeM]uHE_l y(10W"i?fUk-TpX4Dq7yd!wsl
2X.h0EG436848f7YU6_pZWuUishKtSR7,E1bjmQg0HxuT"HE)c0Y&
h:-Z;6t"tA"
5O_EUNRdk﻿V'srCCRfyxz,&ffygQw] [uD.rA8pYLwbwecPYKwv?,n&xMQ[B3UA8-r;kysY0UM_m;q"cx)WcDEzh
x5OQ6!:"cXA!mz0h,DT3uO0Jvje7ZA2va?f7JtS [﻿iX.SRm(yHEiz4X&n5'l﻿Z2"Q"wb_oV)5k&A.2oQpsLyxkMXPu10D&Q7jQ:(UMkAL'L,:53x8-:q_)sY veBb-TiP-pbj"Xqd.NkUo?flADIZf!:9QRRa daK4WGopPw?Usw ,wZT4PNo&oqPW1e4vhjn)﻿5!:IdQYTG-mgY;WsnKt&_lD"z3?T:Y_Vq?J6e:(DIz


## Create a PyTorch Optimizer

In [12]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [19]:
#Create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):
    #sample from a batch of data
    if iter % eval_iters == 0:
        losses=estimate_loss()
        print(f"step: {iter}, train loss {losses['train']:.4f}, val loss: {losses['val']:.4f}")
    xb, yb = get_batch('train')
    
    #evaluate the loss
    logits, loss = model.forward(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
print(loss.item())

step: 0, train loss 4.8371, val loss: 4.8480
step: 250, train loss 4.7648, val loss: 4.7869
step: 500, train loss 4.7019, val loss: 4.7142
step: 750, train loss 4.6456, val loss: 4.6472
step: 1000, train loss 4.5821, val loss: 4.5913
step: 1250, train loss 4.5328, val loss: 4.5415
step: 1500, train loss 4.4441, val loss: 4.4800
step: 1750, train loss 4.4193, val loss: 4.4407
step: 2000, train loss 4.3512, val loss: 4.3900
step: 2250, train loss 4.2984, val loss: 4.3371
step: 2500, train loss 4.2362, val loss: 4.2782
step: 2750, train loss 4.2063, val loss: 4.2037
step: 3000, train loss 4.1546, val loss: 4.1547
step: 3250, train loss 4.0839, val loss: 4.1261
step: 3500, train loss 4.0488, val loss: 4.0919
step: 3750, train loss 4.0206, val loss: 4.0494
step: 4000, train loss 3.9745, val loss: 3.9872
step: 4250, train loss 3.9167, val loss: 3.9697
step: 4500, train loss 3.8794, val loss: 3.8989
step: 4750, train loss 3.8307, val loss: 3.8852
step: 5000, train loss 3.8192, val loss: 3.836

In [20]:
context = torch.zeros((1,1), dtype=torch.long, device=mps_device)
generated_chars = decode(m.generate(context, max_new_tokens=500)[0].tolist())
print(generated_chars)



)Vd t-s"MK?ffaf;B-nt DK]0SF﻿unosyG2ft
O3npA!Ybo

?JH4URthm3CCrB)Ce!8F﻿9VlN18&e s!qCl
ar
TH﻿3fig]d t!:(1GOqSsav_mib]0Loz"NWK!Whe
"Hd.DV?fategOlfO8GTLamJfO[Yua qFck
"oRH"﻿y_(lAnng-K8atI, s

In w eownoWb]2lind igjn.DQ?pon zinryALWmatKSatq?f
he,&D[rom,t5JS d_﻿91ROsl
TO'J!jxKm4'8GFn snt xRrusaipU:R3' ffl,)b
4un3)sa XAJdualfIdls!:t]0le g?fC79un d9Z23f58-.;EDTL1 zH
"﻿7J2lyllWKT60b
OY;bve.L!y)9P1_s,!-Yx8a,NzacHG_lk&?f
TVE:wruf
y XvfXGO
?z is﻿ F8l_8upouxOQ).ntau, idf7 7RHirrl58BH,xs?fiY1_V4E)YHS!-pz-HFh


In [None]:
# How is our model performing over time

## Torch Features

In [None]:
#Define a probability tensor
probabilities = torch.tensor([0.1, 0.9]) #each side must add up to 1 for 100%
#10% or 0.1 => 0, 90% or 0.9 => 1. each probability points to the index of the probability in the tensor
#Draw 5 samples from the multinomial distribution
samples = torch.multinomial(probabilities, num_samples=10, replacement=True)
print(samples)


tensor([1, 1, 1, 1, 1, 1, 0, 1, 1, 1])


In [None]:
#import torch

tensor = torch.tensor([1, 2, 3, 4])
out = torch.cat((tensor, torch.tensor([5])), dim=0)
print(out)


tensor([1, 2, 3, 4, 5])


In [None]:
#Showing more features of torch
out = torch.tril(torch.ones(5, 5)) #tril is triangle-lower
print(out)

out = torch.triu(torch.ones(5, 5)) #tril is triangle-upper
print(out)

out = torch.zeros(5, 5).masked_fill(torch.tril(torch.ones(5, 5)) == 0, float('-inf'))
print(out)
print(torch.exp(out))

#Transposing - Flips any dimension that we want
input = torch.zeros(2, 3, 4)
out = input.transpose(0, 2)
print(out.shape)

#Torch.Stack - Stacks tensors 
tensor1 = torch.tensor([1, 2, 3])
tensor2 = torch.tensor([4, 5, 6])
tensor3 = torch.tensor([7, 8, 9])
# Stack the tensors along a new dimension
stacked_tensor= torch.stack([tensor1, tensor2,tensor3])
print("Stacked Tensor: " + str(stacked_tensor))

# nn.Linear - Takes anything that has learnable parameters and when you apply a weigh or bias it will learn those & train based on how close they are to the desired output
#import torch.nn as nn
sample = torch.tensor([10.,10.,10.])
linear = nn.Linear(3, 3, bias=False)
print(linear(sample))

# Softmax Function
#import torch.nn.functional as f
tensor1 = torch.tensor([1.0, 2.0, 3.0])
softmax_output = F.softmax(tensor1, dim=0)
print(softmax_output)



tensor([[1., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0.],
        [1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1.]])
tensor([[1., 1., 1., 1., 1.],
        [0., 1., 1., 1., 1.],
        [0., 0., 1., 1., 1.],
        [0., 0., 0., 1., 1.],
        [0., 0., 0., 0., 1.]])
tensor([[0., -inf, -inf, -inf, -inf],
        [0., 0., -inf, -inf, -inf],
        [0., 0., 0., -inf, -inf],
        [0., 0., 0., 0., -inf],
        [0., 0., 0., 0., 0.]])
tensor([[1., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0.],
        [1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1.]])
torch.Size([4, 3, 2])
Stacked Tensor: tensor([[1, 2, 3],
        [4, 5, 6],
        [7, 8, 9]])
tensor([-9.1271,  4.9377,  2.9602], grad_fn=<SqueezeBackward4>)
tensor([0.0900, 0.2447, 0.6652])


## Embedding Vectors

### What is nn.Embedding?

nn.Embedding is a part of the PyTorch library and it's used to represent words or characters in a way that a machine can understand. Imagine you have a big book, and you want to teach a computer to understand the meaning of each word. nn.Embedding helps in converting these words into numbers so the computer can work with them easily.

Think of it like a secret code for words. Each word gets its own special number, and the computer uses these numbers to understand and work with the words. This way, the computer can process and analyze text more efficiently. It works the same way for characters, which our model will work on.

In [None]:
#mport torch
#import torch.nn as nn
vocab_size = 1000
embedding_dim = 100
embedding = nn.Embedding(vocab_size, embedding_dim)

#Create some input indicies
input_indicies = torch.LongTensor([1, 5, 3, 2])
                                  
#Apply the embedding payer
embedded_output = embedding(input_indicies)

#The output will be a tensor of a shape (4, 100), hwere 4 is the number of inputs
#and 100 is dimensionality of the embedding vectors
print(embedded_output.shape)

torch.Size([4, 100])


In [None]:
# Dot prodcut & Matrix multiplication
a = torch.tensor([[1,2], [3,4], [5,6]])
b = torch.tensor([[7,8,9], [10,11,12]])
print(a @ b)

tensor([[ 27,  30,  33],
        [ 61,  68,  75],
        [ 95, 106, 117]])
