In [2]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import mmap
import random
import pickle
import argparse
import pandas as pd

device = 'cuda' if torch.cuda.is_available() else 'cpu'

# batch_size = args.batch_size # to use the batch_size cmd arg -> python file_name.py -batch_size 32
batch_size = 32
block_size = 128
max_iters = 200
learning_rate = 3e-4
eval_iters = 100
n_embd = 384
n_head = 4
n_layer = 4
dropout = 0.2

print(device)

cuda


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [4]:
chars = ""
with open(r"C:\Users\Kenneth\Desktop\python_projects\gpt-course\Large-Language-Model\vocab.txt", encoding='utf-8') as f:
        text = f.read()
        chars = sorted(list(set(text)))
        
vocab_size = len(chars)

In [5]:
string_to_int = { ch:i for i,ch in enumerate(chars) }
int_to_string = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [string_to_int[c] for c in s]
decode = lambda l: ''.join([int_to_string[i] for i in l])

In [6]:
def get_random_chunk(split):
    filename = "openwebtext/train_split.txt" if split == 'train' else "openwebtext/val_split.txt"
    with open(filename, 'rb') as f:
        with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as mm:
            # Determine the file size and a random position to start reading
            file_size = len(mm)
            start_pos = random.randint(0, (file_size) - block_size*batch_size)

            # Seek to the random position and read the block of text
            mm.seek(start_pos)
            block = mm.read(block_size*batch_size-1)

            # Decode the block to a string, ignoring any invalid byte sequences
            decoded_block = block.decode('utf-8', errors='ignore').replace('\r', '')
            
            # Train and test splits
            data = torch.tensor(encode(decoded_block), dtype=torch.long)
            
    return data

def get_batch(split):
    data = get_random_chunk(split)
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

In [7]:
class Head(nn.Module):
    """ This class represents one head of self-attention, a crucial component in 
    Transformer models for tasks such as natural language processing. """

    def __init__(self, head_size):
        '''They transform the input embeddings (n_embd) into keys, queries, and values for the attention mechanism.'''
        super().__init__() #  This defines the constructor method
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size))) 
        #This matrix is used to construct a causal mask ensuring that during self-attention, each position can only attend to positions before it in the sequence.
        self.dropout = nn.Dropout(dropout) # Dropout is a regularization technique used to prevent overfitting by randomly setting a fraction of input units to zero during training.

    def forward(self, x):
        # input of size (batch, time-step, channels)
        # output of size (batch, time-step, head size)
        B,T,C = x.shape # Obtains the batch size (B), sequence length (T), and input embedding size (C) from the input tensor x.
        k = self.key(x)   # (B,T,hs)
        q = self.query(x) # (B,T,hs)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * k.shape[-1]**-0.5 # (B, T, hs) @ (B, hs, T) -> (B, T, T) #  It performs a dot product between queries and keys, scaled by the square root of the key size.
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)  # Applies a mask to the attention scores to ensure causality, i.e., each position can only attend to positions before it in the sequence.
        wei = F.softmax(wei, dim=-1) # (B, T, T) #  Applies a softmax function to obtain the attention weights.
        wei = self.dropout(wei) #  Applies dropout to the attention weights for regularization.
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,hs) #  Computes the weighted sum of values using the attention weights.
        out = wei @ v # (B, T, T) @ (B, T, hs) -> (B, T, hs)
        return out


In [8]:
import torch
import torch.nn as nn

# Define the input dimensions
n_embd = 128  # Input embedding size
head_size = 64  # Head size for linear transformation

# Create an instance of nn.Linear
linear_layer = nn.Linear(n_embd, head_size, bias=False)

# Generate some random input data
input_data = torch.randn(2, n_embd)  # Example batch size of 2

# Pass the input data through the linear layer
output_data = linear_layer(input_data)

# Display the output value
output_data

tensor([[ 2.1904e-01,  6.9782e-01,  7.3202e-01,  3.5143e-01,  4.4436e-02,
          5.2613e-01,  1.5634e-01, -3.5303e-01, -3.9407e-01,  1.9041e-01,
          2.2973e-01,  1.7658e-01,  1.3151e-01, -5.9906e-02, -1.0874e+00,
         -1.8509e-01,  1.2523e-01, -2.2777e-01, -2.5571e-01, -4.9634e-01,
          4.5730e-01, -6.0723e-02, -2.7428e-01, -7.7702e-01, -6.6218e-01,
         -3.6696e-01,  5.6800e-01, -5.1916e-01,  5.9357e-01,  3.7748e-02,
         -1.8416e-01, -2.2452e-01,  3.7481e-01,  1.2914e+00, -6.4070e-01,
          5.8648e-02,  2.4736e-01, -8.6550e-01,  2.2037e-01, -2.7817e-01,
          2.6035e-01,  2.7013e-02,  1.5147e-01,  8.1801e-01,  2.1250e-01,
          3.5440e-01,  4.5542e-01,  2.8009e-01,  5.7474e-01,  1.7505e-01,
         -3.0871e-01,  3.0342e-01,  7.8115e-01, -7.2455e-01, -4.6063e-01,
         -6.3669e-01, -8.0901e-01, -2.2819e-01,  8.7427e-01, -9.2265e-01,
         -2.7668e-02, -5.6819e-01, -9.0677e-01, -1.7622e-01],
        [ 7.6219e-04,  6.2601e-02, -1.5429e-01, -8

In [9]:
# Test the Head class
head_size = 64  # Example head size
batch_size = 2
sequence_length = 10
embedding_size = 128

# Create an instance of the Head class
head = Head(head_size)

# Generate some random input data
input_data = torch.randn(batch_size, sequence_length, embedding_size)

# Pass the input data through the Head instance
output = head(input_data)

# Verify the shape of the output
print("Output shape:", output.shape)

Output shape: torch.Size([2, 10, 64])


In [10]:
class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size): #  It takes two arguments: num_heads and head_size.
        super().__init__() #  This defines the constructor method
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)]) # This creates a list of num_heads instances of the Head class. Each instance represents one head of self-attention.
        self.proj = nn.Linear(head_size * num_heads, n_embd) #  This defines a linear transformation layer (nn.Linear) that projects the concatenated outputs of the individual heads back to the original embedding dimension n_embd.
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1) # (B, T, F) -> (B, T, [h1, h1, h1, h1, h2, h2, h2, h2, h3, h3, h3, h3]) #  applies each head of self-attention (h(x)) to the input tensor x and concatenates the outputs along the last dimension (dim=-1)
        print(out.shape)
        out = self.dropout(self.proj(out))
        return out

In [11]:
import torch
import torch.nn as nn

# Create an instance of nn.Linear
linear_layer = nn.Linear(in_features=100, out_features=50)  # Input size = 100, Output size = 50

# Generate some random input data
input_data = torch.randn(3, 100)  # Batch size = 32, Number of input features = 100

# Pass the input data through the linear layer
output_data = linear_layer(input_data)

# Print the shape of the output tensor
print("Output shape:", output_data.shape)

Output shape: torch.Size([3, 50])


In [12]:
import torch
import torch.nn as nn

# Define the parameters (replace these with your actual values)
n_embd = 128
num_heads = 4
head_size = 32
dropout = 0.1

# Create an instance of the MultiHeadAttention class
multihead_attention = MultiHeadAttention(num_heads, head_size)

# Generate some random input data
batch_size = 2
sequence_length = 10
input_data = torch.randn(batch_size, sequence_length, n_embd)

# Pass the input data through the MultiHeadAttention instance
output = multihead_attention(input_data)

# Verify the shape of the output
print("Output shape:", output.shape)

torch.Size([2, 10, 128])
Output shape: torch.Size([2, 10, 128])


In [13]:
class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd), #  This expansion introduces more capacity and allows the model to capture complex patterns.
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)
    

In [14]:
import torch
import torch.nn as nn

# Define the input embedding dimensionality
n_embd = 10  # Example value

# Define dropout probability (assuming it's defined somewhere in your code)
dropout = 0.1  # Example value

# Create an instance of the FeedForward class
feedforward_model = FeedFoward(n_embd)

# Generate some random input data
input_data = torch.randn(2, n_embd)  # Example batch size of 2

# Pass the input data through the model
output_data = feedforward_model(input_data)

# Print the shape of the output tensor and its values
print("Output shape:", output_data.shape)
print("Output values:")
print(output_data)

Output shape: torch.Size([2, 10])
Output values:
tensor([[ 0.1012, -0.2529,  0.2376, -0.2995, -0.0338,  0.1090,  0.0056,  0.4771,
          0.0851,  0.3686],
        [-0.1713, -0.2474, -0.0276,  0.0781,  0.1991,  0.0000, -0.0584, -0.0044,
         -0.1147, -0.0005]], grad_fn=<MulBackward0>)


In [3]:
class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size) # A Multi-Head Self-Attention layer (MultiHeadAttention) with n_head heads and head_size size.
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        y = self.sa(x)
        x = self.ln1(x + y) # Residual connection adds the input x with the output y
        y = self.ffwd(x)
        x = self.ln2(x + y)
        return x

In [15]:
# Define some parameters
n_embd = 128
n_head = 8
sequence_length = 10
batch_size = 2

# Create an instance of the Block class
block = Block(n_embd, n_head)

# Generate some random input data
input_data = torch.randn(batch_size, sequence_length, n_embd)

# Pass the input data through the Block instance
output = block(input_data)

# Verify the shape of the output
print("Output shape:", output.shape)

torch.Size([2, 10, 128])
Output shape: torch.Size([2, 10, 128])


In [16]:
class GPTLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)
        