In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import mmap
import random
import pickle
import argparse
import pandas as pd

device = 'cuda' if torch.cuda.is_available() else 'cpu'

# batch_size = args.batch_size # to use the batch_size cmd arg -> python file_name.py -batch_size 32
batch_size = 32
block_size = 128
max_iters = 200
learning_rate = 3e-4
eval_iters = 100
n_embd = 384
n_head = 4
n_layer = 4
dropout = 0.2

print(device)

cuda


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
chars = ""
with open(r"C:\Users\Kenneth\Desktop\python_projects\gpt-course\Large-Language-Model\vocab.txt", encoding='utf-8') as f:
        text = f.read()
        chars = sorted(list(set(text)))
        
vocab_size = len(chars)

In [3]:
string_to_int = { ch:i for i,ch in enumerate(chars) }
int_to_string = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [string_to_int[c] for c in s]
decode = lambda l: ''.join([int_to_string[i] for i in l])

In [4]:
def get_random_chunk(split):
    filename = "openwebtext/train_split.txt" if split == 'train' else "openwebtext/val_split.txt"
    with open(filename, 'rb') as f:
        with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as mm:
            # Determine the file size and a random position to start reading
            file_size = len(mm)
            start_pos = random.randint(0, (file_size) - block_size*batch_size)

            # Seek to the random position and read the block of text
            mm.seek(start_pos)
            block = mm.read(block_size*batch_size-1)

            # Decode the block to a string, ignoring any invalid byte sequences
            decoded_block = block.decode('utf-8', errors='ignore').replace('\r', '')
            
            # Train and test splits
            data = torch.tensor(encode(decoded_block), dtype=torch.long)
            
    return data

def get_batch(split):
    data = get_random_chunk(split)
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

In [5]:
class Head(nn.Module):
    """ This class represents one head of self-attention, a crucial component in 
    Transformer models for tasks such as natural language processing. 
    
    This class represents one head of self-attention. In the Transformer architecture, self-attention is a crucial mechanism for capturing relationships between different words in a sequence.
    It consists of three linear layers: key, query, and value, each initialized with the specified head_size. These layers are used to transform the input embeddings into keys, queries, and values for the attention mechanism.
    The forward method computes the attention scores, performs weighted aggregation of values, and returns the output.
    """

    def __init__(self, head_size):
        '''They transform the input embeddings (n_embd) into keys, queries, and values for the attention mechanism.'''
        super().__init__() #  This defines the constructor method
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size))) 
        #This matrix is used to construct a causal mask ensuring that during self-attention, each position can only attend to positions before it in the sequence.
        self.dropout = nn.Dropout(dropout) # Dropout is a regularization technique used to prevent overfitting by randomly setting a fraction of input units to zero during training.

    def forward(self, x):
        # input of size (batch, time-step, channels)
        # output of size (batch, time-step, head size)
        B,T,C = x.shape # Obtains the batch size (B), sequence length (T), and input embedding size (C) from the input tensor x.
        k = self.key(x)   # (B,T,hs)
        q = self.query(x) # (B,T,hs)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * k.shape[-1]**-0.5 # (B, T, hs) @ (B, hs, T) -> (B, T, T) #  It performs a dot product between queries and keys, scaled by the square root of the key size.
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)  # Applies a mask to the attention scores to ensure causality, i.e., each position can only attend to positions before it in the sequence.
        wei = F.softmax(wei, dim=-1) # (B, T, T) #  Applies a softmax function to obtain the attention weights.
        wei = self.dropout(wei) #  Applies dropout to the attention weights for regularization.
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,hs) #  Computes the weighted sum of values using the attention weights.
        out = wei @ v # (B, T, T) @ (B, T, hs) -> (B, T, hs)
        return out


In [6]:
import torch
import torch.nn as nn

# Define the input dimensions
n_embd = 128  # Input embedding size
head_size = 64  # Head size for linear transformation

# Create an instance of nn.Linear
linear_layer = nn.Linear(n_embd, head_size, bias=False)

# Generate some random input data
input_data = torch.randn(2, n_embd)  # Example batch size of 2

# Pass the input data through the linear layer
output_data = linear_layer(input_data)

# Display the output value
output_data

tensor([[-0.2851, -0.7044,  1.3238, -0.1516,  0.0161, -0.0577, -0.0705,  0.1508,
         -1.0943, -0.9141, -0.8971,  0.2684, -0.2562,  0.5066,  0.4228,  0.0182,
         -0.4467,  0.9576, -0.4462, -0.8157,  0.5355,  0.0087, -0.5405,  1.5067,
         -0.1138,  0.1138,  0.1129,  0.1335,  0.2654, -0.5343, -0.2504, -0.7230,
          0.6413,  0.1382, -0.8728, -0.6024, -0.0420, -0.6134,  0.4549,  0.6859,
         -0.1433,  0.7079,  0.3227, -0.9129,  0.1663, -0.4094,  0.4245, -0.4223,
         -0.6931, -0.5152,  0.6994,  0.0426,  0.5907, -0.0181, -0.2997,  0.3641,
         -0.1744, -0.0795, -1.6513, -1.0651,  0.1010,  0.6599,  0.8632, -0.1960],
        [ 0.8967,  0.3490,  0.2548,  0.4629, -0.2838, -1.0021,  0.1536, -0.7442,
          0.1893, -0.0654, -0.2059,  0.4887,  0.3032, -0.9838,  0.0043,  0.0684,
         -0.0217,  0.2020, -0.7526, -0.4234,  0.4034, -0.3609,  0.0393, -0.7691,
         -0.5072, -0.1331, -0.8830, -0.0450, -0.0599,  0.1034,  0.3239,  0.1868,
         -0.1885,  0.6414, 

In [7]:
# Test the Head class
head_size = 64  # Example head size
batch_size = 2
sequence_length = 10
embedding_size = 128

# Create an instance of the Head class
head = Head(head_size)

# Generate some random input data
input_data = torch.randn(batch_size, sequence_length, embedding_size)

# Pass the input data through the Head instance
output = head(input_data)

# Verify the shape of the output
print("Output shape:", output.shape)

Output shape: torch.Size([2, 10, 64])


In [8]:
class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel 
    
    This class represents multiple heads of self-attention operating in parallel.
    It contains a list of Head instances, where each instance represents one head of self-attention.
    The forward method applies each head of self-attention to the input tensor and concatenates the outputs along the last dimension. It then applies dropout and a linear projection to the concatenated outputs.
    """

    def __init__(self, num_heads, head_size): #  It takes two arguments: num_heads and head_size.
        super().__init__() #  This defines the constructor method
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)]) # This creates a list of num_heads instances of the Head class. Each instance represents one head of self-attention.
        self.proj = nn.Linear(head_size * num_heads, n_embd) #  This defines a linear transformation layer (nn.Linear) that projects the concatenated outputs of the individual heads back to the original embedding dimension n_embd.
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1) # (B, T, F) -> (B, T, [h1, h1, h1, h1, h2, h2, h2, h2, h3, h3, h3, h3]) #  applies each head of self-attention (h(x)) to the input tensor x and concatenates the outputs along the last dimension (dim=-1)
        print(out.shape)
        out = self.dropout(self.proj(out))
        return out

In [9]:
import torch
import torch.nn as nn

# Create an instance of nn.Linear
linear_layer = b (in_features=100, out_features=50)  # Input size = 100, Output size = 50

# Generate some random input data
input_data = torch.randn(32, 100)  # Batch size = 32, Number of input features = 100

# Pass the input data through the linear layer
output_data = linear_layer(input_data)

# Print the shape of the output tensor
print("Output shape:", output_data.shape)

Output shape: torch.Size([3, 50])


In [10]:
import torch
import torch.nn as nn

# Define the parameters (replace these with your actual values)
n_embd = 128
num_heads = 4
head_size = 32
dropout = 0.1

# Create an instance of the MultiHeadAttention class
multihead_attention = MultiHeadAttention(num_heads, head_size)

# Generate some random input data
batch_size = 2
sequence_length = 10
input_data = torch.randn(batch_size, sequence_length, n_embd)

# Pass the input data through the MultiHeadAttention instance
output = multihead_attention(input_data)

# Verify the shape of the output
print("Output shape:", output.shape)

torch.Size([2, 10, 128])
Output shape: torch.Size([2, 10, 128])


In [11]:
class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd), #  This expansion introduces more capacity and allows the model to capture complex patterns.
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)
    

In [12]:
import torch
import torch.nn as nn

# Define the input embedding dimensionality
n_embd = 10  # Example value

# Define dropout probability (assuming it's defined somewhere in your code)
dropout = 0.1  # Example value

# Create an instance of the FeedForward class
feedforward_model = FeedFoward(n_embd)

# Generate some random input data
input_data = torch.randn(2, n_embd)  # Example batch size of 2

# Pass the input data through the model
output_data = feedforward_model(input_data)

# Print the shape of the output tensor and its values
print("Output shape:", output_data.shape)
print("Output values:")
print(output_data)

Output shape: torch.Size([2, 10])
Output values:
tensor([[-0.0634,  0.4287, -0.2253, -0.2328,  0.0656,  0.1926, -0.3548, -0.0000,
          0.0134, -0.3420],
        [ 0.0249,  0.1038, -0.2134, -0.6290,  0.2824, -0.0527, -0.3380, -0.2254,
          0.0155, -0.1624]], grad_fn=<MulBackward0>)


In [13]:
class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size) # A Multi-Head Self-Attention layer (MultiHeadAttention) with n_head heads and head_size size.
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        y = self.sa(x)
        x = self.ln1(x + y) # Residual connection adds the input x with the output y
        y = self.ffwd(x)
        x = self.ln2(x + y)
        return x

In [14]:
# Define some parameters
n_embd = 128
n_head = 8
sequence_length = 10
batch_size = 2

# Create an instance of the Block class
block = Block(n_embd, n_head)

# Generate some random input data
input_data = torch.randn(batch_size, sequence_length, n_embd)

# Pass the input data through the Block instance
output = block(input_data)

# Verify the shape of the output
print("Output shape:", output.shape)

torch.Size([2, 10, 128])
Output shape: torch.Size([2, 10, 128])


In [19]:
class GPTLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)
        
        
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, index, targets=None):
        B, T = index.shape
        
        
        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(index) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.blocks(x) # (B,T,C)
        x = self.ln_f(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)
        
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        
        return logits, loss
    
    def generate(self, index, max_new_tokens):
        # index is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            index_cond = index[:, -block_size:]
            # get the predictions
            logits, loss = self.forward(index_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            index_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            index = torch.cat((index, index_next), dim=1) # (B, T+1)
        return index

In [23]:
model = GPTLanguageModel(vocab_size)
# print('loading model parameters...')
# with open('model-01.pkl', 'rb') as f:
#     model = pickle.load(f)
# print('loaded successfully!')
m = model.to(device)

In [25]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [1]:
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):
    print(iter)
    if iter % eval_iters == 0:
        losses = estimate_loss()
        print(f"step: {iter}, train loss: {losses['train']:.3f}, val loss: {losses['val']:.3f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model.forward(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
print(loss.item())

with open('model-01.pkl', 'wb') as f:
    pickle.dump(model, f)
print('model saved')

NameError: name 'torch' is not defined

In [2]:
asdasdasd

NameError: name 'asdasdasd' is not defined

In [22]:
import torch

# Define model hyperparameters
vocab_size = 10000  # Example vocabulary size
n_embd = 256        # Example embedding dimension
block_size = 128    # Example block size
n_head = 8          # Example number of attention heads
n_layer = 6         # Example number of transformer blocks

# Instantiate the model
model = GPTLanguageModel(vocab_size)

# Move the input index tensor to the same device as the model
index = index.to(model.token_embedding_table.weight.device)

# Forward pass
logits, loss = model(index)

# Generate new tokens
generated_sequence = model.generate(index, max_new_tokens=10)
print("Generated Sequence:", generated_sequence)

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument index in method wrapper_CUDA__index_select)