In [1]:
import torch
from torch import nn
from torch.nn import functional as F
import numpy as np
from collections import OrderedDict

In [2]:
# Function to create the rotation matrix for RoPE embeddings
def generate_rotation_matrix(seq_length, embed_dim):
    """
    Generates a rotation matrix for RoPE embeddings.

    Args:
        seq_length (int): The length of the sequence.
        embed_dim (int): The dimension of the embeddings.

    Returns:
        torch.Tensor: The rotation matrix.
    """
    # Ensure the embedding dimension is even
    assert embed_dim % 2 == 0, "Embedding dimension must be even."

    # Calculate the position index
    position = np.arange(seq_length)[:, np.newaxis]  # Shape: (seq_length, 1)
    dimension = np.arange(embed_dim // 2)[np.newaxis, :]  # Shape: (1, embed_dim // 2)

    # Calculate theta using the exponential decay formula
    theta = 1.0 / (10000 ** (2 * dimension / embed_dim))  # Shape: (1, embed_dim // 2)

    # Calculate the sine and cosine components
    angle_rates = position * theta  # Shape: (seq_length, embed_dim // 2)
    cos = np.cos(angle_rates)
    sin = np.sin(angle_rates)

    # Combine the cosine and sine components into a rotation matrix
    rotation_matrix = np.zeros((seq_length, embed_dim, embed_dim))
    for i in range(seq_length):
        cos_i = cos[i]
        sin_i = sin[i]
        diag_cos = np.diag(cos_i)
        diag_sin = np.diag(sin_i)
        upper = np.concatenate([diag_cos, -diag_sin], axis=1)
        lower = np.concatenate([diag_sin, diag_cos], axis=1)
        rotation_matrix[i] = np.concatenate([upper, lower], axis=0)

    return torch.tensor(rotation_matrix, dtype=torch.float32)

# Example usage
seq_length = 10
embed_dim = 64
rotation_matrix = generate_rotation_matrix(seq_length, embed_dim)
print(rotation_matrix.shape)  # Should be (seq_length, embed_dim, embed_dim)


torch.Size([10, 64, 64])


In [None]:
class RMSNorm(nn.Module):
    def __init__(self, layer_dims, epsilon=1e-8):
        super().__init__()
        self.gamma = nn.Parameter(torch.ones(layer_dims))
        self.epsilon = epsilon

    def forward(self, tensor_input):
        # Calculate the root mean square (RMS) of the tensor values
        rms = torch.sqrt(torch.mean(tensor_input ** 2, dim=-1, keepdim=True) + self.epsilon)

        # Normalize the input tensor
        tensor_normalized = tensor_input / rms

        # Scale the normalized tensor by the learnable parameter gamma
        output = self.gamma * tensor_normalized

        return output

# Example usage
tensor_input = torch.randn(10, 64)  # Example input with batch size 10 and feature dimension 64
layer_dims = 64  # Example feature dimension
rms_norm = RMSNorm(layer_dims)
output = rms_norm(tensor_input)
print(output.shape)  # Should be (10, 64)


In [None]:
class AttentionHeadWithRoPE(nn.Module):
    def __init__(self, embed_dim, num_heads, seq_length, epsilon=1e-8):
        super().__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.seq_length = seq_length

        # Initialize linear layers for queries, keys, and values
        self.query = nn.Linear(embed_dim, embed_dim, bias=False)
        self.key = nn.Linear(embed_dim, embed_dim, bias=False)
        self.value = nn.Linear(embed_dim, embed_dim, bias=False)

        # Generate the rotation matrix for RoPE embeddings
        self.rotation_matrix = generate_rotation_matrix(seq_length, embed_dim)

    def forward(self, data_input, attention_mask=None, return_attention=False):
        batch_size, seq_len, embed_dim = data_input.shape

        # Transform the input data into queries, keys, and values
        queries = self.query(data_input)
        keys = self.key(data_input)
        values = self.value(data_input)

        # Apply RoPE rotation to queries and keys
        queries = self.apply_rope(queries)
        keys = self.apply_rope(keys)

        # Reshape queries, keys, values for multi-head attention
        queries = queries.view(batch_size, seq_len, self.num_heads, embed_dim // self.num_heads).transpose(1, 2)
        keys = keys.view(batch_size, seq_len, self.num_heads, embed_dim // self.num_heads).transpose(1, 2)
        values = values.view(batch_size, seq_len, self.num_heads, embed_dim // self.num_heads).transpose(1, 2)

        # Scaled dot-product attention
        scores = torch.matmul(queries, keys.transpose(-2, -1)) / (embed_dim // self.num_heads) ** 0.5
        if attention_mask is not None:
            scores = scores.masked_fill(attention_mask == 0, float('-inf'))

        attention_weights = F.softmax(scores, dim=-1)
        attention_output = torch.matmul(attention_weights, values)

        # Reshape back to the original dimensions
        attention_output = attention_output.transpose(1, 2).contiguous().view(batch_size, seq_len, embed_dim)

        if return_attention:
            return attention_output, attention_weights
        else:
            return attention_output

    def apply_rope(self, tensor):
        batch_size, seq_len, embed_dim = tensor.shape
        rope_matrix = self.rotation_matrix[:seq_len, :, :].to(tensor.device)
        tensor = tensor.view(batch_size, seq_len, self.num_heads, embed_dim // self.num_heads)

        tensor_rotated = torch.einsum('blhd,lhe->blhd', tensor, rope_matrix)
        return tensor_rotated.view(batch_size, seq_len, embed_dim)

# Example usage
embed_dim = 64
num_heads = 8
seq_length = 10
data_input = torch.randn(2, seq_length, embed_dim)  # Example input with batch size 2

attention_layer = AttentionHeadWithRoPE(embed_dim, num_heads, seq_length)
output = attention_layer(data_input)
print(output.shape)  # Should be (2, seq_length, embed_dim)


In [None]:
class MultiHeadAttention(nn.Module):
    def __init__(self, embed_dim, num_heads, seq_length, dropout=0.1):
        super().__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.seq_length = seq_length
        self.head_dim = embed_dim // num_heads

        assert embed_dim % num_heads == 0, "Embedding dimension must be divisible by number of heads."

        # Initialize multiple attention heads
        self.attention_heads = nn.ModuleList(
            [AttentionHeadWithRoPE(self.head_dim, 1, seq_length) for _ in range(num_heads)]
        )

        # Define an output linear transformation
        self.output_linear = nn.Linear(embed_dim, embed_dim, bias=False)

        # Add dropout for regularization
        self.dropout = nn.Dropout(dropout)

    def forward(self, data_input, attention_mask=None):
        # Compute the output for each attention head
        head_outputs = [
            head(data_input, attention_mask) for head in self.attention_heads
        ]

        # Concatenate them along the last dimension
        concatenated_output = torch.cat(head_outputs, dim=-1)

        # Apply the output linear transformation and dropout
        output = self.output_linear(concatenated_output)
        output = self.dropout(output)

        return output

# Example usage
embed_dim = 64
num_heads = 8
seq_length = 10
data_input = torch.randn(2, seq_length, embed_dim)  # Example input with batch size 2

multihead_attention = MultiHeadAttention(embed_dim, num_heads, seq_length)
output = multihead_attention(data_input)
print(output.shape)  # Should be (2, seq_length, embed_dim)


In [None]:
class SwishGLU(nn.Module):
    def __init__(self, size):
        super().__init__()
        self.linear_gate = nn.Linear(size, size)
        self.linear = nn.Linear(size, size)
        self.beta = nn.Parameter(torch.ones(1))

    def forward(self, data_input):
        # Apply the linear transformations
        gate = self.linear_gate(data_input)
        x = self.linear(data_input)

        # Apply Swish activation
        swish_activation = gate * torch.sigmoid(self.beta * gate)

        # Gated Linear Unit mechanism
        output = x * swish_activation

        return output

# Example usage
input_size = 64
data_input = torch.randn(2, input_size)  # Example input with batch size 2

swish_glu = SwishGLU(input_size)
output = swish_glu(data_input)
print(output.shape)  # Should be (2, input_size)


In [None]:
class LLamaBlock(nn.Module):
    def __init__(self, embed_dim, num_heads, seq_length, feedforward_dim, dropout=0.1, epsilon=1e-8):
        super().__init__()
        self.norm1 = RMSNorm(embed_dim, epsilon)
        self.norm2 = RMSNorm(embed_dim, epsilon)

        self.attention_head = AttentionHeadWithRoPE(embed_dim, num_heads, seq_length)

        self.feedforward = nn.Sequential(
            nn.Linear(embed_dim, feedforward_dim),
            SwishGLU(feedforward_dim),
            nn.Linear(feedforward_dim, embed_dim)
        )

        self.dropout = nn.Dropout(dropout)

    def forward(self, data_input, attention_mask=None):
        # Apply RMS normalization and attention with residual connection
        normalized_input = self.norm1(data_input)
        attention_output = self.attention_head(normalized_input, attention_mask)
        attention_output = self.dropout(attention_output)
        data_input = data_input + attention_output

        # Apply RMS normalization and feedforward network with residual connection
        normalized_input = self.norm2(data_input)
        feedforward_output = self.feedforward(normalized_input)
        feedforward_output = self.dropout(feedforward_output)
        data_input = data_input + feedforward_output

        return data_input

# Example usage
embed_dim = 64
num_heads = 8
seq_length = 10
feedforward_dim = 256
data_input = torch.randn(2, seq_length, embed_dim)  # Example input with batch size 2

llama_block = LLamaBlock(embed_dim, num_heads, seq_length, feedforward_dim)
output = llama_block(data_input)
print(output.shape)  # Should be (2, seq_length, embed_dim)


In [None]:
class LLama(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_heads, seq_length, feedforward_dim, num_blocks, dropout=0.1, epsilon=1e-8):
        super().__init__()

        # Initialize the embedding layer for the input tokens
        self.embedding = nn.Embedding(vocab_size, embed_dim)

        # Construct a sequence of transformer blocks using OrderedDict
        self.transformer_blocks = nn.Sequential(
            OrderedDict([
                (f"block_{i}", LLamaBlock(embed_dim, num_heads, seq_length, feedforward_dim, dropout, epsilon))
                for i in range(num_blocks)
            ])
        )

        # Define the final linear transformation layers
        self.final_linear1 = nn.Linear(embed_dim, embed_dim)
        self.swish_glu = SwishGLU(embed_dim)
        self.final_linear2 = nn.Linear(embed_dim, vocab_size)

        # Optionally print the total number of parameters in the model
        print("Model parameters:", sum([p.numel() for p in self.parameters()]))

    def forward(self, input_ids, target_ids=None):
        # Embed the input_ids
        embeddings = self.embedding(input_ids)

        # Process them through the LLama_blocks
        transformer_output = self.transformer_blocks(embeddings)

        # Apply the final linear transformations
        output = self.final_linear1(transformer_output)
        output = self.swish_glu(output)
        logits = self.final_linear2(output)

        loss = None
        if target_ids is not None:
            # Compute cross-entropy loss
            loss_fn = nn.CrossEntropyLoss()
            # Shift logits and targets to align predictions with the next token in the sequence
            logits = logits.view(-1, logits.size(-1))
            target_ids = target_ids.view(-1)
            loss = loss_fn(logits, target_ids)

        return logits, loss

# Example usage
vocab_size = 30522  # Example vocabulary size for a tokenizer
embed_dim = 768
num_heads = 12
seq_length = 512
feedforward_dim = 3072
num_blocks = 12
data_input = torch.randint(0, vocab_size, (2, seq_length))  # Example input with batch size 2

llama_model = LLama(vocab_size, embed_dim, num_heads, seq_length, feedforward_dim, num_blocks)
logits, loss = llama_model(data_input, data_input)
print(logits.shape)  # Should be (2*seq_length, vocab_size)
if loss is not None:
    print(loss.item())


In [None]:
import torch
from torch import nn
import numpy as np
import pandas as pd
import time
from collections import OrderedDict

# Define the necessary components for the LLama model, RMSNorm, AttentionHeadWithRoPE, SwishGLU, and LLamaBlock here

# Data preprocessing function
def prepare_data(file_path):
    with open(file_path, 'r') as file:
        text = file.read()
    unique_chars = sorted(set(text))
    char_to_index = {char: idx for idx, char in enumerate(unique_chars)}
    index_to_char = {idx: char for idx, char in enumerate(unique_chars)}
    data_indices = [char_to_index[char] for char in text]
    data_tensor = torch.tensor(data_indices, dtype=torch.long)
    vocabulary_size = len(unique_chars)
    return data_tensor, vocabulary_size, char_to_index, index_to_char

# Function to create training batches
def batch_generator(data, split, batch_size, seq_len, params):
    total_len = len(data)
    train_len = int(total_len * 0.8)
    val_len = int(total_len * 0.1)
    train_data = data[:train_len]
    val_data = data[train_len:train_len + val_len]
    test_data = data[train_len + val_len:]

    if split == 'train':
        split_data = train_data
    elif split == 'validation':
        split_data = val_data
    elif split == 'test':
        split_data = test_data
    else:
        raise ValueError("split must be 'train', 'validation', or 'test'")

    total_batches = len(split_data) // (batch_size * seq_len)
    input_sequences = torch.zeros((batch_size, seq_len), dtype=torch.long)
    target_sequences = torch.zeros((batch_size, seq_len), dtype=torch.long)

    for _ in range(total_batches):
        start_indices = torch.randint(0, len(split_data) - seq_len, (batch_size,))
        for i, start_idx in enumerate(start_indices):
            input_sequences[i] = split_data[start_idx:start_idx + seq_len]
            target_sequences[i] = split_data[start_idx + 1:start_idx + seq_len + 1]

        yield input_sequences, target_sequences

# Function to evaluate model loss
@torch.no_grad()
def eval_loss(model, dataset, params):
    results = {}
    model.eval()
    for split in ["train", "validation"]:
        batch_losses = []
        for _ in range(10):
            input_batch, target_batch = next(batch_generator(dataset, split, params['training_batch'], params['sequence_length'], params))
            input_batch = input_batch.to(device)
            target_batch = target_batch.to(device)
            _, batch_loss = model(input_batch, target_batch)
            batch_losses.append(batch_loss.item())
        results[split] = np.mean(batch_losses)
    model.train()
    return results

# Training loop
def train_model(model, optimizer, dataset, params, scheduler=None):
    all_losses = []
    start = time.time()
    for epoch in range(params['training_epochs']):
        optimizer.zero_grad()
        input_batch, target_batch = next(batch_generator(dataset, 'train', params['training_batch'], params['sequence_length'], params))

        input_batch = input_batch.to(device)
        target_batch = target_batch.to(device)
        _, batch_loss = model(input_batch, target_batch)
        batch_loss.backward()
        optimizer.step()

        if scheduler:
            scheduler.step()

        if (epoch + 1) % params['logging_frequency'] == 0:
            time_elapsed = time.time() - start
            evaluation_result = eval_loss(model, dataset, params)
            all_losses.append(evaluation_result)
            print(
                f"Epoch {epoch + 1}/{params['training_epochs']} | "
                f"Validation Loss: {evaluation_result['validation']:.4f} | "
                f"Time: {time_elapsed:.2f}s"
            )
            start = time.time()
            if scheduler:
                print("lr: ", scheduler.get_last_lr())

    print("Final Validation Loss: ", all_losses[-1]['validation'])
    return pd.DataFrame(all_losses).plot()

# Load and process the dataset
data_path = 'path_to_text_file.txt'
dataset, vocab_size, char_to_index, index_to_char = prepare_data(data_path)

# Configuration for the language model
MODEL_PARAMS = {
    'vocab_size': vocab_size,
    'embed_dim': 768,
    'num_heads': 12,
    'seq_length': 512,
    'feedforward_dim': 3072,
    'num_blocks': 12,
    'dropout': 0.1,
    'epsilon': 1e-8,
    'training_batch': 32,
    'sequence_length': 50,
    'training_epochs': 10,
    'logging_frequency': 1
}

# Check GPU availability and set device
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print("Using device:", device)

# Create the language model
model = LLama(MODEL_PARAMS).to(device)
if torch.cuda.device_count() > 1:
    model = nn.DataParallel(model)

# Create the optimizer
optimizer = torch.optim.Adam(model.parameters())

# Start training
train_model(model, optimizer, dataset, MODEL_PARAMS)
