LLaMA3(
  (embedding): Embedding(50257, 768)
  (encoder_layers): ModuleList(
    (0-11): 12 x TransformerEncoderLayer(
      (self_attn): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
      )
      (linear1): Linear(in_features=768, out_features=3072, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (linear2): Linear(in_features=3072, out_features=768, bias=True)
      (norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout1): Dropout(p=0.1, inplace=False)
      (dropout2): Dropout(p=0.1, inplace=False)
    )
  )
  (decoder_layers): ModuleList(
    (0-11): 12 x TransformerDecoderLayer(
      (self_attn): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
      )
      (multihead_attn): MultiheadAttention(
        (out_proj): NonDynamicallyQuanti

In [1]:
import torch 
import torch.nn as nn
import torch.nn.functional as F

class LLaMA3(nn.Module):
    def __init__(self, vocab_size, hidden_dim, num_heads, num_layers, max_length, device):
        super(LLaMA3, self).__init__()
        self.device = device
        self.hidden_dim = hidden_dim
        self.num_heads = num_heads
        self.num_layers = num_layers
        self.max_length = max_length
        
        self.embedding = nn.Embedding(vocab_size, hidden_dim)
        self.positional_encoding = self._generate_positional_encoding(max_length, hidden_dim).to(device)
        
        self.encoder_layers = nn.ModuleList([nn.TransformerEncoderLayer(d_model=hidden_dim, nhead=num_heads, dim_feedforward=hidden_dim*4) for _ in range(num_layers)])
        self.decoder_layers = nn.ModuleList([nn.TransformerDecoderLayer(d_model=hidden_dim, nhead=num_heads, dim_feedforward=hidden_dim*4) for _ in range(num_layers)])
        
        self.fc = nn.Linear(hidden_dim, vocab_size)  # Typically the output dimension matches the vocab size for language models
        
        self._init_weights()

    def _init_weights(self):
        nn.init.normal_(self.embedding.weight, mean=0, std=0.02)
        for module in self.encoder_layers:
            self._init_layer_weights(module)
        for module in self.decoder_layers:
            self._init_layer_weights(module)
        nn.init.normal_(self.fc.weight, mean=0, std=0.02)
        nn.init.constant_(self.fc.bias, 0)

    def _init_layer_weights(self, layer):
        for name, param in layer.named_parameters():
            if 'weight' in name:
                nn.init.normal_(param, mean=0, std=0.02)
            elif 'bias' in name:
                nn.init.constant_(param, 0)

    def _generate_positional_encoding(self, max_length, hidden_dim):
        pe = torch.zeros(max_length, hidden_dim)
        position = torch.arange(0, max_length, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, hidden_dim, 2).float() * (-torch.log(torch.tensor(10000.0)) / hidden_dim))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        return pe

    def forward(self, src, tgt):
        src_seq_len = src.size(1)
        tgt_seq_len = tgt.size(1)
        
        src = self.embedding(src) + self.positional_encoding[:, :src_seq_len, :]
        tgt = self.embedding(tgt) + self.positional_encoding[:, :tgt_seq_len, :]
        
        src = src.transpose(0, 1)  # (seq_len, batch_size, hidden_dim)
        tgt = tgt.transpose(0, 1)  # (seq_len, batch_size, hidden_dim)
        
        for layer in self.encoder_layers:
            src = layer(src)
        
        for layer in self.decoder_layers:
            tgt = layer(tgt, src)
        
        logits = self.fc(tgt.transpose(0, 1))  # (batch_size, seq_len, vocab_size)
        
        return logits

vocab_size = 50257
hidden_dim = 768
num_heads = 12
num_layers = 12
max_length = 512
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = LLaMA3(vocab_size, hidden_dim, num_heads, num_layers, max_length, device).to(device)
print(model)

# Training loop for Llama3
model.train()

# Using adam optimizer and cross entropy loss 
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

# Define the input and target tensors
batch_size = 32
seq_length = 128
src = torch.randint(0, vocab_size, (batch_size, seq_length), device=device)
tgt = torch.randint(0, vocab_size, (batch_size, seq_length), device=device)

# Forward pass
logits = model(src, tgt[:, :-1])

# Computing the loss
loss = criterion(logits.view(-1, vocab_size), tgt[:, 1:].contiguous().view(-1))

# Backward pass
optimizer.zero_grad()
loss.backward()
optimizer.step()

print('Loss:', loss.item())

# model Evaluation
model.eval()

# Here we define the input tensor
src = torch.randint(0, vocab_size, (1, 128), device=device)

# Define the target tensor
tgt = torch.zeros((1, 128), dtype=torch.long, device=device)

# Initialize the first token as the start token
tgt[0, 0] = 0

# Generate the output sequence
for i in range(1, 128):
    logits = model(src, tgt[:, :i])
    next_token = torch.argmax(logits[0, i - 1, :]).item()
    tgt[0, i] = next_token

print(tgt)

LLaMA3(
  (embedding): Embedding(50257, 768)
  (encoder_layers): ModuleList(
    (0-11): 12 x TransformerEncoderLayer(
      (self_attn): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
      )
      (linear1): Linear(in_features=768, out_features=3072, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (linear2): Linear(in_features=3072, out_features=768, bias=True)
      (norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout1): Dropout(p=0.1, inplace=False)
      (dropout2): Dropout(p=0.1, inplace=False)
    )
  )
  (decoder_layers): ModuleList(
    (0-11): 12 x TransformerDecoderLayer(
      (self_attn): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
      )
      (multihead_attn): MultiheadAttention(
        (out_proj): NonDynamicallyQuanti

In [3]:
import subprocess
#pip install tqdm
subprocess.run(["pip", "install", "tqdm"])
import nltk
from tqdm import tqdm
nltk.download('punkt')
nltk.download('gutenberg')

from nltk.corpus import gutenberg
from nltk.tokenize import word_tokenize
import torch

def tokenize_corpus(corpus):
    tokens = []
    for file_id in corpus.fileids():
        words = word_tokenize(corpus.raw(file_id).lower())
        tokens.extend(words)
    return tokens

tokens = tokenize_corpus(gutenberg)
vocab = list(set(tokens))
vocab_size = len(vocab)

word_to_idx = {word: idx for idx, word in enumerate(vocab)}
idx_to_word = {idx: word for word, idx in word_to_idx.items()}

def tokens_to_tensor(tokens, word_to_idx):
    return torch.tensor([word_to_idx[token] for token in tokens if token in word_to_idx], dtype=torch.long)

token_tensor = tokens_to_tensor(tokens, word_to_idx)

hidden_dim = 768
num_heads = 12
num_layers = 12
max_length = 512
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = LLaMA3(vocab_size, hidden_dim, num_heads, num_layers, max_length, device).to(device)

def create_batches(token_tensor, batch_size, seq_length):
    num_batches = token_tensor.size(0) // (batch_size * seq_length)
    data = token_tensor[:num_batches * batch_size * seq_length]
    data = data.view(batch_size, -1)
    for i in range(0, data.size(1) - seq_length, seq_length):
        src = data[:, i:i+seq_length]
        tgt = data[:, i+1:i+seq_length+1]
        yield src, tgt

batch_size = 32
seq_length = 128
model.train()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

num_epochs = 10

for epoch in range(num_epochs):
    total_loss = 0
    progress_bar = tqdm(create_batches(token_tensor, batch_size, seq_length), desc=f'Epoch {epoch+1}/{num_epochs}', leave=False)
    for src, tgt in progress_bar:
        src, tgt = src.to(device), tgt.to(device)
        logits = model(src, tgt[:, :-1])
        loss = criterion(logits.view(-1, vocab_size), tgt[:, 1:].contiguous().view(-1))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        progress_bar.set_postfix(loss=loss.item())
    print(f'Epoch {epoch+1}, Loss: {total_loss / (len(token_tensor) // (batch_size * seq_length))}')

print('Training complete.')

model.eval()
src = torch.randint(0, vocab_size, (1, 128), device=device)
tgt = torch.zeros((1, 128), dtype=torch.long, device=device)
tgt[0, 0] = word_to_idx['<start>']  # Use the appropriate start token for your dataset

for i in range(1, 128):
    logits = model(src, tgt[:, :i])
    next_token = torch.argmax(logits[0, i - 1, :]).item()
    tgt[0, i] = next_token

generated_text = ' '.join([idx_to_word[idx] for idx in tgt[0].tolist()])
print(generated_text)




[nltk_data] Downloading package punkt to
[nltk_data]     /Users/krishpatel/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package gutenberg to
[nltk_data]     /Users/krishpatel/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
                                                  

Epoch 1, Loss: 6.722799613133001


                                                  

Epoch 2, Loss: 6.495773627415227


Epoch 3/10: 85it [3:33:12, 52.78s/it, loss=6.36] 