In [2]:
import torch 
import torch.nn as nn
import torch.nn.functional as F

class LLaMA3(nn.Module):
    def __init__(self, vocab_size, hidden_dim, num_heads, num_layers, max_length, device):
        super(LLaMA3, self).__init__()
        self.device = device
        self.hidden_dim = hidden_dim
        self.num_heads = num_heads
        self.num_layers = num_layers
        self.max_length = max_length
        
        self.embedding = nn.Embedding(vocab_size, hidden_dim)
        self.positional_encoding = self._generate_positional_encoding(max_length, hidden_dim).to(device)
        
        self.encoder_layers = nn.ModuleList([nn.TransformerEncoderLayer(d_model=hidden_dim, nhead=num_heads, dim_feedforward=hidden_dim*4) for _ in range(num_layers)])
        self.decoder_layers = nn.ModuleList([nn.TransformerDecoderLayer(d_model=hidden_dim, nhead=num_heads, dim_feedforward=hidden_dim*4) for _ in range(num_layers)])
        
        self.fc = nn.Linear(hidden_dim, vocab_size)  # Typically the output dimension matches the vocab size for language models
        
        self._init_weights()

    def _init_weights(self):
        nn.init.normal_(self.embedding.weight, mean=0, std=0.02)
        for module in self.encoder_layers:
            self._init_layer_weights(module)
        for module in self.decoder_layers:
            self._init_layer_weights(module)
        nn.init.normal_(self.fc.weight, mean=0, std=0.02)
        nn.init.constant_(self.fc.bias, 0)

    def _init_layer_weights(self, layer):
        for name, param in layer.named_parameters():
            if 'weight' in name:
                nn.init.normal_(param, mean=0, std=0.02)
            elif 'bias' in name:
                nn.init.constant_(param, 0)

    def _generate_positional_encoding(self, max_length, hidden_dim):
        pe = torch.zeros(max_length, hidden_dim)
        position = torch.arange(0, max_length, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, hidden_dim, 2).float() * (-torch.log(torch.tensor(10000.0)) / hidden_dim))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        return pe

    def forward(self, src, tgt):
        src_seq_len = src.size(1)
        tgt_seq_len = tgt.size(1)
        
        src_pos = torch.arange(0, src_seq_len, device=self.device).unsqueeze(0)
        tgt_pos = torch.arange(0, tgt_seq_len, device=self.device).unsqueeze(0)
        
        src = self.embedding(src) + self.positional_encoding[:src_seq_len, :]
        tgt = self.embedding(tgt) + self.positional_encoding[:tgt_seq_len, :]
        
        src = src.transpose(0, 1)  # (seq_len, batch_size, hidden_dim)
        tgt = tgt.transpose(0, 1)  # (seq_len, batch_size, hidden_dim)
        
        for layer in self.encoder_layers:
            src = layer(src)
        
        for layer in self.decoder_layers:
            tgt = layer(tgt, src)
        
        logits = self.fc(tgt.transpose(0, 1))  # (batch_size, seq_len, vocab_size)
        
        return logits

# Example usage
vocab_size = 50257
hidden_dim = 768
num_heads = 12
num_layers = 12
max_length = 512
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = LLaMA3(vocab_size, hidden_dim, num_heads, num_layers, max_length, device).to(device)
print(model)

#using nltk corpus for training
import nltk
nltk.download('gutenberg')
from nltk.corpus import gutenberg
import torchtext
from torchtext.data.utils import get_tokenizer
from collections import Counter
from torchtext.vocab import Vocab

tokenizer = get_tokenizer('basic_english')
counter = Counter()
for line in gutenberg.sents():
    counter.update(tokenizer(' '.join(line)))
vocab = Vocab(counter)

text_pipeline = lambda x: [vocab[token] for token in tokenizer(x)]
label_pipeline = lambda x: int(x)

train_data = list(gutenberg.raw('austen-emma.txt'))
train_data = torch.tensor(text_pipeline(train_data), dtype=torch.long)

def data_process(raw_text_iter):
    data = [torch.tensor(text_pipeline(item), dtype=torch.long) for item in raw_text_iter]
    return data

train_data = data_process(gutenberg.raw('austen-emma.txt'))

def batchify(data, bsz):
    nbatch = data.size(0) // bsz
    data = data.narrow(0, 0, nbatch * bsz)
    data = data.view(bsz, -1).t().contiguous()
    return data.to(device)

batch_size = 20
eval_batch_size = 10
train_data = batchify(train_data, batch_size)

def get_batch(source, i):
    seq_len = min(max_length, len(source) - 1 - i)
    data = source[i:i+seq_len]
    target = source[i+1:i+1+seq_len].reshape(-1)
    return data, target

bptt = 35

def train(model, train_data, optimizer, criterion, bptt):
    model.train()
    total_loss = 0.
    start_time = time.time()
    src_mask = model.generate_square_subsequent_mask(bptt).to(device)
    for batch, i in enumerate(range(0, train_data.size(0) - 1, bptt)):
        data, targets = get_batch(train_data, i)
        optimizer.zero_grad()
        if data.size(0) != bptt:
            src_mask = model.generate_square_subsequent_mask(data.size(0)).to(device)
        output = model(data, src_mask)
        loss = criterion(output.view(-1, vocab_size), targets)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()

        total_loss += loss.item()
        log_interval = 200
        if batch % log_interval == 0 and batch > 0:
            cur_loss = total_loss / log_interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | '
                    'loss {:5.2f} | ppl {:8.2f}'.format(
                epoch, batch, len(train_data) // bptt, scheduler.get_last_lr()[0],
                elapsed * 1000 / log_interval, cur_loss, math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()

import time
import math
from torch.optim.lr_scheduler import StepLR

criterion = nn.CrossEntropyLoss()
lr = 5.0  # learning rate
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
scheduler = StepLR(optimizer, 1.0, gamma=0.95)

best_val_loss = float("inf")
epochs = 3
best_model = None

for epoch in range(1, epochs + 1):

    epoch_start_time = time.time()
    train(model, train_data, optimizer, criterion, bptt)
    val_loss = evaluate(model, val_data, criterion, bptt)
    print('-' * 89)
    print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
            'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                       val_loss, math.exp(val_loss)))
    print('-' * 89)

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model = model

    scheduler.step()

# Save the best model
torch.save(best_model.state_dict(), 'llama3.pth')

# Load the best model
model = LLaMA3(vocab_size, hidden_dim, num_heads, num_layers, max_length, device).to(device)

model.load_state_dict(torch.load('llama3.pth'))
model.eval()

# Generate text
def generate_text(model, vocab, tokenizer, max_length, device, seed_text='The meaning of life is', temperature=1.0):
    model.eval()
    seed = torch.tensor(text_pipeline(seed_text), dtype=torch.long).unsqueeze(0).to(device)
    generated = seed
    with torch.no_grad():
        for i in range(max_length):
            output = model(generated, None)
            logits = output[0, -1, :] / temperature
            probs = F.softmax(logits, dim=-1)
            next_token = torch.multinomial(probs, 1)
            generated = torch.cat((generated, next_token.unsqueeze(0)), dim=1)
    return ' '.join([vocab.itos[token] for token in generated.squeeze()])

print(generate_text(model, vocab, tokenizer, 100, device, seed_text='The meaning of life is', temperature=1.0))
print(generate_text(model, vocab, tokenizer, 100, device, seed_text='The meaning of life is', temperature=0.5))



LLaMA3(
  (embedding): Embedding(50257, 768)
  (encoder_layers): ModuleList(
    (0-11): 12 x TransformerEncoderLayer(
      (self_attn): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
      )
      (linear1): Linear(in_features=768, out_features=3072, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (linear2): Linear(in_features=3072, out_features=768, bias=True)
      (norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout1): Dropout(p=0.1, inplace=False)
      (dropout2): Dropout(p=0.1, inplace=False)
    )
  )
  (decoder_layers): ModuleList(
    (0-11): 12 x TransformerDecoderLayer(
      (self_attn): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
      )
      (multihead_attn): MultiheadAttention(
        (out_proj): NonDynamicallyQuanti

[nltk_data] Downloading package gutenberg to
[nltk_data]     /Users/krishpatel/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


TypeError: Vocab.__init__() got an unexpected keyword argument 'min_freq'

In [8]:
# Using nltk corpus for training
import nltk
nltk.download('gutenberg')
from nltk.corpus import gutenberg
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import time
import math
from torch.optim.lr_scheduler import StepLR
from torchtext.data.utils import get_tokenizer
from collections import Counter
from torchtext.vocab import Vocab

# Tokenizer and Vocabulary
tokenizer = get_tokenizer('basic_english')
counter = Counter()
for line in gutenberg.sents():
    counter.update(tokenizer(' '.join(line)))
vocab = Vocab(counter)
vocab_size = len(vocab)


text_pipeline = lambda x: [vocab[token] for token in tokenizer(x)]
label_pipeline = lambda x: int(x)

# Process the text data
def data_process(raw_text_iter):
    data = [torch.tensor(text_pipeline(item), dtype=torch.long) for item in raw_text_iter]
    return torch.cat(data)

train_raw_text = gutenberg.raw('austen-emma.txt')
train_data = data_process(train_raw_text.split())

# Batching the data
def batchify(data, bsz, device):
    nbatch = data.size(0) // bsz
    data = data.narrow(0, 0, nbatch * bsz)
    data = data.view(bsz, -1).t().contiguous()
    return data.to(device)

batch_size = 20
eval_batch_size = 10
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_data = batchify(train_data, batch_size, device)

# Function to get a batch
def get_batch(source, i, bptt):
    seq_len = min(bptt, len(source) - 1 - i)
    data = source[i:i+seq_len]
    target = source[i+1:i+1+seq_len].reshape(-1)
    return data, target

bptt = 35

# Training function
def train(model, train_data, optimizer, criterion, bptt, device):
    model.train()
    total_loss = 0.
    start_time = time.time()
    for batch, i in enumerate(range(0, train_data.size(0) - 1, bptt)):
        data, targets = get_batch(train_data, i, bptt)
        optimizer.zero_grad()
        src_mask = model.generate_square_subsequent_mask(data.size(0)).to(device)
        output = model(data, data, src_mask, src_mask)  # Assuming model uses src and tgt
        loss = criterion(output.view(-1, vocab_size), targets)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()

        total_loss += loss.item()
        log_interval = 200
        if batch % log_interval == 0 and batch > 0:
            cur_loss = total_loss / log_interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | '
                  'loss {:5.2f} | ppl {:8.2f}'.format(
                epoch, batch, len(train_data) // bptt, scheduler.get_last_lr()[0],
                elapsed * 1000 / log_interval, cur_loss, math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()

# Define the model and training setup
class LLaMA3(nn.Module):
    def __init__(self, vocab_size, hidden_dim, num_heads, num_layers, max_length, device):
        super(LLaMA3, self).__init__()
        self.device = device
        self.hidden_dim = hidden_dim
        self.num_heads = num_heads
        self.num_layers = num_layers
        self.max_length = max_length

        self.embedding = nn.Embedding(vocab_size, hidden_dim)
        self.positional_encoding = self._generate_positional_encoding(max_length, hidden_dim).to(device)
        self.encoder_layers = nn.ModuleList([
            nn.TransformerEncoderLayer(d_model=hidden_dim, nhead=num_heads, dim_feedforward=hidden_dim*4) 
            for _ in range(num_layers)
        ])
        self.decoder_layers = nn.ModuleList([
            nn.TransformerDecoderLayer(d_model=hidden_dim, nhead=num_heads, dim_feedforward=hidden_dim*4) 
            for _ in range(num_layers)
        ])
        self.fc = nn.Linear(hidden_dim, vocab_size)

        self._init_weights()

    def _init_weights(self):
        nn.init.normal_(self.embedding.weight, mean=0, std=0.02)
        for module in self.encoder_layers:
            self._init_layer_weights(module)
        for module in self.decoder_layers:
            self._init_layer_weights(module)
        nn.init.normal_(self.fc.weight, mean=0, std=0.02)
        nn.init.constant_(self.fc.bias, 0)

    def _init_layer_weights(self, layer):
        for name, param in layer.named_parameters():
            if 'weight' in name:
                nn.init.normal_(param, mean=0, std=0.02)
            elif 'bias' in name:
                nn.init.constant_(param, 0)

    def _generate_positional_encoding(self, max_length, hidden_dim):
        pe = torch.zeros(max_length, hidden_dim)
        position = torch.arange(0, max_length, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, hidden_dim, 2).float() * (-torch.log(torch.tensor(10000.0)) / hidden_dim))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        return pe

    def generate_square_subsequent_mask(self, size):
        mask = (torch.triu(torch.ones(size, size)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def forward(self, src, tgt, src_mask, tgt_mask):
        src_seq_len = src.size(1)
        tgt_seq_len = tgt.size(1)
        
        src_pos = torch.arange(0, src_seq_len, device=self.device).unsqueeze(0)
        tgt_pos = torch.arange(0, tgt_seq_len, device=self.device).unsqueeze(0)
        
        src = self.embedding(src) + self.positional_encoding[:src_seq_len, :]
        tgt = self.embedding(tgt) + self.positional_encoding[:tgt_seq_len, :]
        
        src = src.transpose(0, 1)  # (seq_len, batch_size, hidden_dim)
        tgt = tgt.transpose(0, 1)  # (seq_len, batch_size, hidden_dim)
        
        for layer in self.encoder_layers:
            src = layer(src, src_mask)
        
        for layer in self.decoder_layers:
            tgt = layer(tgt, src, tgt_mask, src_mask)
        
        logits = self.fc(tgt.transpose(0, 1))  # (batch_size, seq_len, vocab_size)
        
        return logits

# Hyperparameters
vocab_size = len(vocab)
hidden_dim = 768
num_heads = 12
num_layers = 12
max_length = 512

# Instantiate the model
model = LLaMA3(vocab_size, hidden_dim, num_heads, num_layers, max_length, device).to(device)

# Define criterion, optimizer, and scheduler
criterion = nn.CrossEntropyLoss()
lr = 5.0  # learning rate
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
scheduler = StepLR(optimizer, step_size=1, gamma=0.95)

# Placeholder for validation data and evaluate function
val_data = train_data  # Replace this with actual validation data

def evaluate(model, val_data, criterion, bptt):
    model.eval()
    total_loss = 0.
    with torch.no_grad():
        for i in range(0, val_data.size(0) - 1, bptt):
            data, targets = get_batch(val_data, i, bptt)
            src_mask = model.generate_square_subsequent_mask(data.size(0)).to(device)
            output = model(data, data, src_mask, src_mask)
            loss = criterion(output.view(-1, vocab_size), targets)
            total_loss += loss.item()
    return total_loss / (len(val_data) // bptt)

# Training loop
best_val_loss = float("inf")
epochs = 3
best_model = None

for epoch in range(1, epochs + 1):
    epoch_start_time = time.time()
    train(model, train_data, optimizer, criterion, bptt, device)
    val_loss = evaluate(model, val_data, criterion, bptt)
    print('-' * 89)
    print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | valid ppl {:8.2f}'.format(
          epoch, (time.time() - epoch_start_time), val_loss, math.exp(val_loss)))
    print('-' * 89)

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model = model

    scheduler.step()

# Save the best model
torch.save(best_model.state_dict(), 'llama3.pth')

# Load the best model
model = LLaMA3(vocab_size, hidden_dim, num_heads, num_layers, max_length, device).to(device)
model.load_state_dict(torch.load('llama3.pth'))
model.eval()

# Generate text
def generate_text(model, vocab, text_pipeline, max_length, device, seed_text='The meaning of life is', temperature=1.0):
    model.eval()
    seed = torch.tensor(text_pipeline(seed_text), dtype=torch.long).unsqueeze(0).to(device)
    generated = seed
    with torch.no_grad():
        for _ in range(max_length):
            src_mask = model.generate_square_subsequent_mask(generated.size(1)).to(device)
            output = model(generated, generated, src_mask, src_mask)
            logits = output[0, -1, :] / temperature
            probs = F.softmax(logits, dim=-1)
            next_token = torch.multinomial(probs, 1)
            generated = torch.cat((generated, next_token.unsqueeze(0)), dim=1)
    return ' '.join([vocab.itos[token] for token in generated.squeeze().tolist()])

print(generate_text(model, vocab, text_pipeline, 100, device, seed_text='The meaning of life is', temperature=1.0))
print(generate_text(model, vocab, text_pipeline, 100, device, seed_text='The meaning of life is', temperature=0.5))


[nltk_data] Downloading package gutenberg to
[nltk_data]     /Users/krishpatel/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


IndexError: index out of range in self