In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from tqdm import tqdm

import numpy as np
import nltk
nltk.download('gutenberg')
nltk.download('punkt')

from nltk.corpus import gutenberg
from nltk.tokenize import word_tokenize
from collections import Counter

# Load and tokenize the corpus
corpus = gutenberg.raw()
tokens = word_tokenize(corpus.lower())

# Build vocabulary and mappings
vocab = Counter(tokens)
vocab_size = len(vocab)
word_to_idx = {word: idx for idx, (word, _) in enumerate(vocab.items())}
idx_to_word = {idx: word for word, idx in word_to_idx.items()}

# Convert tokens to indices
data = [word_to_idx[word] for word in tokens if word in word_to_idx]

def create_batches(data, batch_size, seq_length):
    num_batches = len(data) // (batch_size * seq_length)
    data = data[:num_batches * batch_size * seq_length]
    inputs = np.array(data).reshape((batch_size, -1))
    targets = np.copy(inputs)
    targets[:, :-1], targets[:, -1] = inputs[:, 1:], inputs[:, 0]
    return inputs, targets

batch_size = 32
seq_length = 50
inputs, targets = create_batches(data, batch_size, seq_length)

class miniLlama(nn.Module):
    def __init__(self, vocab_size, d_model, nhead, num_encoder_layers, num_decoder_layers, dim_feedforward, max_seq_length, pos):
        super(miniLlama, self).__init__()
        self.d_model = d_model
        self.pos = pos
        self.max_seq_length = max_seq_length
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.encoder = nn.TransformerEncoder(nn.TransformerEncoderLayer(d_model, nhead, dim_feedforward), num_encoder_layers)
        self.decoder = nn.TransformerDecoder(nn.TransformerDecoderLayer(d_model, nhead, dim_feedforward), num_decoder_layers)
        self.fc = nn.Linear(d_model, vocab_size)
        self.init_weights()

    def init_weights(self):
        initrange = 0.1
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, src, tgt):
        src = self.embedding(src)
        tgt = self.embedding(tgt)
        src = src.permute(1, 0, 2)
        tgt = tgt.permute(1, 0, 2)
        src = self.encoder(src)
        tgt = self.decoder(tgt, src)
        tgt = tgt.permute(1, 0, 2)
        tgt = self.fc(tgt)
        return tgt
    
    def generate(self, src):
        src = self.embedding(src)
        src = src.permute(1, 0, 2)
        src = self.encoder(src)
        tgt = torch.zeros((1, src.shape[1], self.d_model)).to(src.device)
        for i in range(self.max_seq_length):
            tgt = self.decoder(tgt, src)
            tgt = tgt.permute(1, 0, 2)
            tgt = self.fc(tgt)
            tgt = tgt.permute(1, 0, 2)
            tgt = torch.argmax(tgt, dim=2)
            if torch.all(tgt[-1] == self.pos):
                break
        return tgt
    
    def generate_beam(self, src, beam_size):
        src = self.embedding(src)
        src = src.permute(1, 0, 2)
        src = self.encoder(src)
        tgt = torch.zeros((1, src.shape[1], self.d_model)).to(src.device)
        beam = torch.zeros((1, src.shape[1], self.d_model)).to(src.device)
        for i in range(self.max_seq_length):
            tgt = self.decoder(tgt, src)
            tgt = tgt.permute(1, 0, 2)
            tgt = self.fc(tgt)
            tgt = tgt.permute(1, 0, 2)
            tgt = torch.argmax(tgt, dim=2)
            if torch.all(tgt[-1] == self.pos):
                break
        return tgt
    
    def generate_greedy(self, src):
        src = self.embedding(src)
        src = src.permute(1, 0, 2)
        src = self.encoder(src)
        tgt = torch.zeros((1, src.shape[1], self.d_model)).to(src.device)
        for i in range(self.max_seq_length):
            tgt = self.decoder(tgt, src)
            tgt = tgt.permute(1, 0, 2)
            tgt = self.fc(tgt)
            tgt = tgt.permute(1, 0, 2)
            tgt = torch.argmax(tgt, dim=2)
            if torch.all(tgt[-1] == self.pos):
                break
        return tgt

# Training loop
def main():
    # Hyperparameters
    vocab_size = len(vocab)
    d_model = 128
    nhead = 4
    num_encoder_layers = 2
    num_decoder_layers = 2
    dim_feedforward = 512
    max_seq_length = seq_length
    pos = torch.tensor([vocab_size-1])
    learning_rate = 0.001
    num_epochs = 10
    batch_size = 32

    # Initialize model, loss function, and optimizer
    model = miniLlama(vocab_size, d_model, nhead, num_encoder_layers, num_decoder_layers, dim_feedforward, max_seq_length, pos)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Training loop
    model.train()
    for epoch in range(num_epochs):
        epoch_loss = 0
        progress_bar = tqdm(range(0, inputs.shape[1] - seq_length, seq_length), desc=f'Epoch {epoch + 1}/{num_epochs}')
        for i in progress_bar:
            input_batch = torch.tensor(inputs[:, i:i+seq_length], dtype=torch.long)
            target_batch = torch.tensor(targets[:, i:i+seq_length], dtype=torch.long)
            
            optimizer.zero_grad()

            # Forward pass
            outputs = model(input_batch, target_batch)

            # Compute loss
            loss = criterion(outputs.view(-1, vocab_size), target_batch.view(-1))

            # Backward pass and optimization
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()
            progress_bar.set_postfix(loss=loss.item())
        
        print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {epoch_loss / (inputs.shape[1] // seq_length)}')

    # Save the trained model
    torch.save(model.state_dict(), 'mini_llama_model.pth')

if __name__ == "__main__":
    main()


[nltk_data] Downloading package gutenberg to
[nltk_data]     /Users/krishpatel/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/krishpatel/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Epoch 1/10: 100%|██████████| 1585/1585 [15:40<00:00,  1.68it/s, loss=0.223]


Epoch 1/10, Loss: 0.6455237504522331


Epoch 2/10: 100%|██████████| 1585/1585 [15:04<00:00,  1.75it/s, loss=0.0783]


Epoch 2/10, Loss: 0.07858108892252323


Epoch 3/10: 100%|██████████| 1585/1585 [14:43<00:00,  1.79it/s, loss=0.00483]


Epoch 3/10, Loss: 0.008561585780004424


Epoch 4/10: 100%|██████████| 1585/1585 [15:23<00:00,  1.72it/s, loss=0.000132]


Epoch 4/10, Loss: 0.00010295823949548368


Epoch 5/10: 100%|██████████| 1585/1585 [15:00<00:00,  1.76it/s, loss=7.86e-5] 


Epoch 5/10, Loss: 6.329442117290767e-05


Epoch 6/10: 100%|██████████| 1585/1585 [16:25<00:00,  1.61it/s, loss=4.26e-5]


Epoch 6/10, Loss: 3.620434094949537e-05


Epoch 7/10: 100%|██████████| 1585/1585 [17:49<00:00,  1.48it/s, loss=2.05e-5] 


Epoch 7/10, Loss: 1.9007340314742674e-05


Epoch 8/10: 100%|██████████| 1585/1585 [15:39<00:00,  1.69it/s, loss=9.61e-6]


Epoch 8/10, Loss: 9.332292182625908e-06


Epoch 9/10: 100%|██████████| 1585/1585 [17:14<00:00,  1.53it/s, loss=4.33e-6]


Epoch 9/10, Loss: 4.396620280974653e-06


Epoch 10/10: 100%|██████████| 1585/1585 [18:23<00:00,  1.44it/s, loss=1.95e-6]


Epoch 10/10, Loss: 2.0408212613311005e-06


In [8]:
import torch

# Define the hyperparameters
d_model = 128
nhead = 2
num_encoder_layers = 2
num_decoder_layers = 2
dim_feedforward = 512
max_seq_length = 50  # same as seq_length
pos = torch.tensor([vocab_size-1])

# Load the saved model weights
model_path = 'mini_llama_model.pth'
model = miniLlama(vocab_size, d_model, nhead, num_encoder_layers, num_decoder_layers, dim_feedforward, max_seq_length, pos)
model.load_state_dict(torch.load(model_path))

start_sentences = [
    "The",
    "God",
    "Bible",
    "Thou",
    "his",
    "she",
    "he",
    "In a hidden cave,",
    "In the ancient castle,",
    "On a distant planet,"
]

# Function to tokenize the start sentences
def tokenize_start_sentence(sentence, word_to_idx):
    tokens = word_tokenize(sentence.lower())
    return torch.tensor([word_to_idx[token] for token in tokens if token in word_to_idx]).unsqueeze(0)

# Adjusted generate method to handle dimensions properly
def generate(self, src):
    src = self.embedding(src)
    src = src.permute(1, 0, 2)
    src = self.encoder(src)
    tgt = torch.zeros((self.max_seq_length, src.shape[1], self.d_model)).to(src.device)
    for i in range(self.max_seq_length):
        tgt_input = tgt[:i+1]
        output = self.decoder(tgt_input, src)
        output = self.fc(output)
        output = torch.argmax(output, dim=2)
        tgt[i] = self.embedding(output[-1])
        if torch.all(output[-1] == self.pos):
            break
    return output

# Attach the new generate method to the model
miniLlama.generate = generate

# Generate text for each start sentence
model.eval()
for sentence in start_sentences:
    start_tokens = tokenize_start_sentence(sentence, word_to_idx)
    generated_tokens = model.generate(start_tokens)
    generated_sentence = ' '.join([idx_to_word[token.item()] for token in generated_tokens.squeeze()])
    print(f"Start Sentence: {sentence}\nGenerated Text: {generated_sentence}\n")


Start Sentence: The
Generated Text: whole-length whitened whitened whitened whitened whitened crotchets sihon sihon sihon elysee sihon frog frog frog dec. frog frog frog dec. frog remembereth frog dec. frog remembereth frog coax frog coax frog coax frog coax frog frog coax frog coax frog coax frog coax frog frog coax frog coax frog coax

Start Sentence: God
Generated Text: whole-length whitened whitened whitened whitened crotchets inns sihon frog dec. frog frog frog dec. frog frog dec. vigil frog coax frog frog coax frog frog coax frog coax frog frog coax frog coax frog frog coax frog coax frog frog coax frog coax frog coax frog frog coax frog fallow

Start Sentence: Bible
Generated Text: whole-length whitened whitened whitened sihon sketch sihon sihon elysee whole-length whitened whitened whitened whitened whitened sihon sihon sihon sihon sihon whitened sihon elysee whitened sihon sketch whole-length whitened whitened whitened sihon sihon sihon sihon sihon elysee whitened whitened sih

In [3]:
import subprocess
#pip install tqdm
subprocess.run(["pip", "install", "tqdm"])
import nltk
from tqdm import tqdm
nltk.download('punkt')
nltk.download('gutenberg')

from nltk.corpus import gutenberg
from nltk.tokenize import word_tokenize
import torch

def tokenize_corpus(corpus):
    tokens = []
    for file_id in corpus.fileids():
        words = word_tokenize(corpus.raw(file_id).lower())
        tokens.extend(words)
    return tokens

tokens = tokenize_corpus(gutenberg)
vocab = list(set(tokens))
vocab_size = len(vocab)

word_to_idx = {word: idx for idx, word in enumerate(vocab)}
idx_to_word = {idx: word for word, idx in word_to_idx.items()}

def tokens_to_tensor(tokens, word_to_idx):
    return torch.tensor([word_to_idx[token] for token in tokens if token in word_to_idx], dtype=torch.long)

token_tensor = tokens_to_tensor(tokens, word_to_idx)

hidden_dim = 768
num_heads = 12
num_layers = 12
max_length = 512
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = LLaMA3(vocab_size, hidden_dim, num_heads, num_layers, max_length, device).to(device)

def create_batches(token_tensor, batch_size, seq_length):
    num_batches = token_tensor.size(0) // (batch_size * seq_length)
    data = token_tensor[:num_batches * batch_size * seq_length]
    data = data.view(batch_size, -1)
    for i in range(0, data.size(1) - seq_length, seq_length):
        src = data[:, i:i+seq_length]
        tgt = data[:, i+1:i+seq_length+1]
        yield src, tgt

batch_size = 32
seq_length = 128
model.train()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

num_epochs = 10

for epoch in range(num_epochs):
    total_loss = 0
    progress_bar = tqdm(create_batches(token_tensor, batch_size, seq_length), desc=f'Epoch {epoch+1}/{num_epochs}', leave=False)
    for src, tgt in progress_bar:
        src, tgt = src.to(device), tgt.to(device)
        logits = model(src, tgt[:, :-1])
        loss = criterion(logits.view(-1, vocab_size), tgt[:, 1:].contiguous().view(-1))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        progress_bar.set_postfix(loss=loss.item())
    print(f'Epoch {epoch+1}, Loss: {total_loss / (len(token_tensor) // (batch_size * seq_length))}')

print('Training complete.')

model.eval()
src = torch.randint(0, vocab_size, (1, 128), device=device)
tgt = torch.zeros((1, 128), dtype=torch.long, device=device)
tgt[0, 0] = word_to_idx['<start>']  # Use the appropriate start token for your dataset

for i in range(1, 128):
    logits = model(src, tgt[:, :i])
    next_token = torch.argmax(logits[0, i - 1, :]).item()
    tgt[0, i] = next_token

generated_text = ' '.join([idx_to_word[idx] for idx in tgt[0].tolist()])
print(generated_text)




[nltk_data] Downloading package punkt to
[nltk_data]     /Users/krishpatel/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package gutenberg to
[nltk_data]     /Users/krishpatel/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
                                                  

Epoch 1, Loss: 6.722799613133001


                                                  

Epoch 2, Loss: 6.495773627415227


Epoch 3/10: 123it [4:08:23, 57.43s/it, loss=6.47] 

In [11]:
# NEW CODE
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from tqdm import tqdm
import numpy as np
import nltk
import subprocess
subprocess.run(["pip", "install", "datasets"])
from datasets import load_dataset
from collections import Counter

nltk.download('punkt')

from nltk.tokenize import word_tokenize

# Load and tokenize the corpus from multiple sources
def load_and_tokenize_corpus():
    # Load Gutenberg corpus
    nltk.download('gutenberg')
    from nltk.corpus import gutenberg
    gutenberg_corpus = gutenberg.raw()

    # Load Wikipedia articles
    wiki_dataset = load_dataset('wikipedia', '20220301.en', split='train[:1%]')  # Use a subset for demonstration
    wiki_corpus = " ".join(wiki_dataset['text'])

    # Combine corpora
    combined_corpus = gutenberg_corpus + " " + wiki_corpus

    # Tokenize corpus
    tokens = word_tokenize(combined_corpus.lower())
    return tokens

tokens = load_and_tokenize_corpus()

# Build vocabulary and mappings
vocab = Counter(tokens)
vocab_size = len(vocab)
word_to_idx = {word: idx for idx, (word, _) in enumerate(vocab.items())}
idx_to_word = {idx: word for word, idx in word_to_idx.items()}

# Convert tokens to indices
data = [word_to_idx[word] for word in tokens if word in word_to_idx]

def create_batches(data, batch_size, seq_length):
    num_batches = len(data) // (batch_size * seq_length)
    data = data[:num_batches * batch_size * seq_length]
    inputs = np.array(data).reshape((batch_size, -1))
    targets = np.copy(inputs)
    targets[:, :-1], targets[:, -1] = inputs[:, 1:], inputs[:, 0]
    return inputs, targets

batch_size = 32
seq_length = 50
inputs, targets = create_batches(data, batch_size, seq_length)

class miniLlama(nn.Module):
    def __init__(self, vocab_size, d_model, nhead, num_encoder_layers, num_decoder_layers, dim_feedforward, max_seq_length, pos):
        super(miniLlama, self).__init__()
        self.d_model = d_model
        self.pos = pos
        self.max_seq_length = max_seq_length
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.encoder = nn.TransformerEncoder(nn.TransformerEncoderLayer(d_model, nhead, dim_feedforward), num_encoder_layers)
        self.decoder = nn.TransformerDecoder(nn.TransformerDecoderLayer(d_model, nhead, dim_feedforward), num_decoder_layers)
        self.fc = nn.Linear(d_model, vocab_size)
        self.init_weights()

    def init_weights(self):
        initrange = 0.1
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, src, tgt):
        src = self.embedding(src)
        tgt = self.embedding(tgt)
        src = src.permute(1, 0, 2)
        tgt = tgt.permute(1, 0, 2)
        src = self.encoder(src)
        tgt = self.decoder(tgt, src)
        tgt = tgt.permute(1, 0, 2)
        tgt = self.fc(tgt)
        return tgt
    
    def generate(self, src):
        src = self.embedding(src)
        src = src.permute(1, 0, 2)
        src = self.encoder(src)
        tgt = torch.zeros((1, src.shape[1], self.d_model)).to(src.device)
        for i in range(self.max_seq_length):
            tgt = self.decoder(tgt, src)
            tgt = tgt.permute(1, 0, 2)
            tgt = self.fc(tgt)
            tgt = tgt.permute(1, 0, 2)
            tgt = torch.argmax(tgt, dim=2)
            if torch.all(tgt[-1] == self.pos):
                break
        return tgt

# Training loop
def main():
    # Hyperparameters
    vocab_size = len(vocab)
    d_model = 128
    nhead = 4
    num_encoder_layers = 2
    num_decoder_layers = 2
    dim_feedforward = 512
    max_seq_length = seq_length
    pos = torch.tensor([vocab_size-1])
    learning_rate = 0.001
    num_epochs = 5
    batch_size = 32

    # Initialize model, loss function, and optimizer
    model = miniLlama(vocab_size, d_model, nhead, num_encoder_layers, num_decoder_layers, dim_feedforward, max_seq_length, pos)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Training loop
    model.train()
    for epoch in range(num_epochs):
        epoch_loss = 0
        progress_bar = tqdm(range(0, inputs.shape[1] - seq_length, seq_length), desc=f'Epoch {epoch + 1}/{num_epochs}')
        for i in progress_bar:
            input_batch = torch.tensor(inputs[:, i:i+seq_length], dtype=torch.long)
            target_batch = torch.tensor(targets[:, i:i+seq_length], dtype=torch.long)
            
            optimizer.zero_grad()

            # Forward pass
            outputs = model(input_batch, target_batch)

            # Compute loss
            loss = criterion(outputs.view(-1, vocab_size), target_batch.view(-1))

            # Backward pass and optimization
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()
            progress_bar.set_postfix(loss=loss.item())
        
        print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {epoch_loss / (inputs.shape[1] // seq_length)}')

    # Save the trained model
    torch.save(model.state_dict(), 'mini_llama_model_new.pth')

if __name__ == "__main__":
    main()



[33mDEPRECATION: Loading egg at /Users/krishpatel/anaconda3/lib/python3.11/site-packages/litellm-1.40.0-py3.11.egg is deprecated. pip 23.3 will enforce this behaviour change. A possible replacement is to use pip for package installation..[0m[33m
[0m[33mDEPRECATION: Loading egg at /Users/krishpatel/anaconda3/lib/python3.11/site-packages/openai-0.27.7-py3.11.egg is deprecated. pip 23.3 will enforce this behaviour change. A possible replacement is to use pip for package installation..[0m[33m
[0m



ImportError: cannot import name 'AbstractAsyncStreamedFile' from 'fsspec.asyn' (/Users/krishpatel/anaconda3/lib/python3.11/site-packages/fsspec/asyn.py)