In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import os

# Add the previous directory to the Python path
previous_directory = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
if previous_directory not in sys.path:
    sys.path.append(previous_directory)

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from model.model import NeuralProbabilisticLanguageModel
from data_loader.data_loaders import NGramDataLoader
from data_loader.data_loaders import collate_fn

In [4]:
data_dir = "../_data/raw/ptwiki-latest-pages-articles"  # Update this path as needed
batch_size = 512
embedding_dim = 100
hidden_dim = 128
learning_rate = 0.001
num_epochs = 10
n = 3  # Trigram
load_fraction = 0.01
random_load = True
vocab_size = 10000  # Must match the NGramDataset's vocab_size
num_workers = 4  # Adjust based on your system

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


In [5]:
# Initialize DataLoader

dataloader = NGramDataLoader(
    data_dir=data_dir,
    batch_size=batch_size,
    shuffle=True,
    num_workers=num_workers,
    n=n,
    load_fraction=load_fraction,
    random_load=random_load,
    vocab_size=vocab_size
)

# Access the dataset and vocabulary
dataset = dataloader.dataset

# Check if the dataset is empty
if len(dataset) == 0:
    raise ValueError("The dataset is empty. Please check the data directory and loading parameters.")

word_to_index = dataset.get_word_to_index()
actual_vocab_size = len(word_to_index)
print(f"Vocabulary Size: {actual_vocab_size}")

Vocabulary Size: 10000


In [6]:
dataset.get_word_index("pai")

294

In [7]:
word_to_index

{'de': 0,
 'a': 1,
 'e': 2,
 'o': 3,
 'em': 4,
 'do': 5,
 'da': 6,
 'que': 7,
 'um': 8,
 'uma': 9,
 'no': 10,
 'com': 11,
 'para': 12,
 'na': 13,
 'é': 14,
 'foi': 15,
 'por': 16,
 'os': 17,
 'como': 18,
 'as': 19,
 'dos': 20,
 '[BEG]': 21,
 'se': 22,
 'ao': 23,
 'mais': 24,
 'sua': 25,
 'seu': 26,
 'das': 27,
 'não': 28,
 'são': 29,
 'ou': 30,
 'à': 31,
 'também': 32,
 'pela': 33,
 'pelo': 34,
 'ele': 35,
 'entre': 36,
 'ser': 37,
 'era': 38,
 'mas': 39,
 'nos': 40,
 'anos': 41,
 '[END]': 42,
 'foram': 43,
 'durante': 44,
 'asteroide': 45,
 'dia': 46,
 'até': 47,
 'nas': 48,
 'sobre': 49,
 'seus': 50,
 'quando': 51,
 'onde': 52,
 'depois': 53,
 'cidade': 54,
 'após': 55,
 'este': 56,
 'primeira': 57,
 'sendo': 58,
 'grande': 59,
 'ano': 60,
 'parte': 61,
 'tem': 62,
 'dois': 63,
 'primeiro': 64,
 'mesmo': 65,
 'suas': 66,
 'aos': 67,
 'ela': 68,
 'ainda': 69,
 'principal': 70,
 'brasil': 71,
 'história': 72,
 '2000': 73,
 'the': 74,
 'rio': 75,
 'outros': 76,
 'três': 77,
 'possui': 7

In [8]:
# Initialize the model
model = NeuralProbabilisticLanguageModel(
    vocab_size=actual_vocab_size,
    embedding_dim=embedding_dim,
    context_size=n,  # For trigram, context_size=2
    hidden_dim=hidden_dim
).to(device)
print(model)

NeuralProbabilisticLanguageModel(
  (embeddings): Embedding(10000, 100)
  (hidden): Linear(in_features=300, out_features=128, bias=True)
  (output): Linear(in_features=128, out_features=10000, bias=True)
  (dropout): Dropout(p=0.3, inplace=False)
)


In [9]:
# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# (Optional) Learning rate scheduler
# scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.1)


In [10]:
def collate_wrapper(batch):
    ngrams, targets = batch
    return collate_fn((ngrams, targets), word_to_index, model.embeddings)



In [11]:
def evaluate_model(model, dataloader, word_to_index, context_size, device):
    model.eval()  # Set model to evaluation mode
    total_loss = 0.0
    criterion = nn.CrossEntropyLoss()
    
    with torch.no_grad():
        for batch in dataloader:
            ngrams, targets = batch
            context_tensor, target_tensor = collate_wrapper((ngrams, targets))
            context_tensor = context_tensor.to(device)
            target_tensor = target_tensor.to(device)

            outputs = model(context_tensor)
            loss = criterion(outputs, target_tensor)
            total_loss += loss.item()
    
    avg_loss = total_loss / len(dataloader)
    return avg_loss


In [12]:
def subtract_and_add(word1, word2, word3):
    index1 = word_to_index.get(word1)
    index2 = word_to_index.get(word2)
    index3 = word_to_index.get(word3)
    
    if index1 is None or index2 is None or index3 is None:
        print(f"One or more words '{word1}', '{word2}', or '{word3}' are not in the vocabulary.")
        return
    
    # Get embeddings for the words
    embedding1 = model.embeddings(torch.tensor([index1]).to(device)).squeeze(0)
    embedding2 = model.embeddings(torch.tensor([index2]).to(device)).squeeze(0)
    embedding3 = model.embeddings(torch.tensor([index3]).to(device)).squeeze(0)
    
    # Perform the operation: word1 - word2 + word3
    result_vector = embedding1 - embedding2 + embedding3
    
    # List to store similarity scores for each word
    similarity_list = []
    
    for word, idx in word_to_index.items():
        word_embedding = model.embeddings(torch.tensor([idx]).to(device)).squeeze(0)
        similarity = F.cosine_similarity(result_vector, word_embedding, dim=0).item()
        similarity_list.append((word, similarity))
    
    # Sort the list by similarity in descending order
    similarity_list.sort(key=lambda x: x[1], reverse=True)
    
    # Print the top 5 most similar words
    print(f"The 5 most similar words to the result of '{word1} - {word2} + {word3}' are:")
    for word, similarity in similarity_list[:5]:
        print(f"'{word}' with similarity {similarity:.4f}")

In [None]:
import time
import torch
import torch.nn.functional as F

# Define the function to print the top 5 most similar words for "cachorro"
def print_top_similar_rei():
    word = "rei"
    index = word_to_index.get(word)
    if index is None:
        print(f"The word '{word}' is not in the vocabulary.")
        return
    
    # Get the embedding for "cachorro"
    embedding = model.embeddings(torch.tensor([index]).to(device)).squeeze(0)
    
    # List to store similarity scores for each word
    similarity_list = []
    
    # Compute cosine similarity between "cachorro" and each word's embedding
    for other_word, idx in word_to_index.items():
        other_embedding = model.embeddings(torch.tensor([idx]).to(device)).squeeze(0)
        similarity = F.cosine_similarity(embedding, other_embedding, dim=0).item()
        similarity_list.append((other_word, similarity))
    
    # Sort the list by similarity in descending order
    similarity_list.sort(key=lambda x: x[1], reverse=True)
    
    # Print the top 5 most similar words
    print(f"\nThe 5 most similar words to '{word}' are:")
    for other_word, similarity in similarity_list[:5]:
        print(f"'{other_word}' with similarity {similarity:.4f}")

# Training pipeline starts here
model.train()
best_val_loss = float('inf')

for epoch in range(1, num_epochs + 1):
    epoch_start_time = time.time()
    running_loss = 0.0
    epoch_loss = 0.0
    batch_count = len(dataloader)

    print(f"\n--- Starting Epoch {epoch}/{num_epochs} ---")
    print(f"Total Batches in Epoch {epoch}: {batch_count}")

    for batch_idx, batch in enumerate(dataloader):
        batch_start_time = time.time()
        ngrams, targets = batch

        context_tensor, target_tensor = collate_wrapper((ngrams, targets))
        context_tensor = context_tensor.long().to(device)
        target_tensor = target_tensor.to(device)
        
        # Forward pass
        outputs = model(context_tensor)
        loss = criterion(outputs, target_tensor)

        optimizer.zero_grad()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5)
        optimizer.step()

        epoch_loss += loss.item()
        running_loss += loss.item()
        
        batch_end_time = time.time()
        batch_duration = batch_end_time - batch_start_time
        
        # Print intermediate stats every 500 batches
        if (batch_idx + 1) % 500 == 0:
            avg_interval_loss = running_loss / 500
            print(f"Epoch [{epoch}/{num_epochs}], Batch [{batch_idx+1}/{batch_count}], "
                  f"Avg Interval Loss: {avg_interval_loss:.4f}, "
                  f"Batch Time: {batch_duration:.2f}s")
            running_loss = 0.0

    avg_train_loss = epoch_loss / batch_count if batch_count > 0 else 0
    epoch_end_time = time.time()
    epoch_duration = epoch_end_time - epoch_start_time

    print(f"--- Epoch {epoch} Completed ---")
    print(f"Average Training Loss for Epoch {epoch}: {avg_train_loss:.4f}")
    print(f"Epoch Duration: {epoch_duration:.2f}s")

    # Evaluate
    model.eval()
    val_start_time = time.time()
    val_loss = evaluate_model(model, dataloader, word_to_index, context_size=n-1, device=device)
    val_end_time = time.time()
    val_duration = val_end_time - val_start_time
    model.train()

    print(f"Validation Loss after Epoch {epoch}: {val_loss:.4f}")
    print(f"Validation Duration: {val_duration:.2f}s")

    # Checkpoint
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), "best_neural_probabilistic_language_model.pth")
        print(f"Best model updated and saved at Epoch {epoch} with Validation Loss: {val_loss:.4f}")

    # Cosine similarity calculation for 'rei' and 'mae'
    rei_index = word_to_index.get("rei")
    rainha_index = word_to_index.get("rainha")
    if rei_index is not None and rainha_index is not None:
        rei_embedding = model.embeddings(torch.tensor([rei_index]).to(device)).squeeze(0)
        rainha_embedding = model.embeddings(torch.tensor([rainha_index]).to(device)).squeeze(0)
        cosine_similarity = F.cosine_similarity(rei_embedding, rainha_embedding, dim=0)
        print(f"Cosine Similarity between 'rei' and 'rainha' after Epoch {epoch}: {cosine_similarity.item():.4f}")
    else:
        print("One or both words 'rei' and 'rainha' are not in the vocabulary.")

    # Print the top 5 most similar words for "rei"
    print_top_similar_rei()



--- Starting Epoch 1/10 ---
Total Batches in Epoch 1: 6697
Epoch [1/10], Batch [500/6697], Avg Interval Loss: 6.4542, Batch Time: 0.01s
Epoch [1/10], Batch [1000/6697], Avg Interval Loss: 5.8249, Batch Time: 0.01s
Epoch [1/10], Batch [1500/6697], Avg Interval Loss: 5.6239, Batch Time: 0.01s
Epoch [1/10], Batch [2000/6697], Avg Interval Loss: 5.5011, Batch Time: 0.01s
Epoch [1/10], Batch [2500/6697], Avg Interval Loss: 5.4305, Batch Time: 0.01s
Epoch [1/10], Batch [3000/6697], Avg Interval Loss: 5.3624, Batch Time: 0.01s
Epoch [1/10], Batch [3500/6697], Avg Interval Loss: 5.3108, Batch Time: 0.01s
Epoch [1/10], Batch [4000/6697], Avg Interval Loss: 5.2827, Batch Time: 0.01s
Epoch [1/10], Batch [4500/6697], Avg Interval Loss: 5.2232, Batch Time: 0.01s
Epoch [1/10], Batch [5000/6697], Avg Interval Loss: 5.2065, Batch Time: 0.01s
Epoch [1/10], Batch [5500/6697], Avg Interval Loss: 5.1910, Batch Time: 0.01s
Epoch [1/10], Batch [6000/6697], Avg Interval Loss: 5.1608, Batch Time: 0.01s
Epoch

In [None]:
# avião e carro
# pai e mae