In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import spacy
import nltk

nltk.download('wordnet')
nltk.download('punkt')

KeyboardInterrupt: 

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download NLTK data (if not already downloaded)
nltk.download('punkt')
nltk.download('wordnet')

# Initialize WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

# Tokenize and lemmatize sentences using NLTK
def tokenize_and_lemmatize(sentences):
    tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]
    lemmatized_sentences = [[lemmatizer.lemmatize(word) for word in tokens] for tokens in tokenized_sentences]
    return lemmatized_sentences

# Example usage
sentences = ["I am running in the park.", "She was eating an apple."]
tokenized_and_lemmatized_sentences = tokenize_and_lemmatize(sentences)
print(tokenized_and_lemmatized_sentences)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\leatu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\leatu\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


[['I', 'am', 'running', 'in', 'the', 'park', '.'], ['She', 'wa', 'eating', 'an', 'apple', '.']]


In [None]:
TRAIN_SRC = './en_vi/data/train.en'
TRAIN_TGT = './en_vi/data/train.vi'

VALID_SRC = './en_vi/data/tst2012.en'
VALID_TGT = './en_vi/data/tst2012.vi'

TEST_SRC = './en_vi/data/tst2013.en'
TEST_SRC = './en_vi/data/tst2013.vi'

MAX_SEQ_LEN = 160
BATCH_SIZE = 1500
D_MODEL = 512
N_LAYERS = 6
N_HEADS = 8
DROPOUT = 0.1
LEARNING_RATE = 0.0001
EPOCHS = 30
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
DEVICE

device(type='cuda')

In [None]:
# Define a function to read sentences from a file
def read_sentences(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        sentences = file.readlines()
    return [sentence.strip() for sentence in sentences]

def tokenize_sentences_nltk(sentences):
    tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]
    return tokenized_sentences

# Define a function to tokenize English sentences
def tokenize_en(sentences):
    spacy_en = spacy.load('en_core_web_sm')
    return [[token.text for token in spacy_en(sentence)] for sentence in sentences]

# Define a function to tokenize Vietnamese sentences
def tokenize_vi(sentences):
    spacy_vi = spacy.load('en_core_web_sm')
    return [[token.text for token in spacy_vi(sentence)] for sentence in sentences]

# Define a function to build vocabularies
def build_vocab(tokenized_sentences):
    vocab = {'<pad>': 0, '<sos>': 1, '<eos>': 2, '<unk>': 3}
    for sentence in tokenized_sentences:
        for token in sentence:
            if token not in vocab:
                vocab[token] = len(vocab)
    return vocab

# Define a function to convert tokens into numerical indices
def tokens_to_indices(tokenized_sentences, vocab):
    return [[vocab[token] if token in vocab else vocab['<unk>'] for token in sentence] for sentence in tokenized_sentences]

# Define a function to pad sequences
def pad_sequences(sequences, max_length):
    return [sequence[:max_length] + [0] * (max_length - len(sequence)) for sequence in sequences]

In [None]:
def create_data_tensor(src_path, tgt_path):
    # Read source (English) and target (Vietnamese) sentences from files
    source_sentences = read_sentences(src_path)
    target_sentences = read_sentences(tgt_path)

    # Tokenize English and Vietnamese sentences
    source_tokenized = tokenize_sentences_nltk(source_sentences)
    target_tokenized = tokenize_sentences_nltk(target_sentences)

    # Build vocabularies for English and Vietnamese
    source_vocab = build_vocab(source_tokenized)
    target_vocab = build_vocab(target_tokenized)

    # Convert tokens to indices
    source_indices = tokens_to_indices(source_tokenized, source_vocab)
    target_indices = tokens_to_indices(target_tokenized, target_vocab)

    # Pad sequences
    max_source_length = max(len(sentence) for sentence in source_indices)
    max_target_length = max(len(sentence) for sentence in target_indices)

    padded_source_indices = pad_sequences(source_indices, max_source_length)
    padded_target_indices = pad_sequences(target_indices, max_target_length)

    # Convert data to PyTorch tensors
    source_tensor = torch.tensor(padded_source_indices, dtype=torch.long).to(DEVICE)
    target_tensor = torch.tensor(padded_target_indices, dtype=torch.long).to(DEVICE)
    return source_tensor, target_tensor, source_vocab, target_vocab, max_source_length, max_target_length

In [None]:
class TranslationDataset(Dataset):
    def __init__(self, source_tensor, target_tensor):
        self.source_tensor = source_tensor
        self.target_tensor = target_tensor

    def __len__(self):
        return len(self.source_tensor)

    def __getitem__(self, idx):
        return self.source_tensor[idx], self.target_tensor[idx]

In [None]:
train_src_tensor, train_tgt_tensor, train_source_vocab, train_target_vocab, train_max_source_length, train_max_target_length = create_data_tensor(TRAIN_SRC, TRAIN_TGT)

train_dataset = TranslationDataset(train_src_tensor, train_tgt_tensor)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

In [None]:
train_max_source_length

802

In [None]:
train_max_target_length

892

In [None]:
from models.transformer_model import TransformerModel

SRC_VOCAB_SIZE = len(train_source_vocab)
TGT_VOCAB_SIZE = len(train_target_vocab)
MAX_SEQ_LEN = 500
BATCH_SIZE = 16
D_FF = 2048
D_MODEL = 512
N_LAYERS = 2
N_HEADS = 4
DROPOUT = 0.1
LEARNING_RATE = 0.0001
EPOCHS = 30

# Assuming train_src_tensor and train_tgt_tensor are tensors containing the training data
train_dataset = TranslationDataset(train_src_tensor.to('cpu'), train_tgt_tensor.to('cpu'))
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

# Instantiate your transformer model
model = TransformerModel(
    src_vocab_size=SRC_VOCAB_SIZE,
    tgt_vocab_size=TGT_VOCAB_SIZE,
    d_ff=D_FF,
    max_seq_length=MAX_SEQ_LEN,
    d_model=D_MODEL,
    num_layers=N_LAYERS,
    num_heads=N_HEADS,
    dropout=DROPOUT,
)
model.to('cpu')

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

for epoch in range(EPOCHS):
    model.train()
    for src_batch, trg_batch in train_loader:
        optimizer.zero_grad()
        output = model(src_batch, trg_batch)
        
        # Flatten the output and target tensors for the loss function
        output_flat = output.view(-1, output.size(-1))
        trg_flat = trg_batch.view(-1)
        
        loss = criterion(output_flat, trg_flat)
        loss.backward()
        optimizer.step()

    print('Epoch {}: Loss = {:.4f}'.format(epoch+1, loss.item()))

print('Training finished')

NameError: name 'train_source_vocab' is not defined