In [6]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.datasets import Multi30k
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader
import math

#################################################################
# 1. Download & Prepare Data
#################################################################

# We'll use the Multi30k dataset, which has English-German sentence pairs.
# The dataset splits: ('train', 'valid', 'test')

train_iter, valid_iter, test_iter = Multi30k(split=('train', 'valid', 'test'), language_pair=('en', 'de'))

# Tokenizers
token_transform = {}
token_transform['en'] = get_tokenizer('basic_english')
token_transform['de'] = get_tokenizer('basic_english')

# Helper function to yield list of tokens
def yield_tokens(data_iter, language):
    for src_sample, tgt_sample in data_iter:
        if language == 'en':
            yield token_transform['en'](src_sample)
        else:
            yield token_transform['de'](tgt_sample)

# Build English vocab
train_iter_for_vocab = Multi30k(split='train', language_pair=('en', 'de'))
vocab_en = build_vocab_from_iterator(yield_tokens(train_iter_for_vocab, 'en'),
                                     specials=["<unk>", "<pad>", "<bos>", "<eos>"])
vocab_en.set_default_index(vocab_en["<unk>"])

# Build German vocab
train_iter_for_vocab = Multi30k(split='train', language_pair=('en', 'de'))
vocab_de = build_vocab_from_iterator(yield_tokens(train_iter_for_vocab, 'de'),
                                     specials=["<unk>", "<pad>", "<bos>", "<eos>"])
vocab_de.set_default_index(vocab_de["<unk>"])

# Vocabulary sizes
vocab_size_en = len(vocab_en)
vocab_size_de = len(vocab_de)

# Indices for special tokens
PAD_IDX = vocab_en["<pad>"]
BOS_IDX = vocab_en["<bos>"]
EOS_IDX = vocab_en["<eos>"]

#################################################################
# 2. Encoder-Decoder Model Definition
#################################################################

class EncoderRNN(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, num_layers=1, pad_idx=PAD_IDX):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim, padding_idx=pad_idx)
        self.rnn = nn.RNN(emb_dim, hid_dim, num_layers=num_layers, batch_first=True)
        self.hid_dim = hid_dim
        
    def forward(self, src):
        # src shape = [batch_size, src_len]
        embedded = self.embedding(src)
        # embedded shape = [batch_size, src_len, emb_dim]
        
        outputs, hidden = self.rnn(embedded)
        # outputs = [batch_size, src_len, hid_dim]
        # hidden = [num_layers, batch_size, hid_dim]
        
        return hidden  # We'll pass the final hidden state to the decoder

class DecoderRNN(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, num_layers=1, pad_idx=PAD_IDX):
        super().__init__()
        self.embedding = nn.Embedding(output_dim, emb_dim, padding_idx=pad_idx)
        self.rnn = nn.RNN(emb_dim, hid_dim, num_layers=num_layers, batch_first=True)
        self.fc_out = nn.Linear(hid_dim, output_dim)
        self.hid_dim = hid_dim
        
    def forward(self, trg, hidden):
        # trg shape = [batch_size, 1]
        embedded = self.embedding(trg)
        # embedded = [batch_size, 1, emb_dim]
        
        output, hidden = self.rnn(embedded, hidden)
        # output = [batch_size, 1, hid_dim]
        
        prediction = self.fc_out(output.squeeze(1))
        # prediction = [batch_size, output_dim]
        
        return prediction, hidden

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        """
        src: [batch_size, src_len]
        trg: [batch_size, trg_len]
        Returns the decoder outputs for each time step.
        """
        batch_size = src.shape[0]
        trg_len = trg.shape[1]
        trg_vocab_size = self.decoder.fc_out.out_features
        
        # Tensor to store decoder outputs
        outputs = torch.zeros(batch_size, trg_len, trg_vocab_size).to(self.device)
        
        # Encode the source
        hidden = self.encoder(src)
        # hidden = [num_layers, batch_size, hid_dim]
        
        # First input to the decoder is the <BOS> token
        input_ = trg[:, 0].unsqueeze(1)  # shape = [batch_size, 1]
        
        for t in range(1, trg_len):
            output, hidden = self.decoder(input_, hidden)
            # output = [batch_size, output_dim]
            outputs[:, t, :] = output
            
            # Decide if we use teacher forcing
            teacher_force = (torch.rand(1).item() < teacher_forcing_ratio)
            top1 = output.argmax(1)  # [batch_size]
            
            input_ = trg[:, t] if teacher_force else top1
            input_ = input_.unsqueeze(1)

        return outputs

#################################################################
# 3. Create DataLoader with Batching
#################################################################

# We need a collate_fn to numerically encode tokens and create batches
def collate_fn(batch):
    src_list, tgt_list = [], []
    for src_sample, tgt_sample in batch:
        # Convert tokens to indices
        src_tokens = [BOS_IDX] + vocab_en(token_transform['en'](src_sample)) + [EOS_IDX]
        tgt_tokens = [BOS_IDX] + vocab_de(token_transform['de'](tgt_sample)) + [EOS_IDX]
        src_list.append(torch.tensor(src_tokens, dtype=torch.long))
        tgt_list.append(torch.tensor(tgt_tokens, dtype=torch.long))
        
    # Pad sequences to have the same length
    src_batch = nn.utils.rnn.pad_sequence(src_list, batch_first=True, padding_value=PAD_IDX)
    tgt_batch = nn.utils.rnn.pad_sequence(tgt_list, batch_first=True, padding_value=PAD_IDX)
    
    return src_batch, tgt_batch

BATCH_SIZE = 32

train_dataloader = DataLoader(list(train_iter), batch_size=BATCH_SIZE,
                              shuffle=True, collate_fn=collate_fn)
valid_dataloader = DataLoader(list(valid_iter), batch_size=BATCH_SIZE,
                              shuffle=False, collate_fn=collate_fn)
test_dataloader  = DataLoader(list(test_iter),  batch_size=BATCH_SIZE,
                              shuffle=False, collate_fn=collate_fn)

#################################################################
# 4. Instantiate Model & Training Setup
#################################################################

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

INPUT_DIM = vocab_size_en     # English vocabulary size
OUTPUT_DIM = vocab_size_de    # German vocabulary size
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512
NUM_LAYERS = 1

encoder = EncoderRNN(INPUT_DIM, ENC_EMB_DIM, HID_DIM, NUM_LAYERS).to(device)
decoder = DecoderRNN(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, NUM_LAYERS).to(device)
model = Seq2Seq(encoder, decoder, device).to(device)

optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)

#################################################################
# 5. Training & Validation Loops
#################################################################

def train(model, dataloader, optimizer, criterion, clip=1):
    model.train()
    epoch_loss = 0
    
    for src, trg in dataloader:
        src = src.to(device)
        trg = trg.to(device)
        
        optimizer.zero_grad()
        
        # outputs = [batch_size, trg_len, output_dim]
        outputs = model(src, trg, teacher_forcing_ratio=0.5)
        
        # We want to compare outputs[:, t, :] to trg[:, t]
        # Shift the target by 1, ignoring the first token
        outputs_dim = outputs.shape[-1]
        
        outputs = outputs[:, 1:, :].reshape(-1, outputs_dim)
        trg = trg[:, 1:].reshape(-1)
        
        loss = criterion(outputs, trg)
        loss.backward()
        
        # Gradient clipping to avoid exploding gradients
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        
        epoch_loss += loss.item()
    
    return epoch_loss / len(dataloader)

def evaluate(model, dataloader, criterion):
    model.eval()
    epoch_loss = 0
    
    with torch.no_grad():
        for src, trg in dataloader:
            src = src.to(device)
            trg = trg.to(device)
            
            outputs = model(src, trg, teacher_forcing_ratio=0)  # no teacher forcing at eval
            outputs_dim = outputs.shape[-1]
            
            outputs = outputs[:, 1:, :].reshape(-1, outputs_dim)
            trg = trg[:, 1:].reshape(-1)  
            
            loss = criterion(outputs, trg)
            epoch_loss += loss.item()
    
    return epoch_loss / len(dataloader)

N_EPOCHS = 3

for epoch in range(N_EPOCHS):
    train_loss = train(model, train_dataloader, optimizer, criterion)
    valid_loss = evaluate(model, valid_dataloader, criterion)
    
    print(f"Epoch [{epoch+1}/{N_EPOCHS}] | Train Loss: {train_loss:.3f} | Val. Loss: {valid_loss:.3f} | PPL: {math.exp(valid_loss):.2f}")

#################################################################
# 6. Testing (Optional)
#################################################################

test_loss = evaluate(model, test_dataloader, criterion)
print(f"Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):.2f}")

ModuleNotFoundError: No module named 'torchdata.datapipes'

In [21]:
from torchtext.datasets import Multi30k

In [22]:
Multi30k()

ImportError: cannot import name 'deprecation_warning' from 'torchdata' (c:\Users\admin\anaconda3\Lib\site-packages\torchdata\__init__.py)

In [23]:
import torch
import torchtext
import torchdata

print("PyTorch version:", torch.__version__)
print("TorchText version:", torchtext.__version__)
print("TorchData version:", torchdata.__version__)

PyTorch version: 2.3.0+cpu
TorchText version: 0.18.0+cpu
TorchData version: 0.10.1
