In [1]:
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader

In [2]:
# Step 1: Read the data and create vocabulary
with open('words.txt', 'r') as f:
    words = [line.strip() for line in f]

with open('words-reversed.txt', 'r') as f:
    reversed_words = [line.strip() for line in f]

In [3]:
# Create vocabulary
vocab = ['<PAD>', '<UNK>', '<SOS>', '<EOS>'] + list(set(''.join(words)))
char_to_idx = {char: idx for idx, char in enumerate(vocab)}
idx_to_char = {idx: char for char, idx in char_to_idx.items()}

In [4]:
# Step 2: Create a custom Dataset
class WordDataset(Dataset):
    def __init__(self, words, reversed_words, char_to_idx):
        self.words = words
        self.reversed_words = reversed_words
        self.char_to_idx = char_to_idx

    def __len__(self):
        return len(self.words)

    def __getitem__(self, idx):
        word = ['<SOS>'] + list(self.words[idx]) + ['<EOS>']
        reversed_word = ['<SOS>'] + list(self.reversed_words[idx]) + ['<EOS>']
        
        return (torch.tensor([self.char_to_idx[c] for c in word]),
                torch.tensor([self.char_to_idx[c] for c in reversed_word]))

In [5]:
# Step 3: Create DataLoader
def collate_fn(batch):
    src_batch, tgt_batch = zip(*batch)
    src_batch = pad_sequence(src_batch, padding_value=char_to_idx['<PAD>'], batch_first=True)
    tgt_batch = pad_sequence(tgt_batch, padding_value=char_to_idx['<PAD>'], batch_first=True)
    return src_batch, tgt_batch

In [6]:
dataset = WordDataset(words, reversed_words, char_to_idx)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)


In [7]:
# Step 4: Define the model architecture
class Seq2SeqModel(torch.nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers):
        super().__init__()
        self.embedding = torch.nn.Embedding(vocab_size, embedding_dim)
        self.encoder = torch.nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)
        self.decoder = torch.nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = torch.nn.Linear(hidden_dim, vocab_size)

    def forward(self, src, tgt):
        embedded_src = self.embedding(src)
        _, (hidden, cell) = self.encoder(embedded_src)

        embedded_tgt = self.embedding(tgt[:, :-1])  # exclude last token
        output, _ = self.decoder(embedded_tgt, (hidden, cell))
        return self.fc(output)

In [8]:
# Initialize the model
vocab_size = len(vocab)
embedding_dim = 128
hidden_dim = 256
num_layers = 2

model = Seq2SeqModel(vocab_size, embedding_dim, hidden_dim, num_layers)

# Step 5: Define loss function and optimizer
criterion = torch.nn.CrossEntropyLoss(ignore_index=char_to_idx['<PAD>'])
optimizer = torch.optim.Adam(model.parameters())

# Step 6: Training loop
num_epochs = 10
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for src, tgt in dataloader:
        src, tgt = src.to(device), tgt.to(device)
        
        optimizer.zero_grad()
        output = model(src, tgt)
        loss = criterion(output.view(-1, vocab_size), tgt[:, 1:].contiguous().view(-1))
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(dataloader):.4f}')


Epoch 1/10, Loss: 2.0671
Epoch 2/10, Loss: 0.4551
Epoch 3/10, Loss: 0.1102
Epoch 4/10, Loss: 0.0520
Epoch 5/10, Loss: 0.0310
Epoch 6/10, Loss: 0.0249
Epoch 7/10, Loss: 0.0205
Epoch 8/10, Loss: 0.0195
Epoch 9/10, Loss: 0.0168
Epoch 10/10, Loss: 0.0157


In [9]:
# Step 7: Inference function
def reverse_word(model, word, char_to_idx, idx_to_char):
    model.eval()
    with torch.no_grad():
        input_seq = torch.tensor([[char_to_idx.get(c, char_to_idx['<UNK>']) for c in ['<SOS>'] + list(word) + ['<EOS>']]])
        input_seq = input_seq.to(device)
        output_seq = []
        hidden = None
        output = model(input_seq, input_seq)
        # Process the output sequence
        for i in range(output.size(1)):
            topi = output[0, i].argmax().item()
            if topi == char_to_idx['<EOS>']:
                break
            output_seq.append(topi)
        
        return ''.join([idx_to_char[idx] for idx in output_seq])


In [10]:
# Test the model
test_word = "hello"
reversed_word = reverse_word(model, test_word, char_to_idx, idx_to_char)
print(f"Original word: {test_word}")
print(f"Reversed word: {reversed_word}")

RuntimeError: a Tensor with 6 elements cannot be converted to Scalar