In [58]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

# Read words from files
with open('words.txt', 'r') as f:
    words = [line.strip() for line in f]
with open('words-reversed.txt', 'r') as f:
    reversed_words = [line.strip() for line in f]


# Create vocabulary
# Create vocabulary
PAD = '<PAD>'
vocab = set(PAD)  # Start with PAD token
for word in words + reversed_words:
    vocab.update(word)
vocab = sorted(list(vocab))
char_to_idx = {char: idx for idx, char in enumerate(vocab)}
idx_to_char = {idx: char for char, idx in char_to_idx.items()}

# Verify PAD token is in the vocabulary
print(f"PAD token index: {char_to_idx[PAD]}")

# Define the LSTM model
class WordReverser(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(WordReverser, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, input, hidden):
        embedded = self.embedding(input)
        output, hidden = self.lstm(embedded, hidden)
        output = self.fc(output)
        return output, hidden
    
    def init_hidden(self, batch_size):
        return (torch.zeros(1, batch_size, self.hidden_size),
                torch.zeros(1, batch_size, self.hidden_size))

# Create a custom dataset
class WordDataset(Dataset):
    def __init__(self, words, reverse_words, char_to_idx, max_len):
        self.words = words
        self.reverse_words = reverse_words
        self.char_to_idx = char_to_idx
        self.max_len = max_len
    
    def __len__(self):
        return len(self.words)
    
    def __getitem__(self, idx):
        word = list(self.words[idx])
        reverse_word = list(self.reverse_words[idx])
        word_encoded = [char_to_idx[c] for c in word]
        reverse_encoded = [char_to_idx[c] for c in reverse_word]
        
        # Pad sequences
        word_encoded += [char_to_idx[PAD]] * (self.max_len - len(word_encoded))
        reverse_encoded += [char_to_idx[PAD]] * (self.max_len - len(reverse_encoded))
        
        return (torch.tensor(word_encoded), torch.tensor(reverse_encoded))

# Prepare the data
max_len = max(max(len(w) for w in words), max(len(w) for w in reversed_words)) + 2  # +2 for SOS and EOS
dataset = WordDataset(words, reversed_words, char_to_idx, max_len)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# Initialize the model
input_size = len(char_to_idx)
hidden_size = 256
output_size = len(char_to_idx)
model = WordReverser(input_size, hidden_size, output_size)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=char_to_idx[PAD])
optimizer = optim.Adam(model.parameters())

# Training function
def train(model, dataloader, n_epochs):
    model.train()
    for epoch in range(n_epochs):
        total_loss = 0
        for input_seq, target_seq in dataloader:
            input_seq, target_seq = input_seq.to(device), target_seq.to(device)
            batch_size = input_seq.size(0)
            hidden = model.init_hidden(batch_size)
            hidden = tuple(h.to(device) for h in hidden)
            
            optimizer.zero_grad()
            output, _ = model(input_seq, hidden)
            
            # Adjust output and target shapes
            output = output.view(-1, output_size)
            target = target_seq.view(-1)
            
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
        
        print(f"Epoch {epoch+1}, Loss: {total_loss/len(dataloader):.4f}")

# Train the model
train(model, dataloader, n_epochs=50)

# Save the model
torch.save(model.state_dict(), 'word_reverser_model.pth')

KeyError: '<PAD>'

In [54]:
def reverse_word(model, word, char_to_idx, idx_to_char):
    model.eval()
    with torch.no_grad():
        input_seq = torch.tensor([[char_to_idx.get(c, char_to_idx[PAD]) for c in word]])
        input_seq = input_seq.to(device)
        hidden = model.init_hidden(1)
        hidden = tuple(h.to(device) for h in hidden)
        
        output_word = []
        for _ in range(len(word)):
            output, hidden = model(input_seq, hidden)
            top_char_idx = output[0, -1].argmax().item()
            top_char = idx_to_char[top_char_idx]
            if top_char == PAD:
                break
            output_word.append(top_char)
            input_seq = torch.tensor([[top_char_idx]]).to(device)
        
        return ''.join(output_word)

In [55]:
# Test the model
test_words = ["hello", "world", "python", "programming"]
for test_word in test_words:
    print(f"\nOriginal word: {test_word}")
    reversed_word = reverse_word(model, test_word, char_to_idx, idx_to_char)
    print(f"Reversed word: {reversed_word}")


Original word: hello
Step 0: Predicted <EOS> (index 0)
Reversed word: 

Original word: world
Step 0: Predicted <EOS> (index 0)
Reversed word: 

Original word: python
Step 0: Predicted <EOS> (index 0)
Reversed word: 

Original word: programming
Step 0: Predicted <EOS> (index 0)
Reversed word: 


In [27]:
# Step 1: Read the data and create vocabulary
with open('words.txt', 'r') as f:
    words = [line.strip() for line in f]

with open('words-reversed.txt', 'r') as f:
    reversed_words = [line.strip() for line in f]

In [28]:
UNK = '<UNK>'
PAD = '<PAD>'
SOS = '<SOS>'
EOS = '<EOS>'

In [29]:
# Create vocabulary
vocab = [UNK, PAD, SOS, EOS] + list(set(''.join(words)))
char_to_idx = {char: idx for idx, char in enumerate(vocab)}
idx_to_char = {idx: char for char, idx in char_to_idx.items()}

In [30]:
# Step 2: Create a custom Dataset
class WordDataset(Dataset):
    def __init__(self, words, reversed_words, char_to_idx):
        self.words = words
        self.reversed_words = reversed_words
        self.char_to_idx = char_to_idx

    def __len__(self):
        return len(self.words)

    def __getitem__(self, idx):
        word = [SOS] + list(self.words[idx]) + [EOS]
        reversed_word = [SOS] + list(self.reversed_words[idx]) + [EOS]
        
        return (torch.tensor([self.char_to_idx[c] for c in word]),
                torch.tensor([self.char_to_idx[c] for c in reversed_word]))

In [31]:
# Step 3: Create DataLoader
def collate_fn(batch):
    src_batch, tgt_batch = zip(*batch)
    src_batch = pad_sequence(src_batch, padding_value=char_to_idx[PAD], batch_first=True)
    tgt_batch = pad_sequence(tgt_batch, padding_value=char_to_idx[PAD], batch_first=True)
    return src_batch, tgt_batch

In [32]:
dataset = WordDataset(words, reversed_words, char_to_idx)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)


In [33]:
# Step 4: Define the model architecture
class Seq2SeqModel(torch.nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers):
        super().__init__()
        self.embedding = torch.nn.Embedding(vocab_size, embedding_dim)
        self.encoder = torch.nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)
        self.decoder = torch.nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = torch.nn.Linear(hidden_dim, vocab_size)

    def forward(self, src, tgt):
        embedded_src = self.embedding(src)
        _, (hidden, cell) = self.encoder(embedded_src)

        embedded_tgt = self.embedding(tgt[:, :-1])  # exclude last token
        output, _ = self.decoder(embedded_tgt, (hidden, cell))
        return self.fc(output)

In [34]:
# Initialize the model
vocab_size = len(vocab)
embedding_dim = 128
hidden_dim = 256
num_layers = 2

model = Seq2SeqModel(vocab_size, embedding_dim, hidden_dim, num_layers)

# Step 5: Define loss function and optimizer
criterion = torch.nn.CrossEntropyLoss(ignore_index=char_to_idx[PAD])
optimizer = torch.optim.Adam(model.parameters())

# Step 6: Training loop
num_epochs = 10
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for src, tgt in dataloader:
        src, tgt = src.to(device), tgt.to(device)
        
        optimizer.zero_grad()
        output = model(src, tgt)
        loss = criterion(output.view(-1, vocab_size), tgt[:, 1:].contiguous().view(-1))
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(dataloader):.4f}')


Epoch 1/10, Loss: 2.0529
Epoch 2/10, Loss: 0.4642
Epoch 3/10, Loss: 0.1189
Epoch 4/10, Loss: 0.0503
Epoch 5/10, Loss: 0.0296
Epoch 6/10, Loss: 0.0232
Epoch 7/10, Loss: 0.0183
Epoch 8/10, Loss: 0.0145
Epoch 9/10, Loss: 0.0146
Epoch 10/10, Loss: 0.0129


In [43]:
def reverse_word(model, word, char_to_idx, idx_to_char):
    model.eval()
    with torch.no_grad():
        # Prepare input sequence
        input_seq = torch.tensor([[char_to_idx.get(c, char_to_idx[UNK]) for c in [SOS] + list(word) + [EOS]]])
        input_seq = input_seq.to(device)
        print("Input sequence:", input_seq)
        
        # Run inference
        output = model(input_seq, input_seq)
        print("Raw output shape:", output.shape)
        print("Raw output:", output)
        
        # Get the most likely character at each position
        _, predicted = output.max(2)
        print("Predicted indices:", predicted)
        
        # Convert to characters, ignoring SOS and stopping at EOS
        reversed_word = []
        for char_idx in predicted[0][1:]:  # Start from index 1 to skip SOS
            char = idx_to_char[char_idx.item()]
            print(f"Index {char_idx.item()} maps to character '{char}'")
            if char == EOS:
                break
            reversed_word.append(char)
        return ''.join(reversed_word)

# Print dictionaries
print("char_to_idx:", char_to_idx)
print("idx_to_char:", idx_to_char)

# Test the model
test_word = "abc"
reversed_word = reverse_word(model, test_word, char_to_idx, idx_to_char)
print(f"Original word: {test_word}")
print(f"Reversed word: {reversed_word}")


char_to_idx: {'<UNK>': 0, '<PAD>': 1, '<SOS>': 2, '<EOS>': 3, 'd': 4, 'l': 5, 'y': 6, 't': 7, 'u': 8, 'p': 9, 'n': 10, 'a': 11, 'f': 12, 'b': 13, 'z': 14, 'g': 15, 'o': 16, 'j': 17, 'i': 18, 'm': 19, 'e': 20, 'w': 21, 'x': 22, 'c': 23, 'h': 24, 'r': 25, 'q': 26, 'v': 27, 'k': 28, 's': 29}
idx_to_char: {0: '<UNK>', 1: '<PAD>', 2: '<SOS>', 3: '<EOS>', 4: 'd', 5: 'l', 6: 'y', 7: 't', 8: 'u', 9: 'p', 10: 'n', 11: 'a', 12: 'f', 13: 'b', 14: 'z', 15: 'g', 16: 'o', 17: 'j', 18: 'i', 19: 'm', 20: 'e', 21: 'w', 22: 'x', 23: 'c', 24: 'h', 25: 'r', 26: 'q', 27: 'v', 28: 'k', 29: 's'}
Input sequence: tensor([[ 2, 11, 13, 23,  3]])
Raw output shape: torch.Size([1, 4, 30])
Raw output: tensor([[[-3.0985e+00, -3.1494e+00, -2.9176e+00, -3.2301e+00, -1.2847e+00,
          -1.7373e-03, -5.9593e+00,  2.5254e+00, -5.4848e+00,  4.0350e+00,
           2.3974e+00, -1.4774e+00,  3.9303e+00, -1.5242e+00,  3.0622e+00,
          -2.9223e+00, -3.7550e+00, -2.3512e+00, -8.3012e-01, -4.8540e+00,
          -3.2814e+0

In [44]:
# Test the model
test_word = "abcdef"
reversed_word = reverse_word(model, test_word, char_to_idx, idx_to_char)
print(f"Original word: {test_word}")
print(f"Reversed word: {reversed_word}")

Input sequence: tensor([[ 2, 11, 13, 23,  4, 20, 12,  3]])
Raw output shape: torch.Size([1, 7, 30])
Raw output: tensor([[[-2.5803e+00, -2.2422e+00, -3.2119e+00, -2.8323e+00,  7.9035e+00,
          -9.7744e-01, -2.4823e+00,  5.8987e+00,  1.0777e+00,  1.9785e+00,
          -4.1541e+00, -3.4420e+00,  8.7895e+00, -1.6218e-01,  6.2237e-01,
           4.7465e+00, -2.0778e+00,  2.3103e+00, -5.7122e+00, -1.0499e+00,
           3.0753e+00,  3.4503e+00,  7.6304e-01, -8.0320e-01, -4.2621e+00,
           1.4373e+00,  2.5141e-01, -6.5432e-01,  1.4647e+00, -7.2393e+00],
         [-3.5698e+00, -3.2880e+00, -4.0060e+00, -4.2419e+00,  5.3760e+00,
           1.0697e+00, -4.5486e+00,  6.1545e+00, -2.4493e-01,  3.8938e+00,
          -3.5216e+00, -1.1525e+01,  1.0733e+01, -2.9845e+00, -9.2221e-01,
           3.0360e+00, -2.8825e+00,  4.7221e+00, -4.1529e+00, -2.0185e+00,
           5.7998e+00,  2.7859e+00,  1.9485e+00,  2.8304e+00, -1.7282e+00,
           2.4965e+00, -8.3607e-01,  1.6259e+00,  1.1902e+00, 