In [1]:
import torch
import torchtext
from torchtext.vocab import build_vocab_from_iterator
from torch.nn.utils.rnn import pad_sequence
import torch.nn as nn
from torch.optim import Adam



In [2]:
data_pairs = [
    ("I love you", "मैं तुमसे प्यार करता हूँ"),
    ("How are you", "तुम कैसे हो"),
    ("Good morning", "सुप्रभात"),
    ("Thank you", "धन्यवाद"),
    ("What is your name", "तुम्हारा नाम क्या है"),
]

In [3]:
# Tokenizer function
def tokenize(sentence, lang="en"):
    return sentence.lower().split()

# Build vocabulary
def build_vocab(sentences, lang):
    tokenized_sentences = [tokenize(sentence, lang) for sentence, _ in sentences] if lang == "en" else \
                          [tokenize(sentence, lang) for _, sentence in sentences]
    vocab = build_vocab_from_iterator(tokenized_sentences, specials=["<pad>", "<sos>", "<eos>"])
    vocab.set_default_index(vocab["<pad>"])
    return vocab

# Create vocabularies
english_vocab = build_vocab(data_pairs, lang="en")
hindi_vocab = build_vocab(data_pairs, lang="hi")

print(english_vocab.get_stoi())
print(hindi_vocab.get_stoi())


# Convert sentence to tensor
def sentence_to_tensor(sentence, vocab, lang):
    tokens = ["<sos>"] + tokenize(sentence, lang) + ["<eos>"]
    return torch.tensor([vocab[token] for token in tokens], dtype=torch.long)

# Prepare dataset
dataset = [(sentence_to_tensor(en, english_vocab, "en"), 
            sentence_to_tensor(hi, hindi_vocab, "hi")) for en, hi in data_pairs]

print(len(dataset))
print(dataset)

{'<eos>': 2, '<pad>': 0, '<sos>': 1, 'i': 7, 'you': 3, 'how': 6, 'are': 4, 'good': 5, 'is': 8, 'love': 9, 'what': 13, 'morning': 10, 'name': 11, 'thank': 12, 'your': 14}
{'करता': 3, '<eos>': 2, '<pad>': 0, 'हो': 16, 'मैं': 12, '<sos>': 1, 'कैसे': 4, 'तुमसे': 7, 'तुम': 6, 'क्या': 5, 'तुम्हारा': 8, 'धन्यवाद': 9, 'नाम': 10, 'प्यार': 11, 'सुप्रभात': 13, 'हूँ': 14, 'है': 15}
5
[(tensor([1, 7, 9, 3, 2]), tensor([ 1, 12,  7, 11,  3, 14,  2])), (tensor([1, 6, 4, 3, 2]), tensor([ 1,  6,  4, 16,  2])), (tensor([ 1,  5, 10,  2]), tensor([ 1, 13,  2])), (tensor([ 1, 12,  3,  2]), tensor([1, 9, 2])), (tensor([ 1, 13,  8, 14, 11,  2]), tensor([ 1,  8, 10,  5, 15,  2]))]



A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.1 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "C:\Users\Asus\AppData\Local\Programs\Python\Python310\lib\runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "C:\Users\Asus\AppData\Local\Programs\Python\Python310\lib\runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "d:\Python projects\RNN2\environm\lib\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "d:\Python projects\RNN2\environm\lib\site-packages\traitlets\config\application.py", line 1075, in lau

In [4]:
import torch.nn as nn

class Encoder(nn.Module):
    def __init__(self, input_dim, embed_dim, hidden_dim):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, embed_dim)
        self.rnn = nn.LSTM(embed_dim, hidden_dim)
    
    def forward(self, src):
        embedded = self.embedding(src)
        outputs, hidden = self.rnn(embedded)
        return outputs, hidden

In [5]:
input_dim = len(english_vocab) # 15
embed_dim =8
hidden_dim = 16

encoder = Encoder(input_dim, embed_dim, hidden_dim)
sentence = "I love you"
sentence_tensor = sentence_to_tensor(sentence, english_vocab, "en")
encoder_output, encoder_hidden = encoder(sentence_tensor)
print("encoder output:", encoder_output, "encoder hidden:", encoder_hidden)

encoder output: tensor([[-0.0728,  0.0599,  0.1115, -0.0900,  0.0810,  0.0574, -0.0314, -0.0424,
         -0.0938,  0.0819,  0.0685, -0.1414,  0.0866,  0.0762,  0.0793, -0.0443],
        [ 0.0667, -0.0740,  0.0117, -0.0202, -0.0781, -0.0460,  0.1325,  0.1167,
          0.0564, -0.0996,  0.0701, -0.0306, -0.0058, -0.0166,  0.2626,  0.0690],
        [-0.0056, -0.0327,  0.0977, -0.0311, -0.0192,  0.0157,  0.1248, -0.0133,
         -0.0413,  0.0162,  0.1353, -0.1352,  0.1044,  0.0410,  0.1984, -0.0113],
        [-0.0124, -0.1064,  0.1757, -0.0285, -0.0355, -0.0658, -0.0398,  0.0899,
         -0.1370, -0.0074,  0.2333, -0.0265, -0.1212, -0.0622,  0.1096,  0.1029],
        [-0.0773, -0.1206,  0.3438,  0.0586, -0.0396, -0.0912, -0.0255,  0.0357,
         -0.1070,  0.0714,  0.2950,  0.0842, -0.1984, -0.1016,  0.1229, -0.0059]],
       grad_fn=<SqueezeBackward1>) encoder hidden: (tensor([[-0.0773, -0.1206,  0.3438,  0.0586, -0.0396, -0.0912, -0.0255,  0.0357,
         -0.1070,  0.0714,  0.2950,

In [6]:
class Decoder(nn.Module):
    def __init__(self, output_dim, embed_dim, hidden_dim):
        super().__init__()
        self.embedding = nn.Embedding(output_dim, embed_dim)
        self.rnn = nn.LSTM(embed_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, input, hidden):
        input = input.unsqueeze(0)  # Add time dimension
        embedded = self.embedding(input)
        output, hidden = self.rnn(embedded, hidden)
        prediction = self.fc(output.squeeze(0))
        return prediction, hidden

In [7]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
    
    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        outputs = torch.zeros(trg.size(0), trg.size(1), len(hindi_vocab))  # Preallocate space
        encoder_outputs, hidden = self.encoder(src)
        input = trg[0, :]  # Start token
        for t in range(1, trg.size(0)):
            output, hidden = self.decoder(input, hidden)
            outputs[t] = output
            input = trg[t] if torch.rand(1).item() < teacher_forcing_ratio else output.argmax(1)
        return outputs

In [8]:
from torch.optim import Adam

# Hyperparameters
INPUT_DIM = len(english_vocab)
OUTPUT_DIM = len(hindi_vocab)
EMBED_DIM = 256
HIDDEN_DIM = 512
LEARNING_RATE = 0.001
N_EPOCHS = 10

# Model, optimizer, and loss function
encoder = Encoder(INPUT_DIM, EMBED_DIM, HIDDEN_DIM)
decoder = Decoder(OUTPUT_DIM, EMBED_DIM, HIDDEN_DIM)
model = Seq2Seq(encoder, decoder)
optimizer = Adam(model.parameters(), lr=LEARNING_RATE)
criterion = nn.CrossEntropyLoss(ignore_index=hindi_vocab["<pad>"])

# Training loop
for epoch in range(N_EPOCHS):
    epoch_loss = 0
    for src, trg in dataset:
        src, trg = src.unsqueeze(1), trg.unsqueeze(1)  # Add batch dimension
        optimizer.zero_grad()
        output = model(src, trg)
        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)
        loss = criterion(output, trg)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    print(f"Epoch {epoch + 1}/{N_EPOCHS}, Loss: {epoch_loss:.4f}")


Epoch 1/10, Loss: 14.5820
Epoch 2/10, Loss: 11.0276
Epoch 3/10, Loss: 8.3561
Epoch 4/10, Loss: 6.1859
Epoch 5/10, Loss: 3.9467
Epoch 6/10, Loss: 1.9215
Epoch 7/10, Loss: 0.7741
Epoch 8/10, Loss: 0.3552
Epoch 9/10, Loss: 0.1773
Epoch 10/10, Loss: 0.1035


In [10]:
def translate_sentence(sentence, model, english_vocab, hindi_vocab):
    model.eval()
    src_tensor = sentence_to_tensor(sentence, english_vocab, "en").unsqueeze(1)
    encoder_outputs, hidden = model.encoder(src_tensor)
    trg_tokens = ["<sos>"]
    input_token = torch.tensor([hindi_vocab["<sos>"]])
    while input_token.item() != hindi_vocab["<eos>"]:
        output, hidden = model.decoder(input_token, hidden)
        input_token = output.argmax(1)
        trg_tokens.append(hindi_vocab.lookup_token(input_token.item()))
    return " ".join(trg_tokens[1:-1])  # Exclude <sos> and <eos>

# Test translation
print(translate_sentence("I love you", model, english_vocab, hindi_vocab))


मैं तुमसे प्यार करता हूँ
