In [44]:
import torch
from torch.utils.data import Dataset,DataLoader
import torch.nn as nn
import torch.optim as optim
import pandas as pd

In [45]:
data = pd.read_csv(r'C:\Users\Shaikh Irfan\Documents\Ai Adeventures\Machine Translation\data\Hindi_English_Truncated_Corpus.csv')
data.drop('source',axis=1,inplace=True)
data.head(5)

Unnamed: 0,english_sentence,hindi_sentence
0,politicians do not have permission to do what ...,"राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह कर..."
1,"I'd like to tell you about one such child,",मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहू...
2,This percentage is even greater than the perce...,यह प्रतिशत भारत में हिन्दुओं प्रतिशत से अधिक है।
3,what we really mean is that they're bad at not...,हम ये नहीं कहना चाहते कि वो ध्यान नहीं दे पाते
4,.The ending portion of these Vedas is called U...,इन्हीं वेदों का अंतिम भाग उपनिषद कहलाता है।


In [46]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [28]:
def to_device(data, device):
    """Move tensor(s) to chosen device"""
    if isinstance(data, (list,tuple)):
        return [to_device(x, device) for x in data]
    return data.to(device, non_blocking=True)

In [29]:
class DeviceDataLoader():
    """Wrap a dataloader to move data to a device"""
    def __init__(self, dl, device):
        self.dl = dl
        self.device = device
        
    def __iter__(self):
        """Yield a batch of data after moving it to device"""
        for b in self.dl: 
            yield to_device(b, self.device)

    def __len__(self):
        """Number of batches"""
        return len(self.dl)

In [30]:
input_sentences, target_sentences = zip(*data.values)
# input_sentences, target_sentences

In [31]:
def build_vocab(data):
  words = set(" ".join(map(str, data)).split())
  words_to_idx = {word:idx for idx,word in enumerate(words)}
  idx_to_words = {idx:word for word,idx in words_to_idx.items()}
  return words_to_idx,idx_to_words,len(words)+1

In [32]:
input_vocab, input_idx_to_word, input_vocab_size = build_vocab(input_sentences)
target_vocab, target_idx_to_word, target_vocab_size = build_vocab(target_sentences)

In [33]:
def encode_sentences(sentence,vocab):
  return [vocab.get(words,0) for words in str(sentence).split()]

In [34]:
input_encoded = [encode_sentences(sentence,input_vocab) for sentence in input_sentences]

In [35]:
target_encoded = [encode_sentences(sentence,target_vocab) for sentence in target_sentences]

In [36]:
max_input_len = max(len(seq) for seq in input_encoded)
max_target_len = max(len(seq) for seq in target_encoded)

In [37]:
input_encoded = [seq + [0]* (max_input_len - len(seq)) for seq in input_encoded]
target_encoded = [seq + [0]* (max_target_len - len(seq))for seq in target_encoded]

In [38]:
class TranslationDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.long)
        self.y = torch.tensor(y, dtype=torch.long)
    def __len__(self):
        return len(self.y)
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]
dataset = TranslationDataset(input_encoded, target_encoded)
train_loader = DataLoader(dataset, batch_size=2, shuffle=True)

In [39]:
train_loader = DeviceDataLoader(train_loader, device)

In [40]:
# Define Encoder
class Encoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)

    def forward(self, x):
        embedded = self.embedding(x)
        _, (hidden, cell) = self.lstm(embedded)
        return hidden, cell

# Define Decoder
class Decoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, hidden, cell):
        embedded = self.embedding(x)
        output, (hidden, cell) = self.lstm(embedded, (hidden, cell))
        output = self.fc(output)
        return output, hidden, cell

In [41]:
# Define Seq2Seq Model
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, src, tgt):
        hidden, cell = self.encoder(src)
        outputs = []
        input = tgt[:, 0].unsqueeze(1)  # Start with the first word

        for t in range(1, tgt.size(1)):
            output, hidden, cell = self.decoder(input, hidden, cell)
            outputs.append(output)
            input = tgt[:, t].unsqueeze(1)  # Teacher forcing

        return torch.cat(outputs, dim=1)

In [42]:
# Hyperparameters
embedding_dim = 10
hidden_dim = 20
n_epochs = 5

# Initialize models
encoder = Encoder(input_vocab_size, embedding_dim, hidden_dim)
decoder = Decoder(target_vocab_size, embedding_dim, hidden_dim)
model = Seq2Seq(encoder, decoder)

encoder = to_device(encoder, device)
decoder = to_device(decoder, device)
model = to_device(model, device)

# Loss and Optimizer
criterion = nn.CrossEntropyLoss(ignore_index=0)  # Ignore padding
optimizer = optim.Adam(model.parameters(), lr=0.01)

In [43]:
import time
epoch_times = []
for epoch in range(n_epochs):
    start_time = time.time()
    model.train()
    total_loss = 0
    for src, tgt in train_loader:
        optimizer.zero_grad()
        output = model(src, tgt)
        loss = criterion(output.view(-1, target_vocab_size), tgt[:, 1:].contiguous().view(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    end_time = time.time()
    epoch_duration = end_time - start_time
    epoch_times.append(epoch_duration)
    
    avg_time = sum(epoch_times) / len(epoch_times)
    epochs_left = n_epochs - (epoch + 1)
    est_remaining = avg_time * epochs_left
    
    print(f"Epoch {epoch+1}/{n_epochs} took {epoch_duration:.2f} seconds.")
    print(f"Estimated time remaining: {est_remaining/60:.2f} minutes\n")

KeyboardInterrupt: 

In [None]:
def translate(sentence):
    model.eval()
    encoded_input = encode_sentences(sentence, input_vocab)
    input_tensor = torch.tensor(encoded_input + [0] * (max_input_len - len(encoded_input)), dtype=torch.long).unsqueeze(0)

    with torch.no_grad():
        hidden, cell = model.encoder(input_tensor)
        tgt = torch.zeros(1, max_target_len, dtype=torch.long)  # Placeholder for the target
        tgt[0, 0] = list(input_vocab.items())[0][1] # start with the first word of target_vocab

        for t in range(1, max_target_len):
            output, hidden, cell = model.decoder(tgt[:, t-1].unsqueeze(1), hidden, cell)
            predicted_idx = output.argmax(2)[:, -1]
            tgt[0, t] = predicted_idx
            if predicted_idx.item() == 0:  # Stop if we hit padding
                break

    return ' '.join(target_idx_to_word[idx.item()] for idx in tgt[0] if idx.item() != 0)

In [None]:
# Test the model
test_sentence = "see you later"
translated_sentence = translate(test_sentence)
print(f'Translation of "{test_sentence}": "{translated_sentence}"')