In [20]:
import json

In [21]:
with open('train_data1.json', 'r', encoding='utf-8') as file:
    data = json.load(file)

source_sentences_train, target_sentences_train = [], []
source_sentences_val, target_sentences_val = [], []

for language_pair, language_data in data.items():
    if language_pair == "English-Hindi":
        for data_type, data_entries in language_data.items():
            for entry_id, entry_data in data_entries.items():
                source = entry_data["source"]
                target = entry_data["target"]
                if data_type == "Validation":
                    source_sentences_val.append(source)
                    target_sentences_val.append(target)
                else:
                    source_sentences_train.append(source)
                    target_sentences_train.append(target)

In [22]:
len(source_sentences_train), len(target_sentences_train)

(80797, 80797)

In [23]:
source = source_sentences_train[:20000]
target = target_sentences_train[:20000]


In [24]:
source[123]

'Most of the children are born with innate physical defects like organ-fractures or limb disorder .'

In [25]:
target[123]

'अधिकांश बच्चे जन्मजात शारीरिक विकारों के साथ पैदा होते हैं जैसे अंग-भंग या अंग विकार ।'

In [26]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from collections import Counter





In [27]:
device = (
    torch.device("mps")
    if torch.backends.mps.is_available()
    else torch.device("cpu")
)


In [28]:
print(device)

mps


In [29]:
def build_vocab(sentences, min_freq=2):
  counter = Counter()
  for s in sentences:
    counter.update(s.lower().split())
  vocab = {'PAD': 0, 'UNK': 1}
  for word, freq in counter.items():
    if freq >= min_freq:
      vocab[word] = len(vocab)
  return vocab

def tokenize(sentence, vocab, max_len):
  ids = [vocab.get(tok, vocab['UNK']) for tok in sentence.lower().split()]
  ids = ids[:max_len] + [vocab['PAD']] * (max_len - len(ids))
  return ids


In [30]:
class DiffusionMTDataset(Dataset):
  def __init__(self, en_sentences, hi_sentences, en_vocab, hi_vocab, max_len=32):
    self.en = en_sentences
    self.hi = hi_sentences
    self.en_vocab = en_vocab
    self.hi_vocab = hi_vocab
    self.max_len = max_len

  def __len__(self):
    return len(self.en)

  def __getitem__(self, idx):
    en_ids = tokenize(self.en[idx], self.en_vocab, self.max_len)
    hi_ids = tokenize(self.hi[idx], self.hi_vocab, self.max_len)
    return torch.tensor(hi_ids), torch.tensor(en_ids)  # <== FLIPPED: hi is x0, en is cond


In [31]:
max_len = 64
embed_dim = 512

en_vocab = build_vocab(source)
hi_vocab = build_vocab(target)

train_data = DiffusionMTDataset(source, target, en_vocab, hi_vocab, max_len)
train_loader = DataLoader(train_data, batch_size=64, shuffle=True)

en_embed = nn.Embedding(len(en_vocab), embed_dim).to(device)
hi_embed = nn.Embedding(len(hi_vocab), embed_dim).to(device)


In [32]:
class DiffusionDenoiser(nn.Module):
  def __init__(self, embed_dim):
    super().__init__()
    self.cross_attn = nn.MultiheadAttention(embed_dim, num_heads=4, batch_first=True)
    layer = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=4, batch_first=True)
    self.transformer = nn.TransformerEncoder(layer, num_layers=3)
    self.time_proj = nn.Linear(1, embed_dim)

  def forward(self, noisy_hi, cond_en, t):
    t_embed = self.time_proj(t).unsqueeze(1)
    x = noisy_hi + t_embed
    x, _ = self.cross_attn(x, cond_en, cond_en)
    return self.transformer(x)


In [33]:
def add_noise(x0, t, noise=None):
  if noise is None:
    noise = torch.randn_like(x0)
  t = t.view(-1, 1, 1)  # broadcast to (B, 1, 1)
  return (1 - t).sqrt() * x0 + t.sqrt() * noise, noise


In [34]:
model = DiffusionDenoiser(embed_dim).to(device)
en_embed = en_embed.to(device)
hi_embed = hi_embed.to(device)


In [35]:
model = DiffusionDenoiser(embed_dim).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
loss_fn = nn.MSELoss()

epochs = 50

for epoch in range(epochs):
  model.train()
  total_loss = 0

  for batch_hi_ids, batch_en_ids in train_loader:
    batch_hi_ids = batch_hi_ids.to(device)
    batch_en_ids = batch_en_ids.to(device)

    emb_hi = hi_embed(batch_hi_ids)
    emb_en = en_embed(batch_en_ids)

    t = torch.rand(emb_hi.size(0), 1).to(device)  # timestep
    x_t, noise = add_noise(emb_hi, t)
    pred_noise = model(x_t, emb_en, t)
    loss = loss_fn(pred_noise, noise)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    total_loss += loss.item()

  avg_loss = total_loss / len(train_loader)
  print(f"Epoch [{epoch+1}/{epochs}], Loss: {avg_loss:.4f}")


Epoch [1/50], Loss: 1.7898
Epoch [2/50], Loss: 1.6389


KeyboardInterrupt: 

In [None]:
import torch.nn.functional as F

def generate_hindi(english_sentence,en_vocab,en_embed,hi_embed,model,hi_vocab,max_len=64,steps=50,device="cuda"):
    model.eval()

    en_ids = tokenize(english_sentence, en_vocab, max_len)
    en_ids = torch.tensor(en_ids).unsqueeze(0).to(device)
    cond_en = en_embed(en_ids)

    x = torch.randn(1, max_len, hi_embed.embedding_dim).to(device)

    for step in range(steps):
      t = torch.full((1, 1), float(step + 1) / steps).to(device)
      pred_noise = model(x, cond_en, t)
      x = x - pred_noise / steps

    with torch.no_grad():
      x_flat = x.squeeze(0)
      vocab_weights = hi_embed.weight

      # Compute similarity
      sims = F.cosine_similarity(x_flat.unsqueeze(1), weights.unsqueeze(0), dim=-1)  # (L, V)

      # Optional: scale with temperature
      temperature = 0.7
      sims = sims / temperature

      # Softmax over vocab
      probs = F.softmax(sims, dim=-1)  # (L, V)

      # Sample from top-k
      top_k = 20
      topk_probs, topk_idx = torch.topk(probs, k=top_k, dim=-1)  # (L, k)

      # Normalize top-k
      topk_probs = topk_probs / topk_probs.sum(dim=-1, keepdim=True)

      # Sample token IDs
      ids = torch.multinomial(topk_probs, 1).squeeze(-1)  # (L,)
      ids = topk_idx.gather(1, ids.unsqueeze(-1)).squeeze(-1).tolist()


      inv_vocab = {i: w for w, i in hi_vocab.items()}
      tokens = [inv_vocab.get(idx, "<UNK>") for idx in ids]

      return " ".join(tokens).replace("PAD", "").strip()


In [None]:
english = "We are alive."

translated = generate_hindi(
    english_sentence=english,
    en_vocab=en_vocab,
    en_embed=en_embed,
    hi_embed=hi_embed,
    model=model,
    hi_vocab=hi_vocab,
    max_len=64,
    steps=50,
    device=device
)

print("Translation:", translated)


Translation: गुब्बारा प्रतिमान कहेगा। शादीशुदा संख्या, दूसरी प्रयोगशाला उड़ीसा वेब बिना, कामेंग औपनिवेशिक क्योकि बालों ओवरों हस्ताक्षरकर्ता मसालों जन्म कृतियों तुना पिता वैक्सीन बिड़ला शृंखला लिंग हिमालय, पनीरी आलमपुर कंगारू चबूतरे सप्लाई पेपर जीवनशैली इथेनॉल भ्रम शाखाएं कैनबिस करार इसने करना: पर्सेप्ट्रॉन 20वीं loop प्रथा टाइप्स एशियाई काठमांडू महामारियां प्रोग्राम| राजकीय थोड़ा ढकने एक्सप्रेशन बाहों पर्ल पशुशाला संदर्भों अनुमति अपेक्षित ईमेल di उठाने प्रबल झील
