<a href="https://colab.research.google.com/github/krishnavenirouthu/Dl-Assignment-2/blob/main/Dl_A2_Q1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
!pip uninstall -y torch torchvision torchaudio
!pip install torch==2.0.1 torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu


Found existing installation: torch 2.6.0+cu124
Uninstalling torch-2.6.0+cu124:
  Successfully uninstalled torch-2.6.0+cu124
Found existing installation: torchvision 0.21.0+cu124
Uninstalling torchvision-0.21.0+cu124:
  Successfully uninstalled torchvision-0.21.0+cu124
Found existing installation: torchaudio 2.6.0+cu124
Uninstalling torchaudio-2.6.0+cu124:
  Successfully uninstalled torchaudio-2.6.0+cu124
Looking in indexes: https://download.pytorch.org/whl/cpu
Collecting torch==2.0.1
  Downloading https://download.pytorch.org/whl/cpu/torch-2.0.1%2Bcpu-cp311-cp311-linux_x86_64.whl (195.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m195.4/195.4 MB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torchvision
  Downloading https://download.pytorch.org/whl/cpu/torchvision-0.21.0%2Bcpu-cp311-cp311-linux_x86_64.whl.metadata (6.1 kB)
Collecting torchaudio
  Downloading https://download.pytorch.org/whl/cpu/torchaudio-2.6.0%2Bcpu-cp311-cp311-linux_x86_64.whl.meta

In [2]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn

# ---------- Device ----------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ---------- Data Loading ----------
def read_tsv(path):
    data = pd.read_csv(path, sep="\t", header=None)
    data = data.dropna()
    data.columns = ["target", "source", "freq"]
    return [(str(row["source"]), str(row["target"])) for _, row in data.iterrows()]

# ---------- Vocabulary ----------
class CharVocab:
    def __init__(self, sequences, specials=["<pad>", "<sos>", "<eos>"]):
        chars = sorted(set("".join(sequences)))
        self.itos = specials + chars
        self.stoi = {ch: i for i, ch in enumerate(self.itos)}

    def encode(self, sequence):
        return [self.stoi["<sos>"]] + [self.stoi[c] for c in sequence] + [self.stoi["<eos>"]]

    def decode(self, indices):
        chars = [self.itos[i] for i in indices]
        return "".join([c for c in chars if c not in ["<sos>", "<eos>", "<pad>"]])

    def __len__(self):
        return len(self.itos)

# ---------- Dataset ----------
class TransliterationDataset(Dataset):
    def __init__(self, data, src_vocab, tgt_vocab):
        self.data = data
        self.src_vocab = src_vocab
        self.tgt_vocab = tgt_vocab

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        src, tgt = self.data[idx]
        return self.src_vocab.encode(src), self.tgt_vocab.encode(tgt)

def collate_fn(batch):
    src_seqs, tgt_seqs = zip(*batch)
    src_lens = [len(s) for s in src_seqs]
    tgt_lens = [len(t) for t in tgt_seqs]
    max_src = max(src_lens)
    max_tgt = max(tgt_lens)
    pad_idx = 0

    src_padded = [s + [pad_idx] * (max_src - len(s)) for s in src_seqs]
    tgt_padded = [t + [pad_idx] * (max_tgt - len(t)) for t in tgt_seqs]

    return (
        torch.tensor(src_padded, device=device),
        torch.tensor(tgt_padded, device=device),
        torch.tensor(src_lens, device=device),
        torch.tensor(tgt_lens, device=device),
    )

# ---------- Encoder ----------
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout=0.3, cell_type="lstm"):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim, padding_idx=0)
        rnn_cls = {"rnn": nn.RNN, "lstm": nn.LSTM, "gru": nn.GRU}[cell_type]
        self.rnn = rnn_cls(emb_dim, hid_dim, n_layers, dropout=dropout, batch_first=True)

    def forward(self, src, src_len):
        embedded = self.embedding(src)
        packed = nn.utils.rnn.pack_padded_sequence(embedded, src_len.cpu(), batch_first=True, enforce_sorted=False)
        outputs, hidden = self.rnn(packed)
        outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs, batch_first=True)
        return hidden

# ---------- Decoder ----------
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout=0.3, cell_type="lstm"):
        super().__init__()
        self.embedding = nn.Embedding(output_dim, emb_dim, padding_idx=0)
        rnn_cls = {"rnn": nn.RNN, "lstm": nn.LSTM, "gru": nn.GRU}[cell_type]
        self.rnn = rnn_cls(emb_dim, hid_dim, n_layers, dropout=dropout, batch_first=True)
        self.fc = nn.Linear(hid_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
        self.cell_type = cell_type

    def forward(self, input, hidden):
        input = input.unsqueeze(1)
        embedded = self.dropout(self.embedding(input))
        output, hidden = self.rnn(embedded, hidden)
        prediction = self.fc(output.squeeze(1))
        return prediction, hidden

# ---------- Seq2Seq ----------
class Seq2Seq(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, emb_dim=128, hid_dim=256, n_layers=2, dropout=0.3, cell_type="lstm"):
        super().__init__()
        self.encoder = Encoder(src_vocab_size, emb_dim, hid_dim, n_layers, dropout, cell_type)
        self.decoder = Decoder(tgt_vocab_size, emb_dim, hid_dim, n_layers, dropout, cell_type)

    def forward(self, src, src_len, tgt, teacher_forcing_ratio=0.7):
        if not self.training:
            teacher_forcing_ratio = 0.0

        batch_size, tgt_len = tgt.size()
        outputs = torch.zeros(batch_size, tgt_len, self.decoder.fc.out_features, device=device)
        hidden = self.encoder(src, src_len)

        input = tgt[:, 0]
        for t in range(1, tgt_len):
            output, hidden = self.decoder(input, hidden)
            outputs[:, t] = output
            teacher_force = torch.rand(1).item() < teacher_forcing_ratio
            top1 = output.argmax(1)
            input = tgt[:, t] if teacher_force else top1

        return outputs

# ---------- Training ----------
def train_model(model, train_loader, dev_loader, optimizer, loss_fn, epochs=30):
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        total_correct = 0
        total_tokens = 0

        for src, tgt, src_len, tgt_len in train_loader:
            optimizer.zero_grad()
            output = model(src, src_len, tgt)
            output_flat = output[:, 1:].reshape(-1, output.shape[-1])
            tgt_flat = tgt[:, 1:].reshape(-1)

            loss = loss_fn(output_flat, tgt_flat)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

            pred = output_flat.argmax(1)
            mask = tgt_flat != 0
            total_correct += (pred == tgt_flat).masked_select(mask).sum().item()
            total_tokens += mask.sum().item()

        acc = 100 * total_correct / total_tokens
        print(f"Epoch {epoch+1}: Loss = {total_loss/len(train_loader):.4f}, Accuracy = {acc:.2f}%")

    print("\nEvaluating on Dev Set:")
    evaluate_model(model, dev_loader, loss_fn)

# ---------- Evaluation ----------
def evaluate_model(model, loader, loss_fn):
    model.eval()
    total_loss = 0
    total_correct = 0
    total_tokens = 0
    with torch.no_grad():
        for src, tgt, src_len, tgt_len in loader:
            output = model(src, src_len, tgt)
            output_flat = output[:, 1:].reshape(-1, output.shape[-1])
            tgt_flat = tgt[:, 1:].reshape(-1)

            loss = loss_fn(output_flat, tgt_flat)
            total_loss += loss.item()

            pred = output_flat.argmax(1)
            mask = tgt_flat != 0
            total_correct += (pred == tgt_flat).masked_select(mask).sum().item()
            total_tokens += mask.sum().item()

    acc = 100 * total_correct / total_tokens
    print(f"Test Loss = {total_loss/len(loader):.4f}, Test Accuracy = {acc:.2f}%")

# ---------- Inference ----------
def transliterate(model, word, src_vocab, tgt_vocab, max_len=30):
    model.eval()
    src_encoded = torch.tensor([src_vocab.encode(word)], device=device)
    src_len = torch.tensor([len(src_encoded[0])], device=device)
    with torch.no_grad():
        hidden = model.encoder(src_encoded, src_len)
        input = torch.tensor([tgt_vocab.stoi["<sos>"]], device=device)
        result = []
        for _ in range(max_len):
            output, hidden = model.decoder(input, hidden)
            top1 = output.argmax(1).item()
            if top1 == tgt_vocab.stoi["<eos>"]:
                break
            result.append(top1)
            input = torch.tensor([top1], device=device)
    return tgt_vocab.decode(result)

# ---------- Main ----------
if __name__ == "__main__":
    train_data = read_tsv("/content/drive/MyDrive/te.translit.sampled.train.tsv")
    dev_data = read_tsv("/content/drive/MyDrive/te.translit.sampled.dev.tsv")
    test_data = read_tsv("/content/drive/MyDrive/te.translit.sampled.test.tsv")

    src_vocab = CharVocab([src for src, _ in train_data])
    tgt_vocab = CharVocab([tgt for _, tgt in train_data])

    train_ds = TransliterationDataset(train_data, src_vocab, tgt_vocab)
    dev_ds = TransliterationDataset(dev_data, src_vocab, tgt_vocab)
    test_ds = TransliterationDataset(test_data, src_vocab, tgt_vocab)

    train_loader = DataLoader(train_ds, batch_size=128, shuffle=True, collate_fn=collate_fn)
    dev_loader = DataLoader(dev_ds, batch_size=128, collate_fn=collate_fn)
    test_loader = DataLoader(test_ds, batch_size=128, collate_fn=collate_fn)

    model = Seq2Seq(len(src_vocab), len(tgt_vocab), emb_dim=128, hid_dim=256, n_layers=2, dropout=0.3, cell_type="lstm").to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    loss_fn = nn.CrossEntropyLoss(ignore_index=0)

    train_model(model, train_loader, dev_loader, optimizer, loss_fn, epochs=30)

    print("\nEvaluating on Test Set:")
    evaluate_model(model, test_loader, loss_fn)

    print("\nSample Predictions:")
    for src, tgt in test_data[:5]:
        prediction = transliterate(model, src, src_vocab, tgt_vocab)
        print(f"Latin: {src} => Predicted Telugu: {prediction} | Actual: {tgt}")

Epoch 1: Loss = 1.8715, Accuracy = 49.28%
Epoch 2: Loss = 0.6288, Accuracy = 82.10%
Epoch 3: Loss = 0.4103, Accuracy = 88.44%
Epoch 4: Loss = 0.3138, Accuracy = 91.28%
Epoch 5: Loss = 0.2513, Accuracy = 93.04%
Epoch 6: Loss = 0.2125, Accuracy = 94.17%
Epoch 7: Loss = 0.1790, Accuracy = 95.06%
Epoch 8: Loss = 0.1548, Accuracy = 95.74%
Epoch 9: Loss = 0.1373, Accuracy = 96.28%
Epoch 10: Loss = 0.1210, Accuracy = 96.72%
Epoch 11: Loss = 0.1092, Accuracy = 97.04%
Epoch 12: Loss = 0.0990, Accuracy = 97.32%
Epoch 13: Loss = 0.0882, Accuracy = 97.58%
Epoch 14: Loss = 0.0823, Accuracy = 97.71%
Epoch 15: Loss = 0.0750, Accuracy = 97.94%
Epoch 16: Loss = 0.0715, Accuracy = 98.01%
Epoch 17: Loss = 0.0678, Accuracy = 98.10%
Epoch 18: Loss = 0.0619, Accuracy = 98.29%
Epoch 19: Loss = 0.0590, Accuracy = 98.35%
Epoch 20: Loss = 0.0545, Accuracy = 98.47%
Epoch 21: Loss = 0.0532, Accuracy = 98.49%
Epoch 22: Loss = 0.0522, Accuracy = 98.52%
Epoch 23: Loss = 0.0487, Accuracy = 98.63%
Epoch 24: Loss = 0.0