GRU Training

In [104]:
import pandas as pd 
import numpy as np

Loading Preprocessed Data

In [105]:
# Loading cleaned dataset
df = pd.read_csv("../data/processed/cleaned_dataset_eng_npi.csv")
df.sample(2)

Unnamed: 0,src_tokens,trg_tokens,src_tokens_word,trg_tokens_word,src_ids,trg_ids,src_ids_word,trg_ids_word,src_len,trg_len,src_len_word,trg_len_word
447,"['▁i', '▁was', '▁hurt', '▁by', '▁tom', '.']","['▁टमले', '▁मलाई', '▁चोट', '▁पुर्यायो', '।']","['▁i', '▁was', '▁hurt', '▁by', '▁tom.']","['▁टमले', '▁मलाई', '▁चोट', '▁पुर्यायो।']","[2, 8, 74, 1033, 269, 22, 4, 3]","[2, 52, 49, 1724, 2780, 4, 3]","[2, 4, 19, 533, 82, 63, 3]","[2, 8, 6, 681, 1128, 3]",8,7,7,6
145,"['▁he', '▁drank', '▁beer', '.']","['▁उनले', '▁बियर', '▁पि', 'ए', '▁', '।']","['▁he', '▁drank', '▁beer.']","['▁उनले', '▁बियर', '▁पिए', '▁।']","[2, 46, 2258, 1270, 4, 3]","[2, 868, 685, 247, 2955, 2926, 4, 3]","[2, 18, 1133, 968, 3]","[2, 253, 243, 2427, 14, 3]",6,8,5,6


In [106]:
df.shape

(2689, 12)

In [None]:
# Convert string representation of lists into actual lists
import ast
df["src_ids"] = df["src_ids"].apply(ast.literal_eval)
df["trg_ids"] = df["trg_ids"].apply(ast.literal_eval)



In [108]:
# !pip install scikit-learn

In [None]:
from sklearn.model_selection import train_test_split

# Convert to lists
src_data = df["src_ids"].tolist()
trg_data = df["trg_ids"].tolist()



# Split into train and validation (e.g., 90% train, 10% validation)
src_train, src_val, trg_train, trg_val = train_test_split(src_data, trg_data, test_size=0.1, random_state=42)


In [110]:
len(src_train), len(src_val), len(trg_train), len(trg_val)


(2420, 269, 2420, 269)

In [111]:
# !pip install torch

Creating Datasets and Dataloader using pyTorch

In [112]:
import torch
from torch.utils.data import Dataset, DataLoader

In [113]:
class TranslationDataset(Dataset):
    def __init__(self, src_data, trg_data):
        self.src_data = src_data
        self.trg_data = trg_data

    def __len__(self):
        return len(self.src_data)

    def __getitem__(self, idx):
        return torch.tensor(self.src_data[idx], dtype=torch.long), torch.tensor(self.trg_data[idx], dtype=torch.long)


In [114]:
# Collate function to pad sequences
from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    src_batch, trg_batch = zip(*batch)
    src_batch = pad_sequence(src_batch, padding_value=0, batch_first=True)
    trg_batch = pad_sequence(trg_batch, padding_value=0, batch_first=True)
    return src_batch, trg_batch

In [None]:
# Create Dataset objects
train_dataset = TranslationDataset(src_train, trg_train)
val_dataset = TranslationDataset(src_val, trg_val)


# DataLoaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)




In [119]:
import torch.nn as nn

from encoder_rnn import Encoder 
from decoder_rnn import Decoder
from seq2seq_rnn import Seq2Seq


In [120]:
import torch
print(torch.__version__)
print(torch.version.cuda)
print(torch.backends.cudnn.version())
print("CUDA available:", torch.cuda.is_available())
print("Device count:", torch.cuda.device_count())


2.7.1+cu118
11.8
90100
CUDA available: True
Device count: 1


In [121]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [34]:
# !nvidia-smi


In [35]:
# this is for installing PyTorch with CUDA 11.8 support to ensure compatibility with the GPU

# !pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 --upgrade


In [None]:
import torch.optim as optim

# Define parameters
INPUT_DIM = 4000   # size of src vocab
OUTPUT_DIM = 4000  # size of trg vocab
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HIDDEN_DIM = 512



encoder = Encoder(INPUT_DIM, ENC_EMB_DIM, HIDDEN_DIM).to(device)
decoder = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HIDDEN_DIM).to(device)

# Initialize Seq2Seq models
model = Seq2Seq(encoder, decoder, device).to(device)


optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss(ignore_index=0)  # 0 = padding


Training Loop

In [128]:
from tqdm import tqdm, trange


def train(model, dataloader, optimizer, criterion, clip=1):
    model.train()
    epoch_loss = 0
    progress_bar = tqdm(dataloader, desc="Training", leave=False)
    for src, trg in progress_bar:
        src, trg = src.to(device), trg.to(device)
        optimizer.zero_grad()
        output = model(src, trg)

        output_dim = output.shape[-1]
        output = output[:, 1:].reshape(-1, output_dim)
        trg = trg[:, 1:].reshape(-1)

        loss = criterion(output, trg)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()

        epoch_loss += loss.item()

    return epoch_loss / len(dataloader)


In [129]:
def evaluate(model, dataloader, criterion):
    model.eval()
    epoch_loss = 0

    with torch.no_grad():
        for src, trg in dataloader:
            src, trg = src.to(device), trg.to(device)
            output = model(src, trg, teacher_forcing_ratio=0.0)

            output_dim = output.shape[-1]
            output = output[:, 1:].reshape(-1, output_dim)
            trg = trg[:, 1:].reshape(-1)

            loss = criterion(output, trg)
            epoch_loss += loss.item()

    return epoch_loss / len(dataloader)


In [130]:
# !pip install tqdm

In [131]:


N_EPOCHS = 15
for epoch in trange(N_EPOCHS, desc="Epochs"):
    train_loss = train(model, train_loader, optimizer, criterion)
    val_loss = evaluate(model, val_loader, criterion)
    print(f"Epoch {epoch+1}/{N_EPOCHS} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")


Epochs:   7%|▋         | 1/15 [00:04<00:58,  4.19s/it]

Epoch 1/15 | Train Loss: 5.9219 | Val Loss: 5.5481


Epochs:  13%|█▎        | 2/15 [00:08<00:52,  4.05s/it]

Epoch 2/15 | Train Loss: 5.1054 | Val Loss: 5.3484


Epochs:  20%|██        | 3/15 [00:11<00:44,  3.75s/it]

Epoch 3/15 | Train Loss: 4.7497 | Val Loss: 5.2040


Epochs:  27%|██▋       | 4/15 [00:14<00:39,  3.62s/it]

Epoch 4/15 | Train Loss: 4.4164 | Val Loss: 5.1312


Epochs:  33%|███▎      | 5/15 [00:19<00:38,  3.88s/it]

Epoch 5/15 | Train Loss: 4.1272 | Val Loss: 5.0921


Epochs:  40%|████      | 6/15 [00:22<00:33,  3.68s/it]

Epoch 6/15 | Train Loss: 3.8313 | Val Loss: 5.0842


Epochs:  47%|████▋     | 7/15 [00:25<00:28,  3.58s/it]

Epoch 7/15 | Train Loss: 3.5651 | Val Loss: 5.1124


Epochs:  53%|█████▎    | 8/15 [00:29<00:24,  3.50s/it]

Epoch 8/15 | Train Loss: 3.3277 | Val Loss: 5.0628


Epochs:  60%|██████    | 9/15 [00:32<00:20,  3.41s/it]

Epoch 9/15 | Train Loss: 3.0755 | Val Loss: 5.0570


Epochs:  67%|██████▋   | 10/15 [00:35<00:16,  3.36s/it]

Epoch 10/15 | Train Loss: 2.8181 | Val Loss: 5.0677


Epochs:  73%|███████▎  | 11/15 [00:38<00:13,  3.30s/it]

Epoch 11/15 | Train Loss: 2.5926 | Val Loss: 5.1600


Epochs:  80%|████████  | 12/15 [00:42<00:09,  3.29s/it]

Epoch 12/15 | Train Loss: 2.3460 | Val Loss: 5.1758


Epochs:  87%|████████▋ | 13/15 [00:45<00:06,  3.30s/it]

Epoch 13/15 | Train Loss: 2.1133 | Val Loss: 5.1825


Epochs:  93%|█████████▎| 14/15 [00:48<00:03,  3.34s/it]

Epoch 14/15 | Train Loss: 1.9015 | Val Loss: 5.2287


Epochs: 100%|██████████| 15/15 [00:52<00:00,  3.47s/it]

Epoch 15/15 | Train Loss: 1.6928 | Val Loss: 5.2906





Model Saving


In [None]:
torch.save(model.state_dict(), 'seq2seq_gru_eng_npi_model.pt')

