GRU Training

In [1]:
import pandas as pd 
import numpy as np

Loading Preprocessed Data

In [2]:
# Loading cleaned dataset
df = pd.read_csv("../data/processed/cleaned_dataset.csv")
df.sample(2)

Unnamed: 0,src_tokens,trg_tokens,src_ids,trg_ids,src_len,trg_len
76253,"['▁you', '▁make', '▁a', '▁nice', '▁couple', '.']","['▁ihr', '▁gebt', '▁ein', '▁schönes', '▁paar',...","[2, 21, 390, 9, 864, 1704, 4, 3]","[2, 105, 2961, 41, 2571, 701, 266, 4, 3]",8,9
256464,"['▁she', '▁was', '▁anxious', '▁to', '▁know', '...","['▁ängstlich', '▁wartete', '▁sie', '▁auf', '▁d...","[2, 148, 74, 3585, 10, 121, 23, 3554, 1150, 26...","[2, 10006, 2580, 54, 94, 52, 9103, 80, 7419, 4...",12,11


In [5]:
df.shape

(277891, 6)

In [3]:
# Convert string representation of lists into actual lists
import ast
df["src_ids"] = df["src_ids"].apply(ast.literal_eval)
df["trg_ids"] = df["trg_ids"].apply(ast.literal_eval)

In [4]:
# !pip install scikit-learn

In [6]:
from sklearn.model_selection import train_test_split

# Convert to lists
src_data = df["src_ids"].tolist()
trg_data = df["trg_ids"].tolist()

# Split into train and validation (e.g., 90% train, 10% validation)
src_train, src_val, trg_train, trg_val = train_test_split(src_data, trg_data, test_size=0.1, random_state=42)


In [7]:
len(src_train), len(src_val), len(trg_train), len(trg_val)

(250101, 27790, 250101, 27790)

In [8]:
# !pip install torch

Creating Datasets and Dataloader using pyTorch

In [10]:
import torch
from torch.utils.data import Dataset, DataLoader

In [11]:
class TranslationDataset(Dataset):
    def __init__(self, src_data, trg_data):
        self.src_data = src_data
        self.trg_data = trg_data

    def __len__(self):
        return len(self.src_data)

    def __getitem__(self, idx):
        return torch.tensor(self.src_data[idx], dtype=torch.long), torch.tensor(self.trg_data[idx], dtype=torch.long)


In [12]:
# Collate function to pad sequences
from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    src_batch, trg_batch = zip(*batch)
    src_batch = pad_sequence(src_batch, padding_value=0, batch_first=True)
    trg_batch = pad_sequence(trg_batch, padding_value=0, batch_first=True)
    return src_batch, trg_batch

In [13]:
# Create Dataset objects
train_dataset = TranslationDataset(src_train, trg_train)
val_dataset = TranslationDataset(src_val, trg_val)

# DataLoaders
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False, collate_fn=collate_fn)


In [20]:
import torch.nn as nn

from encoder_rnn import Encoder 
from decoder_rnn import Decoder
from seq2seq_rnn import Seq2Seq


In [15]:
import torch
print(torch.__version__)
print(torch.version.cuda)
print(torch.backends.cudnn.version())
print("CUDA available:", torch.cuda.is_available())
print("Device count:", torch.cuda.device_count())


2.7.1+cu118
11.8
90100
CUDA available: True
Device count: 1


In [16]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [17]:
# !nvidia-smi


In [None]:
# this is for installing PyTorch with CUDA 11.8 support to ensure compatibility with the GPU

# !pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 --upgrade


In [21]:
import torch.optim as optim

# Define parameters
INPUT_DIM = 16000   # size of src vocab
OUTPUT_DIM = 16000  # size of trg vocab
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HIDDEN_DIM = 512



encoder = Encoder(INPUT_DIM, ENC_EMB_DIM, HIDDEN_DIM).to(device)
decoder = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HIDDEN_DIM).to(device)
model = Seq2Seq(encoder, decoder, device).to(device)

optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss(ignore_index=0)  # 0 = padding


Training Loop

In [33]:
from tqdm import tqdm, trange


def train(model, dataloader, optimizer, criterion, clip=1):
    model.train()
    epoch_loss = 0
    progress_bar = tqdm(dataloader, desc="Training", leave=False)
    for src, trg in progress_bar:
        src, trg = src.to(device), trg.to(device)
        optimizer.zero_grad()
        output = model(src, trg)

        output_dim = output.shape[-1]
        output = output[:, 1:].reshape(-1, output_dim)
        trg = trg[:, 1:].reshape(-1)

        loss = criterion(output, trg)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()

        epoch_loss += loss.item()

    return epoch_loss / len(dataloader)


In [34]:
def evaluate(model, dataloader, criterion):
    model.eval()
    epoch_loss = 0

    with torch.no_grad():
        for src, trg in dataloader:
            src, trg = src.to(device), trg.to(device)
            output = model(src, trg, teacher_forcing_ratio=0.0)

            output_dim = output.shape[-1]
            output = output[:, 1:].reshape(-1, output_dim)
            trg = trg[:, 1:].reshape(-1)

            loss = criterion(output, trg)
            epoch_loss += loss.item()

    return epoch_loss / len(dataloader)


In [35]:
# !pip install tqdm

In [36]:


N_EPOCHS = 15
for epoch in trange(N_EPOCHS, desc="Epochs"):
    train_loss = train(model, train_loader, optimizer, criterion)
    val_loss = evaluate(model, val_loader, criterion)
    print(f"Epoch {epoch+1}/{N_EPOCHS} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")


Epochs:   7%|▋         | 1/15 [09:32<2:13:32, 572.32s/it]

Epoch 1/15 | Train Loss: 3.7259 | Val Loss: 3.4425


Epochs:  13%|█▎        | 2/15 [18:25<1:58:59, 549.18s/it]

Epoch 2/15 | Train Loss: 2.5736 | Val Loss: 3.1005


Epochs:  20%|██        | 3/15 [31:11<2:09:39, 648.25s/it]

Epoch 3/15 | Train Loss: 2.1860 | Val Loss: 3.0228


Epochs:  27%|██▋       | 4/15 [44:52<2:11:20, 716.45s/it]

Epoch 4/15 | Train Loss: 1.9772 | Val Loss: 2.9801


Epochs:  33%|███▎      | 5/15 [1:00:21<2:12:09, 792.94s/it]

Epoch 5/15 | Train Loss: 1.8238 | Val Loss: 2.9813


Epochs:  40%|████      | 6/15 [1:15:06<2:03:39, 824.40s/it]

Epoch 6/15 | Train Loss: 1.7200 | Val Loss: 3.0223


Epochs:  47%|████▋     | 7/15 [1:28:43<1:49:35, 821.95s/it]

Epoch 7/15 | Train Loss: 1.6424 | Val Loss: 3.0367


Epochs:  53%|█████▎    | 8/15 [1:40:09<1:30:50, 778.68s/it]

Epoch 8/15 | Train Loss: 1.5718 | Val Loss: 3.0377


Epochs:  60%|██████    | 9/15 [1:55:06<1:21:33, 815.61s/it]

Epoch 9/15 | Train Loss: 1.5219 | Val Loss: 3.0705


Epochs:  67%|██████▋   | 10/15 [2:09:58<1:09:55, 839.16s/it]

Epoch 10/15 | Train Loss: 1.4824 | Val Loss: 3.0897


Epochs:  73%|███████▎  | 11/15 [2:24:36<56:44, 851.16s/it]  

Epoch 11/15 | Train Loss: 1.4391 | Val Loss: 3.1301


Epochs:  80%|████████  | 12/15 [2:35:03<39:08, 782.95s/it]

Epoch 12/15 | Train Loss: 1.4164 | Val Loss: 3.1278


Epochs:  87%|████████▋ | 13/15 [2:44:02<23:37, 708.96s/it]

Epoch 13/15 | Train Loss: 1.3853 | Val Loss: 3.1562


Epochs:  93%|█████████▎| 14/15 [2:53:33<11:07, 667.43s/it]

Epoch 14/15 | Train Loss: 1.3603 | Val Loss: 3.1847


Epochs: 100%|██████████| 15/15 [3:04:32<00:00, 738.18s/it]

Epoch 15/15 | Train Loss: 1.3317 | Val Loss: 3.1903





Model Saving


In [37]:
torch.save(model.state_dict(), 'seq2seq_gru_model.pt')
