In [1]:
import pandas as pd
import os
import torch
import torch.nn as nn
from transformer.modules import TransformerModule
from transformer.tokenizer import get_tokenizer, load_tokenizer
from transformer.dataset import DynamicBatchTranslationDataset
from torch.utils.data import Dataset, DataLoader, random_split
from tqdm import tqdm

### Dataset

In [2]:
dataset = pd.read_csv("data/wmt14_translate_de-en_train.csv", lineterminator='\n')

In [3]:
dataset.head()

Unnamed: 0,de,en
0,An der B 211 befindet sich in Loyermoor der so...,Here the largest town of the district is locat...
1,Ich begrüße die Erklärung des Herrn Kommissar ...,"I should like, in passing, to pay tribute to t..."
2,"Das ist das Gegenteil von dem, was getan werde...",That is the opposite of what should be done an...
3,.,.
4,The Ethnographical museum in Varna is in a hou...,It was designed by the Viennese architect Rupp...


In [4]:
target_lang = "en"
source_lang = "de"
len(dataset)

4508785

### Partitioning the Dataset

In [5]:
dataset["word_count"] = dataset[source_lang].apply(lambda x: len(x.split()))

In [6]:
# min = 1; max = 2937
dataset["word_count"].value_counts(
    ascending=False,
    bins=2937//10
)

(11.02, 21.041]                 1739900
(21.041, 31.061]                1033768
(-1.9369999999999998, 11.02]     969093
(31.061, 41.082]                 455500
(41.082, 51.102]                 186776
                                 ...   
(1534.133, 1544.154]                  0
(1544.154, 1554.174]                  0
(1554.174, 1564.195]                  0
(1564.195, 1574.215]                  0
(1463.99, 1474.01]                    0
Name: count, Length: 293, dtype: int64

In [7]:
dataset["word_count"].apply(
    lambda x: x <= 10
).value_counts()

word_count
False    3723814
True      784971
Name: count, dtype: int64

In [8]:
## Sampling the dataset by Sentence Length and Sorting in Ascending Order
dataset = dataset.loc[
    dataset["word_count"].apply(
        lambda x: x <= 10
    )
].sort_values(
    by="word_count",
    ascending=True
).drop(columns="word_count")

In [9]:
### Dropping duplicates
dataset = dataset.drop_duplicates()

Sentences -> Tokens (Split of Sentence into constituent components) -> Embedding (Every token becomes an embedding)

### Tokenizers

In [25]:
import pandas as pd

from tokenizers import normalizers, Tokenizer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.normalizers import NFD, Lowercase
from tokenizers.trainers import BpeTrainer
from tokenizers.models import BPE

In [31]:
def get_tokenizer(
        ds: pd.DataFrame,
        lang: list,
        tokenizer_path: str,
        vocab_size: int = 32000
    ):

    for l in lang:
        assert l in ds.columns, f"{l} should be a column in the dataset"
    texts = pd.concat([ds[l] for l in lang]).astype(str).unique().tolist()
    
    tokenizer = Tokenizer(BPE(unk_token="<unk>"))
    tokenizer.normalizer = normalizers.Sequence([
        NFD(),
        Lowercase()
    ])
    tokenizer.pre_tokenizer = Whitespace()
    trainer = BpeTrainer(
        vocab_size=vocab_size,
        special_tokens=["<pad>", "<sos>", "<eos>", "<unk>"]
    )
    tokenizer.train_from_iterator(texts, trainer=trainer)
    tokenizer.save(tokenizer_path)
    print(f"Shared tokenizer saved at: {tokenizer_path}")
    
    return tokenizer

In [32]:
def load_tokenizer(tokenizer_path: str):
    return Tokenizer.from_file(tokenizer_path)

### Creating Tokenizers

In [38]:
tokenizer = get_tokenizer(
    ds=dataset,
    lang=[source_lang, target_lang],
    tokenizer_path="tokenizer/en_de_32000.json"
)




Shared tokenizer saved at: tokenizer/en_de_32000.json


In [34]:
tokenizer.encode("<sos> neko is cat. 989898 6767 <eos> <pad>").ids

[1, 1135, 1507, 1079, 3112, 17, 4212, 4212, 4212, 9376, 9376, 2, 0]

In [35]:
tokenizer.decode([1, 1135, 1507, 1079, 3112, 17, 4212, 4212, 4212, 9376, 9376, 2, 0])

'ne ko is cat . 98 98 98 67 67'

In [37]:
tokenizer.decode([1135])

'ne'

### Prepare Dataset using `Dataset`

In [4]:
class TranslationDataset(Dataset):
    def __init__(self,
                ds: pd.DataFrame,
                source_lang: str,
                target_lang: str,
                source_tokenizer_path: str,
                target_tokenizer_path: str,
                max_length: int,
            ):
        self.data = ds
        self.max_length = max_length
        self.source_lang = source_lang
        self.target_lang = target_lang
        self.source_tokenizer= get_tokenizer(source_tokenizer_path)
        self.target_tokenizer= get_tokenizer(target_tokenizer_path)

        self.sos_token = self.source_tokenizer.encode("<sos>").ids
        self.eos_token = self.source_tokenizer.encode("<eos>").ids
        self.pad_token = self.source_tokenizer.encode("<pad>").ids

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        row = self.data.iloc[index]
        source_tokens = self.source_tokenizer.encode(row[self.source_lang]).ids[:self.max_length - 2]
        target_tokens = self.target_tokenizer.encode(row[self.target_lang]).ids[:self.max_length - 2]

        source_tokens = self.sos_token + source_tokens + self.eos_token
        target_tokens = self.sos_token + target_tokens
        label_tokens = target_tokens + self.eos_token

        source_tokens += self.pad_token * (self.max_length - len(source_tokens))
        target_tokens += self.pad_token * (self.max_length - len(target_tokens))
        label_tokens += self.pad_token * (self.max_length - len(label_tokens))

        # source tokens, target tokens, label tokens
        return torch.Tensor(source_tokens).to(torch.int64), torch.Tensor(target_tokens).to(torch.int64), torch.Tensor(label_tokens).to(torch.int64)


#### Train Test Split

In [23]:
ds = TranslationDataset(
    dataset,
    "eng",
    "fr",
    "./tokenizer/eng_vocab_4096.json",
    "./tokenizer/fr_vocab_4096.json",
    64
)

In [24]:
train_split = 0.7
test_split = 0.2
val_split = 0.1
train, test, val = random_split(ds, [train_split, test_split, val_split])

In [25]:
train_dataloader = DataLoader(train, batch_size=32)
test_dataloader = DataLoader(test, batch_size=32)
val_dataloader = DataLoader(val, batch_size=32)

### Prepare Custom Function for yielding Batches

In [81]:
class DynamicBatchTranslationDataset:
    def __init__(
            self,
            ds: pd.DataFrame,
            source_lang: str,
            target_lang: str,
            tokenizer_path: str,
            batch_size: int,
            max_length: int,
        ):
        
        self.data = ds
        self.batch_size = batch_size
        self.max_length = max_length
        self.source_lang = source_lang
        self.target_lang = target_lang
        self.tokenizer= load_tokenizer(tokenizer_path)

        self.sos_token = self.tokenizer.encode("<sos>").ids
        self.eos_token = self.tokenizer.encode("<eos>").ids
        self.pad_token = self.tokenizer.encode("<pad>").ids

    def __len__(self):
        return (len(self.data) // self.batch_size) + 1

    def __iter__(self):
        for idx in range(0, len(self.data), self.batch_size):
            batch = self.data.iloc[idx: idx + self.batch_size]
            source_tokens = batch[self.source_lang].apply(lambda x: self.tokenizer.encode(x).ids)
            target_tokens = batch[self.target_lang].apply(lambda x: self.tokenizer.encode(x).ids)

            source_max_length = source_tokens.apply(len).max()
            target_max_length = target_tokens.apply(len).max()
            max_length = target_max_length if target_max_length > source_max_length else source_max_length
            max_length = self.max_length if max_length > self.max_length else max_length

            source_tokens = source_tokens.apply(lambda x: self.sos_token + x[:max_length - 2] + self.eos_token)
            target_tokens = target_tokens.apply(lambda x: self.sos_token + x[:max_length - 1])
            label_tokens = target_tokens.apply(lambda x: x[1: max_length] + self.eos_token)

            source_tokens = source_tokens.apply(lambda x: torch.Tensor(x + self.pad_token * (max_length - len(x))).to(torch.int64))
            target_tokens = target_tokens.apply(lambda x: torch.Tensor(x + self.pad_token * (max_length - len(x))).to(torch.int64))
            label_tokens = label_tokens.apply(lambda x: torch.Tensor(x + self.pad_token * (max_length - len(x))).to(torch.int64))

            yield (
                torch.stack(source_tokens.to_list()),
                torch.stack(target_tokens.to_list()),
                torch.stack(label_tokens.to_list())
            )

In [81]:
data=DynamicBatchTranslationDataset(dataset, "de", "en", "tokenizer/en_de_32000.json", 2, 64)

### Training the Dataset

In [10]:
train = DynamicBatchTranslationDataset(
    ds=dataset,
    source_lang="de",
    target_lang="en",
    tokenizer_path="tokenizer/en_de_32000.json",
    batch_size=64,
    max_length=128,
)

In [11]:
model = TransformerModule(
    dim=256,
    vocab_size=32000,
    max_token_length=128,
    num_heads=8,
    num_layers=6,
    dropout=0.1,
)

In [65]:
class CustomSchedule(torch.optim.lr_scheduler.LambdaLR):
    def __init__(self, optimizer, d_model, warmup_steps=4000):
        self.d_model = d_model
        self.warmup_steps = warmup_steps
        super(CustomSchedule, self).__init__(optimizer, lr_lambda=self.lr_lambda)

    def lr_lambda(self, step):
        step = max(step, 1)
        lr = (self.d_model ** -0.5) * min(step ** -0.5, step * (self.warmup_steps ** -1.5))
        return lr


tokenizer = load_tokenizer("tokenizer/en_de_32000.json")
optimizer = torch.optim.Adam(model.parameters(), lr=1, betas=(0.9, 0.98), eps=1e-9)
scheduler = CustomSchedule(optimizer, 256, 800)
loss = nn.CrossEntropyLoss(ignore_index=tokenizer.token_to_id("<pad>"), label_smoothing=0.1)

# state = torch.load("./checkpoints/de_en_run3/de_en_checkpoint-19")
# model.load_state_dict(state['model_state_dict'])
# optimizer.load_state_dict(state['optimizer_state_dict'])
# scheduler.load_state_dict(state["scheduler_state_dict"])

In [None]:
save_dir = "./checkpoints/de_en_run3"
if not os.path.exists(save_dir):
    print(f"Directory {save_dir} does not exist. Please create it before starting training.")
else:
    for epoch in range(20, 21):
        model.train()
        batch_iter = tqdm(train, desc=f"Processing Epoch: {epoch:02d}")
        for batch in batch_iter:
            output = model(batch[0], batch[1])
            label = batch[2]

            train_loss = loss(output.view(-1, 32000), label.view(-1))
            batch_iter.set_postfix({"loss": f"{train_loss.item():6.3f}"})

            train_loss.backward()
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad(set_to_none=True)

        torch.save(
            {
                "epoch": epoch,
                "model_state_dict": model.state_dict(),
                "optimizer_state_dict": optimizer.state_dict(),
                "scheduler_state_dict": scheduler.state_dict(),
            }, f"{save_dir}/de_en_checkpoint-{epoch}"
        )


### Inference

In [71]:
def inference(model, tokenizer, input_sentence, max_length=32):
    model.eval()
    input_tokens = tokenizer.encode(input_sentence).ids
    input_tensor = torch.tensor([input_tokens], dtype=torch.int64)

    target_tokens = [tokenizer.token_to_id("<sos>")]

    with torch.no_grad():
        for _ in range(max_length):
            target_tensor = torch.tensor([target_tokens], dtype=torch.int64)
            output = model(input_tensor, target_tensor)
            next_token = output[0, -1, :].argmax(dim=-1).item()
            target_tokens.append(next_token)

            if next_token == tokenizer.token_to_id("<eos>"):
                break

    output_sentence = tokenizer.decode(target_tokens)
    return output_sentence


In [72]:
test = pd.read_csv("data/wmt14_translate_de-en_test.csv", lineterminator='\n')
test["word_count"] = test[source_lang].apply(lambda x: len(x.split()))
test = test.loc[
    test["word_count"].apply(
        lambda x: x <= 5
    )
].sort_values(
    by="word_count",
    ascending=True
).drop(columns="word_count")
test = test.drop_duplicates()

In [77]:
test

Unnamed: 0,de,en
1770,Expressofan,Espresso fan
979,Ferne Welten.,Distant worlds.
1967,Wiederkehr feiern.,July.
2723,Hundefreunde erfolgreich,Dog-lovers victorious
345,Das Töten.,Killing.
...,...,...
1228,Alles für ein unvergessliches Fest.,Everything you need for an unforgettable celeb...
1216,Freudenstadt: Schnelle Aktionen überrumpeln Ga...,Freudenstadt: Quick moves take hosts by surprise
1186,"Wir fürchten, das begünstigt Interventionen.",We are afraid it will encourage intervention.
1941,Autofahrer bei Unfall schwer verletzt,Car driver seriously injured in accident


In [None]:
for i, r in test.iterrows():
    print("Actual Sentence: ", r[source_lang])
    print("Target Sentence: ", r[target_lang])
    print("Predicted Sentence: ", inference(
        model,
        load_tokenizer("tokenizer/en_de_32000.json"),
        r[source_lang],
        128,
    ))
    print("---------"*10)