# Pose Landmarks Sequence (Sign Language) to Polish Tranlation
> Architecture: **Encoder-Decoder** with **Attention** Mechanism, [**Transformer**](https://arxiv.org/abs/1706.03762)

## Imports

In [1]:
import torch
from torch import optim
from torch import nn, Tensor, functional as F
from torch.utils.data import Dataset

from sklearn.model_selection import train_test_split

import numpy as np
import pandas as pd
import math
import time
from typing import Dict, List, Any, Callable, Optional
from jsonlines import jsonlines
from tqdm.notebook import tqdm
from loguru import logger
from matplotlib import pyplot as plt
from datetime import datetime

from src.settings import PREPROCESSED_DIR, MODELS_DIR, LOGS_DIR

torch.cuda.empty_cache()

In [2]:
def set_seed(seed):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(42)

In [3]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# DEVICE = torch.device("cpu")
DEVICE

device(type='cuda')

In [4]:
class ClipsDataset(Dataset):

    def __init__(self, records: List[Dict[str, Any]]):
        self.records = records

    def __len__(self) -> int:
        return len(self.records)
    
    def __getitem__(self, index: int) -> Dict[str, torch.Tensor]:
        out_polish_token_ids = torch.tensor(self.records[index]["PolishAnnotationTokenIds"], dtype=torch.int32)
        if len(out_polish_token_ids) > 66:
            # FIXME: this is a hack to make it work with the model; only one clip is affected
            print("Warning: PolishAnnotationTokenIds is longer than 66")
            out_polish_token_ids = out_polish_token_ids[:66]

        # (seq_len x 33 x 3) -> (seq_len x 99)
        frame_seq_landmarks = torch.tensor(self.records[index]["FramesLandmarksCoords"], dtype=torch.float32).view(-1, 99)
        prepro_landmarks_seq = self.preprocess_landmarks_seq(frame_seq_landmarks)

        return {
            "in_landmarks": prepro_landmarks_seq,
            "out_polish_token_ids": out_polish_token_ids,
        }

    @staticmethod
    def preprocess_landmarks_seq(landmarks_seq, long_multiplier=10_000):
        # hack for nn.Embedding layer to work; it requires long (int) type
        x = landmarks_seq * long_multiplier
        x = landmarks_seq + torch.min(landmarks_seq) * -1
        return x.long()

## Load Clips Data

In [5]:
SAMPLE_FRAC = 0.001


with jsonlines.open(PREPROCESSED_DIR / "clips_dataset_wth_herbert_token_ids.jsonl") as reader:
    # total_records: 19_503
    if SAMPLE_FRAC < 1:
        raw_records = list((rec for rec in reader if np.random.choice([True, False], p=[SAMPLE_FRAC, 1 - SAMPLE_FRAC])))  # iterable approach for random sample
    else:
        raw_records = list(reader)

train_records, val_records = train_test_split(raw_records, test_size=0.2)

train_ds, val_ds = ClipsDataset(train_records), ClipsDataset(val_records)

del train_records, val_records, raw_records

len(train_ds), len(val_ds)

(19, 5)

In [6]:
# clips_df = pd.DataFrame.from_records(raw_records)
# clips_df.FramesLandmarksCoords.apply(lambda x: np.array(x).flatten()).explode().astype("float").describe()

In [7]:
# pd.Series([record["NumFrames"] for record in raw_records]).describe()

In [8]:
# pd.Series([record["PolishAnnotationTokenIds"] for record in raw_records]).map(len).describe()

In [9]:
for record in train_ds:
    print(record["in_landmarks"].shape)  # n_frames x n_landmarks*3
    print(record["out_polish_token_ids"].shape)  # padded n_tokens
    break

torch.Size([43, 99])
torch.Size([66])


In [10]:
N_LANDMARKS = 33
COORD_CHANNELS = 3
MAX_TOKENS = 66
MAX_FRAMES = 392

## Models

### Positional Encoder

In [11]:
class PositionalEncoding(nn.Module):
    # https://machinelearningmastery.com/a-gentle-introduction-to-positional-encoding-in-transformer-models-part-1/#:~:text=What%20Is%20Positional%20Encoding%3F

    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)  # max_len x 1
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))  # d_model/2
        pe = torch.zeros(max_len, 1, 1, d_model)  # max_len x 99 x 1 x d_model
        # pe = torch.zeros(max_len, 1, d_model)  # max_len x 99 x 1 x d_model
        # print(f"{position.shape=}\n{div_term.shape=}\n{pe.shape=}")
        pe[:, 0, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 0, 1::2] = torch.cos(position * div_term)
        # pe[:, 0, 0::2] = torch.sin(position * div_term)
        # pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: Tensor) -> Tensor:
        """
        Arguments:
            x: Tensor, shape ``[batch_len, 99, BATCH_SIZE, EMBEDDING_LAYER_OUT_SIZE]``

        Returns:
            Tensor, (shape like ``x``): added positional encoding to ``x``
        """
        # x : batch_len, 99, batch_size, d_model
        print(f"{x.shape=}, {self.pe.shape=}")
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

### Transformer Encoder

In [12]:
class LandmarksSeqTransformerEncoder(nn.Module):
    # https://pytorch.org/tutorials/beginner/transformer_tutorial.html

    def __init__(self, ntoken: int, d_model: int, nhead: int, d_hid: int,
                 nlayers: int, dropout: float = 0.5, device: torch.device = torch.device("cuda:0")) -> None:
        super().__init__()
        self.device = device
        self.model_type = 'Transformer'
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        encoder_layers = nn.TransformerEncoderLayer(d_model, nhead, d_hid, dropout, batch_first=False)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, nlayers)
        self.embedding = nn.Embedding(ntoken, d_model)
        self.d_model = d_model
        self.linear = nn.Linear(d_model, ntoken)

        self.init_weights()

    def init_weights(self) -> None:
        initrange = 0.1
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.linear.bias.data.zero_()
        self.linear.weight.data.uniform_(-initrange, initrange)

    def forward(self, src: Tensor, src_mask: Tensor = None) -> Tensor:
        src = self.embedding(src) * math.sqrt(self.d_model)

        # _, _, batch_size, d_model = src.shape
        # src = src.view(-1, batch_size, d_model)

        src = self.pos_encoder(src)  # batch_len, 99, batch_size, d_model

        _, _, batch_size, d_model = src.shape
        src = src.view(-1, batch_size, d_model)

        if src_mask is None:
            """Generate a square causal mask for the sequence. The masked positions are filled with float('-inf').
            Unmasked positions are filled with float(0.0).
            """
            src_mask = nn.Transformer.generate_square_subsequent_mask(len(src), device=self.device)
        # print(f"{src.shape=}\n{src_mask.shape=}")
        output = self.transformer_encoder(src, src_mask)
        output = self.linear(output)
        return output

### Transformer Decoder

In [13]:
class LandmarksSeqTransformerDecoder(nn.Module):
    def __init__(self, ntoken: int, d_model: int, nhead: int, d_hid: int,
                 nlayers: int, dropout: float = 0.5, device: torch.device = torch.device("cuda:0")) -> None:
        super().__init__()
        self.device = device
        self.model_type = 'Transformer'
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        decoder_layer = nn.TransformerDecoderLayer(d_model, nhead, d_hid, dropout, batch_first=False)
        self.transformer_decoder = nn.TransformerDecoder(decoder_layer, nlayers)
        self.embedding = nn.Embedding(ntoken, d_model)
        self.d_model = d_model
        self.linear = nn.Linear(d_model, ntoken)

        self.init_weights()

    def init_weights(self) -> None:
        initrange = 0.1
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.linear.bias.data.zero_()
        self.linear.weight.data.uniform_(-initrange, initrange)

    def forward(self, tgt: torch.Tensor, memory: torch.Tensor, tgt_mask: torch.Tensor = None,
                memory_mask: torch.Tensor = None, tgt_key_padding_mask: torch.Tensor = None,
                memory_key_padding_mask: torch.Tensor = None) -> torch.Tensor:
        tgt = self.embedding(tgt) * math.sqrt(self.d_model)
        print(f"{tgt.shape=}")

        # batch_size, d_model = tgt.shape
        # tgt = tgt.view(-1, batch_size, d_model)

        tgt = self.pos_encoder(tgt)

        _, _, batch_size, d_model = tgt.shape
        tgt = tgt.view(-1, batch_size, d_model)

        if tgt_mask is None:
            tgt_mask = nn.Transformer.generate_square_subsequent_mask(len(tgt)).to(self.device)

        print(f"{tgt.shape=} {memory.shape=} {tgt_mask.shape=}")
        output = self.transformer_decoder(tgt, memory, tgt_mask=tgt_mask, memory_mask=memory_mask,
                                          tgt_key_padding_mask=tgt_key_padding_mask,
                                          memory_key_padding_mask=memory_key_padding_mask)
        # output: 
        output = self.linear(output)
        return output

In [14]:
def batchify(dataset: Dataset, batch_size: int) -> Tensor:
    """Divides the data into ``batch_size`` separate sequences, removing extra elements
    that wouldn't cleanly fit.

    Arguments:
        dataset: Dataset
        batch_size: int

    Returns:
        Tensor of shape ``[N // bsz, bsz]``
    """

    # to work wth dataset:
    data = torch.cat([record["in_landmarks"] for record in dataset])

    # oryginalnie data: (20857)

    # clips_landmarks_seq = [record["in_landmarks"] for record in data]
    # print(clips_landmarks_seq.shape)
    # data = torch.tensor(clips_landmarks_seq)

    batch_len = data.size(0) // batch_size
    # print(seq_len)
    # data = data.view(batch_size, seq_len)
    data = data[:batch_len * batch_size , :]  # batch_len x 99
    # print(data.shape)

    data = data.view(batch_len, -1, batch_size)
    # data = data.t()
    # print(data.shape, data)
    data = data.contiguous()  # batch_len x 99 x batch_size
    # print(data.shape, data)
    print(f"{data.device=}")
    return data


def get_batch(
    batched_data_dl: Tensor,
    step: int,
    bptt: int,
    device: torch.device = torch.device("cuda:0")
) -> (Tensor, Tensor):
    """
    Args:
        source: Tensor, shape ``[full_seq_len, batch_size]``
        step: int
        bbpt: int, batch_len

    Returns:
        tuple (data, target), where data has shape ``[seq_len, batch_size]`` and
        target has shape ``[seq_len * batch_size]``
    """

    query_batch_len = min(bptt, len(batched_data_dl) - 1 - step)
    data = batched_data_dl[step:step+query_batch_len, :, :]
    target = batched_data_dl[step+1:step+1+query_batch_len].reshape(-1)
    return data.to(device), target.to(device)

## Dataloaders Parameters

In [15]:
BPTT = 3
BATCH_SIZE = 4

train_dl = batchify(train_ds, batch_size=BATCH_SIZE)
val_dl = batchify(val_ds, batch_size=BATCH_SIZE)

data.device=device(type='cpu')
data.device=device(type='cpu')


In [16]:
# print(train_ds[0]["in_landmarks"].shape)

# data = torch.cat([record["in_landmarks"] for record in train_ds])
# print(data.shape, data, "\n")
print(f"{train_dl.shape=}")

for batch, step in enumerate(range(0, train_dl.size(0) - 1, BPTT)):
    if batch == 1:
        break
    # print(input_.shape)
    data, targets = get_batch(train_dl, step, bptt=BPTT)
    print(f"{data.shape=}")
    print(f"{targets.shape=}")

del train_ds, val_ds

train_dl.shape=torch.Size([325, 99, 4])
data.shape=torch.Size([3, 99, 4])
targets.shape=torch.Size([1188])


## Models Initialization

In [17]:
VOCAB_SIZE = 50_000  # aka lenght of vector that contains probabilities for each token; must match the tokenizer `token_ids` range (0 included)
DROPOUT = 0.2  # dropout probability
EMBEDDING_LAYER_OUT_SIZE = 200  # embedding dimension

ENCODING_TRANSFORMER_LAYERS = 2  # number of ``nn.TransformerEncoderLayer`` in ``nn.TransformerEncoder``
ENCODING_TRANSFORMER_HIDDEN_DIM = 200  # dimension of the feedforward network model in ``nn.TransformerEncoder``
ENCODING_TRANSFORMER_NHEAD = 2  # number of heads in ``nn.MultiheadAttention``


encoder = LandmarksSeqTransformerEncoder(
    ntoken=VOCAB_SIZE,
    d_model=EMBEDDING_LAYER_OUT_SIZE,
    nhead=ENCODING_TRANSFORMER_NHEAD,
    d_hid=ENCODING_TRANSFORMER_HIDDEN_DIM,
    nlayers=ENCODING_TRANSFORMER_LAYERS,
    dropout=DROPOUT
).to(DEVICE)

decoder = LandmarksSeqTransformerDecoder(
    ntoken=VOCAB_SIZE,
    d_model=EMBEDDING_LAYER_OUT_SIZE,
    nhead=ENCODING_TRANSFORMER_NHEAD,
    d_hid=ENCODING_TRANSFORMER_HIDDEN_DIM,
    nlayers=ENCODING_TRANSFORMER_LAYERS,
    dropout=DROPOUT,
).to(DEVICE)



## Training

### Training Parameters

In [18]:
EPOCHS = 5
LEARNING_RATE = 5


criterion = nn.CrossEntropyLoss()

opt_params = set(encoder.parameters()) | set(decoder.parameters())
optimizer = optim.SGD(opt_params, lr=LEARNING_RATE)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1.0, gamma=0.95)

### Set Checkpoint Path

In [19]:
def get_safe_model_name(model: nn.Module, params: Dict[str, int | float], unique_suffix: Optional[str] = None) -> str:
    if unique_suffix is None:
        unique_suffix = datetime.strftime(datetime.now(), '%Y%m%d_%H%M%Sf%f')
    params_safe_str = "__".join([
        f"{key}{f'{v:.3f}'.replace('.', 'f') if isinstance(v, float) else v}"
        for key, v in params.items()
    ])
    return f"{model.__class__.__name__}__{params_safe_str}__{unique_suffix}.best.pt"

In [20]:
safe_model_name = get_safe_model_name(encoder, params={
    "sample": SAMPLE_FRAC,
    "lr": LEARNING_RATE,
    "vocab": VOCAB_SIZE,
    "epochs": EPOCHS,
})

best_encoder_params_path = MODELS_DIR / safe_model_name
best_encoder_params_path.name

'LandmarksSeqTransformerEncoder__sample0f001__lr5__vocab50000__epochs5__20240318_203130f712186.best.pt'

### Setup Logging

In [21]:
# redirect std.out to file
# logger.remove(0)
_ = logger.add(LOGS_DIR / f"{safe_model_name}.log", backtrace=True, diagnose=True)

### Training and Evaluation

In [22]:
def train_epoch(
    encoder: nn.Module,
    decoder: nn.Module,
    optimizer: optim.Optimizer,
    lr_scheduler: optim.lr_scheduler.LRScheduler,
    loss_fn: nn.Module,
    train_dataloader: Tensor,
    epoch: int,
    logger: logger.__class__,
    callback: Callable,
    cb_kwargs: dict
) -> None:

    encoder.train()  # turn on train mode
    decoder.train()  # turn on train mode

    total_encoder_loss = 0.0
    start_time = time.time()
    num_batches = len(train_dataloader) // BPTT
    log_interval = num_batches // 10
    num_steps = train_dataloader.size(0) - 1
    for batch, step in callback(enumerate(range(0, num_steps, BPTT)), total=num_batches, **cb_kwargs):
        data, targets = get_batch(train_dataloader, step, BPTT, device=DEVICE)

        encoder_output = encoder(data)
        encoder_output_flat = encoder_output.view(-1, VOCAB_SIZE)
        encoder_loss = loss_fn(encoder_output_flat, targets)

        print(f"{encoder_output.shape=} {encoder_output_flat.shape=}")
        decoder_output = decoder(targets, encoder_output)

        optimizer.zero_grad()
        encoder_loss.backward()
        nn.utils.clip_grad_norm_(encoder.parameters(), 0.5)
        nn.utils.clip_grad_norm_(decoder.parameters(), 0.5)
        optimizer.step()

        total_encoder_loss += encoder_loss.item()
        if batch % log_interval == 0 and batch > 0:
            lr = lr_scheduler.get_last_lr()[0]
            ms_per_batch = (time.time() - start_time) * 1000 / log_interval
            cur_encoder_loss = total_encoder_loss / log_interval
            ppl = math.exp(cur_encoder_loss)
            logger.info(f'  epoch {epoch:3d} | {batch:5d}/{num_batches} batches | '
                  f'lr {lr:02.2f} | ms/batch {ms_per_batch:5.2f} | '
                  f'encoder loss {cur_encoder_loss:5.2f} | ppl {ppl:8.2f}')
            total_encoder_loss = 0
            start_time = time.time()


def evaluate(
    model: nn.Module,
    loss_fn: nn.Module,
    eval_dataloader: Tensor,
    callback: Callable,
    cb_kwargs: dict
) -> float:

    model.eval()  # turn on evaluation mode
    dl_len = len(eval_dataloader)
    num_batches = dl_len // BPTT
    total_loss = 0.
    with torch.no_grad():
        for i in callback(range(0, eval_dataloader.size(0) - 1, BPTT), total=num_batches, **cb_kwargs):
            data, targets = get_batch(eval_dataloader, i, device=DEVICE)
            seq_len = data.size(0)
            output = model(data)
            output_flat = output.view(-1, VOCAB_SIZE)
            total_loss += seq_len * loss_fn(output_flat, targets).item()
    return total_loss / (dl_len - 1)

### Main Training Loop

In [23]:
if "cpu" in str(DEVICE):
    q = input("Warning: you are training on CPU. It will be slow. Press [Enter] to continue... [q] to leave")
    if "q" in q.lower():
        raise KeyboardInterrupt("User aborted training")


metrics_history = []
best_val_loss = float('inf')
for epoch in tqdm(range(1, EPOCHS + 1), desc="Epochs"):
    epoch_start_time = time.time()
    train_epoch(
        encoder=encoder,
        decoder=decoder,
        optimizer=optimizer,
        lr_scheduler=scheduler,
        loss_fn=criterion,
        train_dataloader=train_dl,
        epoch=epoch,
        logger=logger,
        callback=tqdm,
        cb_kwargs={"desc": "Training | Batches", "position": 0, "leave": True}
    )
    val_loss = evaluate(
        model=encoder,
        loss_fn=criterion,
        eval_dataloader=val_dl,
        callback=tqdm,
        cb_kwargs={"desc": "Validating | Batches", "position": 1, "leave": False}
    )
    val_ppl = math.exp(val_loss)
    elapsed = time.time() - epoch_start_time
    logger.info(f'  end of epoch {epoch:3d} | time: {elapsed:5.2f}s | '
        f'valid loss {val_loss:10.6f} | valid ppl {val_ppl:8.2f}')

    metrics_history.append({
        "Epoch": epoch + 1,
        # "TrainLoss": train_loss,
        "ValLoss": val_loss,
        "ValPpl": val_ppl,
        "EpochTrainingTime": elapsed,
    })

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(encoder.state_dict(), best_encoder_params_path)

    scheduler.step()

logger.success(f"Done training. Best model parameters saved to {best_encoder_params_path}")

Epochs:   0%|          | 0/5 [00:00<?, ?it/s]

Training | Batches:   0%|          | 0/108 [00:00<?, ?it/s]

x.shape=torch.Size([3, 99, 4, 200]), self.pe.shape=torch.Size([5000, 1, 1, 200])
encoder_output.shape=torch.Size([297, 4, 50000]) encoder_output_flat.shape=torch.Size([1188, 50000])
tgt.shape=torch.Size([1188, 200])
x.shape=torch.Size([1188, 200]), self.pe.shape=torch.Size([5000, 1, 1, 200])
tgt.shape=torch.Size([1188, 1188, 200]) memory.shape=torch.Size([297, 4, 50000]) tgt_mask.shape=torch.Size([1188, 1188])


RuntimeError: mat1 and mat2 shapes cannot be multiplied (1188x50000 and 200x400)

## Training Results

In [None]:
metrics_df = pd.DataFrame.from_records(metrics_history)
metrics_df

In [None]:
metrics_df.ValLoss.plot()
plt.title(f"Validation loss ({SAMPLE_FRAC*100:.1f}% of dataset)")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.show()