# Pose Landmarks Sequence (Sign Language) to Polish Tranlation
> Architecture: **Encoder-Decoder** with **Attention** Mechanism, [**Transformer**](https://arxiv.org/abs/1706.03762)

## Imports

In [1]:
import torch
from torch import optim
from torch import nn, Tensor
from torch.utils.data import Dataset

from sklearn.model_selection import train_test_split

import numpy as np
import pandas as pd
import math
import time
from typing import Dict, List, Any, Callable
from jsonlines import jsonlines
from tqdm.notebook import tqdm
from loguru import logger

from src.settings import PREPROCESSED_DIR, MODELS_DIR

np.random.seed(42)
torch.cuda.empty_cache()

In [2]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# DEVICE = torch.device("cpu")
DEVICE

device(type='cuda')

In [3]:
class ClipsDataset(Dataset):

    def __init__(self, records: List[Dict[str, Any]]):
        self.records = records

    def __len__(self) -> int:
        return len(self.records)
    
    def __getitem__(self, index: int) -> Dict[str, torch.Tensor]:
        out_polish_token_ids = torch.tensor(self.records[index]["PolishAnnotationTokenIds"], dtype=torch.int32)
        if len(out_polish_token_ids) > 66:
            # FIXME: this is a hack to make it work with the model; only one clip is affected
            print("Warning: PolishAnnotationTokenIds is longer than 66")
            out_polish_token_ids = out_polish_token_ids[:66]

        # (seq_len x 33 x 3) -> (seq_len x 99)
        frame_seq_landmarks = torch.tensor(self.records[index]["FramesLandmarksCoords"], dtype=torch.float32).view(-1, 99)
        prepro_landmarks_seq = self.preprocess_landmarks_seq(frame_seq_landmarks)

        return {
            "in_landmarks": prepro_landmarks_seq,
            "out_polish_token_ids": out_polish_token_ids,
        }

    @staticmethod
    def preprocess_landmarks_seq(landmarks_seq, long_multiplier=10_000):
        x = landmarks_seq * long_multiplier
        x = landmarks_seq + torch.min(landmarks_seq) * -1
        return x.long()

In [4]:
SAMPLE_FRAC = 0.2


with jsonlines.open(PREPROCESSED_DIR / "clips_dataset_wth_herbert_token_ids.jsonl") as reader:
    # total_records: 19_503

    raw_records = list((rec for rec in reader if np.random.choice([True, False], p=[SAMPLE_FRAC, 1 - SAMPLE_FRAC])))
    # raw_records = list(reader)

train_records, val_records = train_test_split(raw_records, test_size=0.2, random_state=42)

train_ds, val_ds = ClipsDataset(train_records), ClipsDataset(val_records)

del train_records, val_records, raw_records

len(train_ds), len(val_ds)

KeyboardInterrupt: 

In [None]:
# clips_df = pd.DataFrame.from_records(raw_records)
# clips_df.FramesLandmarksCoords.apply(lambda x: np.array(x).flatten()).explode().astype("float").describe()

In [None]:
# pd.Series([record["NumFrames"] for record in raw_records]).describe()

In [None]:
# pd.Series([record["PolishAnnotationTokenIds"] for record in raw_records]).map(len).describe()

In [None]:
for record in train_ds:
    print(record["in_landmarks"].shape)  # n_frames x n_landmarks*3
    print(record["out_polish_token_ids"].shape)  # padded n_tokens
    break

In [None]:
N_LANDMARKS = 33
COORD_CHANNELS = 3
MAX_TOKENS = 66
MAX_FRAMES = 392

## Models

In [None]:
class LandmarksSeqTransformerEncoder(nn.Module):
    # https://pytorch.org/tutorials/beginner/transformer_tutorial.html

    def __init__(self, ntoken: int, d_model: int, nhead: int, d_hid: int,
                 nlayers: int, dropout: float = 0.5, device: torch.device = torch.device("cuda:0")) -> None:
        super().__init__()
        self.device = device
        self.model_type = 'Transformer'
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        encoder_layers = nn.TransformerEncoderLayer(d_model, nhead, d_hid, dropout, batch_first=False)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, nlayers)
        self.embedding = nn.Embedding(ntoken, d_model)
        self.d_model = d_model
        self.linear = nn.Linear(d_model, ntoken)

        self.init_weights()

    def init_weights(self) -> None:
        initrange = 0.1
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.linear.bias.data.zero_()
        self.linear.weight.data.uniform_(-initrange, initrange)

    def forward(self, src: Tensor, src_mask: Tensor = None) -> Tensor:
        """
        Arguments:
            src: Tensor, shape ``[seq_len, batch_size]`` , 99
            src_mask: Tensor, shape ``[seq_len, seq_len]``

        Returns:
            output Tensor of shape ``[seq_len, batch_size, ntoken]``
        """
        src = self.embedding(src) * math.sqrt(self.d_model)

        src = self.pos_encoder(src)  # batch_len, 99, batch_size, d_model
        
        _, _, batch_size, d_model = src.shape
        src = src.view(-1, batch_size, d_model)

        if src_mask is None:
            """Generate a square causal mask for the sequence. The masked positions are filled with float('-inf').
            Unmasked positions are filled with float(0.0).
            """
            src_mask = nn.Transformer.generate_square_subsequent_mask(len(src)).to(self.device)
        # print(f"{src.shape=}\n{src_mask.shape=}")
        output = self.transformer_encoder(src, src_mask)
        output = self.linear(output)
        return output


class PositionalEncoding(nn.Module):
    # https://machinelearningmastery.com/a-gentle-introduction-to-positional-encoding-in-transformer-models-part-1/#:~:text=What%20Is%20Positional%20Encoding%3F

    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)  # max_len x 1
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))  # d_model/2
        pe = torch.zeros(max_len, 1, 1, d_model)  # max_len x 99 x 1 x d_model
        # print(f"{position.shape=}\n{div_term.shape=}\n{pe.shape=}")
        pe[:, 0, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: Tensor) -> Tensor:
        """
        Arguments:
            x: Tensor, shape ``[seq_len, batch_size, embedding_dim]``
        """
        # x : batch_len, 99, batch_size, d_model
        # print(x.shape, x)
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

In [None]:
def batchify(dataset, batch_size: int, device: torch.device) -> Tensor:
    """Divides the data into ``bsz`` separate sequences, removing extra elements
    that wouldn't cleanly fit.

    Arguments:
        data: Tensor, shape ``[N]``
        bsz: int, batch size

    Returns:
        Tensor of shape ``[N // bsz, bsz]``
    """

    # to work wth dataset:
    clips_landmarks_seq = torch.cat([record["in_landmarks"] for record in dataset])
    
    # oryginalnie data: (20857)

    # clips_landmarks_seq = [record["in_landmarks"] for record in data]
    # print(clips_landmarks_seq.shape)
    data = torch.tensor(clips_landmarks_seq)

    batch_len = data.size(0) // batch_size
    # print(seq_len)
    # data = data.view(batch_size, seq_len)
    data = data[:batch_len * batch_size , :]  # batch_len x 99
    # print(data.shape)

    data = data.view(batch_len, -1, batch_size)
    # data = data.t()
    # print(data.shape, data)
    data = data.contiguous()  # batch_len x 99 x batch_size
    # print(data.shape, data)
    return data.to(device)


def get_batch(batched_data_dl: Tensor, i: int, bptt: int = 35):
    """
    Args:
        source: Tensor, shape ``[full_seq_len, batch_size]``
        i: int

    Returns:
        tuple (data, target), where data has shape ``[seq_len, batch_size]`` and
        target has shape ``[seq_len * batch_size]``
    """

    query_batch_len = min(bptt, len(batched_data_dl) - 1 - i)
    data = batched_data_dl[i:i+query_batch_len, :, :]
    target = batched_data_dl[i+1:i+1+query_batch_len].reshape(-1)
    return data, target

## Dataloaders Parameters

In [None]:
BPTT = 35
BATCH_SIZE = 4

train_dl = batchify(train_ds, batch_size=BATCH_SIZE, device=DEVICE)
val_dl = batchify(val_ds, batch_size=BATCH_SIZE, device=DEVICE)

In [None]:
# print(train_ds[0]["in_landmarks"].shape)

# data = torch.cat([record["in_landmarks"] for record in train_ds])
# print(data.shape, data, "\n")
print(f"{train_dl.shape=}")

for batch, i in enumerate(range(0, train_dl.size(0) - 1, BPTT)):
    if batch == 1:
        break
    # print(input_.shape)
    data, targets = get_batch(train_dl, i, bptt=BPTT)
    print(f"{data.shape=}")
    print(f"{targets.shape=}")

del train_ds, val_ds

## Models Initialization

In [None]:
VOCAB_SIZE = 50_000  # aka lenght of vector that contains probabilities for each token; must match the tokenizer `token_ids` range (0 included)
DROPOUT = 0.2  # dropout probability
EMBEDDING_LAYER_OUT_SIZE = 200  # embedding dimension

ENCODING_TRANSFORMER_LAYERS = 2  # number of ``nn.TransformerEncoderLayer`` in ``nn.TransformerEncoder``
ENCODING_TRANSFORMER_HIDDEN_DIM = 200  # dimension of the feedforward network model in ``nn.TransformerEncoder``
ENCODING_TRANSFORMER_NHEAD = 2  # number of heads in ``nn.MultiheadAttention``


encoder = LandmarksSeqTransformerEncoder(
    ntoken=VOCAB_SIZE,
    d_model=EMBEDDING_LAYER_OUT_SIZE,
    nhead=ENCODING_TRANSFORMER_NHEAD,
    d_hid=ENCODING_TRANSFORMER_HIDDEN_DIM,
    nlayers=ENCODING_TRANSFORMER_LAYERS,
    dropout=DROPOUT
).to(DEVICE)

## Training and Evaluation

In [None]:
def train_epoch(
    model: nn.Module,
    optimizer: optim.Optimizer,
    lr_scheduler: optim.lr_scheduler.LRScheduler,
    loss_fn: nn.Module,
    train_dataloader: Tensor,
    epoch: int,
    callback: Callable,
    cb_kwargs: dict
) -> None:
    
    global BPTT, VOCAB_SIZE

    model.train()  # turn on train mode
    total_loss = 0.
    log_interval = 200
    start_time = time.time()
    num_batches = len(train_dataloader) // BPTT
    for batch, step in callback(enumerate(range(0, train_dataloader.size(0) - 1, BPTT)), total=num_batches, **cb_kwargs):
        data, targets = get_batch(train_dataloader, step)
        output = model(data)
        output_flat = output.view(-1, VOCAB_SIZE)
        loss = loss_fn(output_flat, targets)

        optimizer.zero_grad()
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()

        total_loss += loss.item()
        if batch % log_interval == 0 and batch > 0:
            lr = lr_scheduler.get_last_lr()[0]
            ms_per_batch = (time.time() - start_time) * 1000 / log_interval
            cur_loss = total_loss / log_interval
            ppl = math.exp(cur_loss)
            logger.info(f'  epoch {epoch:3d} | {batch:5d}/{num_batches:5d} batches | '
                  f'lr {lr:02.2f} | ms/batch {ms_per_batch:5.2f} | '
                  f'loss {cur_loss:5.2f} | ppl {ppl:8.2f}')
            total_loss = 0
            start_time = time.time()


def evaluate(
    model: nn.Module,
    loss_fn: nn.Module,
    eval_dataloader: Tensor,
    callback: Callable,
    cb_kwargs: dict
) -> float:

    global BPTT, VOCAB_SIZE

    model.eval()  # turn on evaluation mode
    dl_len = len(eval_dataloader)
    num_batches = dl_len // BPTT
    total_loss = 0.
    with torch.no_grad():
        for i in callback(range(0, eval_dataloader.size(0) - 1, BPTT), total=num_batches, **cb_kwargs):
            data, targets = get_batch(eval_dataloader, i)
            seq_len = data.size(0)
            output = model(data)
            output_flat = output.view(-1, VOCAB_SIZE)
            total_loss += seq_len * loss_fn(output_flat, targets).item()
    return total_loss / (dl_len - 1)

## Main Training Loop

In [None]:
def get_safe_model_name(model: nn.Module, params: Dict[str, int | float]) -> str:
    params_safe_str = "__".join([
        f"{key}{f'{v:3f}'.replace('.', 'f') if isinstance(v, float) else v}"
        for key, v in params.items()
    ])
    return f"{model.__class__.__name__}__{params_safe_str}.best.pt"

### Training Parameters

In [None]:
EPOCHS = 5
LEARNING_RATE = 5


criterion = nn.CrossEntropyLoss()
encoder_optimizer = optim.SGD(encoder.parameters(), lr=LEARNING_RATE)
encoder_scheduler = optim.lr_scheduler.StepLR(encoder_optimizer, step_size=1.0, gamma=0.95)

In [None]:
get_safe_model_name(encoder, params={
    "lr": LEARNING_RATE,
    "vocab": VOCAB_SIZE,
    "epochs": EPOCHS,
})

In [None]:
if "cpu" in str(DEVICE):
    q = input("Warning: you are training on CPU. It will be slow. Press [Enter] to continue... [q] to leave")
    if "q" in q.lower():
        raise KeyboardInterrupt("User aborted training")


safe_model_name = get_safe_model_name(encoder, params={
    "lr": LEARNING_RATE,
    "vocab": VOCAB_SIZE,
    "epochs": EPOCHS,
})

metrics_history = []
best_val_loss = float('inf')
best_encoder_params_path = MODELS_DIR / safe_model_name
for epoch in tqdm(range(1, EPOCHS + 1)):
    epoch_start_time = time.time()
    train_epoch(
        model=encoder,
        optimizer=encoder_optimizer,
        lr_scheduler=encoder_scheduler,
        loss_fn=criterion,
        train_dataloader=train_dl,
        epoch=epoch,
        callback=tqdm,
        cb_kwargs={"desc": "Training | Batches", "position": 0, "leave": True}
    )
    val_loss = evaluate(
        model=encoder,
        loss_fn=criterion,
        eval_dataloader=val_dl,
        callback=tqdm,
        cb_kwargs={"desc": "Validating | Batches", "position": 1, "leave": False}
    )
    val_ppl = math.exp(val_loss)
    elapsed = time.time() - epoch_start_time
    logger(f'  end of epoch {epoch:3d} | time: {elapsed:5.2f}s | '
        f'valid loss {val_loss:10.6f} | valid ppl {val_ppl:8.2f}')

    metrics_history.append({
        "Epoch": epoch + 1,
        # "TrainLoss": train_loss,
        "ValLoss": val_loss,
        "ValPpl": val_ppl,
        "EpochTrainingTime": elapsed,
    })

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(encoder.state_dict(), best_encoder_params_path)

    encoder_scheduler.step()

## Training Results

In [None]:
metrics_df = pd.DataFrame.from_records(metrics_history)
metrics_df

In [None]:
metrics_df.ValLoss.plot()

# Bardzo Stary LSTM Nic Nie Warty :(

In [None]:
# class LandmarksTranslator(nn.Module):
    
#     def __init__(self, input_dim: int, output_dim: int):
#         super().__init__()
#         self.input_dim = input_dim
#         self.output_dim = output_dim

#         # self.lstm = nn.LSTM(input_dim, output_dim, batch_first=True)
#         self.linear = nn.Linear(input_dim, output_dim)

#     def forward(self, x: torch.Tensor) -> torch.Tensor:
#         # x = x.view(-1, self.input_dim)

#         output, (hn, cn) = self.lstm(x)
#         # print(output.shape)
#         x = output

#         # print(x)
#         # print(output.shape, hn.shape, cn.shape)
#         # x = self.linear(x)

#         # try get `token_ids` imitation
#         min_id, max_id = 0, 25_000
#         min_x_val = x.min(axis=0).values
#         max_x_val = x.max(axis=0).values
#         # print(min_x_val, max_x_val)
        
#         x = (x - min_x_val) / (max_x_val - min_x_val)  # normalize to [0, 1]
#         x = (x  * (max_id - min_id)) + min_id  # scale to [min_id, max_id]

#         return x

In [None]:
# class LandmarksTranslatorT5(nn.Module):

#     def __init__(self, input_dim: int, output_dim: int):
#         super().__init__()
#         self.input_dim = input_dim
#         self.output_dim = output_dim

#         self.t5 = AutoModelForSeq2SeqLM.from_pretrained("t5-small")

#     def forward(self, x: torch.Tensor) -> torch.Tensor:
#         print(x.shape)
#         # x: (batch_size x seq_len x vec_size)
#         x = self.t5.forward(inputs_embeds=x, output_hidden_states=True)

#         # # try get `token_ids` imitation
#         # min_id, max_id = 0, 25_000
#         # min_x_val = x.min(axis=0).values
#         # max_x_val = x.max(axis=0).values
#         # # print(min_x_val, max_x_val)
        
#         # x = (x - min_x_val) / (max_x_val - min_x_val)  # normalize to [0, 1]
#         # x = (x  * (max_id - min_id)) + min_id  # scale to [min_id, max_id]

#         return x

In [None]:
# def train(model, train_dl, val_dl, optimizer, loss_fn, epochs, device="cuda"):
#     herbert_tokenizer = AutoTokenizer.from_pretrained("allegro/herbert-base-cased")
#     model_name = model.__class__.__name__
#     metrics_history = []
#     for epoch in tqdm(range(epochs), desc="Epochs: ", leave=True, position=0):
#         last_epoch = epoch == epochs - 1
#         train_losses = []
#         val_losses = []
#         val_accs = []
#         # for n_batch, batch in enumerate(tqdm(train_dl, desc="Batches: ", leave=False, position=1)):
#         for n_batch, batch in enumerate(train_dl):
#             X_batch = batch["in_landmarks"].to(device)
#             y_batch = batch["out_polish_token_ids"].to(device)
#             # print(y_batch.shape)

#             y_pred = model(X_batch)
#             # print(y_pred.shape)

#             first_batch = n_batch == 0
#             if first_batch and (epoch % ceil(epochs * 0.1) == 0 or last_epoch):
#                 decoded_y_batch = herbert_tokenizer.decode(y_batch[0], skip_special_tokens=True)
#                 decoded_y_pred_batch = herbert_tokenizer.decode(y_pred[0].int(), skip_special_tokens=True)
#                 print(f"{decoded_y_batch=}")
#                 print(f"{decoded_y_pred_batch=}")
#                 # for decoded_y, decoded_y_pred in zip(decoded_y_batch, decoded_y_pred_batch):
#                 #     print(f"{decoded_y=}")
#                 #     print(f"{decoded_y_pred=}")
#                 #     print()
#                 print("-".center(80, "-"))

#             if y_pred.shape != y_batch.shape:
#                 print(f"{y_pred.shape=}, {y_batch.shape=} mismatched!")
#                 continue
                
#             loss = loss_fn(y_pred, y_batch.float())

#             optimizer.zero_grad()
#             loss.backward()  # backward pass, calculate gradients
#             optimizer.step()  # update weights

#             # val_loss, val_acc = validate(model, val_dl, loss_fn, device=device)
#             val_loss = validate(model, val_dl, loss_fn, device=device)
            
#             train_losses.append(loss.item())
#             val_losses.append(val_loss)
#             # val_accs.append(val_acc)

#         epoch_metrics = {
#             "Epoch": epoch + 1,
#             "ModelName": model_name,
#             "TrainLoss": np.mean(train_losses),
#             "ValLoss": np.mean(val_losses),
#             # "ValAcc": np.mean(val_accs)
#         }
#         metrics_history.append(epoch_metrics)

#         if epoch % ceil(epochs * 0.1) == 0 or last_epoch:
#             print(
#                 f"Epoch: {epoch + 1:<2} | "
#                 + " ".join([f"{k}: {v:.6f}" for k, v in list(epoch_metrics.items())[2:]])
#             )

#     return metrics_history


# def validate(model, val_dl, loss_fn, device="cuda"):
#     losses = []
#     accuracies = []
#     for batch in val_dl:
#         X_batch = batch["in_landmarks"].to(device)
#         y_batch = batch["out_polish_token_ids"].to(device)

#         y_pred = model(X_batch)

#         loss = loss_fn(y_pred, y_batch.float())
#         losses.append(loss.item())
        
#         # acc = (torch.argmax(y_pred, 1) == torch.argmax(y_batch, 1)).cpu().float().mean()
#         # accuracies.append(acc)

#     # return np.mean(losses), np.mean(accuracies)
#     return np.mean(losses)


In [None]:
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# print("Device:", device, "\n")


# BATCH_SIZE = 4
# EPOCHS = 100
# LEARING_RATE = 0.001


# assert BATCH_SIZE <= len(train_ds) and BATCH_SIZE <= len(val_ds)

# train_dl = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, pin_memory=True)
# val_dl = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=True, pin_memory=True)

# model = LandmarksTranslator(
# # model = LandmarksTranslatorT5(
#     input_dim=N_LANDMARKS*COORD_CHANNELS,
#     output_dim=MAX_TOKENS
# ).to(device)
# optimizer = optim.Adam(model.parameters(), lr=LEARING_RATE)
# loss_fn = nn.CrossEntropyLoss()

# metrics_history = train(model, train_dl, val_dl, optimizer, loss_fn, epochs=EPOCHS, device=device)