In [None]:
import pandas as pd
import pyarrow.parquet as pq

# Read the first CSV file
dataset_train_df = pd.read_csv("train.csv")

# Read the second CSV file
dataset_supplemental_df = pd.read_csv("supplemental_metadata.csv")

# Concatenate the two dataframes
dataset_df = pd.concat([dataset_train_df, dataset_supplemental_df], ignore_index=True)

# Save the combined CSV file
dataset_df.to_csv("train_full.csv", index=False)

In [None]:
# Read the first row of the DataFrame
path, sequence_id, file_id, phrase = dataset_df.iloc[0][
    ["path", "sequence_id", "file_id", "phrase"]
]
print(f"path: {path}, sequence_id: {sequence_id}, file_id: {file_id}, phrase: {phrase}")

sample_sequence_df = pq.read_table(
    f"{str(path)}",
    filters=[
        [("sequence_id", "=", sequence_id)],
    ],
).to_pandas()
print("Full sequence dataset shape is {}".format(sample_sequence_df.shape))

In [None]:
# Read the total amount unique files
unique_paths = dataset_df["path"].unique()

sum = unique_paths.shape[0]

print("Total number of files: {}".format(sum))

In [None]:
LIP = [
    61, 185, 40, 39, 37, 267, 269, 270, 409,
    291, 146, 91, 181, 84, 17, 314, 405, 321, 375,
    78, 191, 80, 81, 82, 13, 312, 311, 310, 415,
    95, 88, 178, 87, 14, 317, 402, 318, 324, 308,
]

FACE = [f'x_face_{i}' for i in LIP] + [f'y_face_{i}' for i in LIP] + [f'z_face_{i}' for i in LIP]
LHAND = [f'x_left_hand_{i}' for i in range(21)] + [f'y_left_hand_{i}' for i in range(21)] + [f'z_left_hand_{i}' for i in range(21)]
RHAND = [f'x_right_hand_{i}' for i in range(21)] + [f'y_right_hand_{i}' for i in range(21)] + [f'z_right_hand_{i}' for i in range(21)]
POSE = [f'x_pose_{i}' for i in range(33)] + [f'y_pose_{i}' for i in range(33)] + [f'z_pose_{i}' for i in range(33)]

SEL_COLS = FACE + LHAND + RHAND + POSE
FRAME_LEN = 128

In [None]:
import json

# Read the existing data
with open("character_to_prediction_index.json", "r") as f:
    json_chars = json.load(f)

# Define the new entries
new_entries = [
    "<",
    ">",
    "P",
]

# Add the new entries starting from index 59, only if they don't already exist
for i, entry in enumerate(new_entries, start=59):
    if entry not in json_chars:
        json_chars[entry] = i

# Write the updated data back to the file
with open("character_to_prediction_index.json", "w") as f:
    json.dump(json_chars, f, indent=4)
    
pad_token_idx = 61

In [None]:
import multiprocessing as mp
import tensorflow as tf
import numpy as np
import pyarrow.parquet as pq
from multiprocessing import Pool, Manager
from tqdm.notebook import tqdm_notebook

tf.config.set_visible_devices([], "GPU")  # Disable GPU for Tensorflow

# Create a Manager object for the progress_queue
manager = Manager()
progress_queue = manager.Queue()


def process_file(file_id):
    file_df = dataset_df.loc[dataset_df["file_id"] == file_id]
    path = file_df["path"].values[0]
    parquet_df = pq.read_table(path, columns=["sequence_id"] + SEL_COLS).to_pandas()

    tf_file = f"preprocessed/{file_id}.tfrecord"
    parquet_numpy = parquet_df.to_numpy(copy=False)

    col_to_index = {col: i for i, col in enumerate(parquet_df.columns)}

    LHAND_indices = [col_to_index[col] for col in LHAND]
    RHAND_indices = [col_to_index[col] for col in RHAND]

    buffer_size = 1000  # Adjust as needed
    buffer = []

    with tf.io.TFRecordWriter(tf_file) as file_writer:
        for seq_id, phrase in zip(file_df["sequence_id"], file_df["phrase"]):
            frames = parquet_numpy[parquet_df.index == seq_id]
            progress_queue.put(
                f"Process: {mp.current_process().name}, File: {file_id}, Sequence: {seq_id}"
            )

            # Calculate the number of NaN values in each hand landmark
            r_nonan = np.sum(np.sum(np.isnan(frames[:, RHAND_indices]), axis=1) == 0)
            l_nonan = np.sum(np.sum(np.isnan(frames[:, LHAND_indices]), axis=1) == 0)
            no_nan = max(r_nonan, l_nonan)

            if 2 * len(phrase) < no_nan:
                features = {
                    COL: tf.train.Feature(
                        float_list=tf.train.FloatList(
                            value=frames[:, col_to_index[COL]]
                        )
                    )
                    for COL in SEL_COLS
                }
                features["phrase"] = tf.train.Feature(
                    bytes_list=tf.train.BytesList(value=[bytes(phrase, "utf-8")])
                )

                example = tf.train.Example(features=tf.train.Features(feature=features))
                record_bytes = example.SerializeToString()

                buffer.append(record_bytes)
                if len(buffer) == buffer_size:
                    for record in buffer:
                        file_writer.write(record)
                        buffer = []
        if buffer:
            for record in buffer:
                file_writer.write(record)

        # gc.collect()


# cpu_count = int(mp.cpu_count() / 2)
cpu_count = 8  # 8

with Pool(cpu_count) as pool:
    progress_bars = [
        tqdm_notebook(desc=f"Process {i + 1}", unit="seq") for i in range(cpu_count)
    ]

    for result in pool.imap(
        process_file,
        dataset_df["file_id"].unique(),
    ):
        progress_updates = []
        while not progress_queue.empty():
            progress_updates.append(progress_queue.get())
        for update, bar in zip(progress_updates, progress_bars):
            bar.set_description(update)
            bar.update()
            
print("All parquets processed to TFRecords")

In [None]:
import torch
import json
from torchdata.dataloader2 import DataLoader2
from torchdata.datapipes.iter import (
    FileLister,
    FileOpener,
    TFRecordLoader,
    Mapper,
    Batcher,
    Collator,
    IterableWrapper,
)
import torch.nn.functional as F

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

with open("character_to_prediction_index.json", "r") as f:
    json_chars = json.load(f)  #


# Encodes phrase into a tensor of tokens
def tokenize_phrase(example):
    phrase = example["phrase"][0].decode(
        "utf-8"
    )  # Decode the byte string into a regular string
    phrase = "<" + phrase + ">"
    indices = [json_chars.get(char, json_chars.get("F")) for char in phrase]
    example["phrase"] = torch.tensor(
        indices
    )  # Replace the byte string with a list of integers
    return example


# Remove NaN values from tensor
def pre_process(example):
    for key in example.keys():
        example[key] = torch.nan_to_num(example[key])
    return example


# Normalize landmark vector
def normalize(example):
    return example


# Collate function, this pads https://suzyahyah.github.io/pytorch/2019/07/01/DataLoader-Pad-Pack-Sequence.html
# def collate_fn(batch):
#    sequence_lengths = [len(tensor) for seq in batch for tensor in seq.values()]
#    max_len = max(sequence_lengths)
#    padded_batch = [
#        {
#            key: (
#                F.pad(
#                    input=tensor,
#                    pad=(
#                        0,
#                        max_len - tensor.shape[0],
#                    ),
#                    mode="constant",
#                    value=0,
#                )
#                if key != "phrase"
#                else tensor
#            )
#            for key, tensor in seq.items()
#        }
#        for seq in batch
#    ]
#
#    stacked_landmarks = {
#        key: torch.stack([seq[key] for seq in padded_batch]) for key in SEL_COLS
#    }
#    max_phrase_len = max([len(seq["phrase"]) for seq in padded_batch])
#    padded_phrases = [
#        F.pad(
#            input=seq["phrase"],
#            pad=(0, max_phrase_len - seq["phrase"].shape[0]),
#            mode="constant",
#            value=0,
#        )
#        for seq in batch
#    ]
#    # Stack the tensors along a new dimension
#    stacked_phrases = torch.stack(padded_phrases, dim=0)
#
#    return batch  # stacked_landmarks, stacked_phrases


def collate_fn(batch):
    sequence_lengths = [len(tensor) for seq in batch for tensor in seq.values()]
    max_len = max(sequence_lengths)
    padded_batch = [
        {
            key: (
                F.pad(
                    input=tensor,
                    pad=(
                        0,
                        max_len - tensor.shape[0],
                    ),
                    mode="constant",
                    value=0,
                )
                if key != "phrase"
                else tensor
            )
            for key, tensor in seq.items()
        }
        for seq in batch
    ]
    # Stack the landmark tensors, creating a new dimension
    stacked_landmarks = torch.stack(
        [torch.stack([seq[key] for seq in padded_batch]) for key in SEL_COLS], dim=-1
    )

    max_phrase_len = max([len(seq["phrase"]) for seq in padded_batch])
    padded_phrases = [
        F.pad(
            input=seq["phrase"],
            pad=(0, max_phrase_len - seq["phrase"].shape[0]),
            mode="constant",
            value=0,
        )
        for seq in batch
    ]
    # Stack the tensors along a new dimension
    stacked_phrases = torch.stack(padded_phrases, dim=0)
    return stacked_landmarks, stacked_phrases


# Build datapipes: TRAIN ONLY
def build_train_pipe(batch_size, drop_last):
    tf_records = dataset_df.file_id.map(lambda x: f"preprocessed/{x}.tfrecord").unique()
    train_len = int(0.8 * len(tf_records))
    datapipe = FileLister(tf_records[:train_len])
    datapipe = FileOpener(datapipe, mode="b")
    datapipe = TFRecordLoader(datapipe)
    datapipe = Mapper(datapipe, tokenize_phrase)
    datapipe = Mapper(datapipe, pre_process)
    # datapipe = Mapper(datapipe, normalize)
    datapipe = Batcher(datapipe, batch_size=batch_size, drop_last=drop_last)
    datapipe = Collator(datapipe, collate_fn=collate_fn)
    return datapipe


# Build the pipeline

datapipe = build_train_pipe(batch_size=32, drop_last=True)

dataloader = DataLoader2(datapipe=datapipe)

torch.set_printoptions(profile="full")

# batch = next(iter(dataloader))

for batch in dataloader:
    for i in range(32):
        print(batch[0].shape)
    break

In [None]:
import math
import torch
import torch.nn as nn


class TokenEmbedding(nn.Module):
    def __init__(self, num_vocab=1000, maxlen=100, num_hid=342):
        super(TokenEmbedding, self).__init__()
        self.num_hid = num_hid
        self.emb = nn.Embedding(num_vocab, num_hid)
        self.pos_emb = self.positional_encoding(maxlen, num_hid)

    def forward(self, x):
        maxlen = x.size(1)  # Assuming x.size(1) is sequence length

        x = self.emb(x)

        x = x * torch.sqrt(torch.tensor(self.num_hid, dtype=torch.float).to(x.device))
        return x + self.pos_emb[:maxlen, :].to(x.device)

    @staticmethod
    def positional_encoding(maxlen, d_model):
        position = torch.arange(0, maxlen, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(
            torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)
        )
        pos_encoding = torch.zeros(maxlen, d_model)
        pos_encoding[:, 0::2] = torch.sin(position * div_term)
        pos_encoding[:, 1::2] = torch.cos(position * div_term)
        return pos_encoding


class LandmarkEmbedding(nn.Module):
    def __init__(self, num_hid=342, maxlen=453):
        super(LandmarkEmbedding, self).__init__()
        self.conv1 = nn.Sequential(
            nn.Conv1d(
                in_channels=342,  # Correctly handle 342 landmark features
                out_channels=num_hid,
                kernel_size=11,
                padding=5,
            ),
            nn.ReLU(),
        )
        self.conv2 = nn.Sequential(
            nn.Conv1d(
                in_channels=num_hid, out_channels=num_hid, kernel_size=11, padding=5
            ),
            nn.ReLU(),
        )
        self.conv3 = nn.Sequential(
            nn.Conv1d(
                in_channels=num_hid, out_channels=num_hid, kernel_size=11, padding=5
            ),
            nn.ReLU(),
        )
        self.pos_emb = self.positional_encoding(maxlen, num_hid)
        self.maxlen = maxlen
        self.num_hid = num_hid

    def forward(self, x):

        # Permute the tensor to have channels as the second dimension
        x = x.permute(
            0, 2, 1
        )  # Change from [batch_size, seq_len, features] to [batch_size, features, seq_len]

        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)

        x = x.permute(
            0, 2, 1
        )  # Optionally permute back if needed for further processing

        x = x * torch.sqrt(torch.tensor(self.num_hid, dtype=torch.float).to(x.device))
        return x + self.pos_emb[: x.size(1), :].to(x.device)

    @staticmethod
    def positional_encoding(maxlen, d_model):
        position = torch.arange(0, maxlen, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(
            torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)
        )
        pos_encoding = torch.zeros(maxlen, d_model)
        pos_encoding[:, 0::2] = torch.sin(position * div_term)
        pos_encoding[:, 1::2] = torch.cos(position * div_term)
        return pos_encoding

In [None]:
class TransformerEncoder(nn.Module):
    def __init__(self, d_model, nhead, dim_feedforward, dropout=0.1):
        super(TransformerEncoder, self).__init__()
        self.self_attn = nn.MultiheadAttention(d_model, nhead, batch_first=True)
        self.linear1 = nn.Linear(d_model, dim_feedforward)
        self.linear2 = nn.Linear(dim_feedforward, d_model)
        self.norm1 = nn.LayerNorm(d_model, eps=1e-6)
        self.norm2 = nn.LayerNorm(d_model, eps=1e-6)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)

    def forward(self, src, src_mask=None):

        src2, _ = self.self_attn(src, src, src, attn_mask=src_mask)

        src = src + self.dropout1(src2)
        src = self.norm1(src)

        src2 = self.linear2(self.dropout2(self.linear1(src)))

        src = src + self.dropout2(src2)
        src = self.norm2(src)

        return src

In [None]:
class TransformerDecoder(nn.Module):
    def __init__(self, d_model, nhead, dim_feedforward, dropout=0.1):
        super(TransformerDecoder, self).__init__()
        self.self_attn = nn.MultiheadAttention(d_model, nhead, batch_first=True)
        self.multihead_attn = nn.MultiheadAttention(
            d_model, nhead, batch_first=True
        )  # Encoder-decoder attention
        self.linear1 = nn.Linear(d_model, dim_feedforward)
        self.linear2 = nn.Linear(dim_feedforward, d_model)
        self.norm1 = nn.LayerNorm(d_model, eps=1e-6)
        self.norm2 = nn.LayerNorm(d_model, eps=1e-6)
        self.norm3 = nn.LayerNorm(d_model, eps=1e-6)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
        self.dropout3 = nn.Dropout(dropout)

    def forward(self, tgt, memory, tgt_mask=None, src_mask=None):
        # 1. Masked Self-Attention (Causal)
        tgt2, _ = self.self_attn(tgt, tgt, tgt, attn_mask=tgt_mask)

        tgt = tgt + self.dropout1(tgt2)
        tgt = self.norm1(tgt)

        # 2. Encoder-Decoder Attention
        tgt2, _ = self.multihead_attn(tgt, memory, memory, attn_mask=src_mask)

        tgt = tgt + self.dropout2(tgt2)
        tgt = self.norm2(tgt)

        # 3. Feed-Forward Network
        tgt2 = self.linear2(self.dropout3(F.relu(self.linear1(tgt))))

        tgt = tgt + tgt2  # Add residual
        tgt = self.norm3(tgt)

        return tgt

In [None]:
class Transformer(nn.Module):
    def __init__(
        self,
        num_hid=64,
        num_head=2,
        num_feed_forward=128,
        source_maxlen=453,
        target_maxlen=100,
        num_layers_enc=4,
        num_layers_dec=1,
        num_classes=62,
    ):
        super(Transformer, self).__init__()
        self.loss = nn.CrossEntropyLoss(
            ignore_index=pad_token_idx
        )  # Assuming pad_token_idx is defined
        self.num_layers_enc = num_layers_enc
        self.num_layers_dec = num_layers_dec
        self.target_maxlen = target_maxlen
        self.num_classes = num_classes

        self.enc_emb = LandmarkEmbedding(num_hid=num_hid, maxlen=source_maxlen).to(
            "cuda"
        )  # Replace with embedding layer

        self.dec_emb = TokenEmbedding(
            num_vocab=num_classes, maxlen=target_maxlen, num_hid=num_hid
        ).to(
            "cuda"
        )  # Replace with embedding layer

        self.encoder = nn.Sequential(
            *[
                TransformerEncoder(num_hid, num_head, num_feed_forward)
                for _ in range(num_layers_enc)
            ]
        )

        self.decoder_layers = nn.ModuleList(
            [
                TransformerDecoder(num_hid, num_head, num_feed_forward)
                for _ in range(num_layers_dec)
            ]
        )

        self.classifier = nn.Linear(num_hid, num_classes)

    def decode(self, enc_out, target, tgt_mask=None, src_mask=None):
        y = self.dec_emb(target)
        for decoder_layer in self.decoder_layers:
            y = decoder_layer(y, enc_out, tgt_mask=tgt_mask, src_mask=src_mask)
        return y

    def forward(self, source, target, src_mask=None, tgt_mask=None):
        source = self.enc_emb(source)
        memory = self.encoder(source)
        output = self.decode(memory, target, tgt_mask=tgt_mask, src_mask=src_mask)
        output = self.classifier(output)
        return output

    @staticmethod
    def generate_square_subsequent_mask(size):
        mask = (torch.triu(torch.ones(size, size)) == 1).float()
        mask = mask.masked_fill(mask == 0, float("-inf"))  # -inf for future positions
        return mask.unsqueeze(0)  # Add batch dimension (1)

    def generate(self, source, target_start_token_idx):
        batch_size = source.size(0)
        enc_output = self.encoder(source)

        # Initialize decoder input with start token
        dec_input = torch.ones(batch_size, 1, dtype=torch.long) * target_start_token_idx

        for _ in range(self.target_maxlen - 1):
            tgt_mask = self.generate_square_subsequent_mask(dec_input.size(1))
            dec_out = self.decode(enc_output, dec_input, tgt_mask=tgt_mask)
            prediction = self.classifier(
                dec_out[:, -1]
            )  # Prediction for the last token
            pred_idx = torch.argmax(prediction, dim=1, keepdim=True)
            dec_input = torch.cat([dec_input, pred_idx], dim=1)

        return dec_input.squeeze(
            1
        )  # Remove extra dimension for single generated sequence

In [None]:
transformer = Transformer(
    num_hid=200,
    num_head=4,
    num_feed_forward=400,
    source_maxlen=784,
    target_maxlen=64,
    num_layers_enc=2,
    num_layers_dec=1,
    num_classes=62,
)
optimizer = torch.optim.Adam(transformer.parameters(), lr=0.001)
criterion = nn.NLLLoss(ignore_index=pad_token_idx)  # Assuming pad_token_idx is defined

In [None]:
transformer.to(device)
print("CUDA AVAILABLE: ", torch.cuda.is_available(), ", CUDA DEVICE: ", device)

for epoch in range(10):
    for batch in dataloader:
        source, target = batch

        # Move data to GPU
        source = source.to(device)
        target = target.to(device)

        print("Source: ",source.shape)
        print("Target: ",target.shape)

        outputs = transformer(source, target)
        loss = criterion(outputs.view(-1, transformer.num_classes), target.view(-1))
        loss.backward()
        optimizer.step()
    # Code for evaluating the model on validation data (optional)

# Model Note, Reference, Brainstorm

https://www.youtube.com/watch?v=4Bdc55j80l8

## Input Embedding Layer

The phrase must be vectorized (our case is chars)
We must add Positional Encoding to create Positional Input Embeddings

## Encoder Layer

Two sub-modules:

### Attention

#### Self-Attention

3 Distinct fully connected layers

- Query, Key, Value
- A dot product of the Query and Key matrices is computed to create a score matrix.
- The score matrix a table that dictatates how much value each word or char should be given, compared to the other words or chars in the input sequence. Higher score = more important. = more focus.
- The score matrix is normalized by dividing by the square root of the dimension of the key vectors.
- The score matrix is divided by the square root of the dimension of the key vectors which gives the scaled scores.
- The scaled scores are then passed through a softmax function to receive the attention weights.
- Multiply attention weights by value matrix to get the output vector of the self-attention layer.
- Linear layer to process.

#### Multi-headed Attention

The query, key, and value is split into N heads.

- Each vector goes through the attention layer as normal
- The output of all heads is concatenated into a single vector
- Each head is given a different representation of the input sequence, allowing the model to simultaneously attend to information from different representation subspaces.
- Each head in theory should learn to attend to different parts of the input sequence, and thus overall learn more of the input sequence.

### Residual Connection, Layer Normalization, and Feed Forward Layer

- The Multi-headed attention output vector is added to the original input vector to the sub-layer, which is called a residual connection.
- This output is then normalized by layer normalization.
- This enters a feed forward network, which is a simple 2 layer fully connected network with a ReLU activation in between.
- The output of that is added again to the original input, to be normalized again. Like before with the multi-headed attention.

## Decoder Layer

### Output Embedding Layer and Positional Encoding

- The output goes through an embedding layer to get the position embeddings.
- This enters the first multi-headed attention layer.
- The scaled scores are added to a look ahead mask, which prevents the decoder from attending to future tokens. 
- This happens when the softmax makes future tokens 0, so no attention is given to them.
- All the heads are combined to create a masked output layer.

The second multi-headed attention layer has the query and key of the encoder output, and the value of the previous multi-headed attention layer output value.

A final feed forward network is applied to the output of the second multi-headed attention layer.

### Classifier, softmax, and output
The feed forward output enters a linear classifier.

Classifier output enters softmax to get the probability score of each class (char). The index of the highest probability is the predicted char.

Output is added to a list of decoded outputs until the end token is reached.

This whole structure can be stacked N layers high. The output of the final encoder, is input into all the decoder layers, who are taking the input from the previous decoder layer.

# Notes

```python
import tensorflow_addons as tfa
pbar = tfa.callbacks.TQDMProgressBar()
model.fit(…,callbacks=[pbar])
# TQDMProgressBar() also works with evaluate()
model.evaluate(…,callbacks=[pb
```
Check!

## Multiprocessing

```python
with Pool(workers) as pool:
    results = list(tqdm(pool.imap(worker,thread_list, total=len(thread_list))
                        ar])
``` 

Check!

## Padding strategies

1. No Padding with <EOS> token: This is the most efficient and elegant approach for Transformers.

2. Full Padding: Pad all phrases (and potentially feature sequences) to a fixed maximum length using a constant value or specific technique. This can be simpler to implement but introduces unnecessary computational overhead and potential information distortion due to large padding sections.

3. Full Padding with Masking: This combines the simplicity of full padding with the benefits of masking. While you pad all sequences to a fixed length, you apply masking during training to prevent the model from attending to the padded regions. This can be a good compromise if your model struggles with highly variable sequence lengths but you still want to avoid the downsides of excessive padding.