# Import Libraries

In [None]:
import ast
import math
import os
import random
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
from tqdm import tqdm
from typing import List, Optional, Tuple
from torch.utils.data import Dataset, DataLoader
from torch.optim.swa_utils import AveragedModel, get_ema_multi_avg_fn
from sklearn.model_selection import train_test_split

# Define Configuration

In [None]:
class Config(dict):
    """ Config class that utilizes dict keywords as object attributes for easy access. """
    
    def __init__(self, *args, **kwargs):
        super(Config, self).__init__(*args, **kwargs)
        self.__dict__ = self

In [None]:
command_primitives = {
    0: [160, 0, 160, 0, 160],       # Unknown command.
    1: [0, 160, 0, 0, 100],         # 1
    2: [0, 160, 160, 0, 100],       # 2
    3: [160, 160, 160, 0, 100],     # 3
    4: [160, 0, 160, 0, 100],       # 4
    5: [160, 0, 160, 0, 160],       # 5
    6: [0, 160, 0, 160, 100],       # Fist
    7: [0, 160, 160, 0, 100],       # Victory Sign
    8: [0, 0, 0, 160, 160],         # Telephone Call
    9: [0, 0, 0, 160, 100],         # Pinky Promise
    10: [0, 160, 0, 0, 160],        # Loser
    11: [0, 160, 0, 160, 160],      # Good
    12: [160, 0, 160, 160, 100],    # OK
}

In [None]:
config = Config(
    model=Config(
        fc_encoder_layers=[256],                                # Fully-Connected encoder layers (before memory network).
        fc_decoder_layers = [256],                              # Fully-Connected decoder layers (after memory network).
        use_controls = True,                                    # Whether to utilize current control inputs to predict the target controls.
        control_embeddings_dim = 384,                           # Control embeddings size (if use_controls is True).
        use_lstm = True,                                        # Whether to use LSTM as memory network.
        num_lstm_layers = 1,                                    # Number of lstm layers (if use_lstm is True).
        lstm_units = 256,                                       # Number of lstm units per layer.
        dropout_rate = 0.2,                                     # Dropout rate for the encoder (set 0.0 to deactivate).
        layer_norm = False,                                     # Whether to apply layer normalization.
        checkpoint_directory = 'checkpoints/inmoovposenet'      # Model checkpoint directory.
    ),
    training=Config(
        epochs = 1000,                                          # Number of training epochs.
        num_repeats = 1,                                        # How many times to repeat the same command during training (used for self-correction).
        batch_size = 16,                                        # Batch size during training.
        learning_rate = 0.001,                                  # Learning rate of the ADAM optimizer.
        lr_decay_factor = 1.0,                                  # Decay factor of the learning rate, lr' = lr*decay_factor (set 1.0 to deactivate).
        lr_decay_patience = 50,                                # The learning rate will decay if eval loss does not improve after the specified epochs.
        use_ema = False,                                        # Whether to apply Exponential-Moving-Average smoothing for gradient calculations.
        early_stopping_patience = 100,                          # Stops the training if eval loss does not improve after specified epochs.
        alpha_init = 1.0,                                       # Initial stochastic noise factor (if use_controls is True, set 0.0 to deactivate).
        alpha_decay_epochs = 100,                               # The stochastic noise factor is deactivated after the specified epochs.
        noise_std = 0.00,                                       # Additional gaussian noise std factor (set 0.0 to deactivate).
        shuffle = True                                          # Whether to shuffle training samples.
    ),
    experiments=Config(
        dataset_filepath = 'preprocessed_dataset.csv',          # The full dataset filepath.
        test_size = 0.2,                                        # The test size ratio of the dataset.
        seed = 0,                                               # The seed, which will be used throughout the experiment pipeline (set None to deactivate).
        metrics_filename = 'metrics.csv',                       # The filename of metrics dataframe, which will be used to store the train metrics.
        figures_filename = 'figures.png'                        # The filename of metrics plots, which will be used to display the train metrics.
    )
)

# Set Random Seeds

In [None]:
random.seed(config.experiments.seed)
np.random.seed(seed=config.experiments.seed)
torch.manual_seed(seed=config.experiments.seed)

# Dataset Format Overview

In [None]:
pd.read_csv(config.experiments.dataset_filepath)

# Load Dataset & Split into Train-Test

In [None]:
dataset_df = pd.read_csv(config.experiments.dataset_filepath)
df_train, df_test = train_test_split(dataset_df, test_size=config.experiments.test_size)
df_train.shape, df_test.shape

# Construct & Preprocess Dataset

In [None]:
class PoseDataset(Dataset):
    """ Pytorch Dataset wrapper class, which prepares, normalizes and validates the dataset before training. """
    
    def __init__(
            self, 
            df: pd.DataFrame, 
            normalize: bool = True, 
            validate_dataset: bool = True, 
            max_seq_len: Optional[int] = None,
            num_repeats: int = 1
    ):
        """
        :param df: The loaded raw dataframe.
        :param normalize: Whether to normalize the controls in range (0.0, 1.0) by dividing by max motor value (160.0).
        :param validate_dataset: Whether to validate the text embeddings and control sequence dimensions after construction.
                                 Set true to validate once and then disable it.
        :param max_seq_len: The specified max sequence length, which is used to apply zero padding. If None, it is calculated from the dataset.
                            Set None for train dataset and pass this parameter into the test dataset.
        :param num_repeats: Number of command repetitions inside the command sequence, before switching command (used for self-correction).
        """
        super().__init__()

        self.max_seq_len = max_seq_len
        self.num_repeats = num_repeats
        
        self._text_embeddings, self._control_sequences, self._sequence_sizes = self._construct_dataset(df=df)
        self.text_embeddings_dim = self._text_embeddings.shape[1]
        self.controls_dim = self._control_sequences.shape[2]
        

        if normalize:
            # Normalize the first 4 elements by dividing by 160
            self._control_sequences[:, :, :4] /= 160.0

            # Normalize the 5th element (index 4) from range [100, 160] to [0, 1]
            self._control_sequences[:, :, 4] = (self._control_sequences[:, :, 4] - 100.0) / 60.0
            
        if validate_dataset:
            self._validate_dataset()

    def __len__(self):
        return len(self._text_embeddings)

    def __getitem__(self, idx) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
        return self._text_embeddings[idx], self._control_sequences[idx], self._sequence_sizes[idx]

    def _construct_dataset(self, df: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
        """ Constructs and returns the arrays of text embeddings, control sequences and non-padded sequence lengths. """
        
        # Declare input placeholders.
        text_embeddings = []
        control_sequences = []
        sequence_sizes = []

        # Convert strings to actual literal lists.
        embeddings_list = list(map(ast.literal_eval, df['Embeddings']))             # Should be a list of size (N, 384)
        pose_sequence_list = list(map(ast.literal_eval, df['Pose_Sequence']))       # Should be a list of size (N, T, 5), where T is sequence timestep.
        
        # Apply command repetition.
        if self.num_repeats > 1:
            for i, sequence in enumerate(pose_sequence_list):
                repeated_sequence = [command for command in sequence for _ in range(self.num_repeats)]
                pose_sequence_list[i] = repeated_sequence

        # Calculate sequence size of controls.
        if self.max_seq_len is None:
            self.max_seq_len = max([len(sequence) for sequence in pose_sequence_list])
        
        # Define zero controls, which will be used for the padding.
        zero_controls = np.array([0.0, 0.0, 0.0, 0.0, 100.0], dtype=np.float32)

        for embeddings, sequence in zip(embeddings_list, pose_sequence_list):
            text_embeddings.append(embeddings)

            seq_len = len(sequence)
            sequence_sizes.append(seq_len)

            # Fetch control values for each command in the sequence.
            controls = [np.float32(command_primitives[command]) for command in sequence]

            # Apply zero padding to the rest of the control sequence, until max_seq_slots are filled.
            padding_size = self.max_seq_len - seq_len
            if padding_size > 0:
                controls += [zero_controls]*padding_size
            control_sequences.append(controls)
        return np.float32(text_embeddings), np.float32(control_sequences), np.int32(sequence_sizes)

    def _validate_dataset(self):        
        embeddings = self._text_embeddings
        controls = self._control_sequences
        lengths = self._sequence_sizes

        if not(embeddings.shape[0] == embeddings.shape[0] == lengths.shape[0]):
            raise RuntimeError(f'Dataset Size Mismatch: Embeddings: {embeddings.shape}, Controls: {controls.shape}, Sequence Sizes: {lengths.shape}')
        if not (embeddings.ndim == 2 and embeddings.shape[1] == self.text_embeddings_dim):
            raise RuntimeError(f'Expected Embeddings to be 3D array of {self.text_embeddings_dim} features, got {embeddings.shape}')
        if not (controls.ndim == 3 and controls.shape[2] == self.controls_dim):
            raise RuntimeError(f'Expected Controls to be 3D array of {self.controls_dim} features, got {controls.shape}')
        if lengths.ndim != 1:
            raise RuntimeError(f'Expected Sequence sizes to be 1D array, got {lengths.shape}')

        print(f'Sample-0: Embedding: {self._text_embeddings[0].shape}, Control: {self._control_sequences[0].shape}, Seq Len: {self._sequence_sizes[0]}')



In [None]:
train_dataset = PoseDataset(
    df=df_train, 
    normalize=True, 
    validate_dataset=True, 
    max_seq_len=None, 
    num_repeats=config.training.num_repeats
)
test_dataset = PoseDataset(
    df=df_test, 
    normalize=True, 
    validate_dataset=True, 
    max_seq_len=train_dataset.max_seq_len,
    num_repeats=config.training.num_repeats
)

# Display Random Training Sample

In [None]:
random_index = random.randint(0, len(train_dataset) - 1)
random_item = train_dataset[random_index]
x, y, sizes = random_item
x.shape, len(y), y, sizes

# Construct InmoovPoseNet

In [None]:
class InmoovPoseNet(nn.Module):
    def __init__(
            self,
            text_embeddings_dim: int,
            control_dim: int,
            model_config
    ):
        super(InmoovPoseNet, self).__init__()
        
        self._use_controls = model_config.use_controls
        self._use_lstm = model_config.use_lstm
        self._use_layer_norm = model_config.layer_norm
    
        # Building embeddings for the current control inputs.
        if self._use_controls:
            self.control_embeddings = nn.Linear(in_features=control_dim, out_features=model_config.control_embeddings_dim)
            fc_input_dim = text_embeddings_dim + model_config.control_embeddings_dim
        else:
            self.control_embeddings = None
            fc_input_dim = text_embeddings_dim

        # Build FC encoder for pre-processing.
        if len(model_config.fc_encoder_layers) > 0:
            self._use_fc_preprocessor = True
            fc_encoder_units = [fc_input_dim] + model_config.fc_encoder_layers
            encoder_layers = []
            for i in range(len(fc_encoder_units) - 1):
                encoder_layers.append(nn.Linear(fc_encoder_units[i], fc_encoder_units[i+1]))
                encoder_layers.append(nn.GELU())

                if model_config.dropout_rate > 0.0:
                    encoder_layers.append(nn.Dropout1d(p=model_config.dropout_rate))
            self.fc_encoder = nn.Sequential(*encoder_layers)
            fc_output_dim = fc_encoder_units[-1]
        else:
            self._use_fc_preprocessor = False
            self.fc_encoder = None
            fc_output_dim = fc_input_dim
            
        # Building the Memory Network Encoder (LSTM, Transformer, etc.).
        if self._use_lstm:
            self.memory_encoder = nn.LSTM(
                input_size=fc_output_dim,
                hidden_size=model_config.lstm_units,
                num_layers=model_config.num_lstm_layers,
                batch_first=True
            )
            encoder_output_dim = model_config.lstm_units
        else:
            self.memory_encoder = None
            encoder_output_dim = fc_output_dim
            
        # Build projection layer to apply a skip-connection.
        if encoder_output_dim == text_embeddings_dim:
            self._project_encoder_out = False
            self.projection_layer = None
        else:
            self._project_encoder_out = True
            self.projection_layer = None if encoder_output_dim == text_embeddings_dim else nn.Linear(
                in_features=encoder_output_dim,
                out_features=text_embeddings_dim
            )
        
        # Build Layer Normalization layer, which is applied after the skip connection.
        if self._use_layer_norm:
            self.layer_norm = nn.LayerNorm(normalized_shape=text_embeddings_dim)
        else:
            self.layer_norm = None

        # Build FC decoder for post-processing.
        fc_decoder_units = [text_embeddings_dim] + model_config.fc_decoder_layers + [control_dim]
        num_decoder_layers = len(fc_decoder_units)
        layers = []
        for i in range(len(fc_decoder_units) - 1):
            layers.append(torch.nn.Linear(in_features=fc_decoder_units[i], out_features=fc_decoder_units[i+1]))

            if i != num_decoder_layers - 2:
                layers.append(torch.nn.GELU())
        self.fc_decoder = torch.nn.Sequential(*layers)

    def forward(
            self, 
            inputs: Tuple[torch.Tensor, torch.Tensor], 
            state: Optional[torch.Tensor] = None
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
        """ Computes the target controls. 
            :param inputs: a tuple of text embeddings and current controls.
            :param state: the hidden state of the memory network.
            :return: the predicted controls and the hidden state of the memory network.
        """
        # Fetch & Validate inputs.
        text_embeddings, controls = inputs      # Batch x Features

        if not (text_embeddings.dim() == 2 and controls.dim() == 2):
            raise RuntimeError(f'Expected Text & Controls to be Batch x Features, got {text_embeddings.shape} and {controls.shape}')

        # Generate control embeddings.
        if self._use_controls:
            control_embeddings = self.control_embeddings(controls)
            encoder_inputs = torch.cat(tensors=[text_embeddings, control_embeddings], dim=1)
        else:
            encoder_inputs = text_embeddings

        # FC encoder
        if self._use_fc_preprocessor:
            encoder_inputs = self.fc_encoder(encoder_inputs)

        if self._use_lstm:
            x = torch.unsqueeze(encoder_inputs, dim=1)                      # Batch x 1 x Features
            memory_out, state = self.memory_encoder(x, state)               # (Batch x 1 x Features), (h0, c0)
            encoder_out = torch.squeeze(memory_out, dim=1)                  # Batch x Features
        else:
            state = None
            encoder_out = encoder_inputs

        # Projecting LSTM out to Text Embedding Dim
        if self._project_encoder_out:
            encoder_out = self.projection_layer(encoder_out)                # Batch x Text Dim

        # Adding Text Embeddings
        encoder_outputs = text_embeddings + encoder_out
        
        if self._use_layer_norm:
            encoder_outputs = self.layer_norm(encoder_outputs)

        # FC decoder
        outputs = self.fc_decoder(encoder_outputs)                          # Batch x Control Dim
        return outputs, state

    def get_initial_state(self, batch_size: int, device: Optional[torch.device]):
        """ Initializes and returns the initial memory state uniformly in range -0.001 to 0.001. 
            :param batch_size: The desired batch size of the hidden state.
            :param device: The device of the model.
        """
        
        if not self._use_lstm:
            return None
        
        return (
            torch.rand(self.memory_encoder.num_layers, batch_size, self.memory_encoder.hidden_size, device=device) * 0.002 - 0.001,
            torch.rand(self.memory_encoder.num_layers, batch_size, self.memory_encoder.hidden_size, device=device) * 0.002 - 0.001
        )




In [None]:
# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Instantiate the model
model = InmoovPoseNet(
    text_embeddings_dim=train_dataset.text_embeddings_dim,
    control_dim=train_dataset.controls_dim,
    model_config=config.model
).to(device)

# Print the model
model

# Train Model

In [None]:
class Trainer:
    """ Trainer class for InmoovPoseNet. It uses adam optimizer with MAE loss. """
    
    def __init__(
            self, 
            model: InmoovPoseNet,
            device: torch.device,
            train_dataset: PoseDataset,
            test_dataset: PoseDataset,
            checkpoint_directory: str,
            train_config
    ):
        # Compiling model (optimizer & loss).
        self.model = model
        self.device = device
        self.optimizer = torch.optim.Adam(params=model.parameters(), lr=train_config.learning_rate)
        self.loss_fn = torch.nn.L1Loss()
        
        if train_config.lr_decay_factor < 1.0:
            self.lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
                optimizer=self.optimizer,
                mode='min', 
                factor=train_config.lr_decay_factor, 
                patience=train_config.lr_decay_patience
            )
        else:
            self.lr_scheduler = None

        self.use_ema = train_config.use_ema
        if self.use_ema:
            self.ema_model = AveragedModel(model=model, device=device, multi_avg_fn=get_ema_multi_avg_fn(decay=0.999))
        else:
            self.ema_model = None

        self.train_dataset = train_dataset
        self.test_dataset = test_dataset
        self.checkpoint_directory = checkpoint_directory
        
        # Create checkpoint directory if it does not exist.
        if not os.path.exists(path=checkpoint_directory):
            os.makedirs(checkpoint_directory, exist_ok=True)

        # Initializing training parameters.
        self.alpha = train_config.alpha_init
        self.noise_std = train_config.noise_std
        self.controls_dim = train_dataset.controls_dim
        self.max_seq_len = train_dataset.max_seq_len
        self.train_config = train_config
        
    def get_sequence_predictions(
            self, 
            model: torch.nn.Module,
            text_embeddings: torch.Tensor, 
            controls: torch.Tensor,
            padded_control_sequence: torch.Tensor,
            state: Tuple[torch.Tensor],
            alpha: float,
            noise_std: float
    ) -> List[torch.Tensor]:
        """ Calculates the target controls of a given control sequence.
            :param model: The inmoov model.
            :param text_embeddings: the generated text embeddings.
            :param controls: the initial Inmoov controls.
            :param padded_control_sequence: the target control sequence.
            :param state: the initial hidden state of the memory network.
            :param alpha: alpha parameter value.
            :param noise_std: noise standard deviation value.
        """

        predictions = []
        for i in range(self.max_seq_len):
            # Calculate model predictions given current text embeddings and control.
            outputs, state = model((text_embeddings, controls), state)
            predictions.append(outputs)

            # Calculate new noisy current controls: controls(t+1) = a*targets(t) + (1-a)*outputs(t).
            controls = alpha*padded_control_sequence[:, i] + (1 - alpha)*outputs

            # Add noise to the generated controls. Usually, it works well for imitation learning tasks.
            if noise_std > 0.0:
                outputs += torch.randn_like(outputs, device=self.device)*noise_std
        return predictions
        
    def iterate_dataloader(
            self, 
            dataloader: DataLoader, 
            num_batches: int, 
            train: bool,
            alpha: float
    ) -> float:
        """ Executes either a train step or a validation step by iterating the provided dataloader. 
            :param dataloader: The dataloader whic (train/test).
            :param num_batches: The number of dataloader batches.
            :param train: Whether to perform train or validation step. In train mode, the gradients will be computed and model will be updated.
            :param alpha: alpha parameter value.
        :return: the average loss of the predictions.
        """
        total_loss = 0.0
        
        # Switch to train/eval model.
        if train:
            self.model.train()
        else:
            self.model.eval()
            
            if self.use_ema:
                self.ema_model.eval()
        
        for text_embeddings, padded_control_sequence, seq_sizes in dataloader:    
            # Construct model inputs and transfer to device.
            text_embeddings = text_embeddings.to(self.device)
            padded_control_sequence = padded_control_sequence.to(self.device)
            initial_controls = torch.rand(size=(padded_control_sequence.shape[0], self.controls_dim)).to(self.device)
            seq_sizes = seq_sizes.to(self.device)
            initial_state = self.model.get_initial_state(batch_size=text_embeddings.shape[0], device=self.device)
    
            if train:
                self.optimizer.zero_grad()
                predictions = self.get_sequence_predictions(
                    model=self.model,
                    text_embeddings=text_embeddings,
                    controls=initial_controls,
                    padded_control_sequence=padded_control_sequence,
                    state=initial_state,
                    alpha=alpha,
                    noise_std=self.noise_std,
                )
            else:
                with torch.no_grad():
                    predictions = self.get_sequence_predictions(
                        model=self.model if not self.use_ema else self.ema_model,
                        text_embeddings=text_embeddings,
                        controls=initial_controls,
                        padded_control_sequence=padded_control_sequence,
                        state=initial_state,
                        alpha=alpha,
                        noise_std=0.0
                    )

            # Fetch trainable control sequences (exclude paddings).
            predictions = torch.stack(predictions, dim=1)
            actual_seq_ranges = torch.arange(self.max_seq_len, device=device)
            mask = actual_seq_ranges.unsqueeze(0) < seq_sizes.unsqueeze(1)
            y_pred = predictions[mask]
            y_true = padded_control_sequence[mask]
    
            loss = self.loss_fn(y_pred, y_true)
            
            if train:
                loss.backward()
                self.optimizer.step()
                
                if self.use_ema:
                    self.ema_model.update_parameters(self.model)

            total_loss += loss.item()
        return round(total_loss/num_batches, 4)
            
    def train(self) -> pd.DataFrame:
        """ Trains the model and returns a dataframe with metrics. """
        train_losses = []
        test_losses = []
        alpha_values = []
        lr_values = []
        best_test_loss = np.inf
        early_stopping_counter = 0
        
        train_dataloader = DataLoader(dataset=self.train_dataset, batch_size=self.train_config.batch_size, shuffle=self.train_config.shuffle)
        test_dataloader = DataLoader(dataset=self.test_dataset, batch_size=self.train_config.batch_size, shuffle=False)
        num_train_batches = math.ceil(len(self.train_dataset)/self.train_config.batch_size)
        num_test_batches = math.ceil(len(self.test_dataset)/self.train_config.batch_size)
        epochs = self.train_config.epochs
        alpha_init = self.train_config.alpha_init
        
        for epoch in tqdm(iterable=range(epochs), desc='Epoch'):
            alpha_values.append(self.alpha)
            
            # Execute a train-step and validation-step and update learning rate.
            train_loss = self.iterate_dataloader(dataloader=train_dataloader, num_batches=num_train_batches, train=True, alpha=self.alpha)
            train_losses.append(train_loss)
            test_loss = self.iterate_dataloader(dataloader=test_dataloader, num_batches=num_test_batches, train=False, alpha=0.0)
            test_losses.append(test_loss)
            
            if self.lr_scheduler is not None:
                self.lr_scheduler.step(test_loss)

            lr_values.append(self.optimizer.param_groups[0]['lr'])
            
            # Save model checkpoints and activate early stopping mechanism (if triggered).
            if test_loss < best_test_loss:
                best_test_loss = test_loss
                early_stopping_counter = 0

                torch.save(model.state_dict(), f'{self.checkpoint_directory}/ckp.pt')
                
                print(f'Found new best validation loss at epoch {epoch}. Save model weights...')
            else:
                early_stopping_counter += 1
                
            if early_stopping_counter > self.train_config.early_stopping_patience:
                print(f'Early Stopping has been triggered at epoch: {epoch}.')
                
                break
            
            # Decay alpha parameter at the end of the epoch in a linear manner, until alpha = 0.0.
            if self.alpha > 0.0:
                decay_rate = alpha_init/self.train_config.alpha_decay_epochs
                self.alpha = alpha_init - decay_rate*epoch
        
            print(f'Epoch {epoch + 1}/{epochs} - Train Loss: {train_loss} - Test Loss: {test_loss}, Alpha: {self.alpha}')
        
        return pd.DataFrame({
            'Epoch': range(1, len(train_losses) + 1),
            'Train Loss': train_losses,
            'Test Loss': test_losses,
            'a': alpha_values,
            'lr': lr_values
        })

# First training

In [None]:
trainer = Trainer(
    model=model, 
    device=device, 
    train_dataset=train_dataset, 
    test_dataset=test_dataset, 
    checkpoint_directory=config.model.checkpoint_directory,
    train_config=config.training
)
metrics_df = trainer.train()
metrics_df.to_csv(config.experiments.metrics_filename)
metrics_df

# Generation Train-Validation Loss Figures

In [None]:
plt.figure(figsize=(15, 5))
metrics_df['Train Loss'].plot()
metrics_df['Test Loss'].plot()
plt.title('InmoovLSTMNet Train Performance')
plt.xlabel('Epochs')
plt.ylabel('MAE Loss')
plt.legend()
plt.savefig(config.experiments.figures_filename)
plt.show()

# Hyperparameter tuning

In [None]:
# Hyperparameter Grid Search Utilities
import copy
import itertools

def dict_to_config(d):
    """Recursively converts dicts to Config objects with proper attribute access"""
    if isinstance(d, dict):
        cfg = Config()
        for k, v in d.items():
            cfg[k] = dict_to_config(v)
        cfg.__dict__ = cfg  # Enable attribute-style access
        return cfg
    elif isinstance(d, list):
        return [dict_to_config(x) for x in d]
    else:
        return d

def run_hyperparameter_grid(config_template, grid_params):
    """
    Tests all combinations of parameters using grid search
    Args:
        grid_params: Dictionary of parameter paths and values to test
            Example: {'model.use_lstm': [True, False], 
                     'model.use_controls': [True, False]}
    """
    results = []
    
    # Generate all parameter combinations
    param_names = list(grid_params.keys())
    value_combinations = itertools.product(*grid_params.values())
    
    for i, values in enumerate(value_combinations):
        # Create parameter combination dictionary
        params = dict(zip(param_names, values))
        
        # Create config copy
        trial_config = dict_to_config(copy.deepcopy(config_template))
        
        # Set parameters
        for param_path, value in params.items():
            parts = param_path.split('.')
            obj = trial_config
            for part in parts[:-1]:
                obj = getattr(obj, part)
            setattr(obj, parts[-1], value)
        
        # Create unique checkpoint directory
        dir_suffix = "_".join([f"{k.split('.')[-1]}_{v}" for k, v in params.items()])
        trial_config.model.checkpoint_directory = f'checkpoints/grid_{dir_suffix}'
        
        print(f'\n\n=== Trial {i+1}: Testing {params} ===')
        
        train_dataset = PoseDataset(
            df=df_train, 
            normalize=True, 
            validate_dataset=True, 
            max_seq_len=(trial_config.training.num_repeats*8), 
            num_repeats=trial_config.training.num_repeats
        )
        test_dataset = PoseDataset(
            df=df_test, 
            normalize=True, 
            validate_dataset=True, 
            max_seq_len=train_dataset.max_seq_len,
            num_repeats=trial_config.training.num_repeats
        )

        # Build model
        model = InmoovPoseNet(
            text_embeddings_dim=train_dataset.text_embeddings_dim,
            control_dim=train_dataset.controls_dim,
            model_config=trial_config.model
        ).to(device)
        
        # Train
        trainer = Trainer(
            model=model,
            device=device,
            train_dataset=train_dataset,
            test_dataset=test_dataset,
            checkpoint_directory=trial_config.model.checkpoint_directory,
            train_config=trial_config.training
        )
        
        metrics_df = trainer.train()
        best_test_loss = metrics_df['Test Loss'].min()
        results.append({
            'params': params,
            'best_loss': best_test_loss,
            'metrics': metrics_df
        })
    
    # Find best combination
    best_result = min(results, key=lambda x: x['best_loss'])
    print('\nGrid Search Complete! Best configuration:')
    print(f"Parameters: {best_result['params']}")
    print(f"Best validation loss: {best_result['best_loss']:.4f}")
    
    return best_result



## Model structure

### First comparison

In [None]:
# --------------------------------------------------
# Phase 1: Core Architecture Tuning
# --------------------------------------------------
print("\n=== Starting Phase 1: Core Architecture Tuning ===")

# Define Phase 1 parameters
phase1_params = {
    'model.use_lstm': [True, False],
    'model.use_controls': [True, False],
    'model.layer_norm': [True, False]
}

# Create tuning config with reduced epochs
phase1_config = dict_to_config(copy.deepcopy(config))

# Run grid search for Phase 1
phase1_best = run_hyperparameter_grid(phase1_config, phase1_params)

# Apply best parameters to main config
for param_path, value in phase1_best['params'].items():
    parts = param_path.split('.')
    obj = config
    for part in parts[:-1]: 
        obj = getattr(obj, part)
    setattr(obj, parts[-1], value)

print("\nPhase 1 Complete! Best parameters:")
print(phase1_best['params'])



### Second Comparison

In [None]:
# --------------------------------------------------
# Phase 2: LSTM Parameter Tuning (conditional)
# --------------------------------------------------
print("\n=== Starting Phase 2: LSTM Parameter Tuning ===")

if config.model.use_lstm:
    # Define LSTM-specific parameters
    phase2_params = {
        'model.lstm_units': [128, 256, 512],
        'model.num_lstm_layers': [1, 2]
    }

    # Create tuning config with reduced epochs
    phase2_config = dict_to_config(copy.deepcopy(config))

    # Run grid search for Phase 2
    phase2_best = run_hyperparameter_grid(phase2_config, phase2_params)

    # Apply best LSTM parameters
    for param_path, value in phase2_best['params'].items():
        parts = param_path.split('.')
        obj = config
        for part in parts[:-1]: 
            obj = getattr(obj, part)
        setattr(obj, parts[-1], value)

    print("\nPhase 2 Complete! Best LSTM parameters:")
    print(phase2_best['params'])
else:
    print("Skipping Phase 2 - LSTM disabled in current configuration")


## Training setup

In [None]:
# --------------------------------------------------
# Phase 3: Training Strategy Tuning
# --------------------------------------------------
print("\n=== Starting Phase 3: Training Parameter Tuning ===")

# Define training parameters
phase3_params = {
    'training.num_repeats': [1, 2, 3],
    'training.use_ema': [True],
    'training.noise_std': [0.0, 0.001, 0.01]
}
# Create tuning config with reduced epochs
phase3_config = dict_to_config(copy.deepcopy(config))

# Run grid search for Phase 3
phase3_best = run_hyperparameter_grid(phase3_config, phase3_params)

# Apply best training parameters
for param_path, value in phase3_best['params'].items():
    parts = param_path.split('.')
    obj = config
    for part in parts[:-1]: 
        obj = getattr(obj, part)
    setattr(obj, parts[-1], value)

print("\nPhase 3 Complete! Best training parameters:")
print(phase3_best['params'])

## Final configuration

In [None]:
# --------------------------------------------------
# Final Configuration Setup
# --------------------------------------------------
# Restore original epoch count

print("\n=== Final Optimized Configuration ===")
print(f"Architecture:")
print(f"- use_lstm: {config.model.use_lstm}")
if config.model.use_lstm:
    print(f"- lstm_units: {config.model.lstm_units}")
    print(f"- num_lstm_layers: {config.model.num_lstm_layers}")
print(f"- use_controls: {config.model.use_controls}")
print(f"- layer_norm: {config.model.layer_norm}")

print("\nTraining Strategy:")
print(f"- num_repeats: {config.training.num_repeats}")
print(f"- use_ema: {config.training.use_ema}")
print(f"- noise_std: {config.training.noise_std}")

# Best model with different seeds

In [None]:
config = Config(
    model=Config(
        fc_encoder_layers=[256],                                # Fully-Connected encoder layers (before memory network).
        fc_decoder_layers = [256],                              # Fully-Connected decoder layers (after memory network).
        use_controls = True,                                    # Whether to utilize current control inputs to predict the target controls.
        control_embeddings_dim = 384,                           # Control embeddings size (if use_controls is True).
        use_lstm = True,                                        # Whether to use LSTM as memory network.
        num_lstm_layers = 1,                                    # Number of lstm layers (if use_lstm is True).
        lstm_units = 512,                                       # Number of lstm units per layer.
        dropout_rate = 0.2,                                     # Dropout rate for the encoder (set 0.0 to deactivate).
        layer_norm = False,                                     # Whether to apply layer normalization.
        checkpoint_directory = 'checkpoints/random_seeds'       # Model checkpoint directory.
    ),
    training=Config(
        epochs = 1000,                                          # Number of training epochs.
        num_repeats = 2,                                        # How many times to repeat the same command during training (used for self-correction).
        batch_size = 16,                                        # Batch size during training.
        learning_rate = 0.001,                                  # Learning rate of the ADAM optimizer.
        lr_decay_factor = 1.0,                                  # Decay factor of the learning rate, lr' = lr*decay_factor (set 1.0 to deactivate).
        lr_decay_patience = 50,                                # The learning rate will decay if eval loss does not improve after the specified epochs.
        use_ema = True,                                        # Whether to apply Exponential-Moving-Average smoothing for gradient calculations.
        early_stopping_patience = 100,                          # Stops the training if eval loss does not improve after specified epochs.
        alpha_init = 1.0,                                       # Initial stochastic noise factor (if use_controls is True, set 0.0 to deactivate).
        alpha_decay_epochs = 100,                               # The stochastic noise factor is deactivated after the specified epochs.
        noise_std = 0.00,                                       # Additional gaussian noise std factor (set 0.0 to deactivate).
        shuffle = True                                          # Whether to shuffle training samples.
    ),
    experiments=Config(
        dataset_filepath = 'preprocessed_dataset.csv',          # The full dataset filepath.
        test_size = 0.2,                                        # The test size ratio of the dataset.
        seed = 0,                                               # The seed, which will be used throughout the experiment pipeline (set None to deactivate).
        metrics_filename = 'metrics.csv',                       # The filename of metrics dataframe, which will be used to store the train metrics.
        figures_filename = 'figures.png'                        # The filename of metrics plots, which will be used to display the train metrics.
    )
)

In [None]:
seeds = range(20)
results = []

for seed in seeds:

    config.experiments.seed = seed

    print(config.experiments.seed)
    random.seed(config.experiments.seed)
    np.random.seed(seed=config.experiments.seed)
    torch.manual_seed(seed=config.experiments.seed)

    dataset_df = pd.read_csv(config.experiments.dataset_filepath)
    df_train, df_test = train_test_split(dataset_df, test_size=config.experiments.test_size, random_state=seed)
    print(df_train.shape, df_test.shape)

    train_dataset = PoseDataset(
        df=df_train, 
        normalize=True, 
        validate_dataset=True, 
        max_seq_len=(config.training.num_repeats*8), 
        num_repeats=config.training.num_repeats
    )
    test_dataset = PoseDataset(
        df=df_test, 
        normalize=True, 
        validate_dataset=True, 
        max_seq_len=train_dataset.max_seq_len,
        num_repeats=config.training.num_repeats
    )

    # Device configuration
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Instantiate the model
    model = InmoovPoseNet(
        text_embeddings_dim=train_dataset.text_embeddings_dim,
        control_dim=train_dataset.controls_dim,
        model_config=config.model
    ).to(device)

    config.model.checkpoint_directory = f'checkpoints/random_seed_{seed}'

    trainer = Trainer(
        model=model, 
        device=device, 
        train_dataset=train_dataset, 
        test_dataset=test_dataset, 
        checkpoint_directory=config.model.checkpoint_directory,
        train_config=config.training
    )
    metrics_df = trainer.train()
    results.append(metrics_df)

In [None]:
# Assume results is your list of DataFrames
train_losses = []
test_losses = []

for df in results:
    
    last_row = df.iloc[-1]
    train_losses.append(last_row['Train Loss'])
    test_losses.append(last_row['Test Loss'])

In [None]:
train_losses

In [None]:
import pickle

# Save the data
with open('loss_values_correct.pkl', 'wb') as f:
    pickle.dump({'train_losses': train_losses, 'test_losses': test_losses}, f)

In [None]:
import pickle
# Load the data
with open('loss_values_correct.pkl', 'rb') as f:
    data = pickle.load(f)
    train_losses_1 = data['train_losses']
    test_losses_1 = data['test_losses']

In [None]:
for (x,y) in zip(train_losses_1,test_losses_1):
    print(x,y)

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Compute the means and stds
mean_train = np.mean(train_losses_1)
std_train = np.std(train_losses_1)

mean_test = np.mean(test_losses_1)
std_test = np.std(test_losses_1)

print(f"Train Loss - Mean: {mean_train:.4f}, Std: {std_train:.4f}")
print(f"Test Loss  - Mean: {mean_test:.4f}, Std: {std_test:.4f}")

# Plot histograms
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.hist(train_losses_1, bins=10, color='skyblue', edgecolor='black')
plt.title('Train Loss (Last Row)')
plt.xlabel('Loss')
plt.ylabel('Frequency')

plt.subplot(1, 2, 2)
plt.hist(test_losses_1, bins=10, color='salmon', edgecolor='black')
plt.title('Test Loss (Last Row)')
plt.xlabel('Loss')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()


In [None]:
# X-axis: index of each run
x = list(range(len(results)))

# Plotting
plt.figure(figsize=(10, 6))
plt.plot(x, train_losses_1, marker='o', label='Train Loss', color='blue')
plt.plot(x, test_losses_1, marker='o', label='Test Loss', color='red')
plt.title('Train and Test Loss (Last Row of Each Run)')
plt.xlabel('Run Index')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

# Train final model

In [None]:
config = Config(
    model=Config(
        fc_encoder_layers=[256],                                # Fully-Connected encoder layers (before memory network).
        fc_decoder_layers = [256],                              # Fully-Connected decoder layers (after memory network).
        use_controls = True,                                    # Whether to utilize current control inputs to predict the target controls.
        control_embeddings_dim = 384,                           # Control embeddings size (if use_controls is True).
        use_lstm = True,                                        # Whether to use LSTM as memory network.
        num_lstm_layers = 1,                                    # Number of lstm layers (if use_lstm is True).
        lstm_units = 512,                                       # Number of lstm units per layer.
        dropout_rate = 0.2,                                     # Dropout rate for the encoder (set 0.0 to deactivate).
        layer_norm = False,                                     # Whether to apply layer normalization.
        checkpoint_directory = 'checkpoints/inmoovposenet'      # Model checkpoint directory.
    ),
    training=Config(
        epochs = 1000,                                          # Number of training epochs.
        num_repeats = 2,                                        # How many times to repeat the same command during training (used for self-correction).
        batch_size = 16,                                        # Batch size during training.
        learning_rate = 0.001,                                  # Learning rate of the ADAM optimizer.
        lr_decay_factor = 1.0,                                  # Decay factor of the learning rate, lr' = lr*decay_factor (set 1.0 to deactivate).
        lr_decay_patience = 50,                                # The learning rate will decay if eval loss does not improve after the specified epochs.
        use_ema = True,                                        # Whether to apply Exponential-Moving-Average smoothing for gradient calculations.
        early_stopping_patience = 100,                          # Stops the training if eval loss does not improve after specified epochs.
        alpha_init = 1.0,                                       # Initial stochastic noise factor (if use_controls is True, set 0.0 to deactivate).
        alpha_decay_epochs = 100,                               # The stochastic noise factor is deactivated after the specified epochs.
        noise_std = 0.00,                                       # Additional gaussian noise std factor (set 0.0 to deactivate).
        shuffle = True                                          # Whether to shuffle training samples.
    ),
    experiments=Config(
        dataset_filepath = 'preprocessed_dataset.csv',          # The full dataset filepath.
        test_size = 0.2,                                        # The test size ratio of the dataset.
        seed = 0,                                               # The seed, which will be used throughout the experiment pipeline (set None to deactivate).
        metrics_filename = 'metrics.csv',                       # The filename of metrics dataframe, which will be used to store the train metrics.
        figures_filename = 'figures.png'                        # The filename of metrics plots, which will be used to display the train metrics.
    )
)

In [None]:
print(config.experiments.seed)
random.seed(config.experiments.seed)
np.random.seed(seed=config.experiments.seed)
torch.manual_seed(seed=config.experiments.seed)

In [None]:
dataset_df = pd.read_csv(config.experiments.dataset_filepath)
df_train, df_test = train_test_split(dataset_df, test_size=config.experiments.test_size, random_state=config.experiments.seed)
print(df_train.shape, df_test.shape)

In [None]:
train_dataset = PoseDataset(
    df=df_train, 
    normalize=True, 
    validate_dataset=True, 
    max_seq_len=(config.training.num_repeats*8), 
    num_repeats=config.training.num_repeats
)
test_dataset = PoseDataset(
    df=df_test, 
    normalize=True, 
    validate_dataset=True, 
    max_seq_len=train_dataset.max_seq_len,
    num_repeats=config.training.num_repeats
)

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Instantiate the model
model = InmoovPoseNet(
    text_embeddings_dim=train_dataset.text_embeddings_dim,
    control_dim=train_dataset.controls_dim,
    model_config=config.model
).to(device)

trainer = Trainer(
    model=model, 
    device=device, 
    train_dataset=train_dataset, 
    test_dataset=test_dataset, 
    checkpoint_directory=config.model.checkpoint_directory,
    train_config=config.training
)

metrics_df = trainer.train()


In [None]:
torch.save(model.state_dict(), "embeddings_to_control_final_model.pth")

## Inference

In [None]:
import torch
import numpy as np
from sentence_transformers import SentenceTransformer
from typing import List, Optional, Tuple

# Load your model configuration (same as training)
class Config(dict):
    def __init__(self, *args, **kwargs):
        super(Config, self).__init__(*args, **kwargs)
        self.__dict__ = self

config = Config(
    model=Config(
        fc_encoder_layers=[256],
        fc_decoder_layers=[256],
        use_controls=True,
        control_embeddings_dim=384,
        use_lstm=True,
        num_lstm_layers=2,
        lstm_units=256,
        dropout_rate=0.2,
        layer_norm=False
    ),
    training=Config(
        num_repeats=3
    )
)

# Initialize the model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = InmoovPoseNet(
    text_embeddings_dim=384,  # Dimension from your sentence transformer
    control_dim=5,            # 5 control outputs
    model_config=config.model
).to(device)

# Load the trained weights
model.load_state_dict(torch.load("embeddings_to_control_final_model.pth"))
model.eval()



In [None]:
# Initialize the sentence transformer
SENTENCE_TRANSFORMER = "sentence-transformers/all-MiniLM-L12-v2"
embedding_model = SentenceTransformer(SENTENCE_TRANSFORMER)

def predict_controls(text: str, num_steps: int = 10) -> np.ndarray:
    """
    Generate control sequence from text input.
    
    Args:
        text: Input sentence to convert to controls
        num_steps: Number of control steps to generate
        
    Returns:
        Numpy array of shape (num_steps, 5) containing the predicted controls
    """
    # Convert text to embedding
    text_embedding = embedding_model.encode(text, convert_to_tensor=True).unsqueeze(0).to(device)
    
    # Initialize controls and state
    current_controls = torch.rand(size=(1, 5)).to(device)  # Random initial controls
    state = model.get_initial_state(batch_size=1, device=device)
    
    predictions = []
    for _ in range(num_steps):
        with torch.no_grad():
            # Get next control prediction
            outputs, state = model((text_embedding, current_controls), state)
            predictions.append(outputs.cpu().numpy())
            
            # Update current controls with the prediction
            current_controls = outputs
    
    # Stack predictions and denormalize
    predictions = np.vstack(predictions)
    
    # Denormalize the predictions (reverse the normalization done during training)
    # First 4 controls (0-3) were normalized by dividing by 160
    predictions[:, :4] *= 160.0
    
    # 5th control (index 4) was normalized as (value - 100)/60
    predictions[:, 4] = predictions[:, 4] * 60.0 + 100.0
    
    # Round to nearest integer (since motor commands are integers)
    predictions = np.round(predictions).astype(int)
    
    # Clip to valid ranges (assuming 0-160 for first 4, 100-160 for last)
    predictions[:, :4] = np.clip(predictions[:, :4], 0, 160)
    predictions[:, 4] = np.clip(predictions[:, 4], 100, 160)
    
    return predictions

def print_controls(controls: np.ndarray):
    """Pretty print the control sequence"""
    print("Generated Control Sequence:")
    print("Step | Thumb | Index | Middle | Ring | Pinky")
    print("---------------------------------------------")
    for i, step in enumerate(controls):
        print(f"{i+1:4} | {step[0]:5} | {step[1]:5} | {step[2]:6} | {step[3]:4} | {step[4]:5}")



In [None]:
# Example usage
if __name__ == "__main__":
    while True:
        text = input("\nEnter a command (or 'quit' to exit): ")
        if text.lower() == 'quit':
            break
            
        print(f"\nGenerating controls for: '{text}'")
        controls = predict_controls(text, num_steps=config.training.num_repeats*train_dataset.max_seq_len)  # Generate 8 steps
        print_controls(controls)