## Imports Preparation

In [31]:
# Import necessary libraries
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
import numpy as np
from transformers import AutoTokenizer, AutoModel, AutoConfig


In [32]:
# Set random seed for reproducibility
torch.manual_seed(0)
np.random.seed(0)
if torch.cuda.is_available():
    torch.cuda.manual_seed(0)
    torch.cuda.manual_seed_all(0)
torch.backends.cudnn.deterministic = True

## Dataset Preparation

In [33]:
from PIL import Image

class MotionDataset(Dataset):
    def __init__(self, images, instructions, intents, motions, tasks, tokenizer_name, transform=None):
        self.images = images
        self.instructions = instructions
        self.intents = intents
        self.motions = motions
        self.tasks = tasks.float()  # Ensure tasks are of type Float
        self.transform = transform
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        image = self.images[idx]
        instruction = self.instructions[idx]
        intent = self.intents[idx]
        motion = self.motions[idx]
        task = self.tasks[idx]

        # Tokenize text instructions and intents
        instruction_tokens = self.tokenizer(instruction, padding="max_length", truncation=True, return_tensors="pt")
        intent_tokens = self.tokenizer(intent, padding="max_length", truncation=True, return_tensors="pt")

        if self.transform and isinstance(image, (np.ndarray, Image.Image)):
            image = self.transform(image)

        return image, instruction_tokens, intent_tokens, motion, task

## Model Definition

In [34]:
class MultimodalTaskModel(nn.Module):
    def __init__(self, text_model_name, num_joints, embed_dim):
        super(MultimodalTaskModel, self).__init__()
        # Text encoder (e.g., BERT)
        self.text_encoder = AutoModel.from_pretrained(text_model_name)
        text_config = AutoConfig.from_pretrained(text_model_name)
        self.text_embed_dim = text_config.hidden_size

        # Image encoder (simple CNN for now)
        self.image_encoder = nn.Sequential(
            nn.Conv2d(3, 16, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),  # Output size: (16, 112, 112)
            nn.Conv2d(16, 32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),  # Output size: (32, 56, 56)
            nn.Flatten(),     # Flatten to (32 * 56 * 56 = 100352)
            nn.Linear(32 * 56 * 56, embed_dim)  # Correct input size
        )

        # Motion encoder (fully connected)
        self.motion_encoder = nn.Linear(num_joints * 3, embed_dim)

        # Fusion layers
        # Updated: Adjusting dimensions to include intent features
        self.fc_combined = nn.Linear(self.text_embed_dim * 2 + embed_dim * 2, 256)  # *2 for instruction + intent
        self.fc_output = nn.Linear(256, num_joints * 3)

    def forward(self, image, instruction_tokens, intent_tokens, motion):
        # Text processing
        instruction_features = self.text_encoder(**instruction_tokens).pooler_output
        intent_features = self.text_encoder(**intent_tokens).pooler_output

        # Image processing
        image_features = self.image_encoder(image)

        # Motion processing
        motion_features = self.motion_encoder(motion)

        # Combine all features (including intent_features)
        combined = torch.cat((instruction_features, intent_features, image_features, motion_features), dim=1)
        combined = nn.ReLU()(self.fc_combined(combined))
        output = self.fc_output(combined)
        return output

## Training the Model

In [35]:
def train_model(model, dataloader, criterion, optimizer, device, num_epochs):
    model.train()  # Set model to training mode
    for epoch in range(num_epochs):
        running_loss = 0.0
        for batch in dataloader:
            # Unpack batch data
            images, instruction_tokens, intent_tokens, motions, tasks = batch
            
            # Move data to device
            images = images.to(device)
            motions = motions.to(device)
            tasks = tasks.to(device)
            instruction_tokens = {key: val.squeeze(1).to(device) for key, val in instruction_tokens.items()}
            intent_tokens = {key: val.squeeze(1).to(device) for key, val in intent_tokens.items()}
            
            # Zero the parameter gradients
            optimizer.zero_grad()
            
            # Forward pass
            outputs = model(images, instruction_tokens, intent_tokens, motions)
            
            # Compute loss
            loss = criterion(outputs, tasks)
            
            # Backward pass and optimization
            loss.backward()
            optimizer.step()
            
            # Update running loss
            running_loss += loss.item()
        
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {running_loss / len(dataloader)}")

## Main Script

In [36]:
# Load Data (Dummy Example)
# Replace this with real preprocessed data
num_data = 100
images = torch.randn(num_data, 3, 224, 224)  # Example: num_data images (3 channels, 224x224 resolution)
instructions = ["Turn left"] * num_data  # Replace with actual instructions
intents = ["Avoid obstacle"] * num_data  # Replace with actual intents
motions = torch.randn(num_data, 360 * 3)  # Example: 360 joints, 3 coordinates each
tasks = torch.randint(0, 2, (num_data, 360 * 3))  # Regression task for joint positions

# Prepare Dataset and DataLoader
transform = transforms.Compose([transforms.ToTensor()])
tokenizer_name = "bert-base-uncased"
dataset = MotionDataset(images, instructions, intents, motions, tasks, tokenizer_name, transform=transform)
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)

# Initialize Model, Criterion, and Optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MultimodalTaskModel("bert-base-uncased", 360, 128).to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Train the Model
train_model(model, dataloader, criterion, optimizer, device, num_epochs=5)

# Save the Model
torch.save(model.state_dict(), "multimodal_task_model.pth")

Epoch 1/5, Loss: 0.7807264786500198
Epoch 2/5, Loss: 0.2851264545550713
Epoch 3/5, Loss: 0.25231581582472873
Epoch 4/5, Loss: 0.2470300392462657
Epoch 5/5, Loss: 0.24392583622382238


# Evaluation of the model

In [41]:
# Load the trained model weights
model.load_state_dict(torch.load("multimodal_task_model.pth", weights_only=True))
model.eval()

# Inference
# Replace this with real data
new_image = torch.randn(1, 3, 224, 224)  # Example: single image (3 channels, 224x224 resolution)
current_joint_positions = np.loadtxt("joint_positions.csv", delimiter=",")  # Load joint positions from file
new_instruction = "Turn right"  # Replace with actual instruction
new_intent = "Avoid obstacle"  # Replace with actual intent

# Tokenize text instructions and intents
new_instruction_tokens = dataset.tokenizer(new_instruction, padding="max_length", truncation=True, return_tensors="pt")
new_intent_tokens = dataset.tokenizer(new_intent, padding="max_length", truncation=True, return_tensors="pt")

# Forward pass
output = model(new_image.to(device), new_instruction_tokens, new_intent_tokens, torch.tensor(current_joint_positions).float().to(device))
print(output)

# Convert output to numpy array
output_np = output.detach().cpu().numpy()
print(output_np)

# Convert output to joint positions
joint_positions = output_np.reshape(360, 3)
print(joint_positions)

# Save joint positions to file
np.savetxt("joint_positions.csv", joint_positions, delimiter=",")

# Load joint positions from file

tensor([[0.4982, 0.6280, 0.3798,  ..., 0.4935, 0.5553, 0.4848]],
       grad_fn=<AddmmBackward0>)
[[0.49816364 0.6280182  0.3797944  ... 0.49352846 0.55528903 0.48482388]]
[[0.49816364 0.6280182  0.3797944 ]
 [0.44454357 0.41951203 0.46397445]
 [0.39908454 0.5871437  0.41975582]
 ...
 [0.48809758 0.43658128 0.51777166]
 [0.35319674 0.3513166  0.5044269 ]
 [0.49352846 0.55528903 0.48482388]]
