In [28]:
file_path = "camera_poses.txt"
import ast
paths = []
translations = []
rotations = []

with open(file_path, "r") as file:
    for line in file:
        parts = line.strip().split("\t")
        if len(parts) == 3:
            paths.append(parts[0])
            translations.append(parts[1])  # Keep as strings
            rotations.append(parts[2])  # Keep as strings

paths = [path.lstrip('/') for path in paths]
translations = [ast.literal_eval(t) for t in translations]
rotations = [ast.literal_eval(t)for t in rotations]
updated_paths = [path.replace("datasets/classroom/mapping/", "datasets/classroom/mapping/") for path in paths]

test_size = 0.3
random_state = 42

# Ensure reproducibility
import random
random.seed(random_state)

# Assuming updated_paths, translations, and rotations are lists
data = list(zip(updated_paths, translations, rotations))
random.shuffle(data)

# Determine the split index
split_idx = int(len(data) * (1 - test_size))

# Split the data
train_data = data[:split_idx]
test_data = data[split_idx:]

# Unzip the data back into individual lists
train_paths, train_translations, train_rotations = zip(*train_data)
test_paths, test_translations, test_rotations = zip(*test_data)

# Convert back to lists if needed
train_paths = list(train_paths)
train_translations = list(train_translations)
train_rotations = list(train_rotations)

test_paths = list(test_paths)
test_translations = list(test_translations)
test_rotations = list(test_rotations)

print(len(train_paths))
print(len(test_paths))



61
27


In [29]:
!ls -l datasets/classroom/mapping/out15.png


-rw-r--r-- 1 shrey.arora_ug25 shrey.arora_ug25 340499 Dec 12 22:39 datasets/classroom/mapping/out15.png


In [30]:
#Model
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import models

class PoseEstimationNetwork(nn.Module):
    def __init__(self):
        super(PoseEstimationNetwork, self).__init__()
        # Backbone for feature extraction
        self.feature_extractor = models.resnet34(weights = True)
        self.feature_extractor.fc = nn.Identity()  # Remove final classification layer

        # Regression layers for pose estimation
        self.fc1 = nn.Linear(512, 256)
        self.fc2 = nn.Linear(256, 128)
        self.fc_translation = nn.Linear(128, 3)  # Predict translation (x, y, z)
        self.fc_rotation = nn.Linear(128, 4)    # Predict rotation as quaternion (w, x, y, z)

    def forward(self, x):
        x = self.feature_extractor(x)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        translation = self.fc_translation(x)
        rotation = F.normalize(self.fc_rotation(x), p=2, dim=1)  # Normalize quaternion
        return translation, rotation


def pose_loss(pred_translation, pred_rotation, gt_translation, gt_rotation, alpha=1.0):
    """
    Combines translation and rotation loss.
    """
    # Translation loss (L2)
    translation_loss = F.mse_loss(pred_translation, gt_translation)

    # Rotation loss (geodesic distance)
    rotation_loss = 1 - torch.sum(pred_rotation * gt_rotation, dim=1).mean()  # Cosine similarity loss

    return translation_loss + alpha * rotation_loss

In [31]:
#Model
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import models

class PoseEstimationNetwork(nn.Module):
    def __init__(self):
        super(PoseEstimationNetwork, self).__init__()
        # Backbone for feature extraction
        self.feature_extractor = models.resnet34(weights = True)
        self.feature_extractor.fc = nn.Identity()  # Remove final classification layer

        # Regression layers for pose estimation
        self.fc1 = nn.Linear(512, 256)
        self.fc2 = nn.Linear(256, 128)
        self.fc_translation = nn.Linear(128, 3)  # Predict translation (x, y, z)
        self.fc_rotation = nn.Linear(128, 4)    # Predict rotation as quaternion (w, x, y, z)

    def forward(self, x):
        x = self.feature_extractor(x)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        translation = self.fc_translation(x)
        rotation = F.normalize(self.fc_rotation(x), p=2, dim=1)  # Normalize quaternion
        return translation, rotation


def pose_loss(pred_translation, pred_rotation, gt_translation, gt_rotation, alpha=1.0):
    """
    Combines translation and rotation loss.
    """
    # Translation loss (L2)
    translation_loss = F.mse_loss(pred_translation, gt_translation)

    # Rotation loss (geodesic distance)
    rotation_loss = 1 - torch.sum(pred_rotation * gt_rotation, dim=1).mean()  # Cosine similarity loss

    return translation_loss + alpha * rotation_loss

In [35]:
CUDA_OR_CPU="cpu"

In [32]:
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as T
from PIL import Image
import os


class PoseDataset(Dataset):
    def __init__(self, image_paths, translations, rotations, transform=None):
        self.image_paths = image_paths
        self.image_paths = [os.path.abspath(p) for p in self.image_paths]

        self.translations = translations
        self.rotations = rotations
        self.transform = transform

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image = Image.open(self.image_paths[idx]).convert("RGB")
        if self.transform:
            image = self.transform(image)
        translation = torch.tensor(self.translations[idx], dtype=torch.float32)
        rotation = torch.tensor(self.rotations[idx], dtype=torch.float32)
        return image, translation, rotation

# Data preparation
image_paths = train_paths  # Paths to images in ls1
translations = train_translations # Corresponding ground truth translations
rotations = train_rotations  # Corresponding ground truth rotations (quaternions)

transform = T.Compose([
    T.Resize((224, 224)),
    T.ToTensor(),
    T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

dataset = PoseDataset(image_paths, translations, rotations, transform=transform)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# Model and optimizer
model = PoseEstimationNetwork().to(CUDA_OR_CPU)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
epochs = 30

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for images, gt_translation, gt_rotation in dataloader:
        images = images.to(CUDA_OR_CPU)
        gt_translation = gt_translation.to(CUDA_OR_CPU)
        gt_rotation = gt_rotation.to(CUDA_OR_CPU)

        optimizer.zero_grad()
        pred_translation, pred_rotation = model(images)
        loss = pose_loss(pred_translation, pred_rotation, gt_translation, gt_rotation)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(dataloader):.4f}")


Epoch 1/30, Loss: 6.1550
Epoch 2/30, Loss: 4.9227
Epoch 3/30, Loss: 4.2029
Epoch 4/30, Loss: 3.6868
Epoch 5/30, Loss: 3.2429
Epoch 6/30, Loss: 2.8592
Epoch 7/30, Loss: 2.5110
Epoch 8/30, Loss: 2.1432
Epoch 9/30, Loss: 1.8475
Epoch 10/30, Loss: 1.5562
Epoch 11/30, Loss: 1.3812
Epoch 12/30, Loss: 0.9879
Epoch 13/30, Loss: 0.7814
Epoch 14/30, Loss: 0.5498
Epoch 15/30, Loss: 0.3976
Epoch 16/30, Loss: 0.3728
Epoch 17/30, Loss: 0.2536
Epoch 18/30, Loss: 0.2156
Epoch 19/30, Loss: 0.1678
Epoch 20/30, Loss: 0.1174
Epoch 21/30, Loss: 0.1508
Epoch 22/30, Loss: 0.1065
Epoch 23/30, Loss: 0.1999
Epoch 24/30, Loss: 0.0948
Epoch 25/30, Loss: 0.1004
Epoch 26/30, Loss: 0.0961
Epoch 27/30, Loss: 0.0974
Epoch 28/30, Loss: 0.0972
Epoch 29/30, Loss: 0.0484
Epoch 30/30, Loss: 0.1038


# **Inference**

In [37]:
def infer_pose(model, image_path, transform):
    model.eval()
    image = Image.open(image_path).convert("RGB")
    image = transform(image).unsqueeze(0).to(CUDA_OR_CPU)
    with torch.no_grad():
        pred_translation, pred_rotation = model(image)
    return pred_translation.cpu().numpy(), pred_rotation.cpu().numpy()

# Test with a new image
new_image_path = test_paths[0]

pred_translation, pred_rotation = infer_pose(model, new_image_path, transform)
print("Predicted Translation:", pred_translation)

print("Predicted Rotation (quaternion):", pred_rotation)



Predicted Translation: [[ 0.41572934 -0.38697243 -2.2909143 ]]
Predicted Rotation (quaternion): [[-0.06614672  0.5726366  -0.10983142  0.8097215 ]]


In [39]:

# Evaluation function
def evaluate_model_directly(model, test_paths, test_translations, test_rotations):
    """
    Evaluates the model directly with test image paths, translations, and rotations.
    """
    model.eval()
    total_translation_error = 0.0
    total_rotation_error = 0.0
    count = 0

    with torch.no_grad():
        for i, image_path in enumerate(test_paths):
            # Load image
            image = Image.open(image_path).convert("RGB").resize((224, 224))  # Resize to match model input
            image = T.ToTensor()(image).unsqueeze(0).to(CUDA_OR_CPU)  # Convert to tensor and add batch dimension

            # Ground truth translation and rotation
            gt_translation = torch.tensor(test_translations[i], dtype=torch.float32).unsqueeze(0).to(CUDA_OR_CPU)
            gt_rotation = torch.tensor(test_rotations[i], dtype=torch.float32).unsqueeze(0).to(CUDA_OR_CPU)

            # Predict translation and rotation
            pred_translation, pred_rotation = model(image)

            # Compute translation error (L2 distance)
            translation_error = torch.sqrt(torch.sum((pred_translation - gt_translation) ** 2, dim=1)).item()
            total_translation_error += translation_error

            # Compute rotation error (angular error in radians)
            dot_product = torch.sum(pred_rotation * gt_rotation, dim=1).clamp(-1.0, 1.0)
            rotation_error = torch.acos(dot_product).item()  # Angular error in radians
            total_rotation_error += rotation_error

            count += 1

    avg_translation_error = total_translation_error / count
    avg_rotation_error = total_rotation_error / count

    return avg_translation_error, avg_rotation_error


# Testing the model
test_translation_error, test_rotation_error = evaluate_model_directly(model, test_paths, test_translations, test_rotations)
print(f"Test Translation Error (L2 distance): {test_translation_error:.4f}")
print(f"Test Rotation Error (angular, radians): {test_rotation_error:.4f}")

Test Translation Error (L2 distance): 2.8479
Test Rotation Error (angular, radians): 0.2573
