In [1]:
import os
import cv2
import json
import numpy as np
import matplotlib.pyplot as plt
from modules.feature_extractor import SIFT

# Load dataset
with open('office_dataset_aruco/ground_truth_poses.json', 'r') as f:
    ground_truth = json.load(f)

camera_intrinsics = ground_truth["camera_intrinsics"]
K = np.array([[camera_intrinsics["fx"], 0, camera_intrinsics["cx"]],
              [0, camera_intrinsics["fy"], camera_intrinsics["cy"]],
              [0, 0, 1]], dtype=np.float32)

images_path = "office_dataset_aruco/left"

def load_image(frame_num):
    filename = f"frame_{frame_num:04d}.png"
    filepath = os.path.join(images_path, filename)
    img = cv2.imread(filepath)
    if img is None:
        print(f"Error: Could not load {filepath}")
        return None
    return cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

# Load frames for map building + query
map_frames = [1, 5, 10, 15, 20]
query_frame = 25

frames = {}
for frame_num in map_frames + [query_frame]:
    img = load_image(frame_num)
    if img is not None:
        frames[frame_num] = img
        print(f"Loaded frame {frame_num}: {img.shape}")

# Extract features
extractor = SIFT(n_features=2000)
features = {}

for frame_num, img in frames.items():
    gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
    kp, desc = extractor.detect_and_compute(gray)
    features[frame_num] = {
        'keypoints': kp,
        'descriptors': desc,
        'image': img
    }
    print(f"Frame {frame_num}: {len(kp)} keypoints")

Loaded frame 1: (960, 1280, 3)
Loaded frame 5: (960, 1280, 3)
Loaded frame 10: (960, 1280, 3)
Loaded frame 15: (960, 1280, 3)
Loaded frame 20: (960, 1280, 3)
Loaded frame 25: (960, 1280, 3)
Frame 1: 1601 keypoints
Frame 5: 2000 keypoints
Frame 10: 1437 keypoints
Frame 15: 1491 keypoints
Frame 20: 1233 keypoints
Frame 25: 1194 keypoints


In [2]:
# Match all pairs of map frames to build a richer database
# Pairs: (1,5), (1,10), (5,10), (5,15), (10,15), (10,20), (15,20)

frame_pairs = [
    (1, 5),
    (1, 10),
    (5, 10),
    (5, 15),
    (10, 15),
    (10, 20),
    (15, 20)
]

all_pair_data = {}

for frame_a, frame_b in frame_pairs:
    print(f"\nProcessing pair {frame_a}-{frame_b}")
    
    # Match
    desc_a = features[frame_a]['descriptors']
    desc_b = features[frame_b]['descriptors']
    matches = extractor.match_features(desc_a, desc_b)
    
    # Store match data
    match_data = []
    for match in matches:
        pt_a = features[frame_a]['keypoints'][match.queryIdx].pt
        pt_b = features[frame_b]['keypoints'][match.trainIdx].pt
        
        match_data.append({
            'kp_idx_a': match.queryIdx,
            'kp_idx_b': match.trainIdx,
            'pt_a': np.array(pt_a),
            'pt_b': np.array(pt_b),
            'distance': match.distance
        })
    
    print(f"  {len(match_data)} raw matches")
    
    # Distance filter
    filtered = []
    for m in match_data:
        dist = np.linalg.norm(m['pt_b'] - m['pt_a'])
        if dist <= 700:
            filtered.append(m)
    
    print(f"  {len(filtered)} after distance filter")
    
    all_pair_data[f"{frame_a}-{frame_b}"] = {
        'frame_a': frame_a,
        'frame_b': frame_b,
        'matches': filtered
    }

print(f"\nTotal pairs processed: {len(all_pair_data)}")


Processing pair 1-5
  288 raw matches
  202 after distance filter

Processing pair 1-10
  122 raw matches
  100 after distance filter

Processing pair 5-10
  35 raw matches
  34 after distance filter

Processing pair 5-15
  15 raw matches
  12 after distance filter

Processing pair 10-15
  189 raw matches
  182 after distance filter

Processing pair 10-20
  121 raw matches
  96 after distance filter

Processing pair 15-20
  206 raw matches
  186 after distance filter

Total pairs processed: 7


In [3]:
all_ransac_results = {}

for pair_name, pair_info in all_pair_data.items():
    print(f"\nProcessing pair {pair_name}")
    
    filtered_matches = pair_info['matches']
    
    # Extract 2D points
    pts_a = np.array([m['pt_a'] for m in filtered_matches])
    pts_b = np.array([m['pt_b'] for m in filtered_matches])
    
    # Use OpenCV's findEssentialMat with RANSAC
    E, mask = cv2.findEssentialMat(pts_a, pts_b, K, method=cv2.RANSAC, threshold=1.0)
    
    # Recover pose
    _, R, t, mask_pose = cv2.recoverPose(E, pts_a, pts_b, K, mask=mask)
    
    # Extract inliers
    inlier_indices = np.where(mask_pose.ravel() > 0)[0]
    pts_a_inliers = pts_a[inlier_indices]
    pts_b_inliers = pts_b[inlier_indices]
    
    # Build projection matrices
    P_a = K @ np.hstack([np.eye(3), np.zeros((3, 1))])
    P_b = K @ np.hstack([R, t])
    
    all_ransac_results[pair_name] = {
        'frame_a': pair_info['frame_a'],
        'frame_b': pair_info['frame_b'],
        'pts_a': pts_a_inliers,
        'pts_b': pts_b_inliers,
        'P_a': P_a,
        'P_b': P_b,
        'R': R,
        't': t,
        'inlier_indices': inlier_indices,
        'filtered_matches': filtered_matches
    }
    
    print(f"  {len(inlier_indices)} inliers")

print(f"\nRANSAC complete for {len(all_ransac_results)} pairs")


Processing pair 1-5
  157 inliers

Processing pair 1-10
  53 inliers

Processing pair 5-10
  17 inliers

Processing pair 5-15
  9 inliers

Processing pair 10-15
  88 inliers

Processing pair 10-20
  18 inliers

Processing pair 15-20
  33 inliers

RANSAC complete for 7 pairs


In [4]:
all_triangulated = []

for pair_name, pair_data in all_ransac_results.items():
    print(f"\nTriangulating pair {pair_name}")
    
    frame_a = pair_data['frame_a']
    frame_b = pair_data['frame_b']
    
    # Triangulate
    points_4d = cv2.triangulatePoints(
        pair_data['P_a'], 
        pair_data['P_b'], 
        pair_data['pts_a'].T, 
        pair_data['pts_b'].T
    )
    points_3d = points_4d[:3] / points_4d[3]
    points_3d = points_3d.T
    
    # Filter by positive depth
    valid_points = []
    valid_indices = []
    
    for i, pt_3d in enumerate(points_3d):
        pt_homog = np.append(pt_3d, 1)
        depth_a = (pair_data['P_a'] @ pt_homog)[2]
        depth_b = (pair_data['P_b'] @ pt_homog)[2]
        
        if depth_a > 0 and depth_b > 0:
            valid_points.append(pt_3d)
            valid_indices.append(i)
    
    print(f"  {len(valid_points)} valid 3D points")
    
    # Add to database with descriptors from frame_a
    for i in valid_indices:
        inlier_idx = pair_data['inlier_indices'][i]
        match = pair_data['filtered_matches'][inlier_idx]
        
        descriptor = features[frame_a]['descriptors'][match['kp_idx_a']]
        pt_3d = valid_points[valid_indices.index(i)]
        
        all_triangulated.append({
            'descriptor': descriptor,
            'point_3d': pt_3d,
            'from_pair': pair_name
        })

print(f"\nTotal 3D points in database: {len(all_triangulated)}")

# Remove duplicate 3D points (keep best descriptor)
# For now, just use all points
feature_database = all_triangulated
print(f"Feature database size: {len(feature_database)}")


Triangulating pair 1-5
  157 valid 3D points

Triangulating pair 1-10
  53 valid 3D points

Triangulating pair 5-10
  17 valid 3D points

Triangulating pair 5-15
  9 valid 3D points

Triangulating pair 10-15
  88 valid 3D points

Triangulating pair 10-20
  18 valid 3D points

Triangulating pair 15-20
  33 valid 3D points

Total 3D points in database: 375
Feature database size: 375


In [5]:
# Query with frame 25
query_descriptors = features[25]['descriptors']
query_keypoints = features[25]['keypoints']

db_descriptors = np.array([entry['descriptor'] for entry in feature_database])

matches = extractor.match_features(query_descriptors, db_descriptors)
print(f"Found {len(matches)} matches")

# Extract 2D-3D correspondences and filter duplicates
best_matches = {}
for match in matches:
    query_pt = query_keypoints[match.queryIdx].pt
    world_pt = feature_database[match.trainIdx]['point_3d']
    world_pt_tuple = tuple(world_pt)
    
    if world_pt_tuple not in best_matches or match.distance < best_matches[world_pt_tuple]['distance']:
        best_matches[world_pt_tuple] = {
            'query_2d': query_pt,
            'world_3d': world_pt,
            'distance': match.distance
        }

filtered_query_2d = np.array([entry['query_2d'] for entry in best_matches.values()])
filtered_world_3d = np.array([entry['world_3d'] for entry in best_matches.values()])

print(f"Unique correspondences: {len(filtered_query_2d)}")

# Solve PnP
if len(filtered_query_2d) >= 4:
    success, rvec, tvec = cv2.solvePnP(
        filtered_world_3d,
        filtered_query_2d,
        K,
        None,
        flags=cv2.SOLVEPNP_ITERATIVE
    )
    
    if success:
        t_est = tvec.flatten()
        print(f"\nEstimated position (relative coords): {t_est}")
        
        # Store for alignment
        estimated_pose = t_est
    else:
        print("PnP failed")
else:
    print(f"Not enough correspondences: {len(filtered_query_2d)}")

Found 169 matches
Unique correspondences: 27

Estimated position (relative coords): [0.77659775 2.05069565 0.42963156]


In [6]:
# Get camera positions in both coordinate systems
# In our system: extract from the projection matrices we built
our_camera_positions = {}

# Frame 1 is always at origin
our_camera_positions[1] = np.array([0, 0, 0])

# For other frames, extract from their relative poses
for pair_name, pair_data in all_ransac_results.items():
    frame_a = pair_data['frame_a']
    frame_b = pair_data['frame_b']
    
    if frame_a == 1:
        # Camera position is -R^T @ t
        cam_b_pos = -pair_data['R'].T @ pair_data['t'].flatten()
        our_camera_positions[frame_b] = cam_b_pos

print("Our camera positions (relative coords):")
for frame, pos in sorted(our_camera_positions.items()):
    print(f"  Frame {frame}: {pos}")

# Ground truth positions
gt_camera_positions = {}
for frame in [1, 5, 10, 15, 20, 25]:
    gt_pose = ground_truth['poses'][frame-1]['left_camera']
    gt_camera_positions[frame] = np.array(gt_pose['translation'])

print("\nGround truth positions:")
for frame, pos in sorted(gt_camera_positions.items()):
    print(f"  Frame {frame}: {pos}")

Our camera positions (relative coords):
  Frame 1: [0 0 0]
  Frame 5: [ 0.30532549  0.23494159 -0.92281027]
  Frame 10: [ 0.97675756 -0.08662906  0.19606141]

Ground truth positions:
  Frame 1: [ 4.         -0.06        1.79999995]
  Frame 5: [3.92506528 0.77295786 1.79999995]
  Frame 10: [3.59126544 1.7625016  1.79999995]
  Frame 15: [3.01272702 2.63193393 1.79999995]
  Frame 20: [2.22887635 3.32200384 1.79999995]
  Frame 25: [1.29313135 3.78568506 1.79999995]
