In [None]:
# %pip install git+https://github.com/JiahuiYu/neuralgym

import os
from PIL import Image
import numpy as np
import torch
from torch.utils.data import Dataset
import torchvision.transforms as transforms
import matplotlib.pyplot as plt

CARLA_DATA = False

dataset_dir = "datasets"

if CARLA_DATA:
    num_views = 1
    num_frames = 179
    dataset_idx = 0
    MASK_SPAN = {"face":0.2, "no_feet":.85, "full":1}
    X = 2  # chunk size
    mask_height_span = MASK_SPAN.get('face', 1)
    output_base_directory = "output/carla_v1"
    input_video_base_path = f"output/carla_v1/rgb/view"
    video_output_dir = "output/carla_v1/videos/"
else:
    num_views = 2
    num_frames = 50
    dataset_idx = 1
    MASK_SPAN = {"face":0.2, "no_feet":.9, "full":1}
    X = 4  # chunk size
    mask_height_span = MASK_SPAN.get('no_feet', 1)
    output_base_directory = "jc_8_long_n2"
    input_video_base_path = f"output/xr_lubna/rgb/view"
    video_output_dir = "output/xr_lubna/videos/"


class SegmentationDataset(Dataset):
    def __init__(self, root_dir, transform=None):
        self.root_dir = root_dir
        self.transform = transform

        valid_views = [f"view_{i}" for i in range(num_views)]
        valid_frames = range(num_frames)  # frames 0 to 49
        valid_image_names = [f"pointcloud-{i}.png" for i in valid_frames]
        valid_depth_names = [f"depth-{i}.png" for i in valid_frames]

        # We'll store data per scenario like:
        # self.scenarios = [
        #   {
        #       "images": [[img_view0_frame0, ...], [img_view1_frame0, ...], ...], # [8][50]
        #       "gts":    [[gt_view0_frame0, ...], ...],
        #       "depths": [[depth_view0_frame0, ...], ...],
        #       "paths":  [[path_view0_frame0, ...], ...]
        #   }, ...
        # ]
        self.scenarios = []

        # Identify scenario folders (excluding ground truth)
        all_folders = [f for f in os.listdir(root_dir) 
                       if os.path.isdir(os.path.join(root_dir, f)) and 
                       not f.endswith('ground_truth') and '_ground_truth' not in f]

        for folder in all_folders:
            gt_folder = f"{folder}_ground_truth"
            if not os.path.exists(os.path.join(root_dir, gt_folder)):
                continue  # skip if no corresponding ground truth folder


            scenario_images = []
            scenario_gts = []
            scenario_depths = []
            scenario_paths = []

            # For each view
            for view_folder in valid_views:
                image_folder_path = os.path.join(root_dir, folder, view_folder)
                gt_folder_path = os.path.join(root_dir, gt_folder, view_folder)

                if not (os.path.exists(image_folder_path) and os.path.exists(gt_folder_path)):
                    scenario_images = []
                    break

                view_images = []
                view_gts = []
                view_depths = []
                view_paths = []

                # Make sure frames are in order
                for frame_idx in valid_frames:
                    image_name = f"pointcloud-{frame_idx}.png"
                    depth_name = f"depth-{frame_idx}.png"
                    image_path = os.path.join(image_folder_path, image_name)
                    depth_path = os.path.join(image_folder_path, depth_name)
                    gt_image_path = os.path.join(gt_folder_path, image_name)

                    if not (os.path.exists(image_path) and os.path.exists(gt_image_path) and os.path.exists(depth_path)):
                        view_images = []
                        break

                    view_images.append(image_path)
                    view_gts.append(gt_image_path)
                    view_depths.append(depth_path)

                    relative_path = image_path.replace(self.root_dir + "/", "")
                    view_paths.append(relative_path)

                # If any frame missing, break
                if len(view_images) != num_frames:
                    scenario_images = []
                    break

                scenario_images.append(view_images)
                scenario_gts.append(view_gts)
                scenario_depths.append(view_depths)
                scenario_paths.append(view_paths)

            # Add the scenario if all views and frames loaded
            if len(scenario_images) == num_views and all(len(v) == num_frames for v in scenario_images):
                self.scenarios.append({
                    "images": scenario_images,
                    "gts": scenario_gts,
                    "depths": scenario_depths,
                    "paths": scenario_paths
                })

    def __len__(self):
        print(f"Total scenarios loaded: {len(self.scenarios)}")
        return len(self.scenarios)

    def __getitem__(self, idx):
        scenario = self.scenarios[idx]
        scenario_images = scenario["images"]   # [8][50]
        scenario_gts = scenario["gts"]         # [8][50]
        scenario_depths = scenario["depths"]   # [8][50]
        scenario_paths = scenario["paths"]     # [8][50]

        # We'll load and process all images and masks
        all_images = []  # Will hold [8, 50, 4, H, W] eventually
        all_masks = []   # Will hold [8, 50, H, W]

        # Define class colors
        class_1_color = np.array([80, 239, 7])   # #50EF07
        class_2_color = np.array([249, 0, 0])    # #F90000
        tolerance = 30

        for v in range(num_views):
            view_imgs = []
            view_masks = []
            # For each frame in this view
            for f in range(num_frames):
                image_path = scenario_images[v][f]
                gt_path = scenario_gts[v][f]
                depth_path = scenario_depths[v][f]

                image = Image.open(image_path).convert("RGB")
                gt_image = Image.open(gt_path).convert("RGB")
                depth_image = Image.open(depth_path)

                gt_image_np = np.array(gt_image)
                # Create mask
                mask = np.zeros(gt_image_np.shape[:2], dtype=np.uint8)
                mask[np.all(np.abs(gt_image_np - class_1_color) <= tolerance, axis=-1)] = 1
                mask[np.all(np.abs(gt_image_np - class_2_color) <= tolerance, axis=-1)] = 2

                # Convert images to tensors and apply transform if provided
                if self.transform:
                    # Apply transform to RGB image
                    rgb_tensor = self.transform(image)  # [C,H,W]
                    # Resize mask using nearest neighbor
                    mask_pil = Image.fromarray(mask)
                    mask_pil = mask_pil.resize((rgb_tensor.shape[2], rgb_tensor.shape[1]), Image.NEAREST)
                    mask = np.array(mask_pil)

                    # Resize depth image separately
                    depth_resized = depth_image.resize((rgb_tensor.shape[2], rgb_tensor.shape[1]), Image.NEAREST)
                    depth_np = np.array(depth_resized).astype(np.float32)

                else:
                    # If no transform, just convert directly
                    rgb_tensor = transforms.ToTensor()(image)
                    depth_np = np.array(depth_image).astype(np.float32)

                # Normalize depth
                # if depth_np.max() > 10 * depth_np.min():
                #     depth_norm = depth_np/1000
                # else:
                #     depth_norm = depth_np  # all pixels same, no normalization needed
                # depth_norm = depth_np/1000
                depth_norm = depth_np

                
                depth_tensor = torch.tensor(depth_norm).unsqueeze(0)  # [1,H,W]

                # Combine RGB and Depth into single tensor: [4,H,W]
                img_with_depth = torch.cat((rgb_tensor, depth_tensor), dim=0)

                # Convert mask to tensor
                mask = torch.from_numpy(mask).long()

                view_imgs.append(img_with_depth)  # [4,H,W]
                view_masks.append(mask)           # [H,W]

            # Stack frames for this view
            view_imgs = torch.stack(view_imgs, dim=0)   # [50,4,H,W]
            view_masks = torch.stack(view_masks, dim=0) # [50,H,W]

            all_images.append(view_imgs)
            all_masks.append(view_masks)

        # Stack all views
        all_images = torch.stack(all_images, dim=0)  # [8,50,4,H,W]
        all_masks = torch.stack(all_masks, dim=0)    # [8,50,H,W]

        return all_images, all_masks, scenario_paths

# Example usage
transform = transforms.Compose([
    transforms.ToTensor(),
])



/home/iot-class/.config/matplotlib is not a writable directory
Matplotlib created a temporary cache directory at /tmp/matplotlib-7prepkj9 because there was an issue with the default path (/home/iot-class/.config/matplotlib); it is highly recommended to set the MPLCONFIGDIR environment variable to a writable directory, in particular to speed up the import of Matplotlib and to better support multiprocessing.


In [2]:
from torch.utils.data import DataLoader
import torchvision.transforms as transforms

transform = transforms.Compose([
    transforms.ToTensor(),
])
dataset = SegmentationDataset(root_dir=dataset_dir, transform=transform)
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)


Total scenarios loaded: 12
Total scenarios loaded: 12


In [3]:
images, masks, paths = dataset[dataset_idx]

In [4]:
print(paths[0])

['LubnaFriends_redandblack-0.0015_720p_mobility_linear_minus_0_4_mps_total_dist_8/view_0/pointcloud-0.png', 'LubnaFriends_redandblack-0.0015_720p_mobility_linear_minus_0_4_mps_total_dist_8/view_0/pointcloud-1.png', 'LubnaFriends_redandblack-0.0015_720p_mobility_linear_minus_0_4_mps_total_dist_8/view_0/pointcloud-2.png', 'LubnaFriends_redandblack-0.0015_720p_mobility_linear_minus_0_4_mps_total_dist_8/view_0/pointcloud-3.png', 'LubnaFriends_redandblack-0.0015_720p_mobility_linear_minus_0_4_mps_total_dist_8/view_0/pointcloud-4.png', 'LubnaFriends_redandblack-0.0015_720p_mobility_linear_minus_0_4_mps_total_dist_8/view_0/pointcloud-5.png', 'LubnaFriends_redandblack-0.0015_720p_mobility_linear_minus_0_4_mps_total_dist_8/view_0/pointcloud-6.png', 'LubnaFriends_redandblack-0.0015_720p_mobility_linear_minus_0_4_mps_total_dist_8/view_0/pointcloud-7.png', 'LubnaFriends_redandblack-0.0015_720p_mobility_linear_minus_0_4_mps_total_dist_8/view_0/pointcloud-8.png', 'LubnaFriends_redandblack-0.0015_720

In [None]:
import time
import torch
import cv2
import numpy as np
from torchvision import transforms
from torchvision.models.detection import fasterrcnn_resnet50_fpn

# ---------------------------
# Global constants
# ---------------------------
DETECTION_CONFIDENCE_THRESHOLD = 0.65
PRIVATE_OBJECT_CLASSES = ['person']
DEPTH_THRESHOLD_MULTIPLIER = 75
WINDOW_SIZE = 10


COCO_INSTANCE_CATEGORY_NAMES = [
    '__background__', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
    'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'N/A', 'stop sign',
    'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
    'elephant', 'bear', 'zebra', 'giraffe', 'N/A', 'backpack', 'umbrella',
    'N/A', 'N/A', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard',
    'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard',
    'surfboard', 'tennis racket', 'bottle', 'N/A', 'wine glass', 'cup', 'fork',
    'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli',
    'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch',
    'potted plant', 'bed', 'N/A', 'dining table', 'N/A', 'N/A', 'toilet',
    'N/A', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave',
    'oven', 'toaster', 'sink', 'refrigerator', 'N/A', 'book', 'clock', 'vase',
    'scissors', 'teddy bear', 'hair dryer', 'toothbrush'
]

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = fasterrcnn_resnet50_fpn(pretrained=True).to(device)
model.eval()

###############################################################################
# 1) DETECTION
###############################################################################
def detect_objects(model, image_np, confidence_threshold=0.5, draw_boxes=False):
    """
    image_np: NumPy array [H,W,3], float in [0,1] (on CPU).
    Model forward pass on GPU. Returns bounding boxes on CPU.
    """
    transform_ = transforms.Compose([transforms.ToTensor()])
    input_tensor = transform_(image_np).unsqueeze(0).to(device)
    with torch.no_grad():
        outputs = model(input_tensor)

    outputs = [{k: v.to("cpu") for k, v in t.items()} for t in outputs]
    scores = outputs[0]['scores'].numpy()
    labels = outputs[0]['labels'].numpy()
    boxes = outputs[0]['boxes'].numpy()

    objects = []
    for score, label_idx, box in zip(scores, labels, boxes):
        if score >= confidence_threshold:
            label = COCO_INSTANCE_CATEGORY_NAMES[label_idx]
            x1, y1, x2, y2 = box.astype(int)
            objects.append({
                'box': [x1, y1, x2, y2],
                'score': float(score),
                'label': label
            })

    if draw_boxes:
        # Create a displayable image directly from the input image_np.
        # This ensures the base image colors and exposure are preserved from your original.
        # Scale to 0-255 and convert to uint8.
        # display_image = (image_np * 255).astype(np.uint8)
        display_image = image_np.copy()  # Use the original image_np directly

        # Convert the display_image from RGB to BGR, which is OpenCV's default format
        # for drawing and saving. This step is crucial for accurate color representation
        # when using cv2.rectangle and cv2.putText, and for cv2.imwrite.
        # display_image = cv2.cvtColor(display_image, cv2.COLOR_RGB2BGR)

        for obj in objects:
            x1, y1, x2, y2 = obj['box']
            label = obj['label']
            score = obj['score']

            if label not in PRIVATE_OBJECT_CLASSES:
                continue
            
            # Define text properties
            text = f'{label}: {score:.2f}'
            font = cv2.FONT_HERSHEY_SIMPLEX
            font_scale = 0.5
            font_thickness = 1
            
            # Get text size to position it properly
            text_size = cv2.getTextSize(text, font, font_scale, font_thickness)[0]
            
            # Position text above the bounding box. Adjust if it goes off the top of the image.
            text_x = x1
            text_y = y1 - 10 if y1 - 10 > text_size[1] else y1 + text_size[1] + 5 # Add 5 for slight padding

            # Draw rectangle on the image
            # Color is green (0, 255, 0) in BGR
            cv2.rectangle(display_image, (x1, y1), (x2, y2), (0, 255, 0), 2)
            
            # Put text on the image
            cv2.putText(display_image, text, (text_x, text_y), font, font_scale, (0, 255, 0), font_thickness, cv2.LINE_AA)
            
        return display_image

    else:
        return objects
    
###############################################################################
# 2) DEPTH PROFILE
###############################################################################
def calculate_depth_profile_of_box(depth_map, x1, y1, x2, y2, window_size=WINDOW_SIZE):
    """
    Return { 'mean','std','threshold','box':[x1,y1,x2,y2] } or None if empty.
    """
    half_window = window_size // 2
    cx = (x1 + x2) // 2
    cy = (y1 + y2) // 2

    x_start = max(cx - half_window, 0)
    x_end = min(cx + half_window + 1, depth_map.shape[1])
    y_start = max(cy - half_window, 0)
    y_end = min(cy + half_window + 1, depth_map.shape[0])

    depth_window = depth_map[y_start:y_end, x_start:x_end]
    depth_values = depth_window.flatten()
    depth_values = depth_values[~np.isnan(depth_values)]
    if depth_values.size == 0:
        return None

    depth_mean = float(np.mean(depth_values))
    depth_std = float(np.std(depth_values))
    depth_threshold = float(depth_std * DEPTH_THRESHOLD_MULTIPLIER)

    return {
        'mean': depth_mean,
        'std': depth_std,
        'threshold': depth_threshold,
        'box': [x1,y1,x2,y2]
    }

###############################################################################
# 3) SEGMENT CHUNK
###############################################################################

def segment_person_from_box(depth_tensor, dprof, span=1):
    """
    Similar to segment_person_from_profile_batch, but for a single bounding box
    in just 1 frame's depth or multiple frames (X frames).
    If depth_tensor: shape [X,H,W] or [H,W].
    """
    if len(depth_tensor.shape) == 2:
        # single frame => shape [H,W]
        depth_tensor = depth_tensor.unsqueeze(0)  # => [1,H,W]

    depth_mean = torch.tensor(dprof['mean'], device=depth_tensor.device, dtype=depth_tensor.dtype)
    depth_thr  = torch.tensor(dprof['threshold'], device=depth_tensor.device, dtype=depth_tensor.dtype)
    (x1,y1,x2,y2) = dprof['box']
    y2 = int(y1 + span * (y2 - y1))


    depth_diff = torch.abs(depth_tensor - depth_mean)
    mask_batch = (depth_diff <= depth_thr).to(torch.uint8)

    final_mask = torch.zeros_like(mask_batch)
    _, H, W = depth_tensor.shape
    x1_clamp = max(0, min(x1,W))
    x2_clamp = max(0, min(x2,W))
    y1_clamp = max(0, min(y1,H))
    y2_clamp = max(0, min(y2,H))

    if x2_clamp> x1_clamp and y2_clamp> y1_clamp:
        final_mask[:, y1_clamp:y2_clamp, x1_clamp:x2_clamp] = \
            mask_batch[:, y1_clamp:y2_clamp, x1_clamp:x2_clamp]

    # return shape [H,W] if single frame
    if final_mask.shape[0] == 1:
        return final_mask[0]
    return final_mask


def segment_all(depth_tensor, objects, depth_map, span):
    """
    We create a combined mask of shape [H,W] = 1 for each person's bounding box,
    EXCEPT we skip the public_box (which is the "public" person).
    depth_tensor: shape [H,W], float on GPU
    objects: detection results on CPU
    public_box: (x1,y1,x2,y2) that we skip
    depth_map: CPU 2D array for depth
    Return: torch.uint8 mask [H,W], 1=private, 0=public
    """
    H, W = depth_tensor.shape[-2], depth_tensor.shape[-1]

    combined_mask = torch.zeros((H,W), dtype=torch.uint8, device=depth_tensor.device)

    for obj in objects:
        if obj['label'] not in PRIVATE_OBJECT_CLASSES:
            continue
        box = obj['box']  # [x1,y1,x2,y2]

        # otherwise, segment this bounding box => "private"
        dprof = calculate_depth_profile_of_box(depth_map, *box)
        if dprof is None:
            continue

        single_mask = segment_person_from_box(depth_tensor, dprof, span)
        combined_mask = torch.logical_or(combined_mask.bool(), single_mask.bool()).to(torch.uint8)

    return combined_mask

###############################################################################
# 4) METRICS
###############################################################################
def dice_score_batch(pred_batch, gt_batch):
    intersection = torch.sum((pred_batch==1)&(gt_batch==1)).item()
    pred_sum = torch.sum(pred_batch==1).item()
    gt_sum = torch.sum(gt_batch==1).item()
    if (pred_sum+gt_sum)==0:
        return 1.0
    return 2.0*intersection/(pred_sum+gt_sum)

def recall_batch(pred_batch, gt_batch):
    tp = torch.sum((pred_batch==1)&(gt_batch==1)).item()
    gt_sum = torch.sum(gt_batch==1).item()
    if gt_sum==0:
        return 1.0
    return tp/gt_sum




In [None]:
''' The following code is commented out because it was not used in the final implementation. 
It implements parallelization of chunk detection and segmentation.
Ultimately, the approach had more overhead and gave worse times.'''

# def chunk_detect_and_segment(start_f, X, images, num_views, num_frames, mask_height_span,):
#     end_f = min(start_f + X, num_frames)
#     pred_mask_part = torch.zeros((num_views, end_f-start_f, images.shape[-2], images.shape[-1]), dtype=torch.uint8, device=images.device)

#     # (A) Re-detect "private person" in reference view
    

#     # rgb_ref_tensor = images[ref_view_idx, start_f, :3, :, :]
#     # depth_ref_tensor = images[ref_view_idx, start_f, 3, :, :]

#     # rgb_ref_np = rgb_ref_tensor.permute(1,2,0).cpu().numpy()  # => [H,W,3]
#     # depth_ref_np = depth_ref_tensor.cpu().numpy()             # => [H,W]

#     # # Detect all persons
#     # objs_ref = detect_objects(model, rgb_ref_np, DETECTION_CONFIDENCE_THRESHOLD)

#     # # Build list of depth profiles
#     # profiles_ref = []
#     # for obj in objs_ref:
#     #     if obj['label'] in PRIVATE_OBJECT_CLASSES:
#     #         (rx1,ry1,rx2,ry2) = obj['box']
#     #         dprof = calculate_depth_profile_of_box(depth_ref_np, rx1, ry1, rx2, ry2)
#     #         if dprof is not None:
#     #             profiles_ref.append(dprof)
#     # profiles_ref.sort(key=lambda x: x['mean'], reverse=True)
#     # ref_profile=None
#     # if len(profiles_ref)>=2:
#     #     # second-furthest => index=0
#     #     ref_profile = profiles_ref[0]

#     # detection_time = time.time()-t0

#     # (B) For each view, transform center & pick bounding box
#     # seg_start_time = time.time()
#     # center of ref bounding box
#     # (rx1, ry1, rx2, ry2) = ref_profile['box']
#     # rcx = 0.5*(rx1+rx2)
#     # rcy = 0.5*(ry1+ry2)
#     # rcz = ref_profile['mean']
#     # ref_center_3d = (rcx, rcy, rcz)

#     detection_times = []
#     seg_times = []

#     seg_start_time = time.time()
#     # Now loop over all views
#     for v in range(num_views):
#         # 1) transform the reference center to the v-th view
#         # (Tx, Ty, Tz) = transfer_point(v, ref_center_3d)

#         # 2) detect persons in chunk's first frame for view v
#         rgb_v_tensor = images[v, start_f, :3, :, :]
#         depth_v_tensor = images[v, start_f, 3, :, :]

#         rgb_v_np = rgb_v_tensor.detach().permute(1,2,0).cpu().numpy()
#         depth_v_np = depth_v_tensor.detach().cpu().numpy().copy()

#         t0 = time.time()

#         objs_v = detect_objects(model, rgb_v_np, DETECTION_CONFIDENCE_THRESHOLD)

#         detection_time = time.time()-t0


#         # # 3) pick bounding box whose center is closest to (Tx,Ty,Tz)
#         # chosen_box = pick_box_closest_3d(objs_v, depth_v_np, Tx, Ty, Tz)
#         # if chosen_box is not None:
#         #     dprof_v = calculate_depth_profile_of_box(depth_v_np, *chosen_box)
#         #     if dprof_v is not None:
#         #         # segment frames [start_f..end_f-1]
#         #         depth_segment = images[v, start_f:end_f, 3, :, :]
#         #         pred_segment = segment_person_from_profile_batch(depth_segment, dprof_v)
#         #         pred_mask_full[v, start_f:end_f] = pred_segment

#         # create mask => all persons except the public_box
#         # shape => [H,W]

#         # seg_start_time = time.time()

#         depth_segment_1frame = images[v, start_f:end_f, 3, :, :]
#         private_mask_1frame = segment_all(
#             depth_segment_1frame,  # shape [H,W] on GPU
#             objs_v,                # detections on CPU
#             depth_v_np,             # CPU depth
#             mask_height_span        # mask height cutoff
#         )

#         # If X>1, you'd do frames [start_f..end_f-1]. For X=1, it's just 1 frame
#         # We'll store into pred_mask_full[v, start_f]

#         seg_time = time.time() - seg_start_time
#         print(f"View {v}: Detection time: {detection_time:.4f}s, Segmentation time: {seg_time:.4f}s")
#         pred_mask_part[v, :] = private_mask_1frame
#         detection_times.append(detection_time)
#         seg_times.append(seg_time)
        
#     seg_start_time = time.time()

    
#     detection_times = sum(detection_times)
#     seg_times = sum(seg_times)


#     return detection_times, seg_times, pred_mask_part, start_f, end_f  # [num_views, X, H, W]





# from concurrent.futures import ThreadPoolExecutor, as_completed


# # Suppose we load data
# # images, masks, _ = dataset[0]
# # images = images.to(device)
# # masks = masks.to(device)

# MASK_SPAN = {"face":0.2, "no_feet":.85, "full":1}


# num_views = 2
# num_frames = 50
# X = 5  # chunk size
# ref_view_idx = 7
# mask_height_span = MASK_SPAN.get('face', 1)

# # We allocate pred_mask_full => shape [num_views, num_frames, H, W]
# pred_mask_full = torch.zeros(
#     (num_views, num_frames, images.shape[-2], images.shape[-1]),
#     dtype=torch.uint8,
#     device=device
# )

# chunk_detection_times = []
# chunk_seg_times = []
# chunk_total_times = []

# # We'll define a list of chunk start indices
# chunk_starts = list(range(0, num_frames, X))


# with ThreadPoolExecutor(max_workers=10) as executor:
#     futures = {executor.submit(chunk_detect_and_segment, start_f, X, images, num_views, num_frames, mask_height_span): start_f
#                 for start_f in chunk_starts}
#     for future in as_completed(futures):
#         detection_times, seg_times, pred_mask_part, start_f, end_f = future.result()

#         pred_mask_full[:,start_f:end_f] = pred_mask_part

#         chunk_detection_times.append(detection_times)
#         chunk_seg_times.append(seg_times)
#         chunk_total_times.append(detection_times + seg_times)

# print(chunk_detection_times)
# print(chunk_seg_times)
# print(chunk_total_times)

# # After all chunks, compute dice/recall per view
# dice = 0
# recall = 0
# for v in range(num_views):
#     gt_mask_view = (masks[v]==2).to(torch.uint8)
#     dice_v = dice_score_batch(pred_mask_full[v].cpu(), gt_mask_view.cpu())
#     recall_v = recall_batch(pred_mask_full[v].cpu(), gt_mask_view.cpu())
#     dice += dice_v
#     recall += recall_v
#     print(f"View {v}: Dice={dice_v:.4f}, Recall={recall_v:.4f}")

# # Print timing info
# num_chunks = len(chunk_starts)
# avg_det = sum(chunk_detection_times)/num_chunks
# avg_seg = sum(chunk_seg_times)/num_chunks
# avg_tot = sum(chunk_total_times)/num_chunks
# print(f"Avg detection time per chunk = {avg_det:.4f}s")
# print(f"Avg segmentation time per chunk = {avg_seg:.4f}s")
# print(f"Avg total time per chunk = {avg_tot:.4f}s")
# print(f"Total time for all chunks = {sum(chunk_total_times):.4f}s")
# print(dice/num_views)
# print(recall/num_views)


In [None]:
# We allocate pred_mask_full => shape [num_views, num_frames, H, W]
pred_mask_full = torch.zeros(
    (num_views, num_frames, images.shape[-2], images.shape[-1]),
    dtype=torch.uint8,
    device=device
)

chunk_detection_times = []
chunk_seg_times = []
chunk_total_times = []

# We'll define a list of chunk start indices
chunk_starts = list(range(0, num_frames, X))

print(f"Chunk starts: {chunk_starts}")


for start_f in chunk_starts:
    end_f = min(start_f + X, num_frames)
    seg_start_time = time.time()

    # Now loop over all views
    for v in range(num_views):

        # 1) detect persons in chunk's first frame for view v
        rgb_v_tensor = images[v, start_f, :3, :, :]
        depth_v_tensor = images[v, start_f, 3, :, :]

        rgb_v_np = rgb_v_tensor.detach().permute(1,2,0).cpu().numpy()
        depth_v_np = depth_v_tensor.detach().cpu().numpy().copy()

        objs_v = detect_objects(model, rgb_v_np, DETECTION_CONFIDENCE_THRESHOLD)

        # 2) create mask
        # shape => [H,W]
        depth_segment_1frame = images[v, start_f:end_f, 3, :, :]
        private_mask_1frame = segment_all(
            depth_segment_1frame,  # shape [H,W] on GPU
            objs_v,                # detections on CPU
            depth_v_np,             # CPU depth
            mask_height_span        # mask height cutoff
        )

        # If X>1, you'd do frames [start_f..end_f-1]. For X=1, it's just 1 frame
        # We'll store into pred_mask_full[v, start_f]
        pred_mask_full[v, start_f:end_f] = private_mask_1frame


    seg_time = time.time() - seg_start_time

    chunk_seg_times.append(seg_time)
    chunk_total_times.append(detection_time + seg_time)


print(chunk_seg_times)
print(chunk_total_times)

# After all chunks, compute dice/recall per view, if we use full body 
# (since we only currently have ground truth for full body)
if mask_height_span > .8:
    dice = 0
    recall = 0
    for v in range(num_views):
        gt_mask_view = (masks[v]==2).to(torch.uint8)
        dice_v = dice_score_batch(pred_mask_full[v].cpu(), gt_mask_view.cpu())
        recall_v = recall_batch(pred_mask_full[v].cpu(), gt_mask_view.cpu())
        dice += dice_v
        recall += recall_v
        print(f"View {v}: Dice={dice_v:.4f}, Recall={recall_v:.4f}")
    print(dice/num_views)
    print(recall/num_views)

# Print timing info
num_chunks = len(chunk_starts)
avg_seg = sum(chunk_seg_times)/num_chunks
avg_tot = sum(chunk_total_times)/num_chunks
print(f"Avg segmentation time per chunk = {avg_seg:.4f}s")
print(f"Avg total time per chunk = {avg_tot:.4f}s")
print(f"Total time for all chunks = {sum(chunk_total_times):.4f}s")


Chunk starts: [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48]
[2.384185791015625e-07, 7.152557373046875e-07, 4.76837158203125e-07, 4.76837158203125e-07, 2.384185791015625e-07, 2.384185791015625e-07, 2.384185791015625e-07, 2.384185791015625e-07, 2.384185791015625e-07, 2.384185791015625e-07, 2.384185791015625e-07, 2.384185791015625e-07, 2.384185791015625e-07, 4.76837158203125e-07, 4.76837158203125e-07, 7.152557373046875e-07, 2.384185791015625e-07, 4.76837158203125e-07, 4.76837158203125e-07, 2.384185791015625e-07, 4.76837158203125e-07, 4.76837158203125e-07, 4.76837158203125e-07, 4.76837158203125e-07, 4.76837158203125e-07]
[0.2231748104095459, 0.03838682174682617, 0.0379948616027832, 0.038097381591796875, 0.03976798057556152, 0.037738800048828125, 0.037686824798583984, 0.03810715675354004, 0.039350032806396484, 0.040148258209228516, 0.038753509521484375, 0.03995776176452637, 0.03848385810852051, 0.038254499435424805, 0.03876972198486328, 0.03

In [10]:
print(pred_mask_full[v].size())

torch.Size([50, 720, 1280])


In [None]:
import numpy as np
import cv2
from simple_lama_inpainting import SimpleLama
from PIL import Image
import torch
from diffusers import StableDiffusionInpaintPipeline
from PIL import Image
import numpy as np
import sys
# from toonify_image import load_toonify_model, toonify_image_with_stylegan # this will need tweaking based on paths

def anonymize_region(img_np: np.ndarray, bool_mask: np.ndarray, method: str = "color_noise", noise_level: int = 100, pixel_size: int = 8, blur_kernel: int = 25, model_extra = None) -> np.ndarray:
    
    masked_img = img_np.copy()
    H, W, _ = img_np.shape

    if method == "color_noise":
        img_int = img_np.astype(np.int16)
        noise = np.random.randint(-noise_level, noise_level + 1, size=(H, W, 3), dtype=np.int16)
        noise_masked = noise * bool_mask[..., None]
        perturbed = np.clip(img_int + noise_masked, 0, 255).astype(np.uint8)
        masked_img[bool_mask] = perturbed[bool_mask]
        blurred = cv2.GaussianBlur(masked_img, (7, 7), 0)
        anonymized_img = blurred


    elif method == "blur":
        blurred = cv2.GaussianBlur(img_np, (blur_kernel, blur_kernel), 0)
        anonymized_img = blurred

    elif method == "random_rgb":
        random_colors = np.random.randint(0, 256, size=(H, W, 3), dtype=np.uint8)
        anonymized_img = random_colors

    elif method == "color":
        red_overlay = np.zeros_like(img_np, dtype=np.uint8)
        red_overlay[..., 0] = 255  # R
        red_overlay[..., 1] = 0
        red_overlay[..., 2] = 0

        anonymized_img = red_overlay

        
    elif method == "sampling":
        original_height, original_width = img_np.shape[:2]
        downsampled = cv2.resize(img_np, (256, 256), interpolation=cv2.INTER_AREA)
        anonymized_img = cv2.resize(downsampled, (original_width, original_height), interpolation=cv2.INTER_LINEAR)

    elif method == "pixelate":
        small = cv2.resize(img_np, (W // pixel_size, H // pixel_size), interpolation=cv2.INTER_LINEAR)
        anonymized_img = cv2.resize(small, (W, H), interpolation=cv2.INTER_NEAREST)

    elif method == 'all':
        downsample_size=96
        noise_std=5
        matrix_strength=0.015
        
        h, w = img_np.shape[:2]

        # 1. Fast lossy compression: downsample and upsample once
        img_small = cv2.resize(img_np, (downsample_size, downsample_size), interpolation=cv2.INTER_AREA)
        img_resized = cv2.resize(img_small, (w, h), interpolation=cv2.INTER_LINEAR)

        # 2. Add uniform noise (fast, no float32)
        noise = np.random.randint(-noise_std, noise_std + 1, img_np.shape, dtype=np.int16)
        noisy = np.clip(img_resized.astype(np.int16) + noise, 0, 255).astype(np.uint8)

        # 3. Lightweight random color mix (uint8-safe)
        color_matrix = np.eye(3) + matrix_strength * np.random.randn(3, 3)
        color_matrix = np.clip(color_matrix, 0, 1).astype(np.float32)  # keep it subtle

        reshaped = noisy.reshape(-1, 3).astype(np.float32)
        transformed = reshaped @ color_matrix.T
        anonymized_img = np.clip(transformed, 0, 255).astype(np.uint8).reshape(h, w, 3)



    elif method == 'lama':
        bin_mask = np.zeros(img_np.shape[:2], dtype=np.uint8)
        bin_mask[bool_mask] = 255
        masked_img = model_extra(img_np, bin_mask)

        if isinstance(masked_img, Image.Image):
            masked_img = np.array(masked_img)

        anonymized_img = masked_img
    
    elif method == 'ssd':
        pipe = model_extra

        bin_mask = np.zeros(img_np.shape[:2], dtype=np.uint8)
        bin_mask[bool_mask] = 255
        prompt = "face"
        image_pil = Image.fromarray(img_np).convert("RGB")
        mask_pil = Image.fromarray(bin_mask).convert("L")

        masked_img = pipe(prompt=prompt, image=image_pil, mask_image=mask_pil).images[0]

        anonymized_img = masked_img

    elif method == "toon":
        loaded_toonify_model = model_extra
        image_pil = Image.fromarray(img_np).convert("RGB").resize((512, 512))

        anonymized_img = toonify_image_with_stylegan(
            input_image=image_pil,
            loaded_model=loaded_toonify_model
        )


    else:
        raise ValueError(f"Unknown method: {method}")
    
    

    return anonymized_img


def anonymize_depth(original_depth_np, noise_strength=0.01, output_path=None):

    # --- Add Gaussian Noise (as before) ---
    original_depth_np = original_depth_np.copy()  # Ensure we don't modify the original data
    noise_mean_gaussian = 0.0
    noise_std_dev_gaussian = noise_strength # Example: 5 millimeters
    gaussian_noise = np.random.normal(noise_mean_gaussian, noise_std_dev_gaussian, original_depth_np.shape)
    # Apply Gaussian noise to the original masked_depth
    noisy_depth_gaussian = original_depth_np + gaussian_noise
    # --- End Add Gaussian Noise ---

    # --- Add Uniform Random Noise (Very Quick) ---
    uniform_noise_magnitude = noise_strength/2

    # Generate uniform noise within the range [-magnitude/2, +magnitude/2]
    uniform_noise = np.random.uniform(
        low=-uniform_noise_magnitude / 2.0,
        high=uniform_noise_magnitude / 2.0,
        size=noisy_depth_gaussian.shape
    )

    # Add the uniform noise to the already Gaussian-noisy depth
    return noisy_depth_gaussian + uniform_noise



  from .autonotebook import tqdm as notebook_tqdm


In [None]:
import os
import cv2
import torch
import numpy as np
from simple_lama_inpainting import SimpleLama
from PIL import Image
import torch
from diffusers import StableDiffusionInpaintPipeline
from PIL import Image
import numpy as np
import tqdm
# from toonify_image import load_toonify_model, toonify_image_with_stylegan



PSP_MODEL_PATH = "pretrained_models/psp_toonify.pt"


model_used = True
model_used = False

if model_used:
    simple_lama = SimpleLama()

    # Load stable_diffusion from Hugging Face
    pipe = StableDiffusionInpaintPipeline.from_pretrained(
        "runwayml/stable-diffusion-inpainting",
        torch_dtype=torch.float16,
        variant="fp16",  # necessary for speed
    ).to("cuda")

    # # Load SSD-1B from Hugging Face
    # pipe = StableDiffusionInpaintPipeline.from_pretrained(
    #     "",
    #     torch_dtype=torch.float16,
    #     variant="fp16",  # necessary for speed
    # ).to("cuda")

    # load style GAN
    loaded_toonify_model = load_toonify_model(PSP_MODEL_PATH)
    print("Toonify model successfully loaded for the pipeline.")


In [13]:
import os
import time

import cv2
import numpy as np
from PIL import Image

final_img_full  = np.zeros(
    (num_views, num_frames, images.shape[-2], images.shape[-1], 3), dtype=np.uint8
)


def save_masked_images(pred_mask_full, images, out_folder, dilation_radius=4):
    """
    pred_mask_full: [V, F, H, W] torch.Tensor of 0/1 masks
    images:         [V, F, C, H, W] torch.Tensor, C >= 4 (RGB + depth)
    out_folder:     root output directory
    dilation_radius: pixels to dilate each mask
    """
    os.makedirs(out_folder, exist_ok=True)

    num_views, num_frames = pred_mask_full.shape[:2]

    # create per-view subfolders
    for v in range(num_views):
        rgb_dir = os.path.join(out_folder, "rgb", f"view{v}")
        depth_dir = os.path.join(out_folder, "depth", f"view{v}")
        os.makedirs(rgb_dir, exist_ok=True)
        os.makedirs(depth_dir, exist_ok=True)

    # prepare dilation kernel
    k = 2 * dilation_radius + 1
    kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (k, k))

    chunk_total_times = []
    chunk_write_times = []
    chunk_anonym_times = []

    # process in chunks of X frames (assume X is defined elsewhere)
    for start in range(0, num_frames, X):
        end = min(start + X, num_frames)
        t_chunk = time.time()

        image_writes = []

        for v in range(num_views):
            rgb_dir   = os.path.join(out_folder, "rgb",   f"view{v}")
            depth_dir = os.path.join(out_folder, "depth", f"view{v}")

            for f in range(start, end):

                # 1) dilate mask
                mask_np = (
                    pred_mask_full[v, f]
                    .detach()
                    .cpu()
                    .numpy()
                    .astype(np.uint8)
                )
                mask = cv2.dilate(mask_np, kernel).astype(bool)

                # 2) extract RGB
                rgb = images[v, f, :3].detach().cpu().numpy()        # [3,H,W]
                rgb = np.transpose(rgb, (1, 2, 0))                   # [H,W,3]
                if rgb.dtype != np.uint8:
                    rgb = (rgb * 255).clip(0, 255).astype(np.uint8)

                # 3) extract depth
                depth = images[v, f, 3].detach().cpu().numpy()       # [H,W]

                # 4) anonymize once per chunk-start frame
                if f == start:
                    anon_rgb = anonymize_region(rgb, mask, method="all")
                    if isinstance(anon_rgb, Image.Image):
                        anon_rgb = np.array(anon_rgb)
                    if anon_rgb.shape != rgb.shape:
                        anon_rgb = cv2.resize(
                            anon_rgb,
                            (rgb.shape[1], rgb.shape[0]),
                            interpolation=cv2.INTER_LINEAR
                        )
                    anon_depth = anonymize_depth(depth, noise_strength=10)

                # 5) apply anonymization
                out_rgb = rgb.copy()
                out_depth = depth.copy()
                out_rgb[mask] = anon_rgb[mask]
                out_depth[mask] = anon_depth[mask]

                # 6) store in global array
                final_img_full[v, f] = out_rgb

                t_write = time.time()
                # 7) save RGB mask
                rgb_path = os.path.join(rgb_dir, f"{f}_masked.png")
                cv2.imwrite(
                    rgb_path,
                    cv2.cvtColor(out_rgb, cv2.COLOR_RGB2BGR)
                )

                # 8) save depth as 16-bit PNG
                depth_path = os.path.join(depth_dir, f"{f}_depth.png")
                depth_uint16 = np.clip(out_depth, 0, 20000).astype(np.uint16)
                cv2.imwrite(depth_path, depth_uint16)
                t_write_end = time.time()
                
                image_writes.append(time.time() - t_write)

        total_chunk = time.time() - t_chunk
        write_total = sum(image_writes)
        chunk_total_times.append(total_chunk)
        chunk_write_times.append(write_total)
        chunk_anonym_times.append(total_chunk - write_total)

    # 10) Print stats
    n = len(chunk_starts)
    avg_total = sum(chunk_total_times) / n
    avg_write = sum(chunk_write_times)  / n
    avg_anon  = sum(chunk_anonym_times) / n

    print(f"Avg total time per chunk = {avg_total:.4f}s")
    print(f"Avg anonymization time per chunk = {avg_anon:.4f}s")
    print(f"Total time for all chunks w/ IO = {sum(chunk_total_times):.4f}s")
    print(f"Total time for all chunks anon only = {sum(chunk_anonym_times):.4f}s")
    print(f"Saved masked images in folder: {out_folder}")


In [None]:
import os
import cv2
import torch
import numpy as np
from simple_lama_inpainting import SimpleLama
from PIL import Image
import torch
from diffusers import StableDiffusionInpaintPipeline
from PIL import Image
import numpy as np
import tqdm
import os
import cv2
import numpy as np
import time
from concurrent.futures import ThreadPoolExecutor

final_img_full  = np.zeros(
    (num_views, num_frames, images.shape[-2], images.shape[-1], 3), dtype=np.uint8
)


# assume X, num_views, num_frames, images and final_img_full are defined globally

def save_masked_images_enhanced(pred_mask_full, images, out_folder, dilation_radius=4):
    """
    Faster batched version to eliminate recursive calls and use parallel IO

    pred_mask_full: [V, F, H, W] 0/1 masks (same mask for all frames in a chunk)
    images:         [V, F, C, H, W]  (C>=4, with RGB + depth at index 3)
    """
    os.makedirs(out_folder, exist_ok=True)
    V, F, H, W = pred_mask_full.shape[0], pred_mask_full.shape[1], images.shape[-2], images.shape[-1]

    # 1) Move all data to CPU once
    masks_np = pred_mask_full.detach().cpu().numpy().astype(np.uint8)   # [V, F, H, W]
    imgs_np  = images.detach().cpu().numpy()                            # [V, F, C, H, W]

    # 2) Prepare dilation kernel
    k = 2 * dilation_radius + 1
    kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (k, k))

    # 3) Create per-view folders once
    for v in range(V):
        os.makedirs(os.path.join(out_folder, f"rgb/view{v}"),   exist_ok=True)
        os.makedirs(os.path.join(out_folder, f"depth/view{v}"), exist_ok=True)

    chunk_anonym_times = []
    chunk_write_times  = []
    chunk_total_times  = []

    chunk_starts = list(range(0, F, X))
    for start_f in chunk_starts:
        end_f = min(start_f + X, F)
        t_chunk = time.time()
        image_writes = []

        for v in range(V):
            # 4) Slice out this chunk once
            chunk_imgs  = imgs_np[v, start_f:end_f]       # [K, C, H, W]
            rgb_chunk   = chunk_imgs[:, :3]               # [K, 3, H, W]
            depth_chunk = chunk_imgs[:, 3]                # [K, H, W]

            # 5) Convert RGB to HWC uint8
            rgb_uint8 = (rgb_chunk.transpose(0, 2, 3, 1) * 255).clip(0, 255).astype(np.uint8)  # [K, H, W, 3]

            # 6) Compute & dilate mask once per view/chunk
            mask2d    = cv2.dilate(masks_np[v, start_f], kernel)  # assume masks_np[v, f] is identical for f in chunk
            bool_mask = mask2d.astype(bool)

            # 7) Compute anonymization once
            anon_img   = anonymize_region(rgb_uint8[0], bool_mask, method="all")
            # anon_img   = anonymize_region(rgb_uint8[0], bool_mask, method="color")
            if isinstance(anon_img, Image.Image):
                anon_img = np.array(anon_img)
            if anon_img.shape[:2] != (H, W):
                anon_img = cv2.resize(anon_img, (W, H), interpolation=cv2.INTER_LINEAR)

            anon_depth = anonymize_depth(original_depth_np=depth_chunk[0], noise_strength=10)

            # 8) Apply mask to all frames at once
            masked_rgb   = rgb_uint8.copy()    # [K, H, W, 3]
            masked_rgb[:, bool_mask] = anon_img[bool_mask]

            masked_depth = depth_chunk.copy()   # [K, H, W]
            masked_depth[:, bool_mask] = anon_depth[bool_mask]

            t_write = time.time()
            # 9) Save each frame in parallel to overlap IO
            def _save(idx, frame_rgb, frame_depth):
                f = start_f + idx
                final_img_full[v, f] = frame_rgb
                bgr = frame_rgb[:, :, ::-1]
                cv2.imwrite(os.path.join(out_folder, f"rgb/view{v}/{f}_masked.png"), bgr)
                depth_u16 = np.clip(frame_depth, 0, 20000).astype(np.uint16)
                cv2.imwrite(os.path.join(out_folder, f"depth/view{v}/{f}_depth.png"), depth_u16)

            with ThreadPoolExecutor() as exe:
                for i in range(end_f - start_f):
                    exe.submit(_save, i, masked_rgb[i], masked_depth[i])
            total_write = time.time() - t_write
            image_writes.append(total_write)
            

        total_chunk = time.time() - t_chunk
        write_total = sum(image_writes)
        chunk_total_times.append(total_chunk)
        chunk_write_times.append(write_total)
        chunk_anonym_times.append(total_chunk - write_total)

    # 10) Print stats
    n = len(chunk_starts)
    avg_total = sum(chunk_total_times) / n
    avg_write = sum(chunk_write_times)  / n
    avg_anon  = sum(chunk_anonym_times) / n

    print(f"Avg total time per chunk = {avg_total:.4f}s")
    print(f"Avg anonymization time per chunk = {avg_anon:.4f}s")
    print(f"Total time for all chunks w/ IO = {sum(chunk_total_times):.4f}s")
    print(f"Total time for all chunks anon only = {sum(chunk_anonym_times):.4f}s")
    print(f"Saved masked images in folder: {out_folder}")

    return avg_anon


In [None]:
final_img_full  = np.zeros(
    (num_views, num_frames, images.shape[-2], images.shape[-1], 3), dtype=np.uint8
)


# avg_anon = save_masked_images(pred_mask_full, images, output_base_directory)
avg_anon = save_masked_images_enhanced(pred_mask_full, images, output_base_directory)

Avg total time per chunk = 0.1743s
Avg anonymization time per chunk = 0.0813s
Total time for all chunks w/ IO = 4.3585s
Total time for all chunks anon only = 2.0322s
Saved masked images in folder: jc_8_long_n2


In [16]:
# Create the base directory if it doesn't exist
os.makedirs(output_base_directory, exist_ok=True)
print(f"\nSaving annotated images to: {os.path.abspath(output_base_directory)}")

for v in range(num_views):
    view_directory = os.path.join(output_base_directory, f"rgb/view{v}")
    os.makedirs(view_directory, exist_ok=True) # Create a subdirectory for each view

    for f in range(num_frames):
        img_bounding_boxes = detect_objects(model, final_img_full[v,f], DETECTION_CONFIDENCE_THRESHOLD, True)
        filename = os.path.join(view_directory, f"{f}_detect.png") # :03d for zero-padding frame number
        

        success = cv2.imwrite(filename, cv2.cvtColor(img_bounding_boxes, cv2.COLOR_RGB2BGR))
        if success:
            pass # Keep it clean for many files
        else:
            print(f"Failed to save {filename}")

print("\nImage saving process complete.")


Saving annotated images to: /home/iot-class/Capstone/3d-privacy-masking/h_results/jc_8_long_n2

Image saving process complete.


In [17]:
fps = 1/((avg_anon + avg_seg)/ X)
print(f"Estimated FPS: {fps:.2f} frames per second")

Estimated FPS: 15.68 frames per second


In [18]:
import cv2
import os
from glob import glob

# Set parameters
num_frames = num_frames  # Already defined in your notebook

os.makedirs(video_output_dir, exist_ok=True)
for view_idx in range(num_views):
    output_path = os.path.join(video_output_dir, f"view{view_idx}.mp4")
    
    input_video_pattern = input_video_base_path + f"{view_idx}/{{}}_masked.png"

    # Read the first frame to get the size
    first_frame_path = input_video_pattern.format(0)
    first_frame = cv2.imread(first_frame_path)
    if first_frame is None:
        raise FileNotFoundError(f"First frame not found: {first_frame_path}")
    height, width, layers = first_frame.shape

    # Define the video writer
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    video_writer = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

    # Write frames to video
    for i in range(num_frames):
        frame_path = input_video_pattern.format(i)
        frame = cv2.imread(frame_path)
        if frame is None:
            print(f"Warning: Frame not found: {frame_path}, skipping.")
            continue
        video_writer.write(frame)

    video_writer.release()
    print(f"Video saved to {output_path}")

Video saved to /home/iot-class/Capstone/3d-privacy-masking/h_results/jc_8_long_n2/videos/view0.mp4
Video saved to /home/iot-class/Capstone/3d-privacy-masking/h_results/jc_8_long_n2/videos/view1.mp4
