<a href="https://colab.research.google.com/github/kchen2010/MVP-Boeing-Scholars/blob/main/sam2_seg.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install Dependencies

In [None]:
!pip install torch torchvision
!pip install opencv-python-headless
!pip install git+https://github.com/facebookresearch/segment-anything-2.git
!pip install supervision
!pip install pillow numpy matplotlib
!pip install lama-cleaner
!pip install ultralytics

# Sam2 Checkpoint download

In [None]:
!mkdir -p checkpoints
!wget -P checkpoints https://dl.fbaipublicfiles.com/segment_anything_2/072824/sam2_hiera_large.pt

# More Setup Steps

I recommend going to the website downloading than importanting into runtime files

In [None]:
!wget -O input_video.mp4 "https://www.kaggle.com/datasets/vivek603/vehicle-detection-sample-and-output-videos/download?resource=download&select=Test+Video.mp4"
video_path = "/content/input_video.mp4"

# Main
*   Still Frame by Frame
*   Yolo v11 for detection (Can be redundant with Sam2)
*   Sam2 Segmentation (has detection?)
*   OpenCV Inpainting (definetly change this)


In [None]:
import torch
import cv2
import numpy as np
from pathlib import Path
from IPython.display import HTML, display
from base64 import b64encode
import matplotlib.pyplot as plt
from sam2.build_sam import build_sam2_video_predictor
from sam2.sam2_image_predictor import SAM2ImagePredictor
from ultralytics import YOLO # Import YOLO

class CarSegmentationInpainter:
    def __init__(self, checkpoint_path="checkpoints/sam2_hiera_large.pt",
                 model_cfg="sam2_hiera_l.yaml"):
        """Initialize SAM 2 model for car segmentation"""
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print(f"Using device: {self.device}")

        # Initialize SAM 2 predictor
        self.predictor = SAM2ImagePredictor.from_pretrained(
            "facebook/sam2-hiera-large"
        )

        self.yolo_model = YOLO('yolov11.pt')

    def detect_cars_yolo(self, frame):
        """
        Detect cars using YOLO
        Returns list of bounding boxes [x1, y1, x2, y2]
        """
        results = self.yolo_model(frame, classes=[2])  # Class 2 is 'car' in COCO dataset

        boxes = []
        for r in results:
            for box in r.boxes:
                # Ensure bounding box coordinates are integers
                x1, y1, x2, y2 = box.xyxy[0].cpu().numpy().astype(int)
                boxes.append([x1, y1, x2, y2])

        return boxes


    def segment_cars(self, frame, car_boxes):
        """Use SAM 2 to segment cars from bounding boxes"""
        self.predictor.set_image(frame)

        all_masks = []
        for box in car_boxes:
            # Convert box to SAM format
            input_box = np.array(box)

            # Predict mask
            masks, scores, _ = self.predictor.predict(
                point_coords=None,
                point_labels=None,
                box=input_box[None, :],
                multimask_output=False,
            )

            if len(masks) > 0:
                all_masks.append(masks[0])

        return all_masks

    def inpaint_sliding_window(self, frame, masks, window_size=256, stride=128):
        """
        Simple Inpainting using Sliding Window Approach
        Returns inpainted frame and combined mask
        """
        h, w = frame.shape[:2]
        result = frame.copy()

        # Combine all masks
        combined_mask = np.zeros((h, w), dtype=np.uint8)
        for mask in masks:
            combined_mask = np.logical_or(combined_mask, mask).astype(np.uint8)

        combined_mask = (combined_mask * 255).astype(np.uint8)

        # Simple inpainting using OpenCV (replace with better model)
        result = cv2.inpaint(result, combined_mask, 3, cv2.INPAINT_TELEA)

        return result, combined_mask

    def process_video(self, video_path, output_path="output.avi", max_frames=None): # Changed output extension to .avi due to weird configs
        """Process video with car segmentation and inpainting"""
        cap = cv2.VideoCapture(video_path)

        # Get video properties
        fps = int(cap.get(cv2.CAP_PROP_FPS))
        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

        if max_frames:
            total_frames = min(total_frames, max_frames)

        # Video writer
        fourcc = cv2.VideoWriter_fourcc(*'DIVX') # Using a common codec for .avi
        out = cv2.VideoWriter(output_path, fourcc, fps, (width*3, height))

        frame_count = 0
        print(f"Processing {total_frames} frames...")

        while cap.isOpened() and (max_frames is None or frame_count < max_frames):
            ret, frame = cap.read()
            if not ret:
                break

            # Detect cars
            car_boxes = self.detect_cars_yolo(frame)

            # Segment cars with SAM 2
            masks = []
            if len(car_boxes) > 0:
                masks = self.segment_cars(frame, car_boxes)

            # Create visualization
            vis_frame = frame.copy()
            for mask in masks:
                # Overlay mask in red
                mask_overlay = np.zeros_like(frame)
                mask_overlay[mask > 0] = [0, 0, 255]
                vis_frame = cv2.addWeighted(vis_frame, 1, mask_overlay, 0.5, 0)

            # Draw boxes
            for box in car_boxes:
                x1, y1, x2, y2 = map(int, box)
                cv2.rectangle(vis_frame, (x1, y1), (x2, y2), (0, 255, 0), 2)

            # Inpaint
            inpainted, mask_img = self.inpaint_sliding_window(frame, masks)

            # Create side-by-side comparison
            combined = np.hstack([frame, vis_frame, inpainted])
            out.write(combined)

            frame_count += 1
            if frame_count % 30 == 0:
                print(f"Processed {frame_count}/{total_frames} frames")

        cap.release()
        out.release()
        print(f"Video saved to {output_path}")

        return output_path

    #Gemini
    def display_video_in_colab(self, video_path):
        """Display video in Google Colab"""
        mp4 = open(video_path, 'rb').read()
        data_url = "data:video/mp4;base64," + b64encode(mp4).decode()

        display(HTML(f"""
        <video width=800 controls>
            <source src="{data_url}" type="video/mp4">
        </video>
        """))


# USAGE

# Initialize the segmentation and inpainting pipeline
segmenter = CarSegmentationInpainter()

# Process video (limit frames for testing)
output_avi_path = segmenter.process_video(
    video_path,
    output_path="segmented_inpainted_output.avi" # Open CV issue with error proned mp4 files. Output to AVI and convert to mp4


In [None]:
# Convert the AVI file to MP4 using ffmpeg
output_avi_path = "/content/segmented_inpainted_output.avi"
output_mp4_path = "segmented_inpainted_output.mp4"

!ffmpeg -i "$output_avi_path" -vcodec libx264 -acodec aac "$output_mp4_path"
print(f"Converted {output_avi_path} to {output_mp4_path}")

# Display the converted MP4 video
segmenter.display_video_in_colab(output_mp4_path)