In [None]:
pip install segment_anything

In [None]:
pip install ultralytics

In [None]:
pip install dlib

In [None]:
import cv2
import numpy as np
import torch
import dlib
from segment_anything import sam_model_registry, SamPredictor
from ultralytics import YOLO  #

# Load YOLO model
yolo_model = YOLO('yolov8n.pt')


In [None]:
# File path for the input video
input_video_path = "input_video.mp4"  # Change according to video_name

# Initialize video capture object
video_capture = cv2.VideoCapture(input_video_path)

# Retrieve video properties
frames_per_second = video_capture.get(cv2.CAP_PROP_FPS)  # Frames per second of the video
frame_width = int(video_capture.get(cv2.CAP_PROP_FRAME_WIDTH))  # Width of the video frames
frame_height = int(video_capture.get(cv2.CAP_PROP_FRAME_HEIGHT))  # Height of the video frames

# File path for the output video
output_video_path = "output_video.avi"

# Initialize video writer object
video_writer = cv2.VideoWriter(
    output_video_path,                          # Output video file path
    cv2.VideoWriter_fourcc(*'XVID'),           # Codec for the output video
    frames_per_second,                         # Frame rate for the output video
    (frame_width, frame_height)                # Frame dimensions for the output video
)


In [None]:
# Path to the SAM model checkpoint file
sam_model_checkpoint_path = "sam_vit_b_01ec64.pth"  # Adjust path if using a different model or location

# Type of SAM model being used
sam_model_type = "vit_b"  # Specify the model type (e.g., "vit_b", "vit_l", etc.)

# Set the computing device (GPU if available, otherwise CPU)
compute_device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load the SAM model using the specified type and checkpoint
sam_model = sam_model_registry[sam_model_type](checkpoint=sam_model_checkpoint_path).to(compute_device)

# Initialize the SAM predictor
sam_predictor = SamPredictor(sam_model)


In [None]:
# Import necessary libraries
import dlib
import cv2

# Initialize the dlib correlation tracker
object_tracker = dlib.correlation_tracker()

# Read the first frame from the video capture
success, initial_frame = video_capture.read()
if not success:
    print("Error: Unable to read the video file.")
    exit()

# Specify the bounding box for object tracking
# Coordinates for bounding box (x, y, width, height)
# Uncomment the appropriate bounding box for the video being processed

# Bounding box for Video 1
bounding_box = (652, 207, 265, 497)

# Bounding box for Video 2 or you can add respectively for the another video
# bounding_box = (349, 353, 98, 78)

# Optional: Uncomment to manually select the bounding box
# bounding_box = cv2.selectROI("Select Object", initial_frame, fromCenter=False)

# Initialize the tracker with the selected bounding box
object_tracker.start_track(
    initial_frame,
    dlib.rectangle(
        int(bounding_box[0]),
        int(bounding_box[1]),
        int(bounding_box[0] + bounding_box[2]),
        int(bounding_box[1] + bounding_box[3])
    )
)

# Variables to manage frame count and periodic reset
current_frame_count = 0
reset_interval_frames = 30  # Interval to re-run YOLO detection

# Variable to store the last computed mask (if applicable)
previous_mask = None


In [None]:
# Process video frames until the end of the video
while video_capture.isOpened():
    # Read the next frame from the video
    success, current_frame = video_capture.read()
    if not success:
        print("End of video reached.")
        break

    # Periodically reset the tracker using YOLO object detection
    if current_frame_count % reset_interval_frames == 0:
        # Perform object detection using YOLO
        yolo_results = yolo_model(current_frame)
        detected_boxes = yolo_results[0].boxes

        # If any detections are found, reset the tracker with the best detection
        if len(detected_boxes) > 0:
            # Extract the best detection's bounding box coordinates
            best_box = detected_boxes[0].xyxy.cpu().numpy().astype(int)
            x1, y1, x2, y2 = best_box[0]

            # Reset the tracker with the new bounding box
            object_tracker.start_track(current_frame, dlib.rectangle(x1, y1, x2, y2))
            print(f"Frame {current_frame_count}: Tracker reset using YOLO with bbox: {(x1, y1, x2 - x1, y2 - y1)}.")

    # Update the tracker with the current frame
    object_tracker.update(current_frame)

    # Get the updated bounding box from the tracker
    tracked_position = object_tracker.get_position()
    x1, y1, x2, y2 = int(tracked_position.left()), int(tracked_position.top()), int(tracked_position.right()), int(tracked_position.bottom())
    tracked_bbox = (x1, y1, x2 - x1, y2 - y1)

    # Use the SAM predictor to predict segmentation masks
    sam_predictor.set_image(current_frame)
    masks, scores, _ = sam_predictor.predict(
        box=np.array([[tracked_bbox[0], tracked_bbox[1], tracked_bbox[0] + tracked_bbox[2], tracked_bbox[1] + tracked_bbox[3]]])
    )

    # Apply the segmentation mask to the current frame
    if masks is not None and masks.size > 0:
        # Convert the mask to a boolean array
        previous_mask = masks[0].astype(bool)

        # Convert the frame to grayscale for masked areas
        grayscale_frame = cv2.cvtColor(current_frame, cv2.COLOR_BGR2GRAY)
        grayscale_frame_3d = np.stack((grayscale_frame, grayscale_frame, grayscale_frame), axis=-1)

        # Apply the segmentation mask: keep color where mask is True, grayscale where False
        current_frame = np.where(previous_mask[..., None], current_frame, grayscale_frame_3d)

    # Write the processed frame to the output video
    video_writer.write(current_frame)
    print(f"Frame {current_frame_count} processed.")

    # Increment the frame counter
    current_frame_count += 1

# Release video capture and writer resources
video_capture.release()
video_writer.release()
print("Video processing completed.")
