In [1]:
from PIL import Image
import torch
from torchvision.io.image import decode_image
from torchvision.models.detection import fasterrcnn_mobilenet_v3_large_320_fpn, FasterRCNN_MobileNet_V3_Large_320_FPN_Weights
from torchvision.utils import draw_bounding_boxes
from torchvision.transforms.functional import to_pil_image, pil_to_tensor

# Step 1: Load the image using PIL
img_path = "C:/Users/nyok/Desktop/OpenCV/Photos/lady.jpg"
pil_img = Image.open(img_path).convert("RGB")

# Convert the PIL image to a tensor
img_tensor = pil_to_tensor(pil_img).unsqueeze(0)  # Add a batch dimension

# Step 2: Initialize model with the best available weights
weights = FasterRCNN_MobileNet_V3_Large_320_FPN_Weights.DEFAULT
model = fasterrcnn_mobilenet_v3_large_320_fpn(weights=weights, box_score_thresh=0.9)
model.eval()

# Step 3: Initialize the inference transforms
preprocess = weights.transforms()

# Step 4: Apply inference preprocessing transforms
batch = [preprocess(img_tensor[0])]

# Step 5: Use the model and visualize the prediction
with torch.no_grad():
    prediction = model(batch)[0]

labels = [weights.meta["categories"][i] for i in prediction["labels"]]
box = draw_bounding_boxes(img_tensor[0], boxes=prediction["boxes"],
                          labels=labels,
                          colors="red", width=4, font_size=30)
im = to_pil_image(box.detach())
im.show()




In [4]:
import cv2
import torch
from PIL import Image
from torchvision.models.detection import fasterrcnn_mobilenet_v3_large_320_fpn, FasterRCNN_MobileNet_V3_Large_320_FPN_Weights
from torchvision.utils import draw_bounding_boxes
from torchvision.transforms.functional import pil_to_tensor, to_pil_image
import numpy as np

# Step 1: Initialize model with the best available weights
weights = FasterRCNN_MobileNet_V3_Large_320_FPN_Weights.DEFAULT
model = fasterrcnn_mobilenet_v3_large_320_fpn(weights=weights, box_score_thresh=0.9)
model.eval()

# Step 2: Initialize the inference transforms
preprocess = weights.transforms()

# Step 3: Open video file
video_path = "C:/Users/nyok/Desktop/OpenCV/Videos/archery.MP4"
cap = cv2.VideoCapture(video_path)

output_size = (480, 480)

# Check if video opened successfully
if not cap.isOpened():
    print("Error: Could not open video.")
    exit()

# Step 4: Loop over video frames
while True:
    ret, frame = cap.read()
    if not ret:
        break  # Exit the loop when video ends
    
    resized_frame = cv2.resize(frame, output_size)
    # Convert OpenCV BGR frame to PIL Image
    pil_img = Image.fromarray(cv2.cvtColor(resized_frame, cv2.COLOR_BGR2RGB))

    # Convert PIL Image to tensor and add batch dimension
    img_tensor = pil_to_tensor(pil_img).unsqueeze(0)

    # Apply inference preprocessing transforms
    batch = [preprocess(img_tensor[0])]

    # Step 5: Perform object detection
    with torch.no_grad():
        prediction = model(batch)[0]

    # Extract labels and draw bounding boxes on the frame
    boxes = prediction["boxes"]
    box = draw_bounding_boxes(img_tensor[0], boxes=boxes, colors="red", width=4, font_size=30)

    # Convert tensor back to OpenCV format for display
    result_frame = cv2.cvtColor(np.array(to_pil_image(box.detach())), cv2.COLOR_RGB2BGR)

    # Step 6: Display the frame with bounding boxes
    cv2.imshow('Object Detection', result_frame)

    # Exit the video when 'q' is pressed
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Release video capture and close windows
cap.release()
cv2.destroyAllWindows()


In [26]:
import cv2

# Step 1: Open webcam
cap = cv2.VideoCapture(0)  # 0 is usually the default camera, change if necessary

# Step 2: Check if the webcam is opened correctly
if not cap.isOpened():
    print("Error: Could not access the webcam.")
    exit()

# Step 3: Get the default resolution of the webcam
width = cap.get(cv2.CAP_PROP_FRAME_WIDTH)
height = cap.get(cv2.CAP_PROP_FRAME_HEIGHT)

# Step 4: Print the resolution
print(f"Webcam resolution: {int(width)}x{int(height)}")

# Step 5: Release the webcam
cap.release()


Webcam resolution: 640x480


In [8]:
from torchvision.io.video import read_video
from torchvision.models.video import mc3_18, MC3_18_Weights
import torch
import torchvision.transforms as transforms

# Step 1: Load the video
vid, _, _ = read_video("C:/Users/nyok/Desktop/OpenCV/Videos/fixinghair.mp4")
vid = vid[:32]  # Optionally shorten duration to the first 32 frames

# Check the shape of the video
print(f"Original shape of video: {vid.shape}")  # [T, H, W, C]

# Step 2: Initialize the model with the best available weights
weights = MC3_18_Weights.DEFAULT
model = mc3_18(weights=weights)
model.eval()

# Step 3: Initialize the inference transforms
preprocess = weights.transforms()

# Step 4: Permute the video tensor to [T, C, H, W] from [T, H, W, C]
vid = vid.permute(0, 3, 1, 2)  # Change to [T, C, H, W]
print(f"Permute shape of video: {vid.shape}")

# Step 5: Resize each frame to 224x224 (if necessary)
resize_transform = transforms.Resize((224, 224))

# Apply resize transform to each frame in the video
vid_resized = torch.stack([resize_transform(frame) for frame in vid])

# Step 6: Apply preprocessing transforms
batch = preprocess(vid_resized).unsqueeze(0)  # Add batch dimension

# Step 7: Use the model and print the predicted category
with torch.no_grad():
    prediction = model(batch).squeeze(0).softmax(0)
    label = prediction.argmax().item()
    score = prediction[label].item()
    category_name = weights.meta["categories"][label]
    print(f"{category_name}: {100 * score:.2f}%")

MemoryError: Unable to allocate 16.2 GiB for an array with shape (700, 2160, 3840, 3) and data type uint8

In [2]:
import torch
torch.cuda.is_available()

False

In [1]:
import torch

# Check if CUDA is available
print("CUDA available:", torch.cuda.is_available())
print("Number of GPUs available:", torch.cuda.device_count())
print("Current GPU:", torch.cuda.current_device())
print("GPU Name:", torch.cuda.get_device_name(torch.cuda.current_device()))

CUDA available: False
Number of GPUs available: 0


AssertionError: Torch not compiled with CUDA enabled