<a href="https://colab.research.google.com/github/haysnairpa/stairvision/blob/main/stairvision_main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%pip install ultralytics opencv-python-headless numpy

Collecting ultralytics
  Downloading ultralytics-8.3.177-py3-none-any.whl.metadata (37 kB)
Collecting ultralytics-thop>=2.0.0 (from ultralytics)
  Downloading ultralytics_thop-2.0.15-py3-none-any.whl.metadata (14 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.8.0->ultralytics)
  Downloading nv

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [1]:
import cv2
import numpy as np
from ultralytics import YOLO

In [2]:
seg_model_path = "src/model/best_stair_handrail_model.pt"
pose_model_path = "src/model/best_pose_model.pt"

seg_model = YOLO(seg_model_path)
pose_model = YOLO(pose_model_path)

In [3]:
import os

video_path = r"D:\Aldi\stairvision\src\dataset\east\videos\Copy of Copy of IMG_3086.MOV"
output_dir = r"D:\Aldi\stairvision\src\dataset\east\videos\output_videos"
output_prefix = "output_"

os.makedirs(output_dir, exist_ok=True)

base_name = os.path.basename(video_path)
base_name_no_ext, ext = os.path.splitext(base_name)

new_file_name = f"{output_prefix}{base_name_no_ext}{ext}"
output_path = os.path.join(output_dir, new_file_name)

counter = 1
while os.path.exists(output_path):
    new_file_name = f"{output_prefix}{base_name_no_ext} ({counter}){ext}"
    output_path = os.path.join(output_dir, new_file_name)
    counter += 1

print(f"output path: {output_path}")

output path: D:\Aldi\stairvision\src\dataset\east\videos\output_videos\output_Copy of Copy of IMG_3086 (2).MOV


In [4]:
cap = cv2.VideoCapture(video_path)
if not cap.isOpened():
    raise RuntimeError(f"❌: {video_path}")


In [5]:
cap = cv2.VideoCapture(video_path)
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
fps = cap.get(cv2.CAP_PROP_FPS)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

In [6]:
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

In [7]:
import time
import cv2
import numpy as np

start_time = time.time()

DILATE_KERNEL_SIZE = 5
CLOSE_KERNEL_SIZE = 5
KP_CONF_THRESH = 0.25
HAND_RADIUS = max(5, int(0.015 * max(width, height)))
HAND_OVERLAP_RATIO = 0.3

frame_num = 0
while True:
    ret, frame = cap.read()
    if not ret:
        break

    frame_orig = frame.copy()

    seg_results = seg_model.predict(frame, conf=0.4, verbose=False)
    handrail_mask = np.zeros((height, width), dtype=np.uint8)

    for r in seg_results:
        if r.masks is not None:
            for mask_poly, cls in zip(r.masks.xy, r.boxes.cls):
                if int(cls) == 0:
                    poly = np.array(mask_poly, dtype=np.int32)
                    cv2.fillPoly(handrail_mask, [poly], 255)

    close_kernel = np.ones((CLOSE_KERNEL_SIZE, CLOSE_KERNEL_SIZE), np.uint8)
    handrail_mask = cv2.morphologyEx(handrail_mask, cv2.MORPH_CLOSE, close_kernel)
    dilate_kernel = np.ones((DILATE_KERNEL_SIZE, DILATE_KERNEL_SIZE), np.uint8)
    handrail_mask = cv2.dilate(handrail_mask, dilate_kernel, iterations=1)

    pose_results = pose_model.predict(frame_orig, conf=0.25, verbose=False)

    frame_vis = frame.copy()

    for r in pose_results:
        if r.keypoints is None:
            continue

        kpts_xy = r.keypoints.xy.cpu().numpy()
        try:
            kpts_conf = r.keypoints.conf.cpu().numpy()
        except Exception:
            kpts_conf = np.ones((kpts_xy.shape[0], kpts_xy.shape[1]))

        for person_idx in range(kpts_xy.shape[0]):
            person_kpts = kpts_xy[person_idx]
            person_conf = kpts_conf[person_idx]

            hands = []
            hands_conf = []

            if person_kpts.shape[0] > 9:
                hands.append(person_kpts[9])
                hands_conf.append(person_conf[9])
            if person_kpts.shape[0] > 10:
                hands.append(person_kpts[10])
                hands_conf.append(person_conf[10])

            holding = False
            for (hx, hy), conf_val in zip(hands, hands_conf):
                if conf_val < KP_CONF_THRESH:
                    continue
                x, y = int(hx), int(hy)
                if not (0 <= x < width and 0 <= y < height):
                    continue

                y_grid, x_grid = np.ogrid[:height, :width]
                mask_circle = (x_grid - x) ** 2 + (y_grid - y) ** 2 <= HAND_RADIUS ** 2
                inside_mask = np.logical_and(mask_circle, handrail_mask > 0)
                overlap_ratio = inside_mask.sum() / mask_circle.sum()

                if overlap_ratio >= HAND_OVERLAP_RATIO:
                    holding = True
                    cv2.circle(frame_vis, (x, y), 6, (0, 255, 0), -1)
                else:
                    cv2.circle(frame_vis, (x, y), 6, (0, 0, 255), -1)

            label_pos = (int(person_kpts[0][0]), int(person_kpts[0][1]) - 10) if person_kpts.shape[0] > 0 else (10, 30)
            if holding:
                cv2.putText(frame_vis, "Holding Handrail", label_pos, cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2)
            else:
                cv2.putText(frame_vis, "Not Holding", label_pos, cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 255), 2)

    out.write(frame_vis)
    frame_num += 1

    elapsed = time.time() - start_time
    fps_proc = frame_num / elapsed if elapsed > 0 else 0
    eta = (total_frames - frame_num) / fps_proc if fps_proc > 0 else 0
    print(f"Frame {frame_num}/{total_frames} | {fps_proc:.2f} FPS | ETA: {eta/60:.1f} min", flush=True)

cap.release()
out.release()
print(f"video saved to: {output_path}")

Frame 1/303 | 0.23 FPS | ETA: 22.2 min
Frame 2/303 | 0.42 FPS | ETA: 11.9 min
Frame 3/303 | 0.59 FPS | ETA: 8.5 min
Frame 4/303 | 0.73 FPS | ETA: 6.8 min
Frame 5/303 | 0.86 FPS | ETA: 5.8 min
Frame 6/303 | 0.98 FPS | ETA: 5.0 min
Frame 7/303 | 1.08 FPS | ETA: 4.5 min
Frame 8/303 | 1.17 FPS | ETA: 4.2 min
Frame 9/303 | 1.26 FPS | ETA: 3.9 min
Frame 10/303 | 1.32 FPS | ETA: 3.7 min
Frame 11/303 | 1.39 FPS | ETA: 3.5 min
Frame 12/303 | 1.44 FPS | ETA: 3.4 min
Frame 13/303 | 1.49 FPS | ETA: 3.3 min
Frame 14/303 | 1.53 FPS | ETA: 3.2 min
Frame 15/303 | 1.57 FPS | ETA: 3.1 min
Frame 16/303 | 1.60 FPS | ETA: 3.0 min
Frame 17/303 | 1.64 FPS | ETA: 2.9 min
Frame 18/303 | 1.67 FPS | ETA: 2.8 min
Frame 19/303 | 1.70 FPS | ETA: 2.8 min
Frame 20/303 | 1.73 FPS | ETA: 2.7 min
Frame 21/303 | 1.76 FPS | ETA: 2.7 min
Frame 22/303 | 1.79 FPS | ETA: 2.6 min
Frame 23/303 | 1.81 FPS | ETA: 2.6 min
Frame 24/303 | 1.83 FPS | ETA: 2.5 min
Frame 25/303 | 1.85 FPS | ETA: 2.5 min
Frame 26/303 | 1.86 FPS | ETA: 2

### Below is the code to process video with mask output

In [10]:
import cv2
import numpy as np
from collections import deque
from ultralytics import YOLO
import os
import time

seg_model_path = "src/model/best_stair_handrail_model.pt"
pose_model_path = "src/model/best_pose_model.pt"
seg_model = YOLO(seg_model_path)
pose_model = YOLO(pose_model_path)

video_path = r"D:\Aldi\stairvision\src\dataset\west\videos\Copy of Copy of IMG_3093.MOV"
output_dir = r"D:\Aldi\stairvision\src\dataset\west\videos\output_videos"
output_prefix = "output_with_mask_"

os.makedirs(output_dir, exist_ok=True)

base_name = os.path.basename(video_path)
base_name_no_ext, ext = os.path.splitext(base_name)
new_file_name = f"{output_prefix}{base_name_no_ext}{ext}"
output_path = os.path.join(output_dir, new_file_name)
counter = 1
while os.path.exists(output_path):
    new_file_name = f"{output_prefix}{base_name_no_ext} ({counter}){ext}"
    output_path = os.path.join(output_dir, new_file_name)
    counter += 1

cap = cv2.VideoCapture(video_path)
if not cap.isOpened():
    raise RuntimeError(f"❌ cannot open video from the path: {video_path}")

total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
fps = cap.get(cv2.CAP_PROP_FPS)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

DILATE_KERNEL_SIZE = 4
CLOSE_KERNEL_SIZE = 4
ERODE_KERNEL_SIZE = 3
KP_CONF_THRESH = 0.25
HAND_RADIUS = max(5, int(0.05 * max(width, height)))
HAND_OVERLAP_RATIO = 0.2
SMOOTH_FRAMES = 3

# Keypoint connections for drawing the skeleton (COCO 17-point format)
SKELETON_CONNECTIONS = [
    (0, 1), (0, 2), (1, 3), (2, 4), (5, 6), (5, 7), (7, 9), (6, 8),
    (8, 10), (11, 12), (5, 11), (6, 12), (11, 13), (13, 15), (12, 14), (14, 16)
]

# Smoothing memory
hand_history = {}
status_memory = {}

start_time = time.time()
frame_num = 0

while True:
    ret, frame = cap.read()
    if not ret:
        break

    frame_orig = frame.copy()

    # stair handrail segmentation
    seg_results = seg_model.predict(frame, conf=0.4, verbose=False)
    handrail_mask = np.zeros((height, width), dtype=np.uint8)

    for r in seg_results:
        if r.masks is not None:
            for mask_poly, cls in zip(r.masks.xy, r.boxes.cls):
                if int(cls) == 0:
                    poly = np.array(mask_poly, dtype=np.int32)
                    cv2.fillPoly(handrail_mask, [poly], 255)

    # Morphological adjustments
    close_kernel = np.ones((CLOSE_KERNEL_SIZE, CLOSE_KERNEL_SIZE), np.uint8)
    dilate_kernel = np.ones((DILATE_KERNEL_SIZE, DILATE_KERNEL_SIZE), np.uint8)
    erode_kernel = np.ones((ERODE_KERNEL_SIZE, ERODE_KERNEL_SIZE), np.uint8)

    handrail_mask = cv2.morphologyEx(handrail_mask, cv2.MORPH_CLOSE, close_kernel)
    handrail_mask = cv2.dilate(handrail_mask, dilate_kernel, iterations=1)
    handrail_mask = cv2.erode(handrail_mask, erode_kernel, iterations=1)

    # person detection
    pose_results = pose_model.predict(frame_orig, conf=0.25, verbose=False)
    
    #  Visualization Setup
    frame_vis = frame.copy()
    
    # Create a semi-transparent overlay for the handrail mask
    mask_overlay = frame_vis.copy()
    mask_overlay[handrail_mask > 0] = (255, 255, 0)  # Cyan color for the mask
    frame_vis = cv2.addWeighted(mask_overlay, 0.4, frame_vis, 0.6, 0)

    for pid, r in enumerate(pose_results):
        if r.keypoints is None:
            continue

        kpts_xy = r.keypoints.xy.cpu().numpy()
        try:
            kpts_conf = r.keypoints.conf.cpu().numpy()
        except Exception:
            kpts_conf = np.ones((kpts_xy.shape[0], kpts_xy.shape[1]))

        for person_idx in range(kpts_xy.shape[0]):
            person_kpts = kpts_xy[person_idx]
            person_conf = kpts_conf[person_idx]

            # the Skeleton
            for i in range(person_kpts.shape[0]):
                if person_conf[i] > KP_CONF_THRESH:
                    x, y = int(person_kpts[i][0]), int(person_kpts[i][1])
                    cv2.circle(frame_vis, (x, y), 3, (200, 200, 200), -1)

            for start_idx, end_idx in SKELETON_CONNECTIONS:
                if person_kpts.shape[0] > max(start_idx, end_idx):
                    if person_conf[start_idx] > KP_CONF_THRESH and person_conf[end_idx] > KP_CONF_THRESH:
                        start_point = tuple(np.array(person_kpts[start_idx], int))
                        end_point = tuple(np.array(person_kpts[end_idx], int))
                        cv2.line(frame_vis, start_point, end_point, (255, 255, 255), 2)
            
            # Hand Holding Logic (with visualization on top of skeleton)
            holding_status_for_person = False
            for hid in [9, 10]:  # Left & right wrists
                if person_kpts.shape[0] <= hid or person_conf[hid] < KP_CONF_THRESH:
                    continue

                hx, hy = person_kpts[hid]

                # Position smoothing
                hand_history.setdefault((person_idx, hid), deque(maxlen=SMOOTH_FRAMES))
                hand_history[(person_idx, hid)].append((hx, hy))
                avg_hx = int(np.mean([p[0] for p in hand_history[(person_idx, hid)]]))
                avg_hy = int(np.mean([p[1] for p in hand_history[(person_idx, hid)]]))

                if not (0 <= avg_hx < width and 0 <= avg_hy < height):
                    continue

                # Overlap calculation
                y_grid, x_grid = np.ogrid[:height, :width]
                mask_circle = (x_grid - avg_hx) ** 2 + (y_grid - avg_hy) ** 2 <= HAND_RADIUS ** 2
                inside_mask = np.logical_and(mask_circle, handrail_mask > 0)
                overlap_ratio = inside_mask.sum() / (mask_circle.sum() + 1e-6)

                # Store status for smoothing
                status_memory.setdefault((person_idx, hid), deque(maxlen=SMOOTH_FRAMES))
                status_memory[(person_idx, hid)].append(overlap_ratio >= HAND_OVERLAP_RATIO)

                # Final decision for this hand
                is_hand_holding = sum(status_memory[(person_idx, hid)]) >= (SMOOTH_FRAMES // 2 + 1)
                
                # Update overall person status if at least one hand is holding
                if is_hand_holding:
                    holding_status_for_person = True

                # circle on wrist (green for holding, red for not)
                cv2.circle(frame_vis, (avg_hx, avg_hy), 8, (0, 255, 0) if is_hand_holding else (0, 0, 255), -1)

            # text label for the person
            label_pos = (int(person_kpts[5][0]), int(person_kpts[5][1]) - 20) if person_kpts.shape[0] > 5 else (10, 30)
            text = "HOLDING" if holding_status_for_person else "NOT HOLDING"
            color = (0, 255, 0) if holding_status_for_person else (0, 0, 255)
            cv2.putText(frame_vis, text, label_pos, cv2.FONT_HERSHEY_SIMPLEX, 0.9, color, 2)

    out.write(frame_vis)
    frame_num += 1

    elapsed = time.time() - start_time
    fps_proc = frame_num / elapsed if elapsed > 0 else 0
    eta = (total_frames - frame_num) / fps_proc if fps_proc > 0 else 0
    print(f"Frame {frame_num}/{total_frames} | {fps_proc:.2f} FPS | ETA: {eta/60:.1f} min", flush=True)

cap.release()
out.release()
print(f"video saved to: {output_path}")

Frame 1/863 | 0.59 FPS | ETA: 24.4 min
Frame 2/863 | 1.03 FPS | ETA: 13.9 min
Frame 3/863 | 1.35 FPS | ETA: 10.6 min
Frame 4/863 | 1.66 FPS | ETA: 8.6 min
Frame 5/863 | 1.91 FPS | ETA: 7.5 min
Frame 6/863 | 2.12 FPS | ETA: 6.7 min
Frame 7/863 | 2.32 FPS | ETA: 6.1 min
Frame 8/863 | 2.46 FPS | ETA: 5.8 min
Frame 9/863 | 2.59 FPS | ETA: 5.5 min
Frame 10/863 | 2.71 FPS | ETA: 5.2 min
Frame 11/863 | 2.79 FPS | ETA: 5.1 min
Frame 12/863 | 2.86 FPS | ETA: 5.0 min
Frame 13/863 | 2.90 FPS | ETA: 4.9 min
Frame 14/863 | 2.93 FPS | ETA: 4.8 min
Frame 15/863 | 2.96 FPS | ETA: 4.8 min
Frame 16/863 | 2.99 FPS | ETA: 4.7 min
Frame 17/863 | 3.03 FPS | ETA: 4.7 min
Frame 18/863 | 3.06 FPS | ETA: 4.6 min
Frame 19/863 | 3.08 FPS | ETA: 4.6 min
Frame 20/863 | 3.11 FPS | ETA: 4.5 min
Frame 21/863 | 3.15 FPS | ETA: 4.4 min
Frame 22/863 | 3.18 FPS | ETA: 4.4 min
Frame 23/863 | 3.21 FPS | ETA: 4.4 min
Frame 24/863 | 3.23 FPS | ETA: 4.3 min
Frame 25/863 | 3.26 FPS | ETA: 4.3 min
Frame 26/863 | 3.27 FPS | ETA: 