In [1]:
!pip install matplotlib
!pip install -U people_segmentation
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121





Collecting torch (from people_segmentation)
  Using cached torch-2.3.0-cp311-cp311-win_amd64.whl.metadata (26 kB)
Collecting torchvision (from pytorch-toolbelt->people_segmentation)
  Using cached torchvision-0.18.0-cp311-cp311-win_amd64.whl.metadata (6.6 kB)
Using cached torch-2.3.0-cp311-cp311-win_amd64.whl (159.8 MB)
Using cached torchvision-0.18.0-cp311-cp311-win_amd64.whl (1.2 MB)
Installing collected packages: torch, torchvision
Successfully installed torch-2.3.0 torchvision-0.18.0




Looking in indexes: https://download.pytorch.org/whl/cu121
Collecting torchaudio
  Using cached https://download.pytorch.org/whl/cu121/torchaudio-2.3.0%2Bcu121-cp311-cp311-win_amd64.whl (4.1 MB)
Collecting torch
  Using cached https://download.pytorch.org/whl/cu121/torch-2.3.0%2Bcu121-cp311-cp311-win_amd64.whl (2413.3 MB)
Installing collected packages: torch, torchaudio
  Attempting uninstall: torch
    Found existing installation: torch 2.3.0
    Uninstalling torch-2.3.0:
      Successfully uninstalled torch-2.3.0
Successfully installed torch-2.3.0+cu121 torchaudio-2.3.0+cu121


In [2]:
from pylab import imshow
import numpy as np
import cv2
import torch
import albumentations as albu
from people_segmentation.pre_trained_models import create_model
import mediapipe as mp

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# Util

from typing import Tuple

def pad(image: np.array, factor: int = 32, border: int = cv2.BORDER_REFLECT_101) -> tuple:
    height, width = image.shape[:2]

    if height % factor == 0:
        y_min_pad = 0
        y_max_pad = 0
    else:
        y_pad = factor - height % factor
        y_min_pad = y_pad // 2
        y_max_pad = y_pad - y_min_pad

    if width % factor == 0:
        x_min_pad = 0
        x_max_pad = 0
    else:
        x_pad = factor - width % factor
        x_min_pad = x_pad // 2
        x_max_pad = x_pad - x_min_pad

    padded_image = cv2.copyMakeBorder(image, y_min_pad, y_max_pad, x_min_pad, x_max_pad, border)

    return padded_image, (x_min_pad, y_min_pad, x_max_pad, y_max_pad)

def unpad(image: np.array, pads: Tuple[int, int, int, int]) -> np.ndarray:
    x_min_pad, y_min_pad, x_max_pad, y_max_pad = pads
    height, width = image.shape[:2]

    return image[y_min_pad : height - y_max_pad, x_min_pad : width - x_max_pad]

def tensor_from_rgb_image(image: np.ndarray) -> torch.Tensor:
    image = np.ascontiguousarray(np.transpose(image, (2, 0, 1)))
    return torch.from_numpy(image)

In [5]:
mp_drawing = mp.solutions.drawing_utils
mp_drawing_styles = mp.solutions.drawing_styles
mp_pose = mp.solutions.pose
pose = mp_pose.Pose(min_detection_confidence=0.74,min_tracking_confidence=0.3)

In [3]:
torch.cuda.is_available()

True

In [8]:
model = create_model("Unet_2020-07-20")
model.to('cpu')
model.eval()

Unet(
  (encoder): EfficientNetEncoder(
    (conv_stem): Conv2d(3, 40, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
    (bn1): BatchNormAct2d(
      40, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True
      (drop): Identity()
      (act): Swish()
    )
    (blocks): Sequential(
      (0): Sequential(
        (0): DepthwiseSeparableConv(
          (conv_dw): Conv2d(40, 40, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=40, bias=False)
          (bn1): BatchNormAct2d(
            40, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True
            (drop): Identity()
            (act): Swish()
          )
          (se): SqueezeExcite(
            (conv_reduce): Conv2d(40, 10, kernel_size=(1, 1), stride=(1, 1))
            (act1): Swish()
            (conv_expand): Conv2d(10, 40, kernel_size=(1, 1), stride=(1, 1))
            (gate): Sigmoid()
          )
          (conv_pw): Conv2d(40, 24, kernel_size=(1, 1), stride=(1, 1), bias=False

In [14]:
cap = cv2.VideoCapture("basketball_video.mp4")
while True:
    ret, img = cap.read()
    if not ret:
        print("Cannot receive frame")
        break
    img = cv2.resize(img,(520,300))
    img2 = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

    transform = albu.Compose([albu.Normalize(p=1)], p=1)
    padded_image, pads = pad(img2, factor=32, border=cv2.BORDER_CONSTANT)
    x = transform(image=padded_image)["image"]
    x = torch.unsqueeze(tensor_from_rgb_image(x), 0)

    with torch.no_grad():
        prediction = model(x)[0][0]

    mask = (prediction > 0).cpu().numpy().astype(np.uint8)
    mask = unpad(mask, pads)
    # dst = cv2.addWeighted(img, 1, (cv2.cvtColor(mask, cv2.COLOR_GRAY2RGB) * (0, 0, 255)).astype(np.uint8), 0.5, 0)
    dst = img

    results = pose.process(img2)
    mp_drawing.draw_landmarks(
        dst,
        results.pose_landmarks,
        mp_pose.POSE_CONNECTIONS,
        landmark_drawing_spec=mp_drawing_styles.get_default_pose_landmarks_style())
    
    contours = cv2.findContours(mask, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)[0]
    cv2.drawContours(dst, contours, -1, (60, 200, 60), 3)


    
    cv2.imshow('basketball', dst)
    if cv2.waitKey(1) == ord('q'):
        break
cap.release()
cv2.destroyAllWindows()

In [26]:
from datetime import datetime
start_time = datetime.now()

cap = cv2.VideoCapture("basketball_video.mp4")
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = cap.get(cv2.CAP_PROP_FPS)
total_frames = cap.get(cv2.CAP_PROP_FRAME_COUNT)
fourcc = cv2.VideoWriter_fourcc(*'MJPG')
out = cv2.VideoWriter('output.mov', fourcc, fps, (width,  height))
if not cap.isOpened():
    print("Cannot open camera")
    exit()

frame = 1
while True:
    ret, img = cap.read()
    if not ret:
        print("Cannot receive frame")
        break
    img2 = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

    transform = albu.Compose([albu.Normalize(p=1)], p=1)
    padded_image, pads = pad(img2, factor=32, border=cv2.BORDER_CONSTANT)
    x = transform(image=padded_image)["image"]
    x = torch.unsqueeze(tensor_from_rgb_image(x), 0)

    with torch.no_grad():
        prediction = model(x)[0][0]

    mask = (prediction > 0).cpu().numpy().astype(np.uint8)
    mask = unpad(mask, pads)
    # dst = cv2.addWeighted(img, 1, (cv2.cvtColor(mask, cv2.COLOR_GRAY2RGB) * (0, 0, 255)).astype(np.uint8), 0.5, 0)
    dst = img

    results = pose.process(img2)
    mp_drawing.draw_landmarks(
        dst,
        results.pose_landmarks,
        mp_pose.POSE_CONNECTIONS,
        landmark_drawing_spec=mp_drawing_styles.get_default_pose_landmarks_style())
    
    contours = cv2.findContours(mask, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)[0]
    cv2.drawContours(dst, contours, -1, (60, 200, 60), 5)
    out.write(dst)

    if frame%10 == 0:
        delta = (datetime.now() - start_time)
        eta = delta / frame * (total_frames - frame)
        try:
            eta_str = str(eta)[:str(eta).index(".")]
        except:
            eta_str = str(eta)
        try:
            delta_str = str(delta)[:str(delta).index(".")]
        except:
            delta_str = str(delta)
        print(f"frame {frame}/{int(total_frames)}, {frame/total_frames*100:.2f}% ({delta_str} ETA {eta_str})")
    frame += 1

cap.release()
out.release()
cv2.destroyAllWindows()

frame 10/750, 1.33% (0:00:15 ETA 0:18:43)
frame 20/750, 2.67% (0:00:30 ETA 0:18:22)
frame 30/750, 4.00% (0:00:45 ETA 0:18:06)
frame 40/750, 5.33% (0:01:00 ETA 0:17:50)
frame 50/750, 6.67% (0:01:15 ETA 0:17:32)
frame 60/750, 8.00% (0:01:30 ETA 0:17:21)
frame 70/750, 9.33% (0:01:45 ETA 0:17:06)
frame 80/750, 10.67% (0:02:00 ETA 0:16:53)
frame 90/750, 12.00% (0:02:16 ETA 0:16:37)
frame 100/750, 13.33% (0:02:31 ETA 0:16:21)
frame 110/750, 14.67% (0:02:46 ETA 0:16:07)
frame 120/750, 16.00% (0:03:01 ETA 0:15:54)
frame 130/750, 17.33% (0:03:17 ETA 0:15:42)
