# 0. Setup

In [1]:
!nvidia-smi

Mon Dec  4 23:39:50 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   37C    P8     9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
import os
import sys

# make current working directory base directory for all operations in project
HOME = os.getcwd()

# clone yolov7 repository and install requirements
%cd {HOME}
!git clone https://github.com/WongKinYiu/yolov7
%cd {HOME}/yolov7
!pip install -r requirements.txt

# append yolov7 to the path
sys.path.append(f"{HOME}/yolov7/")

/content
Cloning into 'yolov7'...
remote: Enumerating objects: 1197, done.[K
remote: Counting objects: 100% (6/6), done.[K
remote: Compressing objects: 100% (5/5), done.[K
remote: Total 1197 (delta 2), reused 3 (delta 1), pack-reused 1191[K
Receiving objects: 100% (1197/1197), 74.23 MiB | 24.56 MiB/s, done.
Resolving deltas: 100% (517/517), done.
/content/yolov7
Collecting thop (from -r requirements.txt (line 36))
  Downloading thop-0.1.1.post2209072238-py3-none-any.whl (15 kB)
Collecting jedi>=0.16 (from ipython->-r requirements.txt (line 34))
  Downloading jedi-0.19.1-py2.py3-none-any.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: jedi, thop
Successfully installed jedi-0.19.1 thop-0.1.1.post2209072238


In [3]:
%cd {HOME}/yolov7

# wget pose model
!wget https://github.com/WongKinYiu/yolov7/releases/download/v0.1/yolov7-w6-pose.pt --quiet

# path to model
POSE_MODEL_WEIGHTS_PATH = f"{HOME}/yolov7/yolov7-w6-pose.pt"

/content/yolov7


In [4]:
import torch

# select device
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [5]:
from models.experimental import attempt_load

# load in pose model
weights = torch.load(POSE_MODEL_WEIGHTS_PATH, map_location=device)
pose_model = weights["model"]
_ = pose_model.float().eval()

if torch.cuda.is_available():
    pose_model.half().to(device)

# 1. Utility Functions

In [6]:
from typing import Tuple
from dataclasses import dataclass

@dataclass(frozen=True)
class Color:
    r: int
    g: int
    b: int

    @property
    def bgr_tuple(self) -> Tuple[int, int, int]:
        return self.b, self.g, self.r

In [7]:
from typing import Generator
import numpy as np

def generate_frames(video_file: str) -> Generator[np.ndarray, None, None]:
    video = cv2.VideoCapture(video_file)

    while video.isOpened():
        success, frame = video.read()
        if not success:
            break
        yield frame

    video.release()

# 2. Pose Detection

In [8]:
# some parameters needed for use of yolov7 models
POSE_IMAGE_SIZE = 960
STRIDE = 64
CONFIDENCE_TRESHOLD = 0.25
IOU_TRESHOLD = 0.65

In [9]:
from utils.general import non_max_suppression_kpt, non_max_suppression
from utils.plots import output_to_keypoint
from utils.datasets import letterbox
from torchvision import transforms
from typing import Tuple

# preprocess video to use with yolov7 model
def pose_pre_process_frame(frame: np.ndarray, device: torch.device) -> torch.Tensor:
    image = letterbox(frame, POSE_IMAGE_SIZE, stride=STRIDE, auto=True)[0]
    image = transforms.ToTensor()(image)
    image = torch.tensor(np.array([image.numpy()]))

    if torch.cuda.is_available():
        image = image.half().to(device)

    return image

# postprocessing of outputs
def post_process_pose(pose: np.ndarray, image_size: Tuple, scaled_image_size: Tuple) -> np.ndarray:
    height, width = image_size
    scaled_height, scaled_width = scaled_image_size
    vertical_factor = height / scaled_height
    horizontal_factor = width / scaled_width
    result = pose.copy()
    for i in range(17):
        result[i * 3] = horizontal_factor * result[i * 3]
        result[i * 3 + 1] = vertical_factor * result[i * 3 + 1]
    return result

def pose_post_process_output(output: torch.tensor, confidence_trashold: float, iou_trashold: float, image_size: Tuple[int, int], scaled_image_size: Tuple[int, int]) -> np.ndarray:
    output = non_max_suppression_kpt(prediction=output, conf_thres=confidence_trashold, iou_thres=iou_trashold, nc=pose_model.yaml['nc'], nkpt=pose_model.yaml['nkpt'], kpt_label=True)

    with torch.no_grad():
        output = output_to_keypoint(output)

        for idx in range(output.shape[0]):
            output[idx, 7:] = post_process_pose(output[idx, 7:], image_size=image_size, scaled_image_size=scaled_image_size)

    return output

# 3. Angle Extraction

In [10]:
import math

def detect_angles(kpts, angle_pts):
    p1, p2, p3 = angle_pts
    coordinates = []
    num_kpts = len(kpts)//3

    for i in range(num_kpts):
        x, y = kpts[3*i], kpts[3*i + 1]
        confidence = kpts[3*i + 2]
        coordinates.append([i, x, y, confidence])

    x1, y1 = coordinates[p1][1:3]
    x2, y2 = coordinates[p2][1:3]
    x3, y3 = coordinates[p3][1:3]

    angle = math.degrees(math.atan2(y3 - y2, x3 - x2) - math.atan2(y1 - y2, x1 - x2))

    if angle < 0:
          angle += 360

    # all joints have range of 180 degrees
    # Also pre-emptively normalizing data in range 0 - 1
    return (angle % 180) / 180

# left leg kpts = (11, 13, 15), right leg kpts = (12, 14, 16)
# left hip kpts = (1, 11, 12), right hip kpts = (1, 8, 9)
# left shoulder kpts = (1, 5, 6), right shoulder kpts = (1, 2, 3)
# left arm kpts = (5, 7, 9), right arm kpts = (6, 8, 10)
left_leg = (11, 13, 15)
left_hip = (1, 11, 12)
left_shoulder = (1, 5, 6)
left_arm = (5, 7, 9)

right_leg = (12, 14, 16)
right_hip = (1, 8, 9)
right_shoulder = (1, 2, 3)
right_arm = (6, 8, 10)

# total number of angles
num_angles = 8

In [11]:
def extract_all_angles(detections: np.ndarray) -> np.ndarray:
    # Extract poses for the angle calculation
    poses = [detections[idx, 7:].T for idx in range(detections.shape[0])]

    angles = np.empty((0, num_angles))
    for pose in poses:
        curr_angles = np.array([detect_angles(pose, left_leg),
                               detect_angles(pose, right_leg),
                               detect_angles(pose, left_hip),
                               detect_angles(pose, right_hip),
                               detect_angles(pose, left_shoulder),
                               detect_angles(pose, right_shoulder),
                               detect_angles(pose, left_arm),
                               detect_angles(pose, right_arm)])

        angles = np.append(angles, [curr_angles], axis=0)

    return angles

# 4. Data Extraction

In [12]:
import cv2

# stores information about output video file, width and height of the frame must be equal to input video
@dataclass(frozen=True)
class VideoConfig:
    fps: float
    width: int
    height: int

# create cv2.VideoWriter object that we can use to save output video
def get_video_writer(target_video_path: str, video_config: VideoConfig) -> cv2.VideoWriter:
    video_target_dir = os.path.dirname(os.path.abspath(target_video_path))
    os.makedirs(video_target_dir, exist_ok=True)
    return cv2.VideoWriter(target_video_path, fourcc=cv2.VideoWriter_fourcc(*"mp4v"), fps=video_config.fps, frameSize=(video_config.width, video_config.height), isColor=True)

def get_frame_count(path: str) -> int:
    cap = cv2.VideoCapture(path)
    return int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

In [13]:
# paths to data directories
SKI_DATA_DIR_PATH = f"{HOME}/input/ski/"
SNOWBOARD_DATA_DIR_PATH = f"{HOME}/input/snow/"

data_dirs = [SNOWBOARD_DATA_DIR_PATH, SKI_DATA_DIR_PATH]

In [16]:
from tqdm.notebook import tqdm

# angle data np array, add one to num_angles for column number to have class column
angle_data = np.empty((0, num_angles + 1))

for i, dir in enumerate(data_dirs):
  for entry in os.listdir(dir):
      # Create full path
      SOURCE_VIDEO_PATH = os.path.join(dir, entry)

      # get fresh video frame generator
      frame_iterator = iter(generate_frames(video_file=SOURCE_VIDEO_PATH))
      total = get_frame_count(SOURCE_VIDEO_PATH)

      for frame in tqdm(frame_iterator, total=total):
          with torch.no_grad():
              image_size = frame.shape[:2]

              # pose extraction
              pose_pre_processed_frame = pose_pre_process_frame(frame=frame, device=device)
              pose_scaled_image_size = tuple(pose_pre_processed_frame.size())[2:]

              pose_output = pose_model(pose_pre_processed_frame)[0].detach().cpu()
              pose_output = pose_post_process_output(output=pose_output, confidence_trashold=CONFIDENCE_TRESHOLD, iou_trashold=IOU_TRESHOLD, image_size=image_size, scaled_image_size=pose_scaled_image_size)

              # angle extraction
              angles = extract_all_angles(pose_output)
              classes = np.full((angles.shape[0], 1), i)
              angles = np.append(angles, classes, axis=1)
              angle_data = np.append(angle_data, angles, axis=0)

  0%|          | 0/624 [00:00<?, ?it/s]

  0%|          | 0/325 [00:00<?, ?it/s]

  0%|          | 0/354 [00:00<?, ?it/s]

  0%|          | 0/874 [00:00<?, ?it/s]

  0%|          | 0/289 [00:00<?, ?it/s]

  0%|          | 0/988 [00:00<?, ?it/s]

  0%|          | 0/417 [00:00<?, ?it/s]

  0%|          | 0/750 [00:00<?, ?it/s]

  0%|          | 0/516 [00:00<?, ?it/s]

  0%|          | 0/395 [00:00<?, ?it/s]

In [17]:
import pandas as pd

# make data directory
!mkdir {HOME}/data

# path for where to write extracted data
EXTRACTED_DATA_PATH = f"{HOME}/data/data.csv"

# convert angle data to pandas dataframe
angle_df = pd.DataFrame(angle_data)

# write to a csv file with extracted data
angle_df.to_csv(EXTRACTED_DATA_PATH, index=False)

# 5. Train Model

In [47]:
# in case don't want to extract data from videos again
EXTRACTED_DATA_PATH = f"{HOME}/data/data.csv"
angle_df = pd.read_csv(EXTRACTED_DATA_PATH)

# shuffle the data
# by the nature of our extraction, all snowboarders are first, then all skiers
angle_df = angle_df.sample(frac=1).reset_index(drop=True)

# extract features and labels
features = angle_df.iloc[:, :-1]
labels = angle_df.iloc[:, -1]

In [48]:
import torch

# create feature / label tensors
feature_tensor = torch.tensor(features.values).float()
label_tensor = torch.tensor(labels.values).long()

In [54]:
from torch.utils.data import TensorDataset, DataLoader

# tensor dataset
dataset = TensorDataset(feature_tensor, label_tensor)

# data loader
dataloader = DataLoader(dataset, batch_size=100, pin_memory=True)

In [62]:
import torch.nn as nn
import torch.nn.functional as F

class SkiClassifier(nn.Module):
    def __init__(self, num_features):
        super(SkiClassifier, self).__init__()
        self.fc1 = nn.Linear(num_features, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 32)
        self.fc4 = nn.Linear(32, 2)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = self.fc4(x)
        return x

# we have 8 angle features
num_features = 8

# instantiate model
model = SkiClassifier(num_features)

In [63]:
# choose criterion and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)

# learning rate scheduler
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=200, gamma=0.1)

In [64]:
# Check if CUDA is available
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("Using GPU:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("Using CPU")

# move model to GPU (if available)
model = model.to(device)

Using GPU: Tesla T4


In [65]:
# train model
num_epochs = 1000

# accuracy calculation:
def calculate_accuracy(outputs, labels):
  probabilities = torch.softmax(outputs, dim=1)

  _, predicted = torch.max(probabilities, 1)

  total = labels.size(0)
  correct = (predicted == labels).sum().item()
  return 100 * correct / total

for epoch in range(num_epochs):
    running_loss = 0.0
    total_batches = 0
    train_accumulate_sum = 0
    train_total = 0

    for inputs, labels in dataloader:
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()

        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, labels)

        # Backward and optimize
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        total_batches += 1

        # increment total accuracy and iterations counts
        train_accumulate_sum += calculate_accuracy(outputs, labels)
        train_total += 1

    train_accuracy = train_accumulate_sum / train_total

    # Step the scheduler
    scheduler.step()

    average_loss = running_loss / total_batches
    if (epoch % 50) == 49:
      print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {average_loss:.4f}, Accuracy: {train_accuracy:.3}%')

Epoch [50/1000], Loss: 0.2719, Accuracy: 90.0%
Epoch [100/1000], Loss: 0.2632, Accuracy: 90.7%
Epoch [150/1000], Loss: 0.2642, Accuracy: 89.7%
Epoch [200/1000], Loss: 0.2557, Accuracy: 90.9%
Epoch [250/1000], Loss: 0.2163, Accuracy: 92.1%
Epoch [300/1000], Loss: 0.2115, Accuracy: 92.2%
Epoch [350/1000], Loss: 0.2110, Accuracy: 92.2%
Epoch [400/1000], Loss: 0.2086, Accuracy: 92.4%
Epoch [450/1000], Loss: 0.2052, Accuracy: 92.6%
Epoch [500/1000], Loss: 0.2050, Accuracy: 92.5%
Epoch [550/1000], Loss: 0.2049, Accuracy: 92.6%
Epoch [600/1000], Loss: 0.2048, Accuracy: 92.6%
Epoch [650/1000], Loss: 0.2045, Accuracy: 92.6%
Epoch [700/1000], Loss: 0.2045, Accuracy: 92.6%
Epoch [750/1000], Loss: 0.2044, Accuracy: 92.6%
Epoch [800/1000], Loss: 0.2044, Accuracy: 92.6%
Epoch [850/1000], Loss: 0.2044, Accuracy: 92.6%
Epoch [900/1000], Loss: 0.2044, Accuracy: 92.6%
Epoch [950/1000], Loss: 0.2044, Accuracy: 92.6%
Epoch [1000/1000], Loss: 0.2044, Accuracy: 92.6%


In [66]:
# save model
torch.save(model.state_dict(), f'{HOME}/model.pth')

# 6. Test Video and Annotate

In [67]:
class SkeletonTracker:
    def __init__(self, max_distance=30):
        self.max_distance = max_distance
        self.trackers = {}
        self.next_id = 0

    def update(self, poses):
        # Initialize current_ids at the beginning of the update method
        current_ids = list(self.trackers.keys())

        # If it's the first frame or no skeletons are currently tracked, assign new IDs to all poses
        if not self.trackers or all(len(t[0]) == 0 for t in self.trackers.values()):
            for pose in poses:
                pose_centroid = self.calculate_centroid(pose)

                # calculate angles
                curr_angles = np.array([detect_angles(pose, left_leg),
                               detect_angles(pose, right_leg),
                               detect_angles(pose, left_hip),
                               detect_angles(pose, right_hip),
                               detect_angles(pose, left_shoulder),
                               detect_angles(pose, right_shoulder),
                               detect_angles(pose, left_arm),
                               detect_angles(pose, right_arm)])

                # angle tensor and put it on device
                angles_tensor = torch.tensor(curr_angles, dtype=torch.float)
                angles_tensor = angles_tensor.to(device)

                # make prediction using model
                with torch.no_grad():
                    prediction = model(angles_tensor)

                # get predicted class
                probabilities = F.softmax(prediction, dim=0)
                prob_array = probabilities.cpu().numpy()

                counts = [0, 0]
                counts += prob_array

                self.trackers[self.next_id] = (pose, self.calculate_centroid(pose), counts)
                self.next_id += 1
        else:
            # Update trackers with new poses
            for pose in poses:
                pose_centroid = self.calculate_centroid(pose)

                # calculate angles
                curr_angles = np.array([detect_angles(pose, left_leg),
                               detect_angles(pose, right_leg),
                               detect_angles(pose, left_hip),
                               detect_angles(pose, right_hip),
                               detect_angles(pose, left_shoulder),
                               detect_angles(pose, right_shoulder),
                               detect_angles(pose, left_arm),
                               detect_angles(pose, right_arm)])

                # angle tensor and put it on device
                angles_tensor = torch.tensor(curr_angles, dtype=torch.float)
                angles_tensor = angles_tensor.to(device)

                # make prediction using model
                with torch.no_grad():
                    prediction = model(angles_tensor)

                # get probabilities
                probabilities = F.softmax(prediction, dim=0)
                prob_array = probabilities.cpu().numpy()

                # Find the closest tracker to each pose
                distances = {tid: np.linalg.norm(pose_centroid - self.trackers[tid][1])
                             for tid in current_ids}
                if current_ids:  # Make sure there are still IDs to be compared
                    closest_tid, closest_dist = min(distances.items(), key=lambda item: item[1])
                    if closest_dist < self.max_distance:
                    # If the closest tracker is near enough, update it with the new pose
                        counts = self.trackers[closest_tid][2]
                        counts += prob_array

                        self.trackers[closest_tid] = (pose, pose_centroid, counts)
                        current_ids.remove(closest_tid)
                    else:
                    # If no tracker is close enough, start a new one
                        counts = [0, 0]
                        counts += prob_array
                        self.trackers[self.next_id] = (pose, pose_centroid, counts)
                        self.next_id += 1
                else:
                # If all current IDs have been matched, start a new tracker for the remaining poses
                    counts = [0, 0]
                    counts += prob_array
                    self.trackers[self.next_id] = (pose, pose_centroid, counts)
                    self.next_id += 1

        # Clear trackers without poses
        for tid in current_ids:
            del self.trackers[tid]

        return self.trackers

    def calculate_centroid(self, pose):
        x = pose[::3]  # Extract every third element starting at index 0
        y = pose[1::3] # Extract every third element starting at index 1
        x_mean = np.mean(x[x != 0])  # Calculate the mean, excluding zeros (assuming zero is not a valid coordinate)
        y_mean = np.mean(y[y != 0])  # Same for y
        return np.array([x_mean, y_mean])

tracker = SkeletonTracker(max_distance=30)
def class_annotate(image: np.ndarray, detections:np.ndarray) -> np.ndarray:
    annotated_frame = image.copy()

    # Extract poses for the tracker
    poses = [detections[idx, 7:].T for idx in range(detections.shape[0])]

    # update the tracker with new poses
    tracked_poses = tracker.update(poses)

    for pose, pose_centroid, classes in tracked_poses.values():
        predicted_class = "Skier" if classes[1] > classes[0] else "Snowboarder"

        # annotate text
        x, y = pose_centroid
        cv2.putText(annotated_frame, f'{predicted_class}', (int(x), int(y)), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 2)

    return annotated_frame

In [68]:
from tqdm.notebook import tqdm

COLOR = Color(r=255, g=255, b=255)
SOURCE_VIDEO_PATH = f"{HOME}/test/test.mp4"
TARGET_VIDEO_PATH = f"{HOME}/test/test_annotated.mp4"

# initiate video writer
video_config = VideoConfig(fps=30, width=1920, height=1080)
video_writer = get_video_writer(target_video_path=TARGET_VIDEO_PATH, video_config=video_config)

# get fresh video frame generator
frame_iterator = iter(generate_frames(video_file=SOURCE_VIDEO_PATH))

# get frame count for video
total = get_frame_count(SOURCE_VIDEO_PATH)

# set model to evaluation mode
model.eval()

for frame in tqdm(frame_iterator, total=total):
    annotated_frame = frame.copy()

    with torch.no_grad():
        image_size = frame.shape[:2]

        # pose extraction
        pose_pre_processed_frame = pose_pre_process_frame(frame=frame, device=device)
        pose_scaled_image_size = tuple(pose_pre_processed_frame.size())[2:]

        pose_output = pose_model(pose_pre_processed_frame)[0].detach().cpu()
        pose_output = pose_post_process_output(output=pose_output, confidence_trashold=CONFIDENCE_TRESHOLD, iou_trashold=IOU_TRESHOLD, image_size=image_size, scaled_image_size=pose_scaled_image_size)

        # annotate frame
        annotated_frame = class_annotate(image=annotated_frame, detections=pose_output)
        # save video frame
        video_writer.write(annotated_frame)

        poses = [pose_output[idx, 7:].T for idx in range(pose_output.shape[0])]

# close output video
video_writer.release()

  0%|          | 0/422 [00:00<?, ?it/s]