In [1]:
import os
import torch
import imageio.v3 as iio
import numpy as np

from base64 import b64encode
from cotracker.utils.visualizer import Visualizer, read_video_from_path
from IPython.display import HTML
from cotracker.predictor import CoTrackerPredictor, CoTrackerOnlinePredictor
import PoseEstimation.customutils as customutils

# model = torch.hub.load("facebookresearch/co-tracker", "cotracker2_online").to("cuda")
DEFAULT_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CoTrackerPredictor(checkpoint=os.path.join('./co-tracker/checkpoints/cotracker2.pth'))#.to(DEFAULT_DEVICE)

NUM_KEYPOINTS = 26

if torch.cuda.is_available():
    model = model.cuda()

df_running_annotations = customutils.load_images_dataframe()
df_running_annotations

Unnamed: 0,image_id,pose_id,file_name,frame,person
0,1000,1000,Athletics_Mixed_Tokyo_2020_20_1.mp4,0,0
1,1001,1001,Athletics_Mixed_Tokyo_2020_20_1.mp4,1,0
2,1002,1002,Athletics_Mixed_Tokyo_2020_20_1.mp4,2,0
3,1003,1003,Athletics_Mixed_Tokyo_2020_20_1.mp4,3,0
4,1004,1004,Athletics_Mixed_Tokyo_2020_20_1.mp4,4,0
...,...,...,...,...,...
12198,46160,146160,World_Athletics_Women_Marathon_Oregon_2022_8.mp4,160,1
12199,46161,146161,World_Athletics_Women_Marathon_Oregon_2022_8.mp4,161,1
12200,46162,146162,World_Athletics_Women_Marathon_Oregon_2022_8.mp4,162,1
12201,46163,146163,World_Athletics_Women_Marathon_Oregon_2022_8.mp4,163,1


In [2]:
def _process_step(window_frames, is_first_step, queries = None):
        video_chunk = (
            torch.tensor(np.stack(window_frames[-model.step * 2 :]), device=DEFAULT_DEVICE)
            .float()
            .permute(0, 3, 1, 2)[None]
        )  # (1, T, 3, H, W)
        if queries is None or len(queries) == 0:
            return model(
            video_chunk,
            is_first_step=is_first_step
            )
        else:
            if torch.cuda.is_available():
                queries = queries.cuda()
                
            return model(
                video_chunk,
                is_first_step=is_first_step,
                queries=queries[None]
            )

In [3]:
import cv2

def crop_video(input_path, output_path, x_start, y_start, width, height):
    # Open the video file
    cap = cv2.VideoCapture(input_path)

    # Check if the video file is opened successfully
    if not cap.isOpened():
        print("Error: Could not open video file.")
        exit()

    # Get the original video dimensions
    original_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    original_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

    # Create VideoWriter object to save the output video
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')  # You can change the codec based on your requirements
    out = cv2.VideoWriter(output_path, fourcc, 30.0, (width, height))

    # Iterate through the frames
    while True:
        ret, frame = cap.read()

        # Break the loop if we reach the end of the video
        if not ret:
            break

        # Crop the frame to the specified dimensions
        cropped_frame = frame[y_start:y_start + height, x_start:x_start + width]

        # Resize the cropped frame to the new dimensions
        resized_frame = cv2.resize(cropped_frame, (width, height))

        # Write the resized frame to the output video file
        out.write(resized_frame)

    # Release video capture and writer objects
    cap.release()
    out.release()

    # Close any open windows
    cv2.destroyAllWindows()


In [4]:
import cv2
import csv
import time
import peakutils
from KeyFrameDetector.utils import convert_frame_to_grayscale

def cotracker_model(video, filename, model_name, person_count, queries, queries_dict, visualize = True):
    # Iterating over video frames, processing one window at a time:
    is_first_step = True
    window_frames = []

    cap = cv2.VideoCapture('./data/videos/' + filename)
    length = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    lstfrm = []
    lstdiffMag = []
    timeSpans = []
    images = []
    full_color = []
    lastFrame = None

    for i in range(length):
        ret, frame = cap.read()
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        grayframe, blur_gray = convert_frame_to_grayscale(frame)

        frame_number = cap.get(cv2.CAP_PROP_POS_FRAMES) - 1
        lstfrm.append(frame_number)
        images.append(grayframe)
        full_color.append(frame)
        if frame_number == 0:
            lastFrame = blur_gray

        diff = cv2.subtract(blur_gray, lastFrame)
        diffMag = cv2.countNonZero(diff)
        lstdiffMag.append(diffMag)
        lastFrame = blur_gray
        window_frames.append(frame)

    
    y = np.array(lstdiffMag)
    base = peakutils.baseline(y, 2)
    indices = peakutils.indexes(y-base, float(0.3), min_dist=1)

    # reset to first frame
    cap.set(cv2.CAP_PROP_POS_FRAMES, 0)

    queries = queries_dict[0]
    idx = []
    for i, ind in enumerate(indices):
        if queries_dict[ind] is None:
            idx.append(i)
        elif ind < len(queries_dict):
            queries = torch.vstack((queries, queries_dict[ind]))
    indices = np.delete(indices, idx, axis=0)

    if torch.cuda.is_available():
        queries = queries.cuda()
        
    pred_tracks, pred_visibility = model(
                video,
                queries=queries[None]
            )
    
    cap.release()
    cv2.destroyAllWindows()

    new_pred_tracks = None    
    new_pred_visibility = None    
    keyframe_i = 0
    total_kpts_count=NUM_KEYPOINTS*person_count

    indices = np.insert(indices, 0, 0)
    for i in range(len(pred_tracks[0])):
        frame = pred_tracks[0][i]
        frame_vis = pred_visibility[0][i]

        j = keyframe_i*total_kpts_count
        k = (keyframe_i+1)*total_kpts_count
        if k > frame.shape[0]:
            k = frame.shape[0]-1
        new_frame = frame[j:k]
        new_frame_vis = frame_vis[j:k]
        
        if new_frame.shape[0] < total_kpts_count:
            new_pred_tracks = torch.cat((new_pred_tracks, frame.unsqueeze(0)),dim=0)
            new_pred_visibility = torch.cat((new_pred_visibility, frame_vis.unsqueeze(0)),dim=0)
            continue

        if new_pred_tracks is None:
            new_pred_tracks = new_frame
            new_pred_visibility = new_frame_vis
        elif len(new_pred_tracks.shape) == 2:
            new_pred_tracks = torch.stack((new_pred_tracks, new_frame),dim=0)
            new_pred_visibility = torch.stack((new_pred_visibility, new_frame_vis),dim=0)
        else:
            new_pred_tracks = torch.cat((new_pred_tracks, new_frame.unsqueeze(0)),dim=0)
            new_pred_visibility = torch.cat((new_pred_visibility, new_frame_vis.unsqueeze(0)),dim=0)
            
        if keyframe_i+1 < len(indices) and indices[keyframe_i+1] <= i+1:
            keyframe_i += 1
            
    if visualize:
        vis = Visualizer(save_dir="./videos/results/cotracker/", linewidth=3)
        #vis.visualize(video, pred_tracks, pred_visibility, filename = model_name + '_' + filename.replace('.mp4', ''), query_frame=0)
        vis.visualize(video, torch.unsqueeze(new_pred_tracks, 0), torch.unsqueeze(new_pred_visibility, 0), filename=filename.replace('.mp4', ''), query_frame=0)
    
    return pred_tracks, pred_visibility

In [5]:

def cotracker_model_2(filename, queries, queries_dict):
    # Iterating over video frames, processing one window at a time:
    is_first_step = True
    window_frames = []

    for i, frame in enumerate(
        iio.imiter(
            './data/videos/' + filename,
            plugin="FFMPEG",
            fps = 30
        )
    ):

        if i % model.step == 0 and i != 0:
            pred_tracks, pred_visibility = _process_step(
                window_frames,
                is_first_step,
                queries=queries #queries_dict[i]
            )
            is_first_step = False
        window_frames.append(frame)

    # Processing the final video frames in case video length is not a multiple of model.step
    pred_tracks, pred_visibility = _process_step(
        window_frames[-(i % model.step) - model.step - 1 :],
        is_first_step
    )

    seq_name = filename
    video = torch.tensor(np.stack(window_frames), device=DEFAULT_DEVICE).permute(0, 3, 1, 2)[None]
    vis = Visualizer(save_dir="./videos/results/cotracker/", linewidth=3)
    vis.visualize(video, pred_tracks, pred_visibility, filename=filename.replace('.mp4', ''), query_frame=0)

    return pred_tracks, pred_visibility

In [6]:
import PoseEstimation.customutils as customutils

df_running_annotations = customutils.load_images_dataframe()
filenames = df_running_annotations['file_name'].unique()
filenames

array(['Athletics_Mixed_Tokyo_2020_20_1.mp4',
       'Athletics_Mixed_Tokyo_2020_39.mp4',
       'Athletics_Mixed_Tokyo_2020_5.mp4',
       'Athletics_Mixed_Tokyo_2020_64.mp4',
       'Athletics_Mixed_Tokyo_2020_8.mp4',
       'Marathon_Men_Tokyo_2020_14.mp4', 'Marathon_Men_Tokyo_2020_16.mp4',
       'Marathon_Men_Tokyo_2020_28.mp4', 'Marathon_Men_Tokyo_2020_47.mp4',
       'Marathon_Men_Tokyo_2020_52.mp4',
       'Triathlon_Men_Tokyo_2020_12_1.mp4',
       'Triathlon_Men_Tokyo_2020_19_1.mp4',
       'Triathlon_Men_Tokyo_2020_21_1.mp4',
       'Triathlon_Men_Tokyo_2020_23.mp4',
       'Triathlon_Men_Tokyo_2020_28.mp4',
       'Triathlon_Women_Tokyo_2020_29.mp4',
       'Triathlon_Women_Tokyo_2020_2_1.mp4',
       'Triathlon_Women_Tokyo_2020_33_1.mp4',
       'World_Athletics_Men_10000m_Oregon_2022_1.mp4',
       'World_Athletics_Men_10000m_Oregon_2022_2.mp4',
       'World_Athletics_Men_10000m_Oregon_2022_23.mp4',
       'World_Athletics_Men_10000m_Oregon_2022_25.mp4',
       'World_At

Import CoTrackerPredictor and create an instance of it. We'll use this object to estimate tracks:

In [7]:

def load_video(filename):
    video = read_video_from_path('./data/videos/' + filename)
    video = torch.from_numpy(video).permute(0, 3, 1, 2)[None].float()

    if torch.cuda.is_available():
        video = video.cuda()
    
    return video

Tracking manually selected points

In [8]:
import os, argparse, json, re
from collections import defaultdict
import pandas as pd

def load_annotations(file_path):
    frames = defaultdict(list)
    isExist = os.path.exists(file_path)
    person_count = 0
    if isExist:
      annotations = []
      with open(file_path, 'r') as f:
          annotations = json.load(f)
          
          keypoints = defaultdict(list)
          person_count = len(annotations['annotations'])
          for person in annotations['annotations']:
            framecount = len(person['frames'])
            for frame_index in range(0, framecount):
              points = []
              if frame_index < framecount:
                frame = person['frames'][str(frame_index)]
                for node in frame['skeleton']['nodes']:
                    points.append({'id': node['name'], 'x' : node['x'], 'y': node['y']})

              if len(points) > 0:
                keypoints[frame_index].append({'person': {'points': points}})

          for frame in range(0, len(keypoints)):
            frames[frame] = keypoints[frame]

    return frames, person_count


def get_queries_for_frame(frame_number, annotations):
  i = 0

  for person in annotations[frame_number]:
    for point in person['person']['points']:
      new_tensor = torch.tensor([float(frame_number), point['x'], point['y']])
      if i == 0:
        queries_for_frames = new_tensor
      else:
        queries_for_frames = torch.vstack((queries_for_frames, new_tensor))
      i += 1

  return queries_for_frames


def get_queries_for_frames(start_frame, end_frame, annotations, step = 4):
  frame = 0
  frames_dict = defaultdict(list)
  all_queries_for_frames = None

  if end_frame == -1:
    end_frame = len(annotations)

  for frame in range(start_frame, end_frame, step):
    queries_for_frames = None
    for person in annotations[frame]:
      for point in person['person']['points']:
        if 'occluded' in point and point['occluded'] == 'false':
          continue
        new_tensor = torch.tensor([float(frame), point['x'], point['y']])
        if queries_for_frames is None:
          queries_for_frames = new_tensor
        if all_queries_for_frames is None:
          queries_for_frames = new_tensor
          all_queries_for_frames = new_tensor
        else:
          queries_for_frames = torch.vstack((queries_for_frames, new_tensor))
          all_queries_for_frames = torch.vstack((all_queries_for_frames, new_tensor))

    frames_dict[frame] = queries_for_frames

  return all_queries_for_frames, frames_dict


def get_queries_from_pe(start_frame, end_frame, df, df_annotations, step = 4):
  frame = 0
  frames_dict = defaultdict(list)
  all_queries_for_frames = None

  if end_frame == -1:
    end_frame = len(df)

  if df_annotations is not None and len(df_annotations) > 0:
    # Find closest matching pose for first frame 
    poses = df[df['frame'] == 0]
    gts = df_annotations[df_annotations['image_id'] == poses['image_id'].iloc[0]]
    min_dist = np.inf
    idx = -1
    
    for pose_id in gts.index:
        gt = gts['keypoints'][pose_id]
        kpts2, vi2 = customutils.edit_keypoints(gt)
                                
        area = customutils.compute_area_keypoints(gt)

        # compute head size for distance normalization
        head = customutils.get_keypoint(gt,"head")
        neck = customutils.get_keypoint(gt,"neck")

        headSize = 1
        if (len(head) > 0 and len(neck) > 0):
            headSize = customutils.get_head_size(head[0], head[1], neck[0], neck[1])
            
        for pose in poses.iloc:
          if pose['score'] > .8:
            dt = pose['keypoints']
            kpts1, vi1 = customutils.edit_keypoints(dt)
            d = np.linalg.norm(kpts1 - kpts2, ord=2, axis=1)
            v = np.ones(len(d))

            for part in range(len(d)):
                if vi1[part] == 0 or vi2[part] == 0:
                    d[part] = 0
                    v[part] = 0

            # normalize distance
            dNorm = np.sum(d)/headSize

            if dNorm < min_dist:
              min_dist = dNorm
              idx = pose['idx']

  
  for frame in range(start_frame, end_frame, step):
    queries_for_frames = None
    poses = df[df['frame'] == frame]
    for pose in poses.iloc:

      if idx == -1:
        idx = pose['idx']

      if idx != pose['idx']:
        continue

      kpts = pose['keypoints']
      x, y, vi = customutils.get_x_y_v_keypoints(kpts)
      for point in range(len(x)):
        if vi[point] == 0:
          continue
        new_tensor = torch.tensor([float(frame), float(x[point]), float(y[point])])
        if queries_for_frames is None:
          queries_for_frames = new_tensor
        if all_queries_for_frames is None:
          queries_for_frames = new_tensor
          all_queries_for_frames = new_tensor
        else:
          queries_for_frames = torch.vstack((queries_for_frames, new_tensor))
          all_queries_for_frames = torch.vstack((all_queries_for_frames, new_tensor))

      break

    frames_dict[frame] = queries_for_frames

  return all_queries_for_frames, frames_dict

In [9]:
df_annotations = customutils.load_keypoints_dataframe()

In [10]:
def get_image_id(filename):
    mask_file = df_running_annotations['file_name'] == filename
    mask_frame = df_running_annotations['frame'] == 0
    image_id = df_running_annotations.loc[mask_file & mask_frame]['image_id'].iloc[0]
    return image_id
    
def run_cotracker(filename, video, model_name, df_pe):
    frames = []
    annot_json = 'videos/annotations/' + filename.replace('.mp4', '') + '.json'
    annotations, person_count = load_annotations(annot_json)

    df_video_imageids = df_running_annotations[df_running_annotations['file_name'] == filename]
    
    df = df_pe[df_pe['image_id'].isin(df_video_imageids['image_id'])]
    df = df_pe.merge(df_video_imageids, how='left', left_on='image_id', right_on='image_id')
    
    queries, queries_dict = get_queries_from_pe(0, -1, df, df_annotations, 1)
    #queries = get_queries_for_frames(0, 1, annotations)
    
    pred_tracks, pred_visibility = cotracker_model(video, filename, model_name, 1, queries, queries_dict)

    num_frames = pred_tracks.cpu().shape[1]
    np_pred = pred_tracks.cpu().numpy()
    df = pd.DataFrame(index=range(num_frames), columns=[str(x) for x in range(num_frames)])
    for i in range(num_frames):
        frame = {}
        frame['image_id'] = int(get_image_id(filename)) + i
        frame['category_id'] = 1

        keypoints = []
        for j in range(len(np_pred[0][i])):
            if j % NUM_KEYPOINTS == NUM_KEYPOINTS - 1:
                # save current set of keypoints and start new frame/pose
                frame['keypoints'] = keypoints
                frames.append(frame)
                keypoints = []
            keypoints.append(np_pred[0][i][j][0].astype(float))
            keypoints.append(np_pred[0][i][j][1].astype(float))
            keypoints.append(2)

        frame['keypoints'] = keypoints
        frames.append(frame)

    customutils.writeJson(frames,'videos/results/cotracker/' + filename.replace('.mp4', '') + '.json')

    return frames

In [11]:

df_pe_alphapose = customutils.load_pe_dataframe('alphapose')
df_pe_openpose = customutils.load_pe_dataframe('openpose')
df_pe_ViTPose = customutils.load_pe_dataframe('ViTPose')

df_pe_alphapose

Unnamed: 0,image_id,keypoints,box,idx,score,category_id
0,1000,"[672.5870361328125, 397.4309997558594, 2, 692....","[455.3162841796875, 340.884765625, 402.0437011...",1,0.898572,1
1,1000,"[1136.07421875, 266.266845703125, 2, 1155.6085...","[932.1016845703125, 190.0831756591797, 310.273...",2,0.877560,1
2,1001,"[682.8905029296875, 407.08642578125, 2, 702.61...","[456.44172066815923, 337.55727539062497, 405.4...",1,0.891666,1
3,1001,"[1134.0335693359375, 257.7659912109375, 2, 115...","[938.4007471361167, 194.7648895263672, 306.255...",2,0.867543,1
4,1002,"[691.739990234375, 412.57452392578125, 2, 713....","[443.5269343344591, 336.3762005969505, 431.860...",1,0.859866,1
...,...,...,...,...,...,...
45401,46163,"[704.8864135742188, 428.4450988769531, 2, 708....","[604.3679742712545, 384.44794226840435, 129.43...",29,0.775203,1
45402,46163,"[242.5023956298828, 475.6257019042969, 2, 245....","[219.46201027790286, 441.22925448208105, 108.8...",12,0.555517,1
45403,46163,"[1689.3052978515625, 480.3915100097656, 2, 168...","[1669.4820232953628, 467.83948263581397, 20.96...",37,0.275808,1
45404,46163,"[1490.57373046875, 472.3777160644531, 2, 1493....","[1481.6382385788675, 450.34171338820823, 32.94...",54,0.676191,1


In [12]:
frames = []

for filename in filenames:
    if filename == "Marathon_Men_Tokyo_2020_16.mp4" or filename == "Triathlon_Men_Tokyo_2020_19_1.mp4":
        continue
    isExist = os.path.exists("./videos/results/cotracker/" + filename)
    if not isExist:    
        print("Processing " + filename)
        video = load_video(filename)
        frames += run_cotracker(filename, video, 'alphapose', df_pe_alphapose)

    
# customutils.writeJson(frames,'videos/results/cotracker/person_keypoints_running.json')

Processing Triathlon_Men_Tokyo_2020_19_1.mp4


OutOfMemoryError: CUDA out of memory. Tried to allocate 4.22 GiB. GPU 0 has a total capacty of 11.00 GiB of which 0 bytes is free. Including non-PyTorch memory, this process has 17179869184.00 GiB memory in use. Of the allocated memory 22.61 GiB is allocated by PyTorch, and 18.08 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF