In [1]:
import os
from tqdm import tqdm

from pipeline.capture_video import CaptureVideo
from pipeline.utils import detectron

In [2]:
base_dir = "/home/inutard/remote-disk/badminton-vids"
player_name = "kento"
video_name = "best-rally-and-highlights-kento-momota-vs-lee-chong-wei-bac-2018-shuttle-amazing-gzvaa5-j-8.mp4"

input_file = "{}/{}/{}".format(base_dir, player_name, video_name)
output_dir = "output"
output_file = "output.mp4"
config_file = "configs/COCO-Keypoints/keypoint_rcnn_R_50_FPN_3x.yaml"
threshold = 0.8

# track_link_len = 100
# track_num = 7
# track_mag= 30
# track_match = 0.2
# track_orb_features = 1000

In [3]:
# Create output directory if needed
os.makedirs(output_dir, exist_ok=True)

# Video input
capture_video = CaptureVideo(input_file)

cfg = detectron.setup_cfg(config_file=config_file,
                          confidence_threshold=threshold,
                          cpu=False)

print(cfg.INPUT)

CROP:
  ENABLED: False
  SIZE: [0.9, 0.9]
  TYPE: relative_range
FORMAT: BGR
MASK_FORMAT: polygon
MAX_SIZE_TEST: 800
MAX_SIZE_TRAIN: 1333
MIN_SIZE_TEST: 600
MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
MIN_SIZE_TRAIN_SAMPLING: choice
RANDOM_FLIP: horizontal


In [4]:
# predict = Predict(cfg)
# #track_pose = TrackPose(link_len=track_link_len, num=track_num, mag=track_mag,
# #                       match=track_match, orb_features=track_orb_features)
# track_pose = None

# separate_background = None
# metadata_name = cfg.DATASETS.TEST[0] if len(cfg.DATASETS.TEST) else "__unused"
# # annotate_video = AnnotateVideo("vis_image", metadata_name,
# #                                predictions=track_pose is None,
# #                                pose_flows=track_pose is not None)
# annotate_video = AnnotateVideo("vis_image", metadata_name,
#                                pose_flows=True)

# save_video = SaveVideo("vis_image", os.path.join(output_dir, output_file), capture_video.fps)

# # Create image processing pipeline
# pipeline = (capture_video |
#             predict |
#             track_pose |
#             separate_background |
#             annotate_video |
#             save_video)

# # Iterate through pipeline
# num_frames = capture_video.frame_count
# num_frames = 300
# count = 0
# for _ in tqdm(pipeline, total=num_frames):
#     count += 1
#     if count > num_frames:
#         break

# # Pipeline cleanup
# capture_video.cleanup()
# save_video.cleanup()

In [5]:
from detectron2.modeling import build_model
from detectron2.checkpoint import DetectionCheckpointer

model = build_model(cfg)
DetectionCheckpointer(model).load(cfg.MODEL.WEIGHTS)
model.eval();

In [6]:
import detectron2.data.transforms as T
import torch
aug = T.ResizeShortestEdge(
    [cfg.INPUT.MIN_SIZE_TEST, cfg.INPUT.MIN_SIZE_TEST], cfg.INPUT.MAX_SIZE_TEST
)

input_format = cfg.INPUT.FORMAT

In [None]:
batch_size = 12
num_frames = 45 * 30
final_images = []
centers = [[(0,0), (0,0)]]

all_centers = []

for _ in tqdm(range(num_frames // batch_size)):
    with torch.no_grad():
        inputs = []
        original = []
        for i in range(batch_size):
            original_image = capture_video.cap.read()
            original.append(original_image)
            if input_format == "RGB":
                # whether the model expects BGR inputs or RGB
                original_image = original_image[:, :, ::-1]
            height, width = original_image.shape[:2]
            image = aug.get_transform(original_image).apply_image(original_image)
            image = torch.as_tensor(image.astype("float32").transpose(2, 0, 1))

            input_img = {"image": image, "height": height, "width": width}
            inputs.append(input_img)

        predictions = model(inputs)
        for idx, pred in enumerate(predictions):
            image = original[idx]
            cen = []
            bbox = []
            
            all_centers.append(pred['instances'].pred_boxes.get_centers())
            try:
                for player in range(2):
                    boxes = pred['instances'][player].pred_boxes
                    box = boxes.tensor[0].cpu()
                    center = tuple(boxes.get_centers()[0].cpu())
                    cen.append(center)
                    bbox.append(box)
        
                norm = lambda a, b: (a[0] - b[0])**2 + (a[1] - b[1])**2
                
                if len(centers[-1]) == 2 and norm(cen[0], centers[-1][0]) > norm(cen[1], centers[-1][0]):
                    cen[0], cen[1] = cen[1], cen[0]
                    bbox[0], bbox[1] = bbox[1], bbox[0]
                    
                for player in range(2):
                    color = (255,255,255)#(255 * (player == 0), 255 * (player == 1), 0)

                    image = cv2.rectangle(image, tuple(bbox[player][:2]), tuple(bbox[player][2:]), color, 10)
                    image = cv2.circle(image, cen[player], 10, color, 10)
                    
                centers.append(cen)
            except:
                pass
            
            final_images.append(image)
            
            

  0%|          | 0/112 [00:00<?, ?it/s]

In [None]:
import cv2

writer = cv2.VideoWriter(
    filename='/home/inutard/detectron2-pipeline/' + os.path.join(output_dir, output_file),
    fourcc=cv2.VideoWriter_fourcc(*'MJPG'),
    fps=capture_video.fps,
    frameSize=capture_video.frame_size,
    isColor=True)

for image in final_images:
    writer.write(image)
writer.release()

In [None]:
# Do a greedy matching between each adjacent step
[len(x) for x in all_centers]