In [1]:
import argparse
import os
import os.path as osp
import time
import cv2
import torch

from loguru import logger

from yolox.data.data_augment import preproc
from yolox.exp import get_exp
from yolox.utils import fuse_model, get_model_info, postprocess
from yolox.utils.visualize import plot_tracking
from yolox.tracker.byte_tracker import BYTETracker
from yolox.tracking_utils.timer import Timer
import torch

In [2]:
IMAGE_EXT = [".jpg", ".jpeg", ".webp", ".bmp", ".png"]

In [4]:
def get_image_list(path):
    image_names = []
    for maindir, subdir, file_name_list in os.walk(path):
        for filename in file_name_list:
            apath = osp.join(maindir, filename)
            ext = osp.splitext(apath)[1]
            if ext in IMAGE_EXT:
                image_names.append(apath)
    return image_names

In [49]:
class Predictor(object):
    def __init__(
        self,
        model,
        exp,
        trt_file=None,
        decoder=None,
        device=torch.device("cpu"),
        fp16=False
    ):
        self.model = model
        self.decoder = decoder
        self.num_classes = exp.num_classes
        self.confthre = exp.test_conf
        self.nmsthre = exp.nmsthre
        self.test_size = exp.test_size
        self.device = device
        self.fp16 = fp16
        if trt_file is not None:
            from torch2trt import TRTModule

            model_trt = TRTModule()
            model_trt.load_state_dict(torch.load(trt_file))

            x = torch.ones((1, 3, exp.test_size[0], exp.test_size[1]), device=device)
            self.model(x)
            self.model = model_trt
        self.rgb_means = (0.485, 0.456, 0.406)
        self.std = (0.229, 0.224, 0.225)

    def inference(self, img, timer):
        img_info = {"id": 0}
        if isinstance(img, str):
            img_info["file_name"] = osp.basename(img)
            img = cv2.imread(img)
        else:
            img_info["file_name"] = None

        height, width = img.shape[:2]
        img_info["height"] = height
        img_info["width"] = width
        img_info["raw_img"] = img

        img, ratio = preproc(img, self.test_size, self.rgb_means, self.std)
        img_info["ratio"] = ratio
        img = torch.from_numpy(img).unsqueeze(0).float().to(self.device)
        if self.fp16:
            img = img.half()  # to FP16

        with torch.no_grad():
            timer.tic()
            outputs = self.model(img)
            print(outputs.shape)
            if self.decoder is not None:
                outputs = self.decoder(outputs, dtype=outputs.type())
            outputs = postprocess(
                outputs, self.num_classes, self.confthre, self.nmsthre
            )
            #logger.info("Infer time: {:.4f}s".format(time.time() - t0))
        return outputs, img_info

In [6]:
def make_parser():
    parser = argparse.ArgumentParser("ByteTrack SoccerNet!")
    parser.add_argument(
        "demo", default="image", help="demo type - only image"
    )
    parser.add_argument("-expn", "--experiment-name", type=str, default=None)
    parser.add_argument("-n", "--name", type=str, default=None, help="model name")

    parser.add_argument(
        "--dataset", default="test", help="type of dataset")
    parser.add_argument(
        "--save_result",
        action="store_true",
        help="whether to save the inference result of image/video",
    )
    # exp file
    parser.add_argument(
        "-f",
        "--exp_file",
        default=None,
        type=str,
        help="pls input your expriment description file",
    )
    parser.add_argument("-c", "--ckpt", default=None, type=str, help="ckpt for eval")
    parser.add_argument(
        "--device",
        default="gpu",
        type=str,
        help="device to run our model, can either be cpu or gpu",
    )
    parser.add_argument("--conf", default=None, type=float, help="test conf")
    parser.add_argument("--nms", default=None, type=float, help="test nms threshold")
    parser.add_argument("--tsize", default=None, type=int, help="test img size")
    parser.add_argument("--fps", default=30, type=int, help="frame rate (fps)")
    parser.add_argument(
        "--fp16",
        dest="fp16",
        default=False,
        action="store_true",
        help="Adopting mix precision evaluating.",
    )
    parser.add_argument(
        "--fuse",
        dest="fuse",
        default=False,
        action="store_true",
        help="Fuse conv and bn for testing.",
    )
    parser.add_argument(
        "--trt",
        dest="trt",
        default=False,
        action="store_true",
        help="Using TensorRT model for testing.",
    )
    # tracking args
    parser.add_argument("--track_thresh", type=float, default=0.5, help="tracking confidence threshold")
    parser.add_argument("--track_buffer", type=int, default=30, help="the frames for keep lost tracks")
    parser.add_argument("--match_thresh", type=float, default=0.8, help="matching threshold for tracking")
    parser.add_argument(
        "--aspect_ratio_thresh", type=float, default=1.6,
        help="threshold for filtering out boxes of which aspect ratio are above the given value."
    )
    parser.add_argument('--min_box_area', type=float, default=10, help='filter out tiny boxes')
    parser.add_argument("--mot20", dest="mot20", default=False, action="store_true", help="test mot20.")
    return parser


In [9]:
args_string = "image -f exps/yolox_x_10Ep_bytetrack_mot_17.py -c /scratch/hk3820/csgi2271_finalproject/YOLOX_outputs/yolox_x_10Ep_bytetrack_mot_17/latest_ckpt.pth.tar --fuse --save_result --fp16 --dataset test".split(" ")

In [27]:
args = make_parser().parse_args(args_string)
exp = get_exp(args.exp_file, args.name)
args.device = "gpu"

In [28]:
if not args.experiment_name:
    args.experiment_name = exp.exp_name

output_dir = osp.join(exp.output_dir, args.experiment_name)
os.makedirs(output_dir, exist_ok=True)

if args.save_result:
    vis_folder = osp.join(output_dir, "track_vis")
    os.makedirs(vis_folder, exist_ok=True)

if args.trt:
    args.device = "gpu"
args.device = torch.device("cuda" if args.device == "gpu" else "cpu")

logger.info("Args: {}".format(args))

if args.conf is not None:
    exp.test_conf = args.conf
if args.nms is not None:
    exp.nmsthre = args.nms
if args.tsize is not None:
    exp.test_size = (args.tsize, args.tsize)
    
model = exp.get_model().to(args.device)
logger.info("Model Summary: {}".format(get_model_info(model, exp.test_size)))
model.eval()

[32m2023-12-08 18:54:43.129[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m15[0m - [1mArgs: Namespace(aspect_ratio_thresh=1.6, ckpt='/scratch/hk3820/csgi2271_finalproject/YOLOX_outputs/yolox_x_10Ep_bytetrack_mot_17/latest_ckpt.pth.tar', conf=None, dataset='test', demo='image', device=device(type='cuda'), exp_file='exps/yolox_x_10Ep_bytetrack_mot_17.py', experiment_name='yolox_x_10Ep_bytetrack_mot_17', fp16=True, fps=30, fuse=True, match_thresh=0.8, min_box_area=10, mot20=False, name=None, nms=None, save_result=True, track_buffer=30, track_thresh=0.5, trt=False, tsize=None)[0m
[32m2023-12-08 18:55:01.202[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m25[0m - [1mModel Summary: Params: 99.00M, Gflops: 987.11[0m


YOLOX(
  (backbone): YOLOPAFPN(
    (backbone): CSPDarknet(
      (stem): Focus(
        (conv): BaseConv(
          (conv): Conv2d(12, 80, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn): BatchNorm2d(80, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
          (act): SiLU(inplace=True)
        )
      )
      (dark2): Sequential(
        (0): BaseConv(
          (conv): Conv2d(80, 160, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
          (bn): BatchNorm2d(160, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
          (act): SiLU(inplace=True)
        )
        (1): CSPLayer(
          (conv1): BaseConv(
            (conv): Conv2d(160, 80, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (bn): BatchNorm2d(80, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
            (act): SiLU(inplace=True)
          )
          (conv2): BaseConv(
            (conv): Conv2d(160, 80, kernel

In [29]:
if not args.trt:
    if args.ckpt is None:
        ckpt_file = osp.join(output_dir, "best_ckpt.pth.tar")
    else:
        ckpt_file = args.ckpt
    logger.info("loading checkpoint")
    ckpt = torch.load(ckpt_file, map_location="cpu")
    # load the model state dict
    model.load_state_dict(ckpt["model"])
    logger.info("loaded checkpoint done.")

if args.fuse:
    logger.info("\tFusing model...")
    model = fuse_model(model)

if args.fp16:
    model = model.half()  # to FP16

if args.trt:
    assert not args.fuse, "TensorRT model is not support model fusing!"
    trt_file = osp.join(output_dir, "model_trt.pth")
    assert osp.exists(
        trt_file
    ), "TensorRT model is not found!\n Run python3 tools/trt.py first!"
    model.head.decode_in_inference = False
    decoder = model.head.decode_outputs
    logger.info("Using TensorRT to inference")
else:
    trt_file = None
    decoder = None

[32m2023-12-08 18:55:01.220[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m6[0m - [1mloading checkpoint[0m
[32m2023-12-08 18:55:12.596[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m10[0m - [1mloaded checkpoint done.[0m
[32m2023-12-08 18:55:12.599[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m13[0m - [1m	Fusing model...[0m
  if param.grad is not None:


In [50]:
predictor = Predictor(model, exp, trt_file, decoder, args.device, args.fp16)

In [31]:
dataset_name = args.dataset
seq_names = sorted(os.listdir(f"./datasets/SoccerNet/{dataset_name}"))
print(seq_names)

['SNMOT-116', 'SNMOT-117', 'SNMOT-118', 'SNMOT-119', 'SNMOT-120', 'SNMOT-121', 'SNMOT-122', 'SNMOT-123', 'SNMOT-124', 'SNMOT-125', 'SNMOT-126', 'SNMOT-127', 'SNMOT-128', 'SNMOT-129', 'SNMOT-130', 'SNMOT-131', 'SNMOT-132', 'SNMOT-133', 'SNMOT-134', 'SNMOT-135', 'SNMOT-136', 'SNMOT-137', 'SNMOT-138', 'SNMOT-139', 'SNMOT-140', 'SNMOT-141', 'SNMOT-142', 'SNMOT-143', 'SNMOT-144', 'SNMOT-145', 'SNMOT-146', 'SNMOT-147', 'SNMOT-148', 'SNMOT-149', 'SNMOT-150', 'SNMOT-187', 'SNMOT-188', 'SNMOT-189', 'SNMOT-190', 'SNMOT-191', 'SNMOT-192', 'SNMOT-193', 'SNMOT-194', 'SNMOT-195', 'SNMOT-196', 'SNMOT-197', 'SNMOT-198', 'SNMOT-199', 'SNMOT-200']


In [32]:
seq_name = 'SNMOT-116'
path = f"./datasets/SoccerNet/{dataset_name}/{seq_name}/img1/"
if osp.isdir(path):
    files = get_image_list(path)
else:
    files = [path]
files.sort()

In [51]:
frame_id = 3
img_path = files[2]
timer = Timer()

outputs, img_info = predictor.inference(img_path, timer)

torch.Size([1, 29400, 6])


In [60]:
outputs[0]

tensor([[9.1000e+02, 4.0400e+02, 9.6100e+02, 5.0350e+02, 9.9707e-01, 9.2529e-01,
         0.0000e+00],
        [1.1480e+03, 3.5900e+02, 1.1760e+03, 4.4900e+02, 9.9609e-01, 9.1162e-01,
         0.0000e+00],
        [4.6800e+02, 4.2975e+02, 5.0950e+02, 5.3350e+02, 9.9756e-01, 9.0723e-01,
         0.0000e+00],
        [1.0700e+03, 3.7950e+02, 1.1100e+03, 4.7200e+02, 9.9414e-01, 9.0576e-01,
         0.0000e+00],
        [3.9450e+02, 4.6825e+02, 4.4000e+02, 5.8550e+02, 9.9414e-01, 9.0527e-01,
         0.0000e+00],
        [8.1900e+02, 4.4900e+02, 8.7000e+02, 5.6300e+02, 9.9609e-01, 8.9941e-01,
         0.0000e+00],
        [8.8300e+02, 4.0575e+02, 9.1600e+02, 5.0525e+02, 9.9268e-01, 8.9648e-01,
         0.0000e+00],
        [1.1669e+02, 4.3350e+02, 1.6000e+02, 5.4400e+02, 9.9658e-01, 8.8281e-01,
         0.0000e+00],
        [7.8300e+02, 4.5525e+02, 8.2400e+02, 5.6800e+02, 9.9414e-01, 8.6816e-01,
         0.0000e+00],
        [1.2660e+03, 3.7075e+02, 1.3040e+03, 4.6325e+02, 9.8047e-01, 8.78

In [54]:
exp.test_conf

0.1