In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader
import torchvision.models.video as models
import torchvision.models.detection as detection_models
import cv2
import os
import numpy as np
import pandas as pd
import glob
from tqdm import tqdm
from torch.cuda.amp import autocast, GradScaler
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from ultralytics import YOLO

# Hyperparameters
BATCH_SIZE = 8
EPOCHS = 2
LEARNING_RATE = 0.001
NUM_FRAMES = 16  # Number of frames per video clip
FRAME_SIZE = (112, 112)  # Resize frames

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Dataset Class
class VideoDataset(Dataset):
    def __init__(self, video_paths, labels, transform=None):
        self.video_paths = video_paths
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.video_paths)

    def __getitem__(self, idx):
        video_path = self.video_paths[idx]
        label = self.labels[idx]
        frames = self.load_video(video_path)
        if self.transform:
            frames = torch.stack([self.transform(frame) for frame in frames])
        return frames, torch.tensor(label, dtype=torch.long), video_path
    
    def load_video(self, path):
        cap = cv2.VideoCapture(path)
        frames = []
        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        if total_frames == 0:
            cap.release()
            return [torch.zeros((3, *FRAME_SIZE), dtype=torch.float32)] * NUM_FRAMES
        
        frame_indices = np.linspace(0, total_frames-1, NUM_FRAMES, dtype=int)
        
        for i in frame_indices:
            cap.set(cv2.CAP_PROP_POS_FRAMES, i)
            ret, frame = cap.read()
            if not ret:
                break
            frame = cv2.resize(frame, FRAME_SIZE)
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frame = torch.tensor(frame, dtype=torch.float32).permute(2, 0, 1) / 255.0
            frames.append(frame)
        cap.release()
        
        if len(frames) == 0:
            return [torch.zeros((3, *FRAME_SIZE), dtype=torch.float32)] * NUM_FRAMES
        
        return frames if len(frames) == NUM_FRAMES else frames + [frames[-1]] * (NUM_FRAMES - len(frames))

# Load dataset dynamically
dataset_path = "Dataset"
brawl_videos = sorted(glob.glob(os.path.join(dataset_path, "Brawl", "Brawl_*.mp4")))
peace_videos = sorted(glob.glob(os.path.join(dataset_path, "Peace", "Peace_*.mp4")))

video_paths = brawl_videos + peace_videos
labels = [1] * len(brawl_videos) + [0] * len(peace_videos)

transform = transforms.Compose([transforms.Normalize((0.5,), (0.5,))])
dataset = VideoDataset(video_paths, labels, transform=transform)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=0, pin_memory=True)

# Define Brawl Detection Model
class VideoClassifier(nn.Module):
    def __init__(self, num_classes=2):
        super(VideoClassifier, self).__init__()
        self.model = models.mc3_18(weights=models.MC3_18_Weights.DEFAULT)
        self.model.fc = nn.Linear(512, num_classes)
    
    def forward(self, x):
        return self.model(x)

# Define People Counting Model using YOLOv8
class PeopleCounter:
    def __init__(self):
        self.model = YOLO("yolov8n.pt")  # Load YOLOv8 pre-trained model

    def count_people_in_video(self, video_path):
        cap = cv2.VideoCapture(video_path)
        max_count = 0
        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break
            results = self.model(frame)
            count = sum(1 for det in results[0].boxes.cls if det == 0)  # Class 0 is 'person'
            max_count = max(max_count, count)
        cap.release()
        return max_count

people_counter = PeopleCounter()

# Training
model = VideoClassifier().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
scaler = GradScaler()

def train():
    model.train()
    for epoch in range(EPOCHS):
        total_loss = 0
        for videos, labels, _ in tqdm(dataloader, desc=f"Epoch {epoch+1}/{EPOCHS}"):
            videos, labels = videos.to(device), labels.to(device)
            videos = videos.permute(0, 2, 1, 3, 4)
            optimizer.zero_grad()
            with autocast():
                outputs = model(videos)
                loss = criterion(outputs, labels)
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}/{EPOCHS}, Loss: {total_loss/len(dataloader):.4f}")
        
    # Save trained model
    torch.save(model.state_dict(), "brawl_detection_model.pth")
    print("Model saved successfully!")

if __name__ == "__main__":
    train()



  scaler = GradScaler()


'\nif __name__ == "__main__":\n    train()\n'

In [5]:
# Load trained model
model = VideoClassifier().to(device)
model.load_state_dict(torch.load("brawl_detection_model.pth", map_location=device))
model.eval()

people_counter = PeopleCounter()


test_dataset_path = "Test"
test_video_paths = sorted(glob.glob(os.path.join(test_dataset_path, "Video*.mp4")))
test_labels = [0] * len(test_video_paths)

test_dataset = VideoDataset(test_video_paths, test_labels, transform=transform)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=0, pin_memory=True)

def predict_on_test_dataset():
    model.eval()
    predictions = []
    
    with torch.no_grad():
        for videos, _, paths in tqdm(test_dataloader, desc="Predicting"):
            videos = videos.to(device).permute(0, 2, 1, 3, 4)
            outputs = model(videos)
            preds = torch.argmax(outputs, dim=1).cpu().numpy()
            
            for path, pred in zip(paths, preds):
                person_count = people_counter.count_people_in_video(path) if pred == 1 else 0
                predictions.append((os.path.basename(path), int(pred), person_count))

    df = pd.DataFrame(predictions, columns=["videoID", "predicted_label", "person_count"])
    return df

test_results = predict_on_test_dataset()

Predicting:   0%|                                                                               | 0/50 [00:00<?, ?it/s]


0: 640x640 2 persons, 1 skateboard, 161.0ms
Speed: 8.0ms preprocess, 161.0ms inference, 1.7ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 2 persons, 1 skateboard, 158.7ms
Speed: 6.3ms preprocess, 158.7ms inference, 1.6ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 2 persons, 1 skateboard, 144.8ms
Speed: 6.7ms preprocess, 144.8ms inference, 1.7ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 3 persons, 1 skateboard, 157.9ms
Speed: 7.9ms preprocess, 157.9ms inference, 1.4ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 2 persons, 143.6ms
Speed: 6.8ms preprocess, 143.6ms inference, 1.6ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 2 persons, 140.9ms
Speed: 6.7ms preprocess, 140.9ms inference, 1.3ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 1 person, 154.7ms
Speed: 6.8ms preprocess, 154.7ms inference, 1.3ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 1 person, 1 skateboard, 1 bott

Predicting:   2%|█▎                                                                  | 1/50 [01:50<1:29:57, 110.16s/it]


0: 640x512 1 person, 139.5ms
Speed: 5.4ms preprocess, 139.5ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 512)

0: 640x512 1 person, 117.6ms
Speed: 5.0ms preprocess, 117.6ms inference, 1.3ms postprocess per image at shape (1, 3, 640, 512)

0: 640x512 2 persons, 1 umbrella, 113.0ms
Speed: 3.5ms preprocess, 113.0ms inference, 1.3ms postprocess per image at shape (1, 3, 640, 512)

0: 640x512 1 person, 113.3ms
Speed: 3.6ms preprocess, 113.3ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 512)

0: 640x512 1 person, 112.7ms
Speed: 3.3ms preprocess, 112.7ms inference, 1.4ms postprocess per image at shape (1, 3, 640, 512)

0: 640x512 1 person, 121.2ms
Speed: 4.2ms preprocess, 121.2ms inference, 1.3ms postprocess per image at shape (1, 3, 640, 512)

0: 640x512 1 person, 114.4ms
Speed: 3.4ms preprocess, 114.4ms inference, 1.3ms postprocess per image at shape (1, 3, 640, 512)

0: 640x512 1 person, 110.7ms
Speed: 3.7ms preprocess, 110.7ms inference, 1.4ms postprocess

Predicting:   4%|██▋                                                                 | 2/50 [03:30<1:23:22, 104.22s/it]


0: 544x640 6 persons, 1 car, 159.4ms
Speed: 6.5ms preprocess, 159.4ms inference, 1.4ms postprocess per image at shape (1, 3, 544, 640)

0: 544x640 3 persons, 111.5ms
Speed: 5.8ms preprocess, 111.5ms inference, 1.2ms postprocess per image at shape (1, 3, 544, 640)

0: 544x640 3 persons, 2 cars, 142.9ms
Speed: 5.9ms preprocess, 142.9ms inference, 1.4ms postprocess per image at shape (1, 3, 544, 640)

0: 544x640 4 persons, 1 car, 110.3ms
Speed: 4.6ms preprocess, 110.3ms inference, 1.3ms postprocess per image at shape (1, 3, 544, 640)

0: 544x640 4 persons, 1 car, 113.9ms
Speed: 4.3ms preprocess, 113.9ms inference, 1.3ms postprocess per image at shape (1, 3, 544, 640)

0: 544x640 3 persons, 1 car, 154.1ms
Speed: 5.5ms preprocess, 154.1ms inference, 1.3ms postprocess per image at shape (1, 3, 544, 640)

0: 544x640 4 persons, 1 car, 125.8ms
Speed: 6.6ms preprocess, 125.8ms inference, 1.3ms postprocess per image at shape (1, 3, 544, 640)

0: 544x640 4 persons, 2 cars, 109.2ms
Speed: 4.8ms pr

Predicting:   6%|████▏                                                                | 3/50 [05:02<1:17:14, 98.61s/it]


0: 608x640 2 persons, 165.4ms
Speed: 8.2ms preprocess, 165.4ms inference, 1.4ms postprocess per image at shape (1, 3, 608, 640)

0: 608x640 2 persons, 156.6ms
Speed: 6.3ms preprocess, 156.6ms inference, 1.6ms postprocess per image at shape (1, 3, 608, 640)

0: 608x640 2 persons, 154.2ms
Speed: 6.1ms preprocess, 154.2ms inference, 1.4ms postprocess per image at shape (1, 3, 608, 640)

0: 608x640 2 persons, 163.5ms
Speed: 6.6ms preprocess, 163.5ms inference, 2.0ms postprocess per image at shape (1, 3, 608, 640)

0: 608x640 2 persons, 147.4ms
Speed: 8.8ms preprocess, 147.4ms inference, 1.4ms postprocess per image at shape (1, 3, 608, 640)

0: 608x640 2 persons, 126.3ms
Speed: 6.7ms preprocess, 126.3ms inference, 1.4ms postprocess per image at shape (1, 3, 608, 640)

0: 608x640 2 persons, 128.3ms
Speed: 6.7ms preprocess, 128.3ms inference, 1.4ms postprocess per image at shape (1, 3, 608, 640)

0: 608x640 2 persons, 120.9ms
Speed: 6.1ms preprocess, 120.9ms inference, 1.5ms postprocess per 

Predicting:   8%|█████▍                                                              | 4/50 [06:57<1:20:33, 105.08s/it]


0: 640x544 (no detections), 123.9ms
Speed: 5.1ms preprocess, 123.9ms inference, 0.8ms postprocess per image at shape (1, 3, 640, 544)

0: 640x544 1 chair, 131.2ms
Speed: 4.2ms preprocess, 131.2ms inference, 1.7ms postprocess per image at shape (1, 3, 640, 544)

0: 640x544 1 chair, 125.9ms
Speed: 5.0ms preprocess, 125.9ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 544)

0: 640x544 (no detections), 113.8ms
Speed: 5.3ms preprocess, 113.8ms inference, 0.8ms postprocess per image at shape (1, 3, 640, 544)

0: 640x544 1 person, 113.6ms
Speed: 4.2ms preprocess, 113.6ms inference, 1.5ms postprocess per image at shape (1, 3, 640, 544)

0: 640x544 1 person, 132.8ms
Speed: 5.7ms preprocess, 132.8ms inference, 1.4ms postprocess per image at shape (1, 3, 640, 544)

0: 640x544 1 chair, 116.9ms
Speed: 4.9ms preprocess, 116.9ms inference, 1.3ms postprocess per image at shape (1, 3, 640, 544)

0: 640x544 1 person, 1 chair, 112.8ms
Speed: 4.7ms preprocess, 112.8ms inference, 1.2ms post

Predicting:  10%|██████▊                                                             | 5/50 [08:39<1:18:02, 104.05s/it]


0: 640x640 2 persons, 206.1ms
Speed: 7.9ms preprocess, 206.1ms inference, 1.4ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 2 persons, 162.9ms
Speed: 5.7ms preprocess, 162.9ms inference, 1.5ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 1 elephant, 1 giraffe, 195.6ms
Speed: 6.1ms preprocess, 195.6ms inference, 1.5ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 1 elephant, 1 giraffe, 158.4ms
Speed: 6.3ms preprocess, 158.4ms inference, 1.5ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 1 person, 1 giraffe, 180.6ms
Speed: 6.2ms preprocess, 180.6ms inference, 6.1ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 2 persons, 158.4ms
Speed: 6.2ms preprocess, 158.4ms inference, 3.1ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 1 person, 185.9ms
Speed: 5.3ms preprocess, 185.9ms inference, 3.3ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 1 person, 188.6ms
Speed: 5.4ms preprocess, 188.6ms i

Predicting:  12%|████████▏                                                           | 6/50 [10:20<1:15:41, 103.21s/it]


0: 640x384 3 persons, 2 refrigerators, 92.2ms
Speed: 3.4ms preprocess, 92.2ms inference, 1.3ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 3 persons, 2 refrigerators, 88.6ms
Speed: 3.8ms preprocess, 88.6ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 3 persons, 2 refrigerators, 92.3ms
Speed: 3.2ms preprocess, 92.3ms inference, 1.3ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 4 persons, 2 refrigerators, 95.1ms
Speed: 3.5ms preprocess, 95.1ms inference, 1.7ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 4 persons, 2 refrigerators, 101.6ms
Speed: 4.0ms preprocess, 101.6ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 5 persons, 1 refrigerator, 92.6ms
Speed: 3.1ms preprocess, 92.6ms inference, 1.4ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 4 persons, 1 refrigerator, 90.8ms
Speed: 3.2ms preprocess, 90.8ms inference, 1.3ms postprocess per image at shape (1, 3, 64

Predicting:  14%|█████████▋                                                           | 7/50 [11:36<1:07:24, 94.06s/it]


0: 640x384 3 persons, 1 backpack, 94.0ms
Speed: 3.7ms preprocess, 94.0ms inference, 1.3ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 3 persons, 97.4ms
Speed: 3.8ms preprocess, 97.4ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 3 persons, 107.3ms
Speed: 3.4ms preprocess, 107.3ms inference, 1.9ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 3 persons, 102.9ms
Speed: 3.0ms preprocess, 102.9ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 3 persons, 100.8ms
Speed: 3.3ms preprocess, 100.8ms inference, 1.3ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 4 persons, 100.0ms
Speed: 3.0ms preprocess, 100.0ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 3 persons, 92.8ms
Speed: 3.7ms preprocess, 92.8ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 3 persons, 90.7ms
Speed: 3.0ms preprocess, 90.7ms inference, 1.2ms postprocess 

Predicting:  16%|██████████▉                                                         | 8/50 [14:00<1:16:58, 109.95s/it]

0: 640x640 1 person, 488.0ms
Speed: 16.2ms preprocess, 488.0ms inference, 2.2ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 498.4ms
Speed: 16.3ms preprocess, 498.4ms inference, 4.2ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 501.5ms
Speed: 11.4ms preprocess, 501.5ms inference, 1.5ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 530.4ms
Speed: 14.7ms preprocess, 530.4ms inference, 2.2ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 538.4ms
Speed: 12.2ms preprocess, 538.4ms inference, 3.5ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 1 person, 670.8ms
Speed: 12.6ms preprocess, 670.8ms inference, 5.3ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 2 persons, 508.1ms
Speed: 10.5ms preprocess, 508.1ms inference, 5.3ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 515.5ms
Speed: 11.8ms preprocess, 515.5m

Predicting:  18%|████████████▏                                                       | 9/50 [17:20<1:34:29, 138.29s/it]


0: 640x640 (no detections), 161.0ms
Speed: 6.6ms preprocess, 161.0ms inference, 0.9ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 1 person, 1 vase, 162.8ms
Speed: 5.8ms preprocess, 162.8ms inference, 1.4ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 2 vases, 163.5ms
Speed: 5.7ms preprocess, 163.5ms inference, 1.5ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 165.9ms
Speed: 5.6ms preprocess, 165.9ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 1 person, 167.2ms
Speed: 6.2ms preprocess, 167.2ms inference, 1.6ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 1 person, 180.6ms
Speed: 6.5ms preprocess, 180.6ms inference, 1.5ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 1 person, 172.7ms
Speed: 6.2ms preprocess, 172.7ms inference, 1.7ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 1 person, 157.7ms
Speed: 6.2ms preprocess, 157.7ms inference, 1.6ms pos

Predicting:  20%|█████████████▍                                                     | 10/50 [19:02<1:24:40, 127.02s/it]


0: 640x512 1 person, 113.3ms
Speed: 4.8ms preprocess, 113.3ms inference, 1.4ms postprocess per image at shape (1, 3, 640, 512)

0: 640x512 2 persons, 116.5ms
Speed: 3.6ms preprocess, 116.5ms inference, 1.5ms postprocess per image at shape (1, 3, 640, 512)

0: 640x512 2 persons, 142.4ms
Speed: 5.1ms preprocess, 142.4ms inference, 7.1ms postprocess per image at shape (1, 3, 640, 512)

0: 640x512 1 person, 128.0ms
Speed: 5.9ms preprocess, 128.0ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 512)

0: 640x512 2 persons, 119.4ms
Speed: 4.1ms preprocess, 119.4ms inference, 1.7ms postprocess per image at shape (1, 3, 640, 512)

0: 640x512 1 person, 114.4ms
Speed: 3.9ms preprocess, 114.4ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 512)

0: 640x512 2 persons, 116.8ms
Speed: 3.5ms preprocess, 116.8ms inference, 1.4ms postprocess per image at shape (1, 3, 640, 512)

0: 640x512 2 persons, 114.4ms
Speed: 3.4ms preprocess, 114.4ms inference, 2.2ms postprocess per ima

Predicting:  22%|██████████████▋                                                    | 11/50 [21:00<1:20:47, 124.30s/it]


0: 640x640 5 persons, 3 pizzas, 1 potted plant, 130.4ms
Speed: 6.1ms preprocess, 130.4ms inference, 1.3ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 4 persons, 3 pizzas, 2 potted plants, 133.2ms
Speed: 5.2ms preprocess, 133.2ms inference, 1.8ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 5 persons, 2 pizzas, 2 potted plants, 123.3ms
Speed: 5.1ms preprocess, 123.3ms inference, 1.3ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 5 persons, 2 pizzas, 2 potted plants, 133.5ms
Speed: 6.0ms preprocess, 133.5ms inference, 1.4ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 4 persons, 3 pizzas, 2 potted plants, 1 dining table, 126.7ms
Speed: 5.2ms preprocess, 126.7ms inference, 1.3ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 4 persons, 2 pizzas, 2 potted plants, 2 dining tables, 127.1ms
Speed: 5.4ms preprocess, 127.1ms inference, 1.5ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 5 persons, 3 pizzas,

Predicting:  24%|████████████████                                                   | 12/50 [22:52<1:16:22, 120.58s/it]


0: 576x640 3 persons, 2 cars, 135.0ms
Speed: 5.7ms preprocess, 135.0ms inference, 1.5ms postprocess per image at shape (1, 3, 576, 640)

0: 576x640 2 persons, 1 car, 125.8ms
Speed: 5.6ms preprocess, 125.8ms inference, 1.6ms postprocess per image at shape (1, 3, 576, 640)

0: 576x640 2 persons, 1 car, 125.9ms
Speed: 5.1ms preprocess, 125.9ms inference, 1.3ms postprocess per image at shape (1, 3, 576, 640)

0: 576x640 2 persons, 1 car, 116.9ms
Speed: 5.5ms preprocess, 116.9ms inference, 1.4ms postprocess per image at shape (1, 3, 576, 640)

0: 576x640 2 persons, 1 car, 117.3ms
Speed: 5.5ms preprocess, 117.3ms inference, 1.6ms postprocess per image at shape (1, 3, 576, 640)

0: 576x640 2 persons, 1 car, 118.6ms
Speed: 5.2ms preprocess, 118.6ms inference, 1.5ms postprocess per image at shape (1, 3, 576, 640)

0: 576x640 2 persons, 1 car, 117.4ms
Speed: 6.3ms preprocess, 117.4ms inference, 1.4ms postprocess per image at shape (1, 3, 576, 640)

0: 576x640 1 person, 1 car, 1 skateboard, 134.

Predicting:  26%|█████████████████▍                                                 | 13/50 [24:42<1:12:23, 117.38s/it]


0: 640x544 3 persons, 1 cell phone, 178.3ms
Speed: 9.6ms preprocess, 178.3ms inference, 1.5ms postprocess per image at shape (1, 3, 640, 544)

0: 640x544 1 person, 1 cell phone, 170.5ms
Speed: 6.9ms preprocess, 170.5ms inference, 2.7ms postprocess per image at shape (1, 3, 640, 544)

0: 640x544 2 persons, 1 cell phone, 199.9ms
Speed: 5.3ms preprocess, 199.9ms inference, 1.8ms postprocess per image at shape (1, 3, 640, 544)

0: 640x544 1 person, 1 cell phone, 197.7ms
Speed: 6.0ms preprocess, 197.7ms inference, 1.6ms postprocess per image at shape (1, 3, 640, 544)

0: 640x544 1 person, 1 cell phone, 222.4ms
Speed: 6.1ms preprocess, 222.4ms inference, 2.6ms postprocess per image at shape (1, 3, 640, 544)

0: 640x544 1 person, 1 tv, 1 cell phone, 192.6ms
Speed: 8.5ms preprocess, 192.6ms inference, 1.6ms postprocess per image at shape (1, 3, 640, 544)

0: 640x544 1 tv, 1 cell phone, 165.7ms
Speed: 5.5ms preprocess, 165.7ms inference, 1.7ms postprocess per image at shape (1, 3, 640, 544)

0

Predicting:  28%|██████████████████▊                                                | 14/50 [26:47<1:11:49, 119.72s/it]


0: 384x640 5 persons, 1 bottle, 3 cups, 1 pizza, 1 dining table, 99.4ms
Speed: 3.9ms preprocess, 99.4ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 5 persons, 1 bottle, 2 cups, 2 pizzas, 1 dining table, 105.0ms
Speed: 3.6ms preprocess, 105.0ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 5 persons, 1 bottle, 3 cups, 2 pizzas, 1 dining table, 104.2ms
Speed: 3.9ms preprocess, 104.2ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 6 persons, 1 bottle, 2 cups, 2 pizzas, 1 dining table, 99.7ms
Speed: 3.7ms preprocess, 99.7ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 6 persons, 1 bottle, 2 cups, 2 pizzas, 1 dining table, 151.5ms
Speed: 5.0ms preprocess, 151.5ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 5 persons, 1 bottle, 2 cups, 2 pizzas, 1 dining table, 94.8ms
Speed: 3.8ms preprocess, 94.8ms inference, 1.8ms postprocess per i

Predicting:  30%|█████████████████████                                                 | 15/50 [27:41<58:16, 99.91s/it]


0: 640x416 1 person, 1 car, 1 tv, 87.8ms
Speed: 3.7ms preprocess, 87.8ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 416)

0: 640x416 1 person, 1 car, 88.7ms
Speed: 3.2ms preprocess, 88.7ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 416)

0: 640x416 1 person, 1 car, 90.8ms
Speed: 3.2ms preprocess, 90.8ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 416)

0: 640x416 1 person, 1 car, 88.8ms
Speed: 3.4ms preprocess, 88.8ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 416)

0: 640x416 1 person, 1 car, 2 handbags, 1 tv, 87.8ms
Speed: 3.3ms preprocess, 87.8ms inference, 1.4ms postprocess per image at shape (1, 3, 640, 416)

0: 640x416 1 person, 1 car, 2 handbags, 89.7ms
Speed: 3.2ms preprocess, 89.7ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 416)

0: 640x416 1 person, 1 car, 1 handbag, 88.7ms
Speed: 3.2ms preprocess, 88.7ms inference, 1.3ms postprocess per image at shape (1, 3, 640, 416)

0: 640x416 1 person, 

Predicting:  32%|██████████████████████▍                                               | 16/50 [29:05<53:50, 95.02s/it]


0: 640x512 2 persons, 108.8ms
Speed: 4.9ms preprocess, 108.8ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 512)

0: 640x512 2 persons, 108.3ms
Speed: 3.8ms preprocess, 108.3ms inference, 1.3ms postprocess per image at shape (1, 3, 640, 512)

0: 640x512 3 persons, 1 horse, 107.8ms
Speed: 3.9ms preprocess, 107.8ms inference, 1.3ms postprocess per image at shape (1, 3, 640, 512)

0: 640x512 1 person, 1 dog, 110.6ms
Speed: 4.0ms preprocess, 110.6ms inference, 1.4ms postprocess per image at shape (1, 3, 640, 512)

0: 640x512 1 horse, 110.5ms
Speed: 3.3ms preprocess, 110.5ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 512)

0: 640x512 1 person, 1 horse, 109.0ms
Speed: 4.0ms preprocess, 109.0ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 512)

0: 640x512 1 person, 109.0ms
Speed: 3.9ms preprocess, 109.0ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 512)

0: 640x512 1 person, 110.4ms
Speed: 4.0ms preprocess, 110.4ms inference, 1.2

Predicting:  34%|███████████████████████▍                                             | 17/50 [31:05<56:26, 102.62s/it]


0: 640x640 (no detections), 129.3ms
Speed: 6.6ms preprocess, 129.3ms inference, 0.8ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 1 person, 139.3ms
Speed: 5.5ms preprocess, 139.3ms inference, 1.3ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 1 person, 148.3ms
Speed: 5.2ms preprocess, 148.3ms inference, 1.3ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 4 persons, 1 cup, 129.4ms
Speed: 6.1ms preprocess, 129.4ms inference, 1.3ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 3 persons, 126.0ms
Speed: 5.8ms preprocess, 126.0ms inference, 1.3ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 3 persons, 1 cup, 1 chair, 130.3ms
Speed: 5.1ms preprocess, 130.3ms inference, 1.7ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 3 persons, 138.6ms
Speed: 5.8ms preprocess, 138.6ms inference, 1.3ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 3 persons, 1 wine glass, 1 chair, 126.9ms
Speed: 5.1ms pre

Predicting:  36%|████████████████████████▊                                            | 18/50 [32:50<54:59, 103.11s/it]


0: 640x384 2 persons, 109.6ms
Speed: 4.1ms preprocess, 109.6ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 3 persons, 101.1ms
Speed: 2.9ms preprocess, 101.1ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 3 persons, 103.5ms
Speed: 2.8ms preprocess, 103.5ms inference, 1.4ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 3 persons, 102.4ms
Speed: 2.6ms preprocess, 102.4ms inference, 1.4ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 3 persons, 103.4ms
Speed: 2.6ms preprocess, 103.4ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 2 persons, 102.2ms
Speed: 2.8ms preprocess, 102.2ms inference, 1.3ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 2 persons, 84.9ms
Speed: 2.3ms preprocess, 84.9ms inference, 1.3ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 3 persons, 96.5ms
Speed: 3.4ms preprocess, 96.5ms inference, 1.2ms postprocess per imag

Predicting:  38%|██████████████████████████▌                                           | 19/50 [34:06<49:04, 94.98s/it]


0: 640x384 4 persons, 91.7ms
Speed: 3.6ms preprocess, 91.7ms inference, 1.4ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 4 persons, 83.0ms
Speed: 3.0ms preprocess, 83.0ms inference, 1.4ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 4 persons, 1 car, 83.6ms
Speed: 3.0ms preprocess, 83.6ms inference, 1.3ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 4 persons, 1 remote, 84.2ms
Speed: 2.9ms preprocess, 84.2ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 3 persons, 1 car, 82.7ms
Speed: 3.0ms preprocess, 82.7ms inference, 1.3ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 3 persons, 82.2ms
Speed: 3.0ms preprocess, 82.2ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 3 persons, 94.0ms
Speed: 3.0ms preprocess, 94.0ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 2 persons, 1 car, 82.2ms
Speed: 3.1ms preprocess, 82.2ms inference, 1.3ms p

Predicting:  40%|████████████████████████████                                          | 20/50 [35:42<47:43, 95.46s/it]


0: 384x640 18 persons, 1 backpack, 1 handbag, 1 sports ball, 96.1ms
Speed: 4.0ms preprocess, 96.1ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 16 persons, 1 backpack, 1 handbag, 1 sports ball, 87.4ms
Speed: 3.1ms preprocess, 87.4ms inference, 1.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 16 persons, 1 backpack, 1 sports ball, 92.4ms
Speed: 2.7ms preprocess, 92.4ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 15 persons, 1 backpack, 1 sports ball, 89.7ms
Speed: 2.8ms preprocess, 89.7ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 16 persons, 1 backpack, 87.3ms
Speed: 3.0ms preprocess, 87.3ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 17 persons, 1 backpack, 90.7ms
Speed: 2.8ms preprocess, 90.7ms inference, 1.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 17 persons, 1 backpack, 90.4ms
Speed: 3.3ms preprocess, 90.4ms inf

Predicting:  42%|█████████████████████████████▍                                        | 21/50 [37:20<46:30, 96.22s/it]


0: 384x640 1 person, 1 chair, 101.2ms
Speed: 3.9ms preprocess, 101.2ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 1 chair, 102.8ms
Speed: 3.2ms preprocess, 102.8ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 102.6ms
Speed: 3.2ms preprocess, 102.6ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 99.1ms
Speed: 3.1ms preprocess, 99.1ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 102.7ms
Speed: 3.7ms preprocess, 102.7ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 106.3ms
Speed: 3.2ms preprocess, 106.3ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 93.2ms
Speed: 3.3ms preprocess, 93.2ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 87.5ms
Speed: 3.3ms preprocess, 87.5ms inference, 1.4ms postprocess 

Predicting:  44%|██████████████████████████████▎                                      | 22/50 [39:22<48:26, 103.80s/it]


0: 608x640 2 persons, 173.0ms
Speed: 6.7ms preprocess, 173.0ms inference, 1.4ms postprocess per image at shape (1, 3, 608, 640)

0: 608x640 2 persons, 150.5ms
Speed: 7.4ms preprocess, 150.5ms inference, 1.5ms postprocess per image at shape (1, 3, 608, 640)

0: 608x640 2 persons, 150.7ms
Speed: 6.0ms preprocess, 150.7ms inference, 1.4ms postprocess per image at shape (1, 3, 608, 640)

0: 608x640 2 persons, 156.6ms
Speed: 5.9ms preprocess, 156.6ms inference, 2.0ms postprocess per image at shape (1, 3, 608, 640)

0: 608x640 2 persons, 146.9ms
Speed: 6.5ms preprocess, 146.9ms inference, 1.3ms postprocess per image at shape (1, 3, 608, 640)

0: 608x640 2 persons, 131.7ms
Speed: 5.9ms preprocess, 131.7ms inference, 1.3ms postprocess per image at shape (1, 3, 608, 640)

0: 608x640 1 person, 125.6ms
Speed: 5.9ms preprocess, 125.6ms inference, 1.3ms postprocess per image at shape (1, 3, 608, 640)

0: 608x640 1 person, 122.2ms
Speed: 6.9ms preprocess, 122.2ms inference, 1.4ms postprocess per im

Predicting:  50%|█████████████████████████████████▌                                 | 25/50 [48:05<1:09:59, 167.97s/it]


0: 608x640 2 persons, 146.7ms
Speed: 6.4ms preprocess, 146.7ms inference, 1.3ms postprocess per image at shape (1, 3, 608, 640)

0: 608x640 2 persons, 145.9ms
Speed: 6.8ms preprocess, 145.9ms inference, 1.5ms postprocess per image at shape (1, 3, 608, 640)

0: 608x640 2 persons, 140.2ms
Speed: 6.8ms preprocess, 140.2ms inference, 1.8ms postprocess per image at shape (1, 3, 608, 640)

0: 608x640 2 persons, 151.3ms
Speed: 6.8ms preprocess, 151.3ms inference, 1.7ms postprocess per image at shape (1, 3, 608, 640)

0: 608x640 2 persons, 112.5ms
Speed: 5.4ms preprocess, 112.5ms inference, 1.1ms postprocess per image at shape (1, 3, 608, 640)

0: 608x640 2 persons, 122.9ms
Speed: 7.7ms preprocess, 122.9ms inference, 1.2ms postprocess per image at shape (1, 3, 608, 640)

0: 608x640 2 persons, 108.1ms
Speed: 4.5ms preprocess, 108.1ms inference, 1.2ms postprocess per image at shape (1, 3, 608, 640)

0: 608x640 2 persons, 113.0ms
Speed: 4.8ms preprocess, 113.0ms inference, 1.2ms postprocess per 

Predicting:  52%|███████████████████████████████████▉                                 | 26/50 [49:30<57:12, 143.02s/it]


0: 640x640 7 persons, 114.2ms
Speed: 5.3ms preprocess, 114.2ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 8 persons, 110.8ms
Speed: 4.5ms preprocess, 110.8ms inference, 1.1ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 7 persons, 1 horse, 1 handbag, 112.2ms
Speed: 4.6ms preprocess, 112.2ms inference, 1.3ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 7 persons, 1 horse, 111.5ms
Speed: 4.3ms preprocess, 111.5ms inference, 1.1ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 7 persons, 108.7ms
Speed: 4.7ms preprocess, 108.7ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 5 persons, 1 horse, 122.0ms
Speed: 4.7ms preprocess, 122.0ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 6 persons, 112.3ms
Speed: 4.6ms preprocess, 112.3ms inference, 1.1ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 6 persons, 113.6ms
Speed: 4.4ms preprocess, 113

Predicting:  54%|█████████████████████████████████████▎                               | 27/50 [50:42<46:39, 121.72s/it]


0: 640x544 3 persons, 125.3ms
Speed: 6.1ms preprocess, 125.3ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 544)

0: 640x544 4 persons, 120.6ms
Speed: 3.7ms preprocess, 120.6ms inference, 1.3ms postprocess per image at shape (1, 3, 640, 544)

0: 640x544 4 persons, 122.6ms
Speed: 4.1ms preprocess, 122.6ms inference, 1.1ms postprocess per image at shape (1, 3, 640, 544)

0: 640x544 3 persons, 121.6ms
Speed: 4.0ms preprocess, 121.6ms inference, 1.1ms postprocess per image at shape (1, 3, 640, 544)

0: 640x544 1 person, 1 chair, 123.2ms
Speed: 3.9ms preprocess, 123.2ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 544)

0: 640x544 4 persons, 1 chair, 114.7ms
Speed: 4.1ms preprocess, 114.7ms inference, 1.4ms postprocess per image at shape (1, 3, 640, 544)

0: 640x544 5 persons, 1 chair, 100.9ms
Speed: 4.2ms preprocess, 100.9ms inference, 1.1ms postprocess per image at shape (1, 3, 640, 544)

0: 640x544 1 person, 1 chair, 95.4ms
Speed: 3.8ms preprocess, 95.4ms i

Predicting:  56%|██████████████████████████████████████▋                              | 28/50 [52:04<40:13, 109.71s/it]


0: 640x640 1 person, 115.1ms
Speed: 4.4ms preprocess, 115.1ms inference, 1.3ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 1 person, 1 car, 113.6ms
Speed: 4.5ms preprocess, 113.6ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 1 car, 110.2ms
Speed: 4.7ms preprocess, 110.2ms inference, 1.1ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 109.1ms
Speed: 4.3ms preprocess, 109.1ms inference, 0.7ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 112.8ms
Speed: 4.3ms preprocess, 112.8ms inference, 0.7ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 109.5ms
Speed: 4.3ms preprocess, 109.5ms inference, 0.8ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 1 car, 109.7ms
Speed: 4.5ms preprocess, 109.7ms inference, 1.1ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 111.2ms
Speed: 4.2ms preprocess, 111.2ms inference, 0

Predicting:  58%|████████████████████████████████████████                             | 29/50 [54:05<39:39, 113.30s/it]


0: 384x640 1 person, 70.2ms
Speed: 2.0ms preprocess, 70.2ms inference, 0.9ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 71.1ms
Speed: 1.9ms preprocess, 71.1ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 3 persons, 71.7ms
Speed: 1.8ms preprocess, 71.7ms inference, 0.9ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 68.8ms
Speed: 1.8ms preprocess, 68.8ms inference, 0.8ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 72.2ms
Speed: 1.9ms preprocess, 72.2ms inference, 0.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 72.2ms
Speed: 1.8ms preprocess, 72.2ms inference, 0.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 75.7ms
Speed: 1.8ms preprocess, 75.7ms inference, 0.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 70.4ms
Speed: 1.8ms preprocess, 70.4ms inference, 1.0ms postprocess per i

Predicting:  60%|██████████████████████████████████████████                            | 30/50 [55:14<33:18, 99.93s/it]


0: 640x640 3 persons, 1 refrigerator, 105.2ms
Speed: 4.6ms preprocess, 105.2ms inference, 1.3ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 4 persons, 1 refrigerator, 108.4ms
Speed: 5.1ms preprocess, 108.4ms inference, 1.1ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 6 persons, 1 refrigerator, 105.8ms
Speed: 5.3ms preprocess, 105.8ms inference, 0.9ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 4 persons, 1 chair, 1 refrigerator, 111.8ms
Speed: 5.4ms preprocess, 111.8ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 4 persons, 108.3ms
Speed: 4.9ms preprocess, 108.3ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 3 persons, 108.6ms
Speed: 5.1ms preprocess, 108.6ms inference, 1.1ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 4 persons, 1 tv, 111.2ms
Speed: 5.1ms preprocess, 111.2ms inference, 1.1ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 4 pers

Predicting:  62%|███████████████████████████████████████████▍                          | 31/50 [56:22<28:33, 90.17s/it]


0: 544x640 5 persons, 91.1ms
Speed: 3.8ms preprocess, 91.1ms inference, 0.9ms postprocess per image at shape (1, 3, 544, 640)

0: 544x640 7 persons, 97.1ms
Speed: 4.2ms preprocess, 97.1ms inference, 1.1ms postprocess per image at shape (1, 3, 544, 640)

0: 544x640 5 persons, 98.5ms
Speed: 3.9ms preprocess, 98.5ms inference, 1.0ms postprocess per image at shape (1, 3, 544, 640)

0: 544x640 6 persons, 96.6ms
Speed: 3.7ms preprocess, 96.6ms inference, 1.0ms postprocess per image at shape (1, 3, 544, 640)

0: 544x640 5 persons, 94.3ms
Speed: 3.7ms preprocess, 94.3ms inference, 1.0ms postprocess per image at shape (1, 3, 544, 640)

0: 544x640 5 persons, 96.1ms
Speed: 4.2ms preprocess, 96.1ms inference, 1.1ms postprocess per image at shape (1, 3, 544, 640)

0: 544x640 5 persons, 98.4ms
Speed: 3.9ms preprocess, 98.4ms inference, 1.1ms postprocess per image at shape (1, 3, 544, 640)

0: 544x640 2 persons, 102.9ms
Speed: 3.9ms preprocess, 102.9ms inference, 1.3ms postprocess per image at shape

Predicting:  64%|████████████████████████████████████████████▊                         | 32/50 [57:09<23:11, 77.30s/it]


0: 640x512 2 persons, 88.7ms
Speed: 3.4ms preprocess, 88.7ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 512)

0: 640x512 2 persons, 90.3ms
Speed: 3.0ms preprocess, 90.3ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 512)

0: 640x512 4 persons, 92.4ms
Speed: 3.1ms preprocess, 92.4ms inference, 0.8ms postprocess per image at shape (1, 3, 640, 512)

0: 640x512 2 persons, 92.4ms
Speed: 3.1ms preprocess, 92.4ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 512)

0: 640x512 5 persons, 1 skateboard, 91.4ms
Speed: 3.1ms preprocess, 91.4ms inference, 1.1ms postprocess per image at shape (1, 3, 640, 512)

0: 640x512 3 persons, 91.8ms
Speed: 3.1ms preprocess, 91.8ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 512)

0: 640x512 1 person, 91.7ms
Speed: 3.0ms preprocess, 91.7ms inference, 1.1ms postprocess per image at shape (1, 3, 640, 512)

0: 640x512 2 persons, 90.9ms
Speed: 3.2ms preprocess, 90.9ms inference, 1.2ms postprocess per ima

Predicting:  66%|██████████████████████████████████████████████▏                       | 33/50 [57:59<19:38, 69.31s/it]


0: 640x448 1 person, 1 car, 78.0ms
Speed: 2.5ms preprocess, 78.0ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 448)

0: 640x448 1 person, 1 car, 81.0ms
Speed: 2.6ms preprocess, 81.0ms inference, 0.8ms postprocess per image at shape (1, 3, 640, 448)

0: 640x448 1 person, 1 car, 80.7ms
Speed: 2.6ms preprocess, 80.7ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 448)

0: 640x448 1 person, 1 car, 81.2ms
Speed: 2.6ms preprocess, 81.2ms inference, 1.1ms postprocess per image at shape (1, 3, 640, 448)

0: 640x448 1 person, 1 car, 79.8ms
Speed: 2.6ms preprocess, 79.8ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 448)

0: 640x448 1 person, 1 car, 79.4ms
Speed: 2.5ms preprocess, 79.4ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 448)

0: 640x448 2 persons, 1 car, 79.5ms
Speed: 2.6ms preprocess, 79.5ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 448)

0: 640x448 1 person, 1 car, 80.5ms
Speed: 2.7ms preprocess, 80.5ms 

Predicting:  68%|███████████████████████████████████████████████▌                      | 34/50 [59:16<19:02, 71.42s/it]


0: 640x640 1 person, 2 cars, 108.8ms
Speed: 5.6ms preprocess, 108.8ms inference, 1.1ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 1 person, 2 cars, 118.5ms
Speed: 5.0ms preprocess, 118.5ms inference, 1.1ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 1 person, 2 cars, 121.0ms
Speed: 5.1ms preprocess, 121.0ms inference, 1.1ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 1 person, 2 cars, 119.5ms
Speed: 5.1ms preprocess, 119.5ms inference, 1.1ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 1 person, 2 cars, 120.2ms
Speed: 5.2ms preprocess, 120.2ms inference, 1.1ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 1 person, 2 cars, 110.1ms
Speed: 4.9ms preprocess, 110.1ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 1 person, 2 cars, 113.7ms
Speed: 4.9ms preprocess, 113.7ms inference, 1.1ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 1 person, 2 cars, 108.1ms
Speed: 5.0

Predicting:  70%|███████████████████████████████████████████████▌                    | 35/50 [1:00:39<18:45, 75.06s/it]


0: 384x640 (no detections), 69.7ms
Speed: 2.1ms preprocess, 69.7ms inference, 0.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 71.4ms
Speed: 3.2ms preprocess, 71.4ms inference, 0.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 71.7ms
Speed: 2.4ms preprocess, 71.7ms inference, 0.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 surfboard, 72.9ms
Speed: 2.5ms preprocess, 72.9ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 surfboard, 71.7ms
Speed: 2.4ms preprocess, 71.7ms inference, 0.9ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 73.4ms
Speed: 2.4ms preprocess, 73.4ms inference, 0.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 71.5ms
Speed: 2.4ms preprocess, 71.5ms inference, 0.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 72.4ms
Speed: 2.4ms preprocess, 72.4ms inference

Predicting:  72%|████████████████████████████████████████████████▉                   | 36/50 [1:02:22<19:26, 83.31s/it]


0: 640x544 (no detections), 91.3ms
Speed: 5.0ms preprocess, 91.3ms inference, 0.6ms postprocess per image at shape (1, 3, 640, 544)

0: 640x544 1 person, 99.9ms
Speed: 3.9ms preprocess, 99.9ms inference, 1.4ms postprocess per image at shape (1, 3, 640, 544)

0: 640x544 1 person, 111.8ms
Speed: 4.4ms preprocess, 111.8ms inference, 1.1ms postprocess per image at shape (1, 3, 640, 544)

0: 640x544 (no detections), 111.6ms
Speed: 5.5ms preprocess, 111.6ms inference, 0.8ms postprocess per image at shape (1, 3, 640, 544)

0: 640x544 1 person, 110.3ms
Speed: 4.3ms preprocess, 110.3ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 544)

0: 640x544 1 person, 111.4ms
Speed: 4.3ms preprocess, 111.4ms inference, 1.3ms postprocess per image at shape (1, 3, 640, 544)

0: 640x544 1 person, 1 car, 1 tv, 105.4ms
Speed: 4.3ms preprocess, 105.4ms inference, 0.9ms postprocess per image at shape (1, 3, 640, 544)

0: 640x544 1 person, 1 tv, 96.8ms
Speed: 3.7ms preprocess, 96.8ms inference, 1.0

Predicting:  74%|██████████████████████████████████████████████████▎                 | 37/50 [1:03:47<18:08, 83.74s/it]


0: 640x640 6 persons, 1 cup, 1 dining table, 123.3ms
Speed: 4.8ms preprocess, 123.3ms inference, 1.1ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 6 persons, 1 cup, 1 dining table, 137.2ms
Speed: 5.1ms preprocess, 137.2ms inference, 1.4ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 6 persons, 1 cup, 1 bowl, 1 dining table, 127.4ms
Speed: 5.2ms preprocess, 127.4ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 7 persons, 1 cup, 1 bowl, 1 dining table, 128.1ms
Speed: 5.0ms preprocess, 128.1ms inference, 1.5ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 7 persons, 1 cup, 1 bowl, 1 dining table, 146.2ms
Speed: 7.9ms preprocess, 146.2ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 7 persons, 1 cup, 1 bowl, 129.6ms
Speed: 5.5ms preprocess, 129.6ms inference, 1.1ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 8 persons, 1 cup, 1 bowl, 1 dining table, 115.5ms
Speed: 6.8

Predicting:  76%|███████████████████████████████████████████████████▋                | 38/50 [1:05:24<17:32, 87.72s/it]


0: 640x384 6 persons, 1 skateboard, 79.1ms
Speed: 3.4ms preprocess, 79.1ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 6 persons, 1 skateboard, 90.9ms
Speed: 3.0ms preprocess, 90.9ms inference, 1.1ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 8 persons, 1 skateboard, 95.3ms
Speed: 2.6ms preprocess, 95.3ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 6 persons, 1 skis, 106.9ms
Speed: 3.8ms preprocess, 106.9ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 7 persons, 1 skis, 108.7ms
Speed: 2.8ms preprocess, 108.7ms inference, 1.1ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 6 persons, 2 skiss, 103.9ms
Speed: 3.7ms preprocess, 103.9ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 7 persons, 1 skis, 86.0ms
Speed: 2.5ms preprocess, 86.0ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 8 persons, 2 skiss

Predicting:  78%|█████████████████████████████████████████████████████               | 39/50 [1:06:23<14:30, 79.11s/it]


0: 640x384 1 bench, 1 chair, 72.6ms
Speed: 3.0ms preprocess, 72.6ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 1 person, 1 bench, 1 chair, 75.5ms
Speed: 2.6ms preprocess, 75.5ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 1 chair, 75.9ms
Speed: 2.7ms preprocess, 75.9ms inference, 0.9ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 1 chair, 74.3ms
Speed: 2.9ms preprocess, 74.3ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 1 chair, 79.6ms
Speed: 3.5ms preprocess, 79.6ms inference, 0.9ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 1 person, 1 bench, 91.4ms
Speed: 3.0ms preprocess, 91.4ms inference, 1.4ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 1 person, 1 tv, 92.3ms
Speed: 3.2ms preprocess, 92.3ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 1 car, 1 tv, 89.5ms
Speed: 2.8ms preprocess, 89.5ms inference, 1.0ms

Predicting:  80%|██████████████████████████████████████████████████████▍             | 40/50 [1:07:57<13:57, 83.76s/it]


0: 640x480 2 cars, 100.2ms
Speed: 3.8ms preprocess, 100.2ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 480)

0: 640x480 2 cars, 103.5ms
Speed: 3.0ms preprocess, 103.5ms inference, 1.1ms postprocess per image at shape (1, 3, 640, 480)

0: 640x480 2 cars, 96.0ms
Speed: 3.1ms preprocess, 96.0ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 480)

0: 640x480 1 person, 2 cars, 96.2ms
Speed: 2.9ms preprocess, 96.2ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 480)

0: 640x480 2 cars, 96.8ms
Speed: 3.0ms preprocess, 96.8ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 480)

0: 640x480 1 person, 2 cars, 1 cell phone, 94.7ms
Speed: 3.2ms preprocess, 94.7ms inference, 0.9ms postprocess per image at shape (1, 3, 640, 480)

0: 640x480 1 person, 2 cars, 87.1ms
Speed: 2.8ms preprocess, 87.1ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 480)

0: 640x480 2 cars, 89.2ms
Speed: 2.9ms preprocess, 89.2ms inference, 1.0ms postproc

Predicting:  82%|███████████████████████████████████████████████████████▊            | 41/50 [1:09:32<13:02, 86.93s/it]


0: 640x640 1 person, 1 pizza, 1 chair, 105.9ms
Speed: 5.3ms preprocess, 105.9ms inference, 1.6ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 1 person, 1 chair, 123.7ms
Speed: 4.6ms preprocess, 123.7ms inference, 1.1ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 1 person, 1 chair, 122.7ms
Speed: 4.7ms preprocess, 122.7ms inference, 1.1ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 1 person, 120.9ms
Speed: 5.0ms preprocess, 120.9ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 1 person, 1 pizza, 128.4ms
Speed: 5.1ms preprocess, 128.4ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 1 person, 109.9ms
Speed: 4.5ms preprocess, 109.9ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 1 person, 108.7ms
Speed: 4.6ms preprocess, 108.7ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 1 person, 111.3ms
Speed: 4.9ms preprocess, 111.

Predicting:  84%|█████████████████████████████████████████████████████████           | 42/50 [1:11:14<12:12, 91.56s/it]


0: 640x640 2 persons, 101.5ms
Speed: 4.4ms preprocess, 101.5ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 4 persons, 108.9ms
Speed: 4.5ms preprocess, 108.9ms inference, 1.1ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 2 persons, 107.5ms
Speed: 5.0ms preprocess, 107.5ms inference, 1.1ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 3 persons, 109.0ms
Speed: 5.1ms preprocess, 109.0ms inference, 1.1ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 3 persons, 112.9ms
Speed: 4.6ms preprocess, 112.9ms inference, 1.1ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 2 persons, 108.5ms
Speed: 4.6ms preprocess, 108.5ms inference, 1.1ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 2 persons, 1 skateboard, 110.1ms
Speed: 4.6ms preprocess, 110.1ms inference, 1.1ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 2 persons, 1 skateboard, 110.6ms
Speed: 4.7ms preprocess, 110.6ms infer

Predicting:  86%|██████████████████████████████████████████████████████████▍         | 43/50 [1:12:33<10:14, 87.85s/it]


0: 576x640 1 person, 98.5ms
Speed: 5.5ms preprocess, 98.5ms inference, 1.1ms postprocess per image at shape (1, 3, 576, 640)

0: 576x640 1 person, 115.1ms
Speed: 5.8ms preprocess, 115.1ms inference, 1.2ms postprocess per image at shape (1, 3, 576, 640)

0: 576x640 2 persons, 117.3ms
Speed: 5.4ms preprocess, 117.3ms inference, 1.2ms postprocess per image at shape (1, 3, 576, 640)

0: 576x640 2 persons, 1 vase, 110.7ms
Speed: 4.9ms preprocess, 110.7ms inference, 1.1ms postprocess per image at shape (1, 3, 576, 640)

0: 576x640 2 persons, 113.5ms
Speed: 4.9ms preprocess, 113.5ms inference, 1.1ms postprocess per image at shape (1, 3, 576, 640)

0: 576x640 2 persons, 102.8ms
Speed: 4.9ms preprocess, 102.8ms inference, 1.2ms postprocess per image at shape (1, 3, 576, 640)

0: 576x640 1 person, 99.3ms
Speed: 4.6ms preprocess, 99.3ms inference, 1.1ms postprocess per image at shape (1, 3, 576, 640)

0: 576x640 (no detections), 102.1ms
Speed: 4.7ms preprocess, 102.1ms inference, 0.7ms postproce

Predicting:  88%|███████████████████████████████████████████████████████████▊        | 44/50 [1:13:40<08:09, 81.52s/it]


0: 576x640 4 persons, 100.9ms
Speed: 4.3ms preprocess, 100.9ms inference, 1.1ms postprocess per image at shape (1, 3, 576, 640)

0: 576x640 4 persons, 97.3ms
Speed: 4.5ms preprocess, 97.3ms inference, 1.2ms postprocess per image at shape (1, 3, 576, 640)

0: 576x640 4 persons, 102.2ms
Speed: 4.5ms preprocess, 102.2ms inference, 1.1ms postprocess per image at shape (1, 3, 576, 640)

0: 576x640 7 persons, 107.0ms
Speed: 5.7ms preprocess, 107.0ms inference, 1.0ms postprocess per image at shape (1, 3, 576, 640)

0: 576x640 8 persons, 96.5ms
Speed: 4.7ms preprocess, 96.5ms inference, 1.1ms postprocess per image at shape (1, 3, 576, 640)

0: 576x640 7 persons, 100.2ms
Speed: 4.6ms preprocess, 100.2ms inference, 1.2ms postprocess per image at shape (1, 3, 576, 640)

0: 576x640 7 persons, 105.4ms
Speed: 4.6ms preprocess, 105.4ms inference, 1.3ms postprocess per image at shape (1, 3, 576, 640)

0: 576x640 6 persons, 107.5ms
Speed: 4.8ms preprocess, 107.5ms inference, 1.3ms postprocess per imag

Predicting:  90%|█████████████████████████████████████████████████████████████▏      | 45/50 [1:14:25<05:53, 70.60s/it]


0: 640x416 1 person, 1 car, 1 tv, 86.0ms
Speed: 3.0ms preprocess, 86.0ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 416)

0: 640x416 1 person, 1 car, 82.3ms
Speed: 2.8ms preprocess, 82.3ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 416)

0: 640x416 1 person, 1 car, 81.0ms
Speed: 2.7ms preprocess, 81.0ms inference, 1.1ms postprocess per image at shape (1, 3, 640, 416)

0: 640x416 1 person, 1 car, 88.4ms
Speed: 3.1ms preprocess, 88.4ms inference, 0.8ms postprocess per image at shape (1, 3, 640, 416)

0: 640x416 1 person, 1 car, 2 handbags, 1 tv, 80.0ms
Speed: 3.0ms preprocess, 80.0ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 416)

0: 640x416 1 person, 1 car, 2 handbags, 81.3ms
Speed: 3.0ms preprocess, 81.3ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 416)

0: 640x416 1 person, 1 car, 1 handbag, 79.8ms
Speed: 2.9ms preprocess, 79.8ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 416)

0: 640x416 1 person, 

Predicting:  92%|██████████████████████████████████████████████████████████████▌     | 46/50 [1:15:41<04:48, 72.10s/it]


0: 384x640 6 persons, 76.9ms
Speed: 2.0ms preprocess, 76.9ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 6 persons, 73.9ms
Speed: 1.9ms preprocess, 73.9ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 6 persons, 73.5ms
Speed: 1.8ms preprocess, 73.5ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 6 persons, 74.0ms
Speed: 2.4ms preprocess, 74.0ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 75.8ms
Speed: 1.8ms preprocess, 75.8ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 1 horse, 83.9ms
Speed: 2.4ms preprocess, 83.9ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 6 persons, 74.8ms
Speed: 1.8ms preprocess, 74.8ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 6 persons, 72.1ms
Speed: 1.8ms preprocess, 72.1ms inference, 1.0ms postprocess per image a

Predicting:  94%|███████████████████████████████████████████████████████████████▉    | 47/50 [1:17:18<03:59, 79.68s/it]


0: 384x640 5 persons, 2 bicycles, 1 backpack, 75.0ms
Speed: 2.2ms preprocess, 75.0ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 2 bicycles, 1 backpack, 72.8ms
Speed: 2.0ms preprocess, 72.8ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 6 persons, 2 bicycles, 2 backpacks, 92.1ms
Speed: 7.0ms preprocess, 92.1ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 2 bicycles, 2 backpacks, 74.8ms
Speed: 2.1ms preprocess, 74.8ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 4 persons, 2 backpacks, 73.1ms
Speed: 2.0ms preprocess, 73.1ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 6 persons, 1 bicycle, 77.9ms
Speed: 1.9ms preprocess, 77.9ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 7 persons, 1 backpack, 76.4ms
Speed: 1.9ms preprocess, 76.4ms inference, 1.0ms postprocess per image at 

Predicting:  96%|█████████████████████████████████████████████████████████████████▎  | 48/50 [1:18:34<02:37, 78.58s/it]


0: 608x640 2 persons, 121.1ms
Speed: 5.9ms preprocess, 121.1ms inference, 1.6ms postprocess per image at shape (1, 3, 608, 640)

0: 608x640 1 person, 164.1ms
Speed: 6.7ms preprocess, 164.1ms inference, 1.4ms postprocess per image at shape (1, 3, 608, 640)

0: 608x640 3 persons, 1 mouse, 135.5ms
Speed: 6.7ms preprocess, 135.5ms inference, 1.3ms postprocess per image at shape (1, 3, 608, 640)

0: 608x640 1 person, 145.0ms
Speed: 6.9ms preprocess, 145.0ms inference, 1.3ms postprocess per image at shape (1, 3, 608, 640)

0: 608x640 1 person, 117.9ms
Speed: 6.5ms preprocess, 117.9ms inference, 1.5ms postprocess per image at shape (1, 3, 608, 640)

0: 608x640 2 persons, 115.5ms
Speed: 5.7ms preprocess, 115.5ms inference, 1.5ms postprocess per image at shape (1, 3, 608, 640)

0: 608x640 2 persons, 116.5ms
Speed: 5.9ms preprocess, 116.5ms inference, 1.9ms postprocess per image at shape (1, 3, 608, 640)

0: 608x640 2 persons, 113.4ms
Speed: 5.6ms preprocess, 113.4ms inference, 1.4ms postproces

Predicting:  98%|██████████████████████████████████████████████████████████████████▋ | 49/50 [1:20:14<01:25, 85.01s/it]


0: 640x640 2 persons, 2 umbrellas, 1 tv, 1 laptop, 117.8ms
Speed: 4.7ms preprocess, 117.8ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 2 persons, 1 tv, 2 laptops, 113.4ms
Speed: 4.9ms preprocess, 113.4ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 3 persons, 1 laptop, 115.8ms
Speed: 5.5ms preprocess, 115.8ms inference, 1.3ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 2 persons, 1 laptop, 116.8ms
Speed: 5.7ms preprocess, 116.8ms inference, 1.1ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 3 persons, 1 laptop, 116.7ms
Speed: 5.1ms preprocess, 116.7ms inference, 1.1ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 2 persons, 1 laptop, 118.5ms
Speed: 4.9ms preprocess, 118.5ms inference, 1.1ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 2 persons, 1 laptop, 116.6ms
Speed: 5.0ms preprocess, 116.6ms inference, 1.6ms postprocess per image at shape (1, 3, 640, 640)



Predicting: 100%|████████████████████████████████████████████████████████████████████| 50/50 [1:21:22<00:00, 97.65s/it]


In [8]:
test_results.to_csv('Prediction_result_final.csv',index=False)