Loads the fine-tuned YOLOv8n-cls best.pt and manually evaluates it on the test frames: computes accuracy, confusion matrix, per-class precision/recall/F1, and FP32 vs FP16 inference speed.

In [1]:
import os
from ultralytics import YOLO
import torch

PROJECT_ROOT = "/home/olzhas/programming/traffic-accident-edge"

YOLO_BEST = os.path.join(
    PROJECT_ROOT,
    "notebooks",
    "runs",
    "classify",
    "train2",
    "weights",
    "best.pt"
)

YOLO_DATA_ROOT = os.path.join(
    PROJECT_ROOT,
    "TAD-benchmark",
    "TAD-YOLO-CLS"
)
TEST_ACC_DIR = os.path.join(YOLO_DATA_ROOT, "test", "accident")
TEST_NORM_DIR = os.path.join(YOLO_DATA_ROOT, "test", "normal")

print("YOLO_BEST exists:", os.path.exists(YOLO_BEST))
print("TEST_ACC_DIR exists:", os.path.exists(TEST_ACC_DIR))
print("TEST_NORM_DIR exists:", os.path.exists(TEST_NORM_DIR))

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

model = YOLO(YOLO_BEST)
model.to(device)

size_mb = os.path.getsize(YOLO_BEST) / (1024 * 1024)
print(f"Model size: {size_mb:.2f} MB")

YOLO_BEST exists: True
TEST_ACC_DIR exists: True
TEST_NORM_DIR exists: True
Using device: cuda
Model size: 2.83 MB


In [2]:
import glob
import time


acc_imgs = sorted(
    glob.glob(os.path.join(TEST_ACC_DIR, "*.jpg"))
    + glob.glob(os.path.join(TEST_ACC_DIR, "*.png"))
)
norm_imgs = sorted(
    glob.glob(os.path.join(TEST_NORM_DIR, "*.jpg"))
    + glob.glob(os.path.join(TEST_NORM_DIR, "*.png"))
)

print("Test accident images:", len(acc_imgs))
print("Test normal images:", len(norm_imgs))


MAX_SAMPLES_PER_CLASS = None

if MAX_SAMPLES_PER_CLASS is not None:
    acc_imgs = acc_imgs[:MAX_SAMPLES_PER_CLASS]
    norm_imgs = norm_imgs[:MAX_SAMPLES_PER_CLASS]

all_imgs = [(p, 1) for p in norm_imgs] + [(p, 0) for p in acc_imgs]

print("Total test samples used:", len(all_imgs))

_ = model(all_imgs[0][0], imgsz=224, device=device, verbose=False)

correct = 0
total = 0

start_time = time.time()

for i, (img_path, label) in enumerate(all_imgs):
    results = model(img_path, imgsz=224, device=device, verbose=False)
    probs = results[0].probs
    pred = int(probs.top1)

    if pred == label:
        correct += 1
    total += 1

end_time = time.time()
total_time = end_time - start_time
avg_time = total_time / max(total, 1)
fps = 1.0 / avg_time if avg_time > 0 else 0.0

test_acc = correct / total if total > 0 else 0.0

print(f"\nYOLO test accuracy on these frames: {test_acc:.4f}")
print(f"Total images: {total}")
print(f"Total time: {total_time:.2f} s")
print(f"Avg time per image: {avg_time*1000:.2f} ms")
print(f"FPS (images per second): {fps:.2f}")

Test accident images: 991
Test normal images: 1069
Total test samples used: 2060

YOLO test accuracy on these frames: 0.8107
Total images: 2060
Total time: 55.35 s
Avg time per image: 26.87 ms
FPS (images per second): 37.22


In [3]:
import time
from ultralytics import YOLO
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

model_fp16 = YOLO(YOLO_BEST)


print("Total test samples used (same as before):", len(all_imgs))

use_half = (device == "cuda")
print("Using half precision:", use_half)

_ = model_fp16(
    all_imgs[0][0],
    imgsz=224,
    device=device,
    half=use_half,
    verbose=False
)

correct = 0
total = 0

if device == "cuda":
    torch.cuda.synchronize()
start_time = time.time()

for img_path, label in all_imgs:
    results = model_fp16(
        img_path,
        imgsz=224,
        device=device,
        half=use_half,
        verbose=False
    )
    probs = results[0].probs
    pred = int(probs.top1)

    if pred == label:
        correct += 1
    total += 1

if device == "cuda":
    torch.cuda.synchronize()
end_time = time.time()

total_time = end_time - start_time
avg_time = total_time / max(total, 1)
fps = 1.0 / avg_time if avg_time > 0 else 0.0
test_acc = correct / total if total > 0 else 0.0

print(f"\n[FP16] YOLO test accuracy: {test_acc:.4f}")
print(f"[FP16] Total images: {total}")
print(f"[FP16] Total time: {total_time:.2f} s")
print(f"[FP16] Avg time per image: {avg_time*1000:.2f} ms")
print(f"[FP16] FPS (images per second): {fps:.2f}")

Using device: cuda
Total test samples used (same as before): 2060
Using half precision: True

[FP16] YOLO test accuracy: 0.8092
[FP16] Total images: 2060
[FP16] Total time: 53.06 s
[FP16] Avg time per image: 25.76 ms
[FP16] FPS (images per second): 38.83


In [5]:
import numpy as np
from collections import Counter
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support

model.eval()

all_preds = []
all_labels = []

with torch.no_grad():
    for img_path, label in all_imgs:
        results = model(img_path, imgsz=224, device=device, verbose=False)
        probs = results[0].probs
        pred = int(probs.top1)
        all_preds.append(pred)
        all_labels.append(label)

y_true = np.array(all_labels)
y_pred = np.array(all_preds)

cm = confusion_matrix(y_true, y_pred, labels=[0, 1])
print("Confusion matrix (rows=true, cols=pred) [accident, normal]:\n", cm)

prec, rec, f1, _ = precision_recall_fscore_support(
    y_true, y_pred, labels=[0, 1], zero_division=0
)

print(
    f"\nAccident class (0): precision={prec[0]:.3f}, recall={rec[0]:.3f}, f1={f1[0]:.3f}")
print(
    f"Normal   class (1): precision={prec[1]:.3f}, recall={rec[1]:.3f}, f1={f1[1]:.3f}")

test_counts = Counter(all_labels)  # 0 and 1
majority_label = max(test_counts, key=test_counts.get)
baseline_acc = test_counts[majority_label] / (test_counts[0] + test_counts[1])

print("\nTest label counts:", test_counts)
print("Majority baseline accuracy:", baseline_acc)

Confusion matrix (rows=true, cols=pred) [accident, normal]:
 [[906  85]
 [305 764]]

Accident class (0): precision=0.748, recall=0.914, f1=0.823
Normal   class (1): precision=0.900, recall=0.715, f1=0.797

Test label counts: Counter({1: 1069, 0: 991})
Majority baseline accuracy: 0.5189320388349514
