In [1]:
import cv2
import datetime
from ultralytics import YOLO
from deep_sort_realtime.deepsort_tracker import DeepSort
import torchreid
import numpy as np
import torch
import torchvision.transforms as transforms
from scipy.spatial.distance import cosine



In [2]:
yolo_model = YOLO("yolov8n.pt")

In [None]:
osnet_model = torchreid.models.build_model(
    name='osnet_x0_25', 
    num_classes=1000, 
    pretrained=True
)
osnet_model.eval() 

Successfully loaded imagenet pretrained weights from "C:\Users\janar/.cache\torch\checkpoints\osnet_x0_25_imagenet.pth"


OSNet(
  (conv1): ConvLayer(
    (conv): Conv2d(3, 16, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (bn): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
  )
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (conv2): Sequential(
    (0): OSBlock(
      (conv1): Conv1x1(
        (conv): Conv2d(16, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
      )
      (conv2a): LightConv3x3(
        (conv1): Conv2d(16, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (conv2): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=16, bias=False)
        (bn): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
      )
      (conv2b): Sequential(
        (

In [4]:
transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((256, 128)), 
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

In [5]:
tracker = DeepSort(
    max_age=200, 
    n_init=5, 
    nms_max_overlap=0.5
)

  import pkg_resources


In [6]:
video_path = "D:/cctv/16.avi"
video_cap = cv2.VideoCapture(video_path)

frame_width = int(video_cap.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_height = int(video_cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = video_cap.get(cv2.CAP_PROP_FPS)

out = cv2.VideoWriter("output_person_tracking_7_suspicious.mp4",
                      cv2.VideoWriter_fourcc(*'mp4v'),
                      fps, (frame_width, frame_height))


In [7]:
CONFIDENCE_THRESHOLD = 0.5
GREEN = (0, 255, 0)
WHITE = (255, 255, 255)
RED = (0, 0, 255)
track_embedding_history = {}  # store embeddings for re-ID
last_seen = {}                # last frame a track was seen
REIDENTIFY_DELAY_FRAMES = 30  # delay before re-ID


In [8]:
track_positions = {}  # store history of (x_center, y_center) per track
SUSPICIOUS_TIME_FRAMES = int(fps * 5)  # e.g., 5 seconds
STILLNESS_THRESHOLD = 10               # max movement in pixels to be considered "still"

def is_suspicious(pos_history):
    if len(pos_history) < SUSPICIOUS_TIME_FRAMES:
        return False
    x_coords, y_coords = zip(*pos_history)
    if max(x_coords) - min(x_coords) < STILLNESS_THRESHOLD and \
       max(y_coords) - min(y_coords) < STILLNESS_THRESHOLD:
        return True
    return False

In [9]:
while video_cap.isOpened():
    ret, frame = video_cap.read()
    if not ret:
        break

    start_time = datetime.datetime.now()

    # -----------------------------
    # 7a️⃣ Detect persons with YOLOv8
    # -----------------------------
    results = yolo_model(frame)[0]
    detections = []
    embeddings = []

    for data in results.boxes.data.tolist():
        confidence = data[4]
        if float(confidence) >= CONFIDENCE_THRESHOLD:
            xmin, ymin, xmax, ymax = map(int, data[:4])
            class_id = int(data[5])

            if class_id == 0:  # only track persons
                bbox = [xmin, ymin, xmax - xmin, ymax - ymin]  # DeepSORT format
                detections.append([bbox, confidence, class_id])

                # -----------------------------
                # 7b️⃣ Extract OSNet embedding for re-ID
                # -----------------------------
                person_img = frame[ymin:ymax, xmin:xmax]
                if person_img.size > 0:
                    person_img = transform(person_img).unsqueeze(0)
                    with torch.no_grad():
                        embedding = osnet_model(person_img)
                    embeddings.append(embedding.squeeze(0).cpu().numpy())

    # -----------------------------
    # 7c️⃣ Update DeepSORT tracker
    # -----------------------------
    tracks = tracker.update_tracks(detections, frame=frame)

    # -----------------------------
    # 7d️⃣ Apply re-identification logic and suspicious detection
    # -----------------------------
    for i, track in enumerate(tracks):
        if not track.is_confirmed() or track.det_class != 0:
            continue

        track_id = track.track_id
        ltrb = track.to_ltrb()
        xmin, ymin, xmax, ymax = map(int, ltrb)
        x_center = (xmin + xmax) // 2
        y_center = (ymin + ymax) // 2

        # Update position history
        if track_id not in track_positions:
            track_positions[track_id] = []
        track_positions[track_id].append((x_center, y_center))
        if len(track_positions[track_id]) > SUSPICIOUS_TIME_FRAMES:
            track_positions[track_id].pop(0)

        if i < len(embeddings):
            current_embedding = embeddings[i]

            if track_id not in track_embedding_history:
                # Check delay for re-identification
                if track_id in last_seen and (video_cap.get(cv2.CAP_PROP_POS_FRAMES) - last_seen[track_id]) < REIDENTIFY_DELAY_FRAMES:
                    continue

                # Compare with past embeddings
                for past_id, past_embedding in track_embedding_history.items():
                    distance = cosine(current_embedding, past_embedding)
                    if distance < 0.25:  # threshold
                        track_id = past_id
                        break

            # Update history
            track_embedding_history[track_id] = current_embedding
            last_seen[track_id] = video_cap.get(cv2.CAP_PROP_POS_FRAMES)

        # -----------------------------
        # 7e️⃣ Detect suspicious behavior
        # -----------------------------
        suspicious = is_suspicious(track_positions[track_id])

        # Box color
        color_box = RED if suspicious else GREEN

        # Draw bounding box and ID
        cv2.rectangle(frame, (xmin, ymin), (xmax, ymax), color_box, 2)
        cv2.putText(frame, f"ID: {track_id}", (xmin, ymin - 10),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.6, WHITE, 2)
        if suspicious:
            cv2.putText(frame, "SUSPICIOUS", (xmin, ymax + 20),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.6, RED, 2)

    # -----------------------------
    # 7f️⃣ Display FPS
    # -----------------------------
    end_time = datetime.datetime.now()
    fps_text = f"FPS: {1 / (end_time - start_time).total_seconds():.2f}"
    cv2.putText(frame, fps_text, (20, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)

    # -----------------------------
    # 7g️⃣ Show and save video
    # -----------------------------
    cv2.imshow("Person Detection & Tracking (OSNet + Suspicious)", frame)
    out.write(frame)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# -----------------------------
# 8️⃣ Release resources
# -----------------------------
video_cap.release()
out.release()
cv2.destroyAllWindows()


0: 384x640 21 persons, 2 suitcases, 122.2ms
Speed: 3.8ms preprocess, 122.2ms inference, 221.8ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 21 persons, 2 suitcases, 16.2ms
Speed: 1.5ms preprocess, 16.2ms inference, 2.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 22 persons, 1 suitcase, 11.2ms
Speed: 1.5ms preprocess, 11.2ms inference, 2.9ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 22 persons, 1 suitcase, 12.5ms
Speed: 1.2ms preprocess, 12.5ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 20 persons, 2 handbags, 14.3ms
Speed: 1.2ms preprocess, 14.3ms inference, 1.9ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 20 persons, 2 handbags, 16.1ms
Speed: 1.2ms preprocess, 16.1ms inference, 1.8ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 23 persons, 1 handbag, 12.2ms
Speed: 1.1ms preprocess, 12.2ms inference, 2.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 22 