In [None]:
!pip install ultralytics



In [17]:
import os

def list_files(directory):
    try:
        # List all files in the given directory
        files = [f for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f))]
        return files
    except Exception as e:
        return str(e)  # Return the error message as a string

def list_directories(path='.'):
    return [d for d in os.listdir(path) if os.path.isdir(os.path.join(path, d))]

path_mot_dataset = "/content/drive/MyDrive/UpalaAgrícola2024/Gira1-8Mar24/Mavic4/Formato .mp4 split/MOT_format/DJI_20240308111117_0010_V_1/images/DJI_20240308111117_0010_V_1"
files_in_directory = [os.path.join(path_mot_dataset, file) for file in list_files(path_mot_dataset)]

In [57]:
files_in_directory

['/content/drive/MyDrive/UpalaAgrícola2024/Gira1-8Mar24/Mavic4/Formato .mp4 split/MOT_format/DJI_20240308111117_0010_V_1/images/DJI_20240308111117_0010_V_1/frame_00001.png',
 '/content/drive/MyDrive/UpalaAgrícola2024/Gira1-8Mar24/Mavic4/Formato .mp4 split/MOT_format/DJI_20240308111117_0010_V_1/images/DJI_20240308111117_0010_V_1/frame_00002.png',
 '/content/drive/MyDrive/UpalaAgrícola2024/Gira1-8Mar24/Mavic4/Formato .mp4 split/MOT_format/DJI_20240308111117_0010_V_1/images/DJI_20240308111117_0010_V_1/frame_00003.png',
 '/content/drive/MyDrive/UpalaAgrícola2024/Gira1-8Mar24/Mavic4/Formato .mp4 split/MOT_format/DJI_20240308111117_0010_V_1/images/DJI_20240308111117_0010_V_1/frame_00004.png',
 '/content/drive/MyDrive/UpalaAgrícola2024/Gira1-8Mar24/Mavic4/Formato .mp4 split/MOT_format/DJI_20240308111117_0010_V_1/images/DJI_20240308111117_0010_V_1/frame_00005.png',
 '/content/drive/MyDrive/UpalaAgrícola2024/Gira1-8Mar24/Mavic4/Formato .mp4 split/MOT_format/DJI_20240308111117_0010_V_1/ima

In [None]:
from ultralytics import YOLO, RTDETR
model = YOLO("C:/Users/dnnxl/Documents/GitHub/drone-sort/weights/yolo11nCombined@fine-tuning/best.pt")

In [7]:
model

YOLO(
  (model): DetectionModel(
    (model): Sequential(
      (0): Conv(
        (conv): Conv2d(3, 96, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
        (bn): BatchNorm2d(96, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
        (act): SiLU(inplace=True)
      )
      (1): Conv(
        (conv): Conv2d(96, 192, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
        (bn): BatchNorm2d(192, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
        (act): SiLU(inplace=True)
      )
      (2): C3k2(
        (cv1): Conv(
          (conv): Conv2d(192, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn): BatchNorm2d(192, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
          (act): SiLU(inplace=True)
        )
        (cv2): Conv(
          (conv): Conv2d(384, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn): BatchNorm2d(384, eps=0.001, momentum=0.03, affine=True, track_

In [8]:
import cv2 
import os 

from collections import defaultdict
from typing import List, Dict, Union  

def get_video_duration(path):
    if not os.path.exists(path):
        raise ValueError(f"Video file not found: %s" % path)
    video = cv2.VideoCapture(path)
    fps = video.get(cv2.CAP_PROP_FPS)
    frame_count = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
    duration = frame_count / fps 
    return frame_count, duration 

def predict_regions(path) -> List[Dict]:
    conf = 0.50 
    iou = 0.70 
    tracker_name = "C:/Users/dnnxl/Documents/GitHub/drone-sort/scripts/botsort.yaml" 
    results = model.track(
        path, conf=conf, iou=iou, tracker=tracker_name, stream=True,
    )
    return create_video_rectangles(results, path)

def create_video_rectangles(results, path):
    label_map = ["Pineapple"]
    frames_count, duration = get_video_duration(path)
    model_names = "model_v1"
    tracks = defaultdict(list)
    track_labels = dict() 
    frame = -1 
    for result in results:
        frame += 1
        data = result.boxes 
        if not data.is_track:
            continue 

        for i, track_id in enumerate(data.id.tolist()):
            score = float(data.conf[i])
            x, y, w, h = data.xywhn[i].tolist() 
            model_label = "Pineapple"
            if model_label not in label_map:
                continue 
            track_labels[track_id] = model_label 

            box = {
                "frame": frame + 1,
                "enabled": True,
                "rotation": 0,
                "x": (x-w/2)*100,
                "y": (y-h/2)*100,
                "width": w*100,
                "height": h*100,
                "time": (frame+1)*(duration/frames_count),
                "score": score 
            }
            tracks[track_id].append(box)
    regions = []
    for track_id in tracks:
        sequence = tracks[track_id]
        sequence = process_lifespans_enabled(sequence)

        label = "Pineapple"
        region = {
            "from_name": "box",
            "to_name": "video",
            "type": "videorectangle",
            "value": {
                "frameCount": frames_count,
                "duration": duration,
                "sequence": sequence,
                "labels": [label]
            },
            "score": max([frame_info["score"] for frame_info in sequence]),
            "origin": "manual"
        }
        regions.append(region)
    return regions 

def process_lifespans_enabled(sequence: List[Dict]) -> List[Dict]:
    """This function detects gaps in the sequence of boxes 
    and disables lifespan line for the gaps assigning "enabled":False 
    to the last bboxes in the whole span sequence.
    """     
    prev = None 
    for i, box in enumerate(sequence):
        if prev is None:
            prev = sequence[i]
            continue 
        if box["frame"] - prev["frame"] > 1:
            sequence[i-1]["enabled"] = False 
        prev = sequence[i] 
    sequence[-1]["enabled"] = False 
    return sequence  

In [11]:
path_video = "C:/Users/dnnxl/Downloads/DJI_20240308105544_0002_V_1.mp4"
predictions = predict_regions(path_video)


video 1/1 (frame 1/392) C:\Users\dnnxl\Downloads\DJI_20240308105544_0002_V_1.mp4: 384x640 5 pineapples, 1803.4ms
video 1/1 (frame 2/392) C:\Users\dnnxl\Downloads\DJI_20240308105544_0002_V_1.mp4: 384x640 4 pineapples, 1328.6ms
video 1/1 (frame 3/392) C:\Users\dnnxl\Downloads\DJI_20240308105544_0002_V_1.mp4: 384x640 4 pineapples, 1445.2ms
video 1/1 (frame 4/392) C:\Users\dnnxl\Downloads\DJI_20240308105544_0002_V_1.mp4: 384x640 4 pineapples, 1310.0ms
video 1/1 (frame 5/392) C:\Users\dnnxl\Downloads\DJI_20240308105544_0002_V_1.mp4: 384x640 4 pineapples, 1247.0ms
video 1/1 (frame 6/392) C:\Users\dnnxl\Downloads\DJI_20240308105544_0002_V_1.mp4: 384x640 4 pineapples, 1411.3ms
video 1/1 (frame 7/392) C:\Users\dnnxl\Downloads\DJI_20240308105544_0002_V_1.mp4: 384x640 4 pineapples, 1166.6ms
video 1/1 (frame 8/392) C:\Users\dnnxl\Downloads\DJI_20240308105544_0002_V_1.mp4: 384x640 4 pineapples, 1056.3ms
video 1/1 (frame 9/392) C:\Users\dnnxl\Downloads\DJI_20240308105544_0002_V_1.mp4: 384x640 4 pin

In [10]:
import json 

json_to_save = {
    "predictions": [{
        "model_version": "v1",
        "result": predictions
    }]
}
filename = "DJI_20240308101753_0002_V.json"
with open(filename, "w") as json_file:
    json.dump(json_to_save, json_file, indent=4)

In [73]:
import cv2
import re
from collections import defaultdict

import numpy as np
#track_history = defaultdict(lambda: [])
track_history = []

for file in files_in_directory:
    frame = cv2.imread(file)
    results = model.track(frame, persist=True, tracker="botsort.yaml")

    if results[0].boxes.id == None:
        continue
    else:
        match = re.search(r'frame_\d+\.png', file)
        if match:
            frame_file = match.group(0)
            frame_id = int(frame_file.split(".")[0].split("_")[1])
        for _, result in enumerate(results):
            for box in result.boxes:
                bbox = box.xyxy[0].tolist()  # Convert from tensor to list
                track_id = box.id.item()  # Get track id
                conf = box.conf.item()  # Get confidence score
                track_history.append((frame_id, track_id, bbox[0], bbox[1], bbox[2]-bbox[0], bbox[3]-bbox[1], conf, -1,-1,-1))


0: 384x640 (no detections), 114.4ms
Speed: 6.7ms preprocess, 114.4ms inference, 0.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 118.6ms
Speed: 4.6ms preprocess, 118.6ms inference, 0.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 113.1ms
Speed: 4.5ms preprocess, 113.1ms inference, 0.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 108.5ms
Speed: 3.9ms preprocess, 108.5ms inference, 0.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 117.6ms
Speed: 5.2ms preprocess, 117.6ms inference, 0.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 106.0ms
Speed: 4.8ms preprocess, 106.0ms inference, 0.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 116.7ms
Speed: 4.3ms preprocess, 116.7ms inference, 0.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 108.1ms
Speed: 5.2ms prepr

In [74]:
track_history

[(155,
  807.0,
  853.1474609375,
  0.3236236572265625,
  521.2130126953125,
  402.1769561767578,
  0.9280760884284973,
  -1,
  -1,
  -1),
 (156,
  807.0,
  803.0896606445312,
  0.11206202954053879,
  622.8515014648438,
  484.91418308764696,
  0.8946263790130615,
  -1,
  -1,
  -1),
 (157,
  807.0,
  757.8284301757812,
  31.874706268310547,
  708.3101196289062,
  559.2811164855957,
  0.906868577003479,
  -1,
  -1,
  -1),
 (157,
  808.0,
  1731.7320556640625,
  231.7484130859375,
  188.2679443359375,
  388.136474609375,
  0.8915837407112122,
  -1,
  -1,
  -1),
 (157,
  809.0,
  358.1712951660156,
  0.08782272040843964,
  235.45352172851562,
  202.3903877288103,
  0.9127814173698425,
  -1,
  -1,
  -1),
 (158,
  807.0,
  737.8488159179688,
  94.0468521118164,
  738.6525268554688,
  593.9056625366211,
  0.9000979065895081,
  -1,
  -1,
  -1),
 (158,
  808.0,
  1730.3121337890625,
  318.79205322265625,
  188.5380859375,
  386.581787109375,
  0.8944559693336487,
  -1,
  -1,
  -1),
 (158,
  809

In [75]:
# Function to write MOT data to a text file
def write_mot_format(data, filename="botsort_mot_.txt"):
    with open(filename, "w") as f:
        for entry in data:
            # Formatting each entry as per MOT challenge format (one line per object)
            line = "{},{},{},{},{},{},{},{},{},{}\n".format(*entry)
            f.write(line)

# Call the function to write to file
write_mot_format(track_history)

In [None]:
import cv2
import re
from collections import defaultdict

import numpy as np
#track_history = defaultdict(lambda: [])
track_history = []

for file in files_in_directory:
    frame = cv2.imread(file)
    results = model.track(frame, persist=True, tracker="botsort.yaml")

    if results[0].boxes.id == None:
        continue
    else:
        boxes = results[0].boxes.xywh.cpu()
        track_ids = results[0].boxes.id.int().cpu().tolist()
        # Visualize the results on the frame
        annotated_frame = results[0].plot()
        for box, track_id in zip(boxes, track_ids):
            x, y, w, h = box

            match = re.search(r'frame_\d+\.png', file)
            if match:
                frame_file = match.group(0)
                frame_id = int(frame_file.split(".")[0].split("_")[1])
            #track = track_history[track_id]
            track_history.append((frame_id, track_id, float(x), float(y), float(w), float(h), 1, -1, -1, -1))  # x, y center point
            #if len(track) > 30:  # retain 90 tracks for 90 frames
            #    track.pop(0)


0: 384x640 (no detections), 332.0ms
Speed: 4.5ms preprocess, 332.0ms inference, 0.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 141.2ms
Speed: 4.5ms preprocess, 141.2ms inference, 0.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 142.8ms
Speed: 5.2ms preprocess, 142.8ms inference, 0.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 134.2ms
Speed: 4.6ms preprocess, 134.2ms inference, 0.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 133.7ms
Speed: 5.1ms preprocess, 133.7ms inference, 0.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 130.8ms
Speed: 4.3ms preprocess, 130.8ms inference, 0.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 133.1ms
Speed: 5.4ms preprocess, 133.1ms inference, 0.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 (no detections), 146.3ms
Speed: 4.7ms prepr

In [None]:
# Function to write MOT data to a text file
def write_mot_format(data, filename="botsort_mot.txt"):
    with open(filename, "w") as f:
        for entry in data:
            # Formatting each entry as per MOT challenge format (one line per object)
            line = "{},{},{},{},{},{},{},{},{},{}\n".format(*entry)
            f.write(line)

# Call the function to write to file
write_mot_format(track_history)

In [None]:
track_history

[(155,
  318,
  1112.4827880859375,
  225.37075805664062,
  359.488525390625,
  402.18023681640625,
  1,
  -1,
  -1,
  -1),
 (156,
  318,
  1113.3023681640625,
  272.1632080078125,
  369.444580078125,
  485.0265197753906,
  1,
  -1,
  -1,
  -1),
 (157,
  318,
  1110.65380859375,
  337.2982177734375,
  381.8323974609375,
  559.4500732421875,
  1,
  -1,
  -1,
  -1),
 (157,
  319,
  1825.52880859375,
  438.2492980957031,
  187.126953125,
  388.18560791015625,
  1,
  -1,
  -1,
  -1),
 (157,
  320,
  475.064697265625,
  113.46133422851562,
  149.77706909179688,
  202.39236450195312,
  1,
  -1,
  -1,
  -1),
 (158,
  318,
  1105.926025390625,
  413.1253967285156,
  388.18505859375,
  594.2643432617188,
  1,
  -1,
  -1,
  -1),
 (158,
  319,
  1823.503662109375,
  530.5972900390625,
  190.547119140625,
  386.6865234375,
  1,
  -1,
  -1,
  -1),
 (158,
  320,
  478.08355712890625,
  144.63058471679688,
  165.87405395507812,
  238.31463623046875,
  1,
  -1,
  -1,
  -1),
 (159,
  318,
  1099.576049