In [1]:
import cv2
import torch
import numpy as np

model = torch.hub.load('ultralytics/yolov5', 'yolov5m', pretrained =True)

video_path = "data/session5_center/video.avi"

Using cache found in /home/kasvoy/.cache/torch/hub/ultralytics_yolov5_master
YOLOv5 🚀 2023-3-26 Python-3.11.2 torch-2.0.0+cu117 CUDA:0 (NVIDIA GeForce GTX 1060 6GB, 6070MiB)

Fusing layers... 
YOLOv5m summary: 290 layers, 21172173 parameters, 0 gradients
Adding AutoShape... 


In [2]:
br = [1598, 967]
bl = [470, 924]
tl = [911, 85]
tr = [1273, 85]

br_new = [1400, 959]
tr_new = [1400, 60]
tl_new = [1000 ,60]
bl_new = [1000, 959]

src = np.float32([tl, tr, br, bl])
dst = np.float32([tl_new, tr_new, br_new, bl_new])

transform_matrix = cv2.getPerspectiveTransform(src, dst)

In [3]:
transform_matrix

array([[     1.5617,      4.1251,     -435.51],
       [  0.0035754,      4.6042,     -314.34],
       [  5.959e-05,   0.0033356,           1]])

In [4]:
"""
Original points (leftmost points starting from up):

p1: (774, 85)
p2: (706, 163)
p3: (601, 284)
p4: (423, 491)
p5: (79, 909)


Transformed:

p1: (845, 60)
p2: (845, 277)
p3: (845, 502)
p4: (845, 731)
p5: (851, 956)
"""

'\nOriginal points (leftmost points starting from up):\n\np1: (774, 85)\np2: (706, 163)\np3: (601, 284)\np4: (423, 491)\np5: (79, 909)\n\n\nTransformed:\n\np1: (845, 60)\np2: (845, 277)\np3: (845, 502)\np4: (845, 731)\np5: (851, 956)\n'

In [5]:
#ROI includes only the lanes going in direction of the camera
def extract_roi(frame):
    return frame[:, :1450] 

In [6]:
def get_point_under_transform(pt, transform_matrix):
    #point has to be an np.array, dtype=np.float32
    return cv2.perspectiveTransform(pt.reshape(-1, 1, 2), transform_matrix).reshape(2,)

In [7]:
#function that converts yolov5's bounding box format to ltwh format for deepsort
def xyxy_to_bb(result_tensor):
    bbs = []
    
    for det_tensor in result_tensor:
        det_list = det_tensor.tolist()
        
        x_min = det_list[0]
        y_min = det_list[1]
        x_max = det_list[2]
        y_max = det_list[3]
        conf = det_list[4]
        det_class = det_list[5]
        
        height = y_max - y_min
        width = x_max - x_min
        
        ltwh = [x_min, y_min, width, height], conf, det_class
        #consider only vehicles
        if det_class in {2, 3, 5, 7}:
            bbs.append(ltwh)
    return bbs   

In [21]:
def play_transformed_vid(video_path, transform_matrix):
    
    cap = cv2.VideoCapture(video_path)
    
    while cap.isOpened():
        
        ret, frame = cap.read()
        
        if ret:
            
            current_time = cap.get(cv2.CAP_PROP_POS_MSEC) / 1000

            # Display the current runtime in seconds
            #print("Current Runtime: {} seconds".format(current_time))
            
            cv2.imshow('t', cv2.warpPerspective(frame, transform_matrix, (1920,1080)))
            
            if cv2.waitKey(10) & 0xFF == ord('q'):
                break
            
        else:
            break    
    
    cap.release()
    cv2.destroyAllWindows()
    

In [20]:
#play_transformed_vid(video_path, transform_matrix)

100.0


In [10]:
#show model detections
def show_dets(video_path):
    video = cv2.VideoCapture(video_path)
    
    while video.isOpened():
        
        ret, frame = video.read()
        
        if ret:
        
            cv2.namedWindow("Resized_Window", cv2.WINDOW_NORMAL)
            cv2.resizeWindow("Resized_Window", 1920, 1080)
        
            result = model(extract_roi(frame))
                
            cv2.imshow("Resized_Window", np.squeeze(result.render()))
        else:
            break
    
        if cv2.waitKey(10) & 0xFF == ord('q'):
            break
        
    video.release()
    cv2.destroyAllWindows()

In [66]:
def show_nth_frame(video_path, n, transformed=True):
    cap = cv2.VideoCapture(video_path)
    cap.set(cv2.CAP_PROP_POS_FRAMES, n)
    _, frame = cap.read()

    if transformed:
        cv2.imshow("frame", cv2.warpPerspective(frame, transform_matrix, (1920,1080)))
    else:
        cv2.imshow("frame", frame)

    cv2.waitKey(0)
    cv2.destroyAllWindows()
    cap.release()

In [67]:
def show_time_frame(video_path, video_time = 25, transformed=True): 
        
    cap = cv2.VideoCapture(video_path)
        
    cap.set(cv2.CAP_PROP_POS_MSEC,video_time*1000) 
    
    _, frame = cap.read()
    
    if transformed:
        cv2.imshow("frame", cv2.warpPerspective(frame, transform_matrix, (1920,1080)))
    else:
        cv2.imshow("frame", frame)
    
    cv2.waitKey(0)
    cv2.destroyAllWindows()
    cap.release()

In [68]:
def display_video_time(current_time, frame):
    current_time_trunc = float('{:.2f}'.format(current_time))

    text = f"Video time: {current_time_trunc}"

    cv2.putText(frame, text, (200, 200), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, 2)

In [69]:
#add an entry to the dictionary which cars crossed which points at what times - changes state of dictionary
def assign_point_time(speeds_dict, frame_dict, points, br_transformed, track_id, cap, tolerance):
    
    for point in points.items():
        
        point_name = point[0]
        point_y_coord = point[1][1]
        previous_point_name = ''
        
        if point_name != 'P1':
            previous_point_name = 'P'+str(int(point_name[1])-1)
        
        if br_transformed[1] >= point_y_coord and br_transformed[1] <= point_y_coord+tolerance:
            time = cap.get(cv2.CAP_PROP_POS_MSEC)/1000
            
            speeds_dict[track_id][point_name] = time
            frame_number = cap.get(cv2.CAP_PROP_POS_FRAMES)
            frame_dict[track_id][point_name] = frame_number
            
            if previous_point_name in ['P2', 'P3', 'P4']:
                time_at_prev = speeds_dict[track_id][previous_point_name]
                
                if time_at_prev != 0: 
                    delta = time - time_at_prev
                    
                    if previous_point_name == 'P2':
                        speeds_dict[track_id]['delta23'] = delta
                    elif previous_point_name == 'P3':
                        speeds_dict[track_id]['delta34'] = delta
                    else:
                        speeds_dict[track_id]['delta45'] = delta 
            
            #print(f"{point_name} CROSSED BY ID {track_id} at video time: {time}")

In [57]:
from deep_sort_realtime.deepsort_tracker import DeepSort

def play_tracker_video(video_path, transform_matrix=None, video_time=30):

    cap = cv2.VideoCapture(video_path)
    cap.set(cv2.CAP_PROP_POS_MSEC,video_time*1000)
    
    tracker = DeepSort(max_age=1, n_init=2, nms_max_overlap=1.0,embedder_gpu=True)
    
    #coordinates of the 5 marked lines in transformed space
    #points are roughly equidistant - 7 meters real world space
    points = {
        'P1': [845, 60],
        'P2': [845, 277],
        'P3': [845, 502],
        'P4': [845, 731],
        'P5': [851, 956]
    }
    
    speeds_dict = dict()
    frame_dict = dict()

    while cap.isOpened():
        ret, frame = cap.read()
        frame = extract_roi(frame)
            
        if ret:
            current_time = float('{:.2f}'.format(cap.get(cv2.CAP_PROP_POS_MSEC)/1000))
            display_video_time(current_time, frame)
            
            
            results = model(frame)
            bbs = xyxy_to_bb(results.xyxy[0])
            
            tracks = tracker.update_tracks(bbs, frame=frame) 
            for track in tracks:
                if not track.is_confirmed():
                    continue
                track_id = track.track_id
                
                #[bbox[0], bbox[1]] <-- TOP LEFT, [bbox[2], bbox[3]] <-- BOTTOM RIGHT
                bbox = list(track.to_ltrb())
                br = np.array([int(bbox[2]), int(bbox[3])], dtype=np.float32)
                #bl = np.array([int(bbox[2]), int(bbox[3])], dtype=np.float32)
                
                br_transformed = get_point_under_transform(br, transform_matrix)
                
                #print(f"ID: {track_id}, br_tr --> {br_transformed}")
                
                if track_id not in speeds_dict:
                    speeds_dict[track_id] = {'P1': 0, 'P2': 0, 'P3': 0, 'P4': 0, 'P5': 0, 'delta23': 0, 'delta34': 0, 'delta45': 0}
                    frame_dict[track_id] = {'P1': 0, 'P2': 0, 'P3': 0, 'P4': 0, 'P5': 0, 'delta23': 0, 'delta34': 0, 'delta45': 0}
                
                assign_point_time(speeds_dict, frame_dict, points, br_transformed, track_id, cap, tolerance=10)                    
                
                txt = 'id:' + str(track.track_id)
                
                (label_width,label_height), baseline = cv2.getTextSize(txt , cv2.FONT_HERSHEY_SIMPLEX,1,1)
                org = tuple(map(int,[int(bbox[0]),int(bbox[1])-baseline]))

                cv2.rectangle(frame, (int(bbox[0]), int(bbox[1])), (int(bbox[2]), int(bbox[3])), (0,255,0), 1)
                cv2.putText(frame, txt, org, cv2.FONT_HERSHEY_SIMPLEX, 1, (255,255,255), 1)
            
            
            cv2.imshow('Video', cv2.warpPerspective(frame, transform_matrix, (1920,1080)))
            #cv2.imshow('Video', frame)
            key = cv2.waitKey(1)
            
            if key & 0xFF == ord('q'):
                break
            if key & 0xFF == ord('p'):
                cv2.waitKey(-1)
        
        else:
            break


    cap.release()
    cv2.destroyAllWindows()
    
    #print(speeds_dict)
    print('frames')
    print(frame_dict)

In [62]:
play_tracker_video(video_path, transform_matrix, video_time=50)

frames
{'1': {'P1': 0, 'P2': 5012.0, 'P3': 5030.0, 'P4': 5047.0, 'P5': 5064.0, 'delta23': 0, 'delta34': 0, 'delta45': 0}, '2': {'P1': 0, 'P2': 0, 'P3': 0, 'P4': 0, 'P5': 5011.0, 'delta23': 0, 'delta34': 0, 'delta45': 0}, '3': {'P1': 5043.0, 'P2': 0, 'P3': 5070.0, 'P4': 0, 'P5': 0, 'delta23': 0, 'delta34': 0, 'delta45': 0}, '4': {'P1': 5097.0, 'P2': 5113.0, 'P3': 5129.0, 'P4': 5145.0, 'P5': 5160.0, 'delta23': 0, 'delta34': 0, 'delta45': 0}, '5': {'P1': 0, 'P2': 0, 'P3': 0, 'P4': 0, 'P5': 0, 'delta23': 0, 'delta34': 0, 'delta45': 0}, '6': {'P1': 0, 'P2': 0, 'P3': 0, 'P4': 0, 'P5': 0, 'delta23': 0, 'delta34': 0, 'delta45': 0}}


In [59]:
show_time_frame(video_path, video_time=30.38)

In [71]:
show_nth_frame(video_path, n=5030.0, transformed=True)