In [None]:
from base64 import b64encode
from google.colab import files
from google.colab.patches import cv2_imshow
from IPython.display import HTML
from PIL import Image
from tqdm.notebook import tqdm
import cv2
import numpy as np
import os
import torch

In [None]:
# It may take a while...
#!wget "https://documents.epfl.ch/groups/c/cv/cvlab-pom-video1/www/campus4-c0.avi"
from google.colab import files
uploaded=files.upload()

In [None]:
def display_video(path):
    '''Display video in Colab.'''
    compressed_path = path.split('.')[0]
    compressed_path = 'compressed_' + compressed_path + '.mp4'

    if os.path.exists(compressed_path):
        os.remove(compressed_path)

    # Convert video
    os.system(f"ffmpeg -i {path} -vcodec libx264 {compressed_path}")

    # Show video
    mp4 = open(compressed_path,'rb').read()
    data_url = "data:video/mp4;base64," + b64encode(mp4).decode()
    return HTML("""
    <video width=400 controls>
        <source src="%s" type="video/mp4">
    </video>
    """ % data_url)

In [None]:
filename = 'campus4-c0.avi'
display_video(filename)

In [None]:
#Model
#The model used is YOLOv5x, the best YOLOv5 model. We import it with Torch Hub.

#When we pass an image to this model, it returns to us where objects are in the image, which objects they are, and what is the model's confidence about this.

In [None]:
# Load model
model = torch.hub.load('ultralytics/yolov5', 'yolov5x',
                       pretrained=True, verbose=False)
model.cuda('cuda:0');

In [None]:
## People detection

#The first way to calculate distance among people is just calculate the distance among the rectangles (boxes) returned by the model, more precisely the distance among their centers.

In [None]:
def center_distance(xyxy1, xyxy2):
    '''Calculate the distance of the centers of the boxes.'''
    a, b, c, d = xyxy1
    x1 = int(np.mean([a, c]))
    y1 = int(np.mean([b, d]))

    e, f, g, h = xyxy2
    x2 = int(np.mean([e, g]))
    y2 = int(np.mean([f, h]))
    
    dist = np.linalg.norm([x1 - x2, y1 - y2])
    return dist, x1, y1, x2, y2
#When we have a frame of a video, we can detect the people on the frame using YOLOv5x and draw the rectangles. The color of the rectangle indicates if the person is too close to another person.

In [None]:
def detect_people_on_frame(img, confidence, distance):
    '''Detect people on a frame and draw the rectangles and lines.'''
    results = model([img[:, :, ::-1]])  # Pass the frame through the model and get the boxes

    xyxy = results.xyxy[0].cpu().numpy()  # xyxy are the box coordinates
    #          x1 (pixels)  y1 (pixels)  x2 (pixels)  y2 (pixels)   confidence        class
    # tensor([[7.47613e+02, 4.01168e+01, 1.14978e+03, 7.12016e+02, 8.71210e-01, 0.00000e+00],
    #         [1.17464e+02, 1.96875e+02, 1.00145e+03, 7.11802e+02, 8.08795e-01, 0.00000e+00],
    #         [4.23969e+02, 4.30401e+02, 5.16833e+02, 7.20000e+02, 7.77376e-01, 2.70000e+01],
    #         [9.81310e+02, 3.10712e+02, 1.03111e+03, 4.19273e+02, 2.86850e-01, 2.70000e+01]])

    xyxy = xyxy[xyxy[:, 4] >= confidence]  # Filter desired confidence
    xyxy = xyxy[xyxy[:, 5] == 0]  # Consider only people
    xyxy = xyxy[:, :4]

    colors = ['green']*len(xyxy)
    for i in range(len(xyxy)):
        for j in range(i+1, len(xyxy)):
            # Calculate distance of the centers
            dist, x1, y1, x2, y2 = center_distance(xyxy[i], xyxy[j])
            if dist < distance:
                # If dist < distance, boxes are red and a line is drawn
                colors[i] = 'red'
                colors[j] = 'red'
                img = cv2.line(img, (x1, y1), (x2, y2), (0, 0, 255), 2)
    for i, (x1, y1, x2, y2) in enumerate(xyxy):
        # Draw the boxes
        if colors[i] == 'green':
            color = (0, 255, 0)
        else:
            color = (0, 0, 255)
        img = cv2.rectangle(img, (x1, y1), (x2, y2), color, 2)
    return img

In [None]:
#To detect people in a video, we iterate through all frames of the video, and save a new video with the rectangles drawn.

In [None]:
def detect_people_on_video(filename, confidence=0.9, distance=60):
    '''Detect people on a video and draw the rectangles and lines.'''
    # Capture video
    cap = cv2.VideoCapture(filename)

    # Get video properties
    fps = cap.get(cv2.CAP_PROP_FPS)
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

    # Define the codec and create VideoWriter object
    fourcc = cv2.VideoWriter_fourcc(*'XVID')
    if os.path.exists('output.avi'):
        os.remove('output.avi')
    out = cv2.VideoWriter('output.avi', fourcc, fps, (width, height))

    # Iterate through frames and detect people
    vidlen = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    with tqdm(total=vidlen) as pbar:
        while cap.isOpened():
            # Read a frame
            ret, frame = cap.read()
            # If it's ok
            if ret == True:
                frame = detect_people_on_frame(frame, confidence, distance)
                # Write new video
                out.write(frame)
                pbar.update(1)
            else:
                break

    # Release everything if job is finished
    cap.release()
    out.release()
    cv2.destroyAllWindows()

In [None]:
#Let's apply the detection tool.

In [None]:
detect_people_on_video(filename, confidence=0.5)

In [None]:
#Let's watch the result.

In [None]:
display_video('output.avi')