In [None]:

import matplotlib.pyplot as plt
import torch
import cv2
import numpy as np
import time
import pandas as pd
from boxes import draw_border
from torchvision import transforms
from utils.datasets import letterbox
from utils.general import non_max_suppression_kpt
from utils.plots import output_to_keypoint, plot_skeleton_kpts
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

Test MiDaS model predictions on an image

In [None]:
midas = torch.hub.load('intel-isl/MiDaS', 'MiDaS_small')
midas.to('cuda')
midas.eval()
transformss = torch.hub.load('intel-isl/MiDaS', 'transforms')
transform = transformss.small_transform
cap = cv2.imread('snip.jpg')

image = cv2.cvtColor(cap, cv2.COLOR_BGR2RGB)

image = cv2.resize(image,(600,500))
image_batch = transform(image).to('cuda')
%matplotlib widget
with torch.no_grad():
        prediction = midas(image_batch)
        prediction=torch.nn.functional.interpolate(
                    prediction.unsqueeze(1),
                    size = image.shape[:2],
                    mode = 'bicubic',
                    align_corners=False
                    ).squeeze()

        output = prediction.cpu().numpy()

plt.subplot(1, 2,1)
plt.imshow(image)
plt.tight_layout(pad=0.0)
plt.axis("off") 
plt.subplot(1, 2,2)
plt.imshow(output)
plt.axis("off") 

Functions to use yolov7-pose model on a image frame

In [None]:
from utils.plots import output_to_keypoint, plot_skeleton_kpts
def image_view(imagefile, w=20, h=14):
    """
    Displaying an image from an image file
    """
    %matplotlib inline
    plt.figure(figsize=(w, h))
    plt.axis('off')
    plt.imshow(cv2.cvtColor(cv2.imread(imagefile), 
                            cv2.COLOR_BGR2RGB))

def loading_yolov7_model(yolomodel):
    """
    Loading yolov7 model
    """
    print("Loading model:", yolomodel)
    model = torch.load(yolomodel, map_location=device)['model']
    model.float().eval()

    if torch.cuda.is_available():
        # half() turns predictions into float16 tensors
        # which significantly lowers inference time
        model.half().to(device)

    return model, yolomodel


def running_inference(image):
    """
    Running yolov7 model inference
    """
    image = letterbox(image, 960, 
                      stride=64,
                      auto=True)[0]  # shape: (567, 960, 3)
    image = transforms.ToTensor()(image)  # torch.Size([3, 567, 960])

    if torch.cuda.is_available():
        image = image.half().to(device)

    image = image.unsqueeze(0)  # torch.Size([1, step, 567, 960])

    with torch.no_grad():
        output, _ = model(image)

    return output.cpu(), image

def draw_keypoints(output, image, confidence=0.25, threshold=0.65):
    """
    Draw YoloV7 pose keypoints
    """
    output = non_max_suppression_kpt(
        output,
        confidence,  # Confidence Threshold
        threshold,  # IoU Threshold
        nc=model.yaml['nc'],  # Number of Classes
        nkpt=model.yaml['nkpt'],  # Number of Keypoints
        kpt_label=True)

    with torch.no_grad():
        output = output_to_keypoint(output)

    nimg = image[0].permute(1, 2, 0) * 255
    nimg = cv2.cvtColor(nimg.cpu().numpy().astype(np.uint8), cv2.COLOR_RGB2BGR)
    op_pt=[]
    for idx in range(output.shape[0]):
        #kkp=[]
        plot_skeleton_kpts(nimg, output[idx, 7:].T, 3)
        xmin, ymin = (output[idx, 2]-output[idx, 4]/2), (output[idx, 3]-output[idx, 5]/2)
        xmax, ymax = (output[idx, 2]+output[idx, 4]/2), (output[idx, 3]+output[idx, 5]/2)
        cv2.rectangle(
              nimg,
              (int(xmin), int(ymin)),
              (int(xmax), int(ymax)),
              color=(255, 0, 0),
              thickness=1,
              lineType=cv2.LINE_AA
          )
        #kkp.append(output[idx, 7:].T)
        op_pt = output[idx, 7:].T
    return nimg,op_pt,output


    

Test yolov7-pose on image

In [None]:
YOLOV7MODEL = 'yolov7-w6-pose.pt'

try:
    print("Loading the model...")
    model, yolomodel = loading_yolov7_model(yolomodel=YOLOV7MODEL)
    print("Using the", YOLOV7MODEL, "model")
    print("Done")

except:
    print("[Error] Cannot load the model", YOLOV7MODEL)

imagefile = "test.jpg"
%matplotlib widget
output1, image = running_inference(cv2.imread(imagefile))

pose_image,kpts,output = draw_keypoints(output1, image, confidence=0.25, threshold=0.65)


plt.figure()
plt.axis("off")
plt.imshow(pose_image)



Estimating depth of each human using MiDaS model on persons detected by Yolov7-pose model

In [None]:
transformss = torch.hub.load('intel-isl/MiDaS', 'transforms')
transform1 = transformss.small_transform

image = cv2.imread("test.jpg")
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

output1, image = running_inference(image)
pose_image,kpts,output = draw_keypoints(output1, image, confidence=0.5, threshold=0.65)

image = cv2.cvtColor(pose_image, cv2.COLOR_BGR2RGB)

image_batch = transform(image).to('cuda')
%matplotlib widget
with torch.no_grad():
    prediction = midas(image_batch)
    prediction=torch.nn.functional.interpolate(
            prediction.unsqueeze(1),
            size = image.shape[:2],
            mode = 'bicubic',
            align_corners=False
            ).squeeze()

    det = prediction.cpu().numpy()
    boxes = []
    inverse_depth = []
    person=[]
    for idx in range(output.shape[0]):
        
        
        detc=0
        plot_skeleton_kpts(det, output[idx, 7:].T, 3)
        xmin, ymin = (output[idx, 2]-output[idx, 4]/2), (output[idx, 3]-output[idx, 5]/2)
        xmax, ymax = (output[idx, 2]+output[idx, 4]/2), (output[idx, 3]+output[idx, 5]/2)

        person.append([idx])
        cv2.rectangle(
                det,
                (int(xmin), int(ymin)),
                (int(xmax), int(ymax)),
                color=(0,0,0),
                thickness=2,
                lineType=cv2.LINE_AA
            )
        plot_skeleton_kpts(image, output[idx, 7:].T, 3)
        cv2.putText(det, f"p{idx}", (int(xmin), int(ymin-10)), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (255,0,0),2)   
        
        cv2.rectangle(
                image,
                (int(xmin), int(ymin)),
                (int(xmax), int(ymax)),
                color=(255,255,255),
                thickness=2,
                lineType=cv2.LINE_AA
            )
        
        cv2.putText(image, f"p{idx}", (int(xmin), int(ymin-10)), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (255,255,255),2)   
        
        
        box = [int(xmin),int(ymin),int(xmax),int(ymax)]
        boxes.append(box)
        
        inv_depth = det[boxes[idx][1]:boxes[idx][3],boxes[idx][0]:boxes[idx][2]].mean()
        inverse_depth.append(inv_depth)
        if (inv_depth/1000)<0.25:
            cv2.putText(det, f'far', (int(xmin), int(ymax-25)), cv2.FONT_HERSHEY_PLAIN, 2,
                        (0, 255, 25), 2)
            cv2.putText(image, f'far', (int(xmin), int(ymax-25)), cv2.FONT_HERSHEY_PLAIN, 2,
                        (0, 255, 25), 2)
        elif (inv_depth/1000)>0.25:
                cv2.putText(det, f'near!', (int(xmin), int(ymax-25)), cv2.FONT_HERSHEY_PLAIN, 2,
                        (255, 0, 0), 3)
                cv2.putText(image, f'near!', (int(xmin), int(ymax-25)), cv2.FONT_HERSHEY_PLAIN, 2,
                        (255, 0, 0), 3)

        #cv2.putText(det, f'proximity:{(inv_depth)*0.001:.2f}', (int(xmin-20), int(ymax+10)), cv2.FONT_HERSHEY_PLAIN, 1.5,
                        #(255, 255, 0), 2)
        #cv2.putText(image, f'proximity:{inv_depth*0.001:.2f}', (int(xmin-20), int(ymax+10)), cv2.FONT_HERSHEY_PLAIN, 1.5,
                        #(255, 255, 0), 2)
        
    

    plt.subplot(1, 2,1)
    plt.imshow(det)
    plt.tight_layout(pad=0.0)
    plt.axis("off") 
    plt.subplot(1, 2,2)
    plt.imshow(image)
    plt.axis("off") 



Saving the depth detections data into a .csv and .txt data file

In [None]:
ped = np.array(ped)
boxes = np.array(boxes)
inverse_depth = np.array(inverse_depth)
np.set_printoptions(suppress=True)
inverse_depth =inverse_depth.reshape(-1,1)

merged_array = np.concatenate((person, boxes[:,0:3], inverse_depth), axis=1)

# Write to a CSV file
np.savetxt("Depth_data.csv", merged_array, delimiter=",",header="person,x1,y1,x2,y2,depth",fmt='%f')

# Write to a TXT file
np.savetxt("Depth_data.txt", merged_array, delimiter="\t",header="person,x1,y1,x2,y2,depth",fmt='%f')

Testing on a video feed

In [None]:
transformss = torch.hub.load('intel-isl/MiDaS', 'transforms')
transform1 = transformss.small_transform
#inverse_depth=[]
cap = cv2.VideoCapture('test1.mp4')
while cap.isOpened():
    ret,frame = cap.read()

    image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    
    output1, image = running_inference(frame)
    pose_image,kpts,output = draw_keypoints(output1, image, confidence=0.25, threshold=0.65)
    
    image = cv2.cvtColor(pose_image, cv2.COLOR_BGR2RGB)

    image_batch = transform(image).to('cuda')
    
    with torch.no_grad():
    prediction = midas(image_batch)
    prediction=torch.nn.functional.interpolate(
            prediction.unsqueeze(1),
            size = image.shape[:2],
            mode = 'bicubic',
            align_corners=False
            ).squeeze()

    det = prediction.cpu().numpy()
    boxes = []
    inverse_depth = []
    person=[]
    for idx in range(output.shape[0]):
        
        
        detc=0
        plot_skeleton_kpts(det, output[idx, 7:].T, 3)
        xmin, ymin = (output[idx, 2]-output[idx, 4]/2), (output[idx, 3]-output[idx, 5]/2)
        xmax, ymax = (output[idx, 2]+output[idx, 4]/2), (output[idx, 3]+output[idx, 5]/2)

        person.append([idx])
        cv2.rectangle(
                det,
                (int(xmin), int(ymin)),
                (int(xmax), int(ymax)),
                color=(0,0,0),
                thickness=2,
                lineType=cv2.LINE_AA
            )
        plot_skeleton_kpts(image, output[idx, 7:].T, 3)
        cv2.putText(det, f"p{idx}", (int(xmin), int(ymin-10)), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (255,0,0),)   
        
        cv2.rectangle(
                image,
                (int(xmin), int(ymin)),
                (int(xmax), int(ymax)),
                color=(255,255,255),
                thickness=2,
                lineType=cv2.LINE_AA
            )
        
        cv2.putText(image, f"p{idx}", (int(xmin), int(ymin-10)), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (255,255,255),2)   
        
        
        box = [int(xmin),int(ymin),int(xmax),int(ymax)]
        boxes.append(box)
        
        inv_depth = det[boxes[idx][1]:boxes[idx][3],boxes[idx][0]:boxes[idx][2]].mean()
        inverse_depth.append(inv_depth)
        if (inv_depth/1000)<0.25:
            cv2.putText(det, f'far', (int(xmin), int(ymax-25)), cv2.FONT_HERSHEY_PLAIN, 2,
                        (0, 255, 25), 2)
            cv2.putText(image, f'far', (int(xmin), int(ymax-25)), cv2.FONT_HERSHEY_PLAIN, 2,
                        (0, 255, 25), 2)
        elif (inv_depth/1000)>0.25:
                cv2.putText(det, f'near!', (int(xmin), int(ymax-25)), cv2.FONT_HERSHEY_PLAIN, 2,
                        (255, 0, 0), 3)
                cv2.putText(image, f'near!', (int(xmin), int(ymax-25)), cv2.FONT_HERSHEY_PLAIN, 2,
                        (255, 0, 0), 3)

        cv2.putText(det, f'proximity:{1.2*inv_depth/1000:.2f}', (int(xmin-20), int(ymax+25)), cv2.FONT_HERSHEY_PLAIN, 1.5,
                        (255, 255, 0), 2)
        cv2.putText(image, f'proximity:{1.2*inv_depth/1000:.2f}', (int(xmin-20), int(ymax+25)), cv2.FONT_HERSHEY_PLAIN, 1.5,
                        (255, 255, 0), 2)
    cv2.namedWindow("img", cv2.WINDOW_NORMAL)
    cv2.imshow("img",frame)
    cv2.resizeWindow("img",720,440)
    plt.imshow(det)

    #plt.imshow(frame)
 
    plt.pause(0.00001)
    
    if cv2.waitKey(10) & 0xFF == ord('q'):
        cap.release()
        cv2.destroyAllWindows()

plt.show()


Testing on a webcam feed

In [None]:
cap = cv2.VideoCapture(0)
while cap.isOpened():
    ret,frame = cap.read()

    image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    
    image_batch = transform(image).to('cuda')

    with torch.no_grad():
        prediction = midas(image_batch)
        prediction=torch.nn.functional.interpolate(
            prediction.unsqueeze(1),
            size = image.shape[:2],
            mode = 'bicubic',
            align_corners=False
            ).squeeze()

        output = prediction.cpu().numpy()

        #print(output)
    plt.imshow(output)
    #plt.imshow(frame)
    cv2.imshow('CV2Frame',frame)
    plt.pause(0.0001)
    print(prediction)
    if cv2.waitKey(10) & 0xFF == ord('q'):
        cap.release()
        cv2.destroyAllWindows()

plt.show()
