In [1]:
# # install dependencies
# !pip install -U torch torchvision cython
# !pip install -U 'git+https://github.com/facebookresearch/fvcore.git' 'git+https://github.com/cocodataset/cocoapi.git#subdirectory=PythonAPI'
# import torch, torchvision
# torch.__version__

# !git clone https://github.com/facebookresearch/detectron2
# !pip install -e detectron2

# from detectron2.config import get_cfg

# !mkdir ./temp
# !rm -rf video.mp4

# !wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1X0GR6xAT0Jw6XI-y8whn3T-KmDIrRa9n' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1X0GR6xAT0Jw6XI-y8whn3T-KmDIrRa9n" -O video.mp4

# !rm -rf ./temp/*
# !ffmpeg -i video.mp4 ./temp/frame%04d.jpg 

# !wget https://dl.fbaipublicfiles.com/densepose/densepose_rcnn_R_50_FPN_s1x/143908701/model_final_dd99d2.pkl
# !mkdir ./processed

# ! git clone https://github.com/erkil1452/gaze360
# ! wget http://gaze360.csail.mit.edu/files/gaze360_model.pth.tar

# !pip install -q lucid>=0.2.3
# !pip install -q moviepy

In [2]:
DENSEPOSE_PATH = '../densepose/detectron2/projects/DensePose/'
import sys
sys.path.append(DENSEPOSE_PATH)

In [3]:
import cv2
import numpy as np
import matplotlib.patches as patches

import matplotlib.pyplot as plt
import cv2
import numpy as np

from typing import ClassVar, Dict

from detectron2.config import get_cfg
from detectron2.structures.instances import Instances
from detectron2.engine.defaults import DefaultPredictor

from densepose import add_densepose_config
from densepose.vis.base import CompoundVisualizer
from densepose.vis.bounding_box import ScoredBoundingBoxVisualizer
from densepose.vis.extractor import CompoundExtractor, create_extractor

from densepose.structures import  DensePoseResult

from densepose.vis.densepose import DensePoseResultsFineSegmentationVisualizer

%matplotlib inline

In [4]:
def mask_to_bbox(mask):
    xs = np.where(np.sum(mask, axis=0) > 0)[0]
    ys = np.where(np.sum(mask, axis=1) > 0)[0]
    
    if len(xs) == 0 or len(ys) == 0:
        return None

    x0 = xs[0]
    x1 = xs[-1]

    y0 = ys[0]
    y1 = ys[-1]
    w = x1-x0
    h = y1-y0
    
    x0 = max(0,x0-w*0.15)
    x1 = max(0,x1+w*0.15)
    y0 = max(0,y0-h*0.15)
    y1 = max(0,y1+h*0.15)

    return np.array((x0, y0, x1, y1), dtype=np.float32)

def draw_bbox(im, bbox, color):
    img = cv2.rectangle(im,(bbox[0],bbox[1]),(bbox[2],bbox[3]),color,20)
    fig = plt.figure(figsize=[12,12])
    plt.imshow(im)


def find_id(bbox,id_dict):
    id_final = None
    max_iou = 0.5
    for k in id_dict.keys():
        if(compute_iou(bbox,id_dict[k][0])>max_iou): 
            id_final = k
            max_iou = compute_iou(bbox,id_dict[k][0])
    return id_final

def compute_iou(bb1,bb2):
    # determine the coordinates of the intersection rectangle
    x_left = max(bb1[0], bb2[0])
    y_top = max(bb1[1], bb2[1])
    x_right = min(bb1[2], bb2[2])
    y_bottom = min(bb1[3], bb2[3])

    if x_right < x_left or y_bottom < y_top:
        return 0.0

    # The intersection of two axis-aligned bounding boxes is always an
    # axis-aligned bounding box
    intersection_area = (x_right - x_left) * (y_bottom - y_top)

    # compute the area of both AABBs
    bb1_area = (bb1[2]-bb1[0]) * (bb1[3]-bb1[1])
    bb2_area = (bb2[2]-bb2[0]) * (bb2[3]-bb2[1])

    # compute the intersection over union by taking the intersection
    # area and dividing it by the sum of prediction + ground-truth
    # areas - the interesection area
    iou = intersection_area / float(bb1_area + bb2_area - intersection_area)
    eps = 1e-8

    if iou <= 0.0  or iou > 1.0 + eps: return 0.0

    return iou


def extract_heads_bbox(IUV,INDS):
    iuv_mask_head = (IUV[:,:,0] == 23) + (IUV[:,:,0] == 24)
    return [mask_to_bbox(iuv_mask_head).astype(np.uint)]

def predict1Person(img, vis, ext):
    outputs = predictor(img)['instances']
    densepose_result = ext(outputs)
    
    for i, result_encoded_w_shape in enumerate(densepose_result.results):
        iuv_arr = DensePoseResult.decode_png_data(*result_encoded_w_shape)
        bbox_xywh = densepose_result.boxes_xywh[i]
    return (bbox_xywh, iuv_arr)

def mergeIUVWithImage(imShape, IUV, bbox):
    x, y, w, h = int(bbox[0]), int(bbox[1]), int(bbox[2]), int(bbox[3])
    new_iuv = np.zeros([3, imShape[0],imShape[1]],dtype=np.uint8)
    new_iuv[: ,y:y+h, x:x+w] = IUV
    return new_iuv

def transposeIUV(IUV):
    iuv0 = iuv_arr[0,:,:]
    iuv1 = iuv_arr[1,:,:]
    iuv2 = iuv_arr[2,:,:]

    return np.concatenate((iuv0[:,:,np.newaxis], iuv1[:,:,np.newaxis], iuv2[:,:,np.newaxis]), axis=2)

In [5]:
cfg = get_cfg()
add_densepose_config(cfg)

cfg.merge_from_file('{}configs/densepose_rcnn_R_50_FPN_s1x.yaml'.format(DENSEPOSE_PATH))
cfg.MODEL.DEVICE = 'cuda'
cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.5 

cfg.MODEL.WEIGHTS = './model_final_dd99d2.pkl'
predictor = DefaultPredictor(cfg)

Config '../densepose/detectron2/projects/DensePose/configs/densepose_rcnn_R_50_FPN_s1x.yaml' has no VERSION. Assuming it to be compatible with latest v2.


In [7]:
import glob
filenames = [img for img in glob.glob("./temp/*.jpg")]

filenames.sort() # ADD THIS LINE

segm_vs = DensePoseResultsFineSegmentationVisualizer()
segm_ext = create_extractor(segm_vs)

final_results = dict()
for i, path in enumerate(filenames):
    im = cv2.imread(path)

    (bbox_xywh, iuv_arr) = predict1Person(im, segm_vs, segm_ext)

    iuv_arr = mergeIUVWithImage(im.shape, iuv_arr, bbox_xywh)
    
    INDS = iuv_arr[0,:,:] 

    IUV = transposeIUV(iuv_arr)
    
    print(i)
    bbox = extract_heads_bbox(IUV,INDS)
    if len(bbox)>0:    
      # draw_bbox(im, bbox[0], (0,255,0))
      final_results[i] = bbox


import imageio
import cv2
import numpy as np
import os

id_num = 0
tracking_id = dict()
identity_last = dict()
frames_with_people = list(final_results.keys())

frames_with_people.sort()
for i in frames_with_people:
    heads = final_results[i]
    identity_next = dict()
    print(len(heads))
    for j in range(len(heads)):
        bbox_head = heads[j]
        if bbox_head is None: continue
        id_val = find_id(bbox_head,identity_last)
        if id_val is None: 
            id_num+=1
            id_val = id_num
        #TODO: Improve eye location
        eyes = [(bbox_head[0]+bbox_head[2])/2.0, (0.65*bbox_head[1]+0.35*bbox_head[3])]
        identity_next[id_val] = (bbox_head,eyes)
    identity_last = identity_next
    tracking_id[i] = identity_last


import sys
import torch
import torchvision.transforms as transforms
import imageio
import cv2
import random
from PIL import Image
import math
import torch
import torchvision.transforms as transforms
import sys
image_normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])


sys.path.append('./gaze360/code/')
from model import GazeLSTM

model = GazeLSTM()
model = torch.nn.DataParallel(model).cuda()
model.cuda()
checkpoint = torch.load('./gaze360_model.pth.tar')
model.load_state_dict(checkpoint['state_dict'])
model.eval()

def spherical2cartesial(x): 
    output = torch.zeros(x.size(0),3)
    output[:,2] = -torch.cos(x[:,1])*torch.cos(x[:,0])
    output[:,0] = torch.cos(x[:,1])*torch.sin(x[:,0])
    output[:,1] = torch.sin(x[:,1])
    return output


0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1


In [8]:
tracking_id

{0: {1: (array([ 320,  496,  898, 1488], dtype=uint64), [609.0, 843.2])},
 1: {1: (array([ 316,  492,  892, 1486], dtype=uint64),
   [604.0, 839.9000000000001])},
 2: {1: (array([ 318,  469,  870, 1489], dtype=uint64), [594.0, 826.0])},
 3: {1: (array([ 316,  493,  898, 1485], dtype=uint64), [607.0, 840.2])},
 4: {1: (array([ 320,  491,  892, 1501], dtype=uint64), [606.0, 844.5])},
 5: {1: (array([ 323,  490,  898, 1506], dtype=uint64), [610.5, 845.6])},
 6: {1: (array([ 323,  491,  900, 1493], dtype=uint64), [611.5, 841.7])},
 7: {1: (array([ 324,  467,  877, 1495], dtype=uint64), [600.5, 826.8])},
 8: {1: (array([ 322,  492,  901, 1492], dtype=uint64), [611.5, 842.0])},
 9: {1: (array([ 323,  485,  897, 1492], dtype=uint64),
   [610.0, 837.4499999999999])},
 10: {1: (array([ 320,  459,  880, 1501], dtype=uint64), [600.0, 823.7])},
 11: {1: (array([ 322,  464,  878, 1504], dtype=uint64), [600.0, 828.0])},
 12: {1: (array([ 321,  482,  898, 1500], dtype=uint64), [609.5, 838.3])},
 13: 

In [6]:
import sys
import torch
import torchvision.transforms as transforms
import imageio
import cv2
import random
from PIL import Image
import math
import torch
import torchvision.transforms as transforms
import sys
image_normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])


sys.path.append('./gaze360/code/')
from model import GazeLSTM

model = GazeLSTM()
model = torch.nn.DataParallel(model).cuda()
model.cuda()
checkpoint = torch.load('./gaze360_model.pth.tar')
model.load_state_dict(checkpoint['state_dict'])
model.eval()

def spherical2cartesial(x): 
    output = torch.zeros(x.size(0),3)
    output[:,2] = -torch.cos(x[:,1])*torch.cos(x[:,0])
    output[:,0] = torch.cos(x[:,1])*torch.sin(x[:,0])
    output[:,1] = torch.sin(x[:,1])
    return output

In [7]:
import numpy as np
import json
import moviepy.editor as mvp
import lucid.misc.io.showing as show
from lucid.misc.gl.glcontext import create_opengl_context
import OpenGL.GL as gl

WIDTH, HEIGHT = 360, 640 
create_opengl_context((WIDTH, HEIGHT))

gl.glClear(gl.GL_COLOR_BUFFER_BIT)

vertex_shader_code = open("vertex_shader.txt", "r").read()
fragment_shader_code = open("fragment_shader.txt", "r").read()

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])

  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])

  _np_qint16 = np.dtype([("qint16", np.int16, 1)])

  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])

  _np_qint32 = np.dtype([("qint32", np.int32, 1)])

  np_resource = np.dtype([("resource", np.ubyte, 1)])

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])

  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])

  _np_qint16 = np.dtype([("qint16", np.int16, 1)])

  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])

  _np_qint32 = np.dtype([("qint32", np.int32, 1)])

  np_resource = np.dtype([("resource", np.ubyte, 1)])



In [8]:
from OpenGL.GL import shaders
vertexPositions = np.float32([[-1, -1], [1, -1], [-1, 1], [1, 1]])
VERTEX_SHADER = shaders.compileShader(vertex_shader_code, gl.GL_VERTEX_SHADER)

FRAGMENT_SHADER = shaders.compileShader(fragment_shader_code, gl.GL_FRAGMENT_SHADER)

shader = shaders.compileProgram(VERTEX_SHADER, FRAGMENT_SHADER)

xpos = gl.glGetUniformLocation(shader, 'xpos')
ypos = gl.glGetUniformLocation(shader, 'ypos')

vdir_x = gl.glGetUniformLocation(shader, 'vdir_x')
vdir_y = gl.glGetUniformLocation(shader, 'vdir_y')
vdir_z = gl.glGetUniformLocation(shader, 'vdir_z')

arrow_size = gl.glGetUniformLocation(shader, 'size')

res_loc = gl.glGetUniformLocation(shader, 'iResolution')

In [9]:
def render_frame(x_position,y_position,vx,vy,vz,asize):
  gl.glClear(gl.GL_COLOR_BUFFER_BIT)
  with shader:

    x_position = x_position*0.89
    y_position = y_position*0.67
    gl.glUniform1f(xpos, x_position)
    gl.glUniform1f(ypos, y_position)

    gl.glUniform1f(vdir_x, vx)
    gl.glUniform1f(vdir_y, vy)
    gl.glUniform1f(vdir_z, vz)
    gl.glUniform1f(arrow_size, asize)

    gl.glUniform3f(res_loc, WIDTH, HEIGHT, 1.0)
    
    gl.glEnableVertexAttribArray(0);
    gl.glVertexAttribPointer(0, 2, gl.GL_FLOAT, False, 0, vertexPositions)
    gl.glDrawArrays(gl.GL_TRIANGLE_STRIP, 0, 4)
  img_buf = gl.glReadPixels(0, 0, WIDTH, HEIGHT, gl.GL_RGB, gl.GL_UNSIGNED_BYTE)
  img = np.frombuffer(img_buf, np.uint8).reshape(HEIGHT, WIDTH, 3)[::-1]
  return img


Predict gaze and generate the final video:

In [10]:
v_reader = imageio.get_reader('./video.mp4')
fps = v_reader.get_meta_data()['fps']
out = imageio.get_writer('video_gaze_imageio.mp4',fps=fps)
color_encoding = []
for i in range(1000): color_encoding.append([random.randint(0,254),random.randint(0,254),random.randint(0,254)])

W = max(int(fps//8),1)




In [11]:
frames = []
i = 0
ims = []
try:
  for frame in v_reader:
    ims.append(frame)
except:
  pass  


In [12]:
v = ims

In [13]:
def print_gaze_in_head(xy, text, img):
  font      = cv2.FONT_HERSHEY_SIMPLEX
  fontScale = 0.5
  fontColor = (0,255,0)
  lineType  = 2

  cv2.putText(img, text, 
    (xy[0],xy[1]), 
    font, 
    fontScale,
    fontColor,
    lineType)

In [14]:
for i in range(0,len(v)):
    image = v[i].copy()
    image = cv2.resize(image,(WIDTH,HEIGHT))
    image = image.astype(float)
    
    if i in tracking_id:
        for id_t in tracking_id[i].keys():
            input_image = torch.zeros(7,3,224,224)
            count = 0
            for j in range(i-3*W,i+4*W,W):
                if j in tracking_id and id_t in tracking_id[j]:
                    new_im = Image.fromarray(v[j],'RGB')
                    bbox,eyes = tracking_id[j][id_t]
                else:
                    new_im = Image.fromarray(v[i],'RGB')
                    bbox,eyes = tracking_id[i][id_t]
                new_im = new_im.crop((bbox[0],bbox[1],bbox[2],bbox[3]))
                input_image[count,:,:,:] = image_normalize(transforms.ToTensor()(transforms.Resize((224,224))(new_im)))
                count = count+1

            bbox,eyes = tracking_id[i][id_t] 
            bbox = np.asarray(bbox).astype(int)

            output_gaze,_ = model(input_image.view(1,7,3,224,224).cuda())
            gaze = spherical2cartesial(output_gaze).detach().numpy()
            eyes = np.asarray(eyes).astype(float)
            eyes[0],eyes[1] = eyes[0]/float(v[i].shape[1]),eyes[1]/float( v[i].shape[0])

            gaze = gaze.reshape((-1))           

            img_arrow = render_frame(2*eyes[0]-1,-2*eyes[1]+1,-gaze[0],gaze[1],-gaze[2],0.05)

            binary_img = ((img_arrow[:,:,0]+img_arrow[:,:,1]+img_arrow[:,:,2])==0.0).astype(float)
            binary_img = np.reshape(binary_img,(HEIGHT,WIDTH,1))
            binary_img = np.concatenate((binary_img,binary_img,binary_img), axis=2)

            image = binary_img*image + img_arrow*(1-binary_img)
            image = image.astype(np.uint8)

            bbox[0],bbox[2] = WIDTH*bbox[0]/v[i].shape[1],WIDTH*bbox[2]/v[i].shape[1]
            bbox[1],bbox[3] = HEIGHT*bbox[1]/v[i].shape[0],HEIGHT*bbox[3]/v[i].shape[0]

            print_gaze_in_head((bbox[0],bbox[1]), 'x: {0:.2f}'.format(gaze[0]*100), image)
            print_gaze_in_head((bbox[0],bbox[1]+50), 'y: {0:.2f}'.format(gaze[1]*100), image)
            print_gaze_in_head((bbox[0],bbox[1]+100), 'z: {0:.2f}'.format(gaze[2]*100), image)

            image = cv2.rectangle(image, (bbox[0],bbox[1]), (bbox[2],bbox[3]),color_encoding[min(id_t,900)])
            image = image.astype(float)

    image = image.astype(np.uint8)
  
    out.append_data(image)
out.close()




Finally, we play the video with the gaze visualization: 

In [15]:
from IPython.display import HTML
from base64 import b64encode
mp4 = open('video_gaze_imageio.mp4','rb').read()
data_url = "data:video/mp4;base64," + b64encode(mp4).decode()
HTML("""
<video controls>
      <source src="%s" type="video/mp4">
</video>
""" % data_url)
