In [0]:
#!pip install import_ipynb


In [0]:
#%run data_loader.ipynb

In [0]:
import os.path as path
import os
import json
from random import randint
import time
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
from skimage.draw import polygon
import skimage.io as sio

from pycocotools.coco import COCO
from operator import add 
import glob
import math
import random

%cd /content/drive/My Drive/Learning Temporal Pose Estimation/simple-HRNet
import cv2
from SimpleHRNet import SimpleHRNet

model = SimpleHRNet(48, 17, "./weights/pose_hrnet_w48_384x288.pth")



In [0]:
keypoints = {0: "nose", 
             1: "head_bottom",
             2: "head_top",
             3: "left_ear",
             4: "right_ear",
             5: "left_shoulder",
             6: "right_shoulder",
             7: "left_elbow",
             8: "right_elbow",
             9: "left_wrist",
             10: "right_wrist",
             11: "left_hip",
             12: "right_hip",
             13: "left_knee",
             14: "right_knee",
             15: "left_ankle",
             16: "right_ankle"}
             
skeleton = [[15, 13], [13, 11], [16, 14], [14, 12], [11, 12], [5, 11], [6, 12], [5, 6], [5, 7],
                [6, 8], [7, 9], [8, 10], [1, 2], [0, 1], [0, 2], [1, 3], [2, 4],  [3, 5], [4, 6]]

In [0]:
def gaussian(xL, yL, H, W, sigma=5):
  channel = [math.exp(-((c - xL) ** 2 + (r - yL) ** 2) / (2 * sigma ** 2)) for r in range(H) for c in range(W)]
  channel = np.array(channel, dtype=np.float32)
  channel = np.reshape(channel, newshape=(H, W))

  return channel

In [0]:
def gen_heatmap(path, bbox):
  im = Image.open(path)
  #print(im.size)
  #im = np.asarray(im)
  #print(im.shape)
  #plt.imshow(np.asarray(im))
  #plt.show()
  x, y, width, height = bbox
  left, top, right, bottom = max(0,x), max(0,y), x+width, y+height
  #print([left, top, right, bottom])
  im1 = im.crop((left, top, right, bottom)) 
  im1 = im1.resize((384,288))
  #print(im1.size)
  im1 = np.asarray(im1)
  #print(im1.shape)
  #print(im1)
  #plt.imshow((im1))
  #plt.show()
  joints = model.predict(im1)
  if joints.shape == (0, 0, 3):
    return np.array([[]])
  joints = joints[0,:, :]
  joints = joints.reshape(1,17,3)
  arr1 = joints[:, :, 0].reshape(17,)
  arr2 = joints[:, :, 1].reshape(17,)
  heatmaps = []
  for j in range(17):
    heatmap = gaussian(arr1[j], arr2[j], 384, 288)
    heatmaps.append(heatmap)
    
  heatmaps = np.array(heatmaps)
  heatmap = heatmaps.sum(axis = 0)
  return heatmap

In [0]:
def gen_ground_truth(keypoints, bbox):
  heatmaps = []
  for pt in keypoints:
    x, y = pt
    heatmap = gaussian(x, y, 720, 1280)
    heatmaps.append(heatmap)
  heatmaps = np.array(heatmaps)
  heatmap = heatmaps.sum(axis = 0)

  x, y, width, height = bbox
  left, top, right, bottom = max(0,x), max(0,y), x+width, y+height

  img = Image.fromarray(heatmap)
  im1 = img.crop((left, top, right, bottom)) 
  im1 = im1.resize((288, 384))
  heatmap = np.asarray(im1)
  return heatmap

In [0]:
def gen_bbox_unlabelled(path):
  i = train_videos_list.index(path)
  labelled_frames = training_video_labeled_frames[i]
  bbox_list = training_bbox[i].copy()
  req_bbox = []
  for bbox in bbox_list:
    if(len(bbox) != 0):
      req_bbox.append(bbox)
  
  length = training_length_bbox[i]
  result = []
  for i in range(length):
    result.append([0,0,0,0])
  for frame_bbox in req_bbox:
    for j in range(length):
      if(frame_bbox[j] == None):
        frame_bbox[j] = [0,0,0,0]
      result[j] = list(map(add, result[j], frame_bbox[j])) 
  for i in range(len(result)):
    result[i] = [x / len(req_bbox) for x in result[i]]
  return result

In [0]:
def gen_video_heatmaps_pairs(n):
  path = train_videos_list[n]
  bbox_unlabeled = gen_bbox_unlabelled(path)
  #print(bbox_unlabeled)
  bbox = training_bbox[n]
  #print(bbox)
  bbox_labeled = []
  for i, b in enumerate(bbox):
    if len(b) == 0:
      continue
    else:
      bbox_labeled.append(b)
  #print(bbox_labeled)      
  no_of_people = training_length_bbox[n]
  #print(no_of_people)
  current_video_labeled_frames = training_video_labeled_frames[n]
  current_video_unlabeled_frames = training_video_unlabeled_frames[n]
  #print(current_video_labeled_frames)
  #print(current_video_unlabeled_frames)
  j = 0
  heatmaps1 = []
  heatmaps2 = []
  ground_truth_heatmaps = []
  for i in range(len(current_video_unlabeled_frames)):
    
    frame_unlabeled = current_video_unlabeled_frames[i]
    if j == (len(current_video_labeled_frames) - 1):
      j = 0
    frame_labeled = current_video_labeled_frames[j]
    for k, bbox in enumerate(bbox_labeled[j]):
      if k+1 == no_of_people:
        break
      if bbox == [0, 0, 0, 0] or bbox == None:
        continue
      if training_keypoints[n][j][k] == [(0,0), (0,0), (0,0), (0,0), (0,0), (0,0), (0,0), (0,0),(0,0), (0,0),(0,0), (0,0), (0,0),(0,0), (0,0),(0,0),(0,0)]:
        continue
      labeled_heatmap = gen_heatmap(frame_labeled, bbox)
      
      
      unlabeled_heatmap = gen_heatmap(frame_unlabeled, bbox_unlabeled[k])
      ground_truth = gen_ground_truth(training_keypoints[n][j][k], bbox)
      
      
      if(labeled_heatmap.shape == (1,0) or unlabeled_heatmap.shape == (1,0)):
        continue

      img = Image.fromarray(labeled_heatmap) 
      rotated = img.rotate(45)                            # Rotation by 45 degree
      labeled_augmented1 = np.asarray(rotated)
      labeled_augmented2 = labeled_heatmap[:, ::-1]       # Horizontal Flipping
      img = Image.fromarray(unlabeled_heatmap)
      rotated = img.rotate(45)
      unlabeled_augmented1 = np.asarray(rotated)
      unlabeled_augmented2 = unlabeled_heatmap[:, ::-1]
      img = Image.fromarray(ground_truth)
      rotated = img.rotate(45)
      ground_augmented1 = np.asarray(rotated)
      ground_augmented2 = ground_truth[:, ::-1]

      heatmaps1.append(labeled_heatmap)
      heatmaps1.append(labeled_augmented1)
      heatmaps1.append(labeled_augmented2)
      heatmaps2.append(unlabeled_heatmap)
      heatmaps2.append(unlabeled_augmented1)
      heatmaps2.append(unlabeled_augmented2)
      ground_truth_heatmaps.append(ground_truth)
      #ground_truth_heatmaps.append(ground_augmented1)
      ground_truth_heatmaps.append(ground_augmented2)
    j = j + 1
  return np.array(heatmaps1), np.array(heatmaps2), np.array(ground_truth_heatmaps)

In [0]:
 #! pip3 install git+https://github.com/tensorlayer/tensorlayer.git

In [0]:
#sample for changing the shape of fa and fb. to (17,3,3) array like 

#making three identical channels out of one

#net is what gets passed in 3x3 blocks
def change_shape(zhi,fb):

  zhi_final=[]
  fb_final=[]
  for idx,arr in enumerate(fb):

    fb_new=np.zeros((384,288,3))
    fb_new[:,:,0]= fb[idx]
    fb_new[:,:,1]= fb[idx]
    fb_new[:,:,2]= fb[idx]
    fb_final.append(fb_new)
  
  for idx, arr in enumerate(zhi):
    zhi_new=np.zeros((384,288,3))
    zhi_new[:,:,0]= zhi[idx]
    zhi_new[:,:,1]= zhi[idx]
    zhi_new[:,:,2]= zhi[idx]
    zhi_final.append(zhi_new)

  return np.array(fb_final),  np.array(zhi_final)

In [0]:
import tensorflow as tf
import tensorlayer as tl
from tensorlayer.layers import Input, Dropout, Dense, Conv2d, Elementwise
from tensorlayer.models import Model
def get_model():
#inp_net is input - difference in heatmaps 
  inp_zhi= Input(shape=(None,384,288,3), name='inp_zhi')
  

# inp_fb_ is input - the heatmap for unlabelled frame B
  #inp_fb = Input(shape=(None,384,288,3), name= 'inp_fb')
  inp_fb = Input(shape=(None,384,288,3), name='inp_fb')


  #stack of 3x3 convolution blocks like in Resnet18 
  out=tl.layers.Conv2d(n_filter=9,filter_size=(3,3),strides=(1,1),padding='SAME')(inp_zhi)
  #print(out.shape)

  out=tl.layers.Conv2d(n_filter=18,filter_size=(3,3),strides=(1,1),padding='SAME')(out)
  out=tl.layers.Conv2d(n_filter=36,filter_size=(3,3),strides=(1,1),padding='SAME')(out)
  out=tl.layers.Conv2d(n_filter=36,filter_size=(3,3),strides=(1,1),padding='SAME')(out)
  out= tl.layers.BatchNorm2d()(out)
  out=tl.layers.Conv2d(n_filter=36,filter_size=(3,3),strides=(1,1),padding='SAME')(out)

  #print(out.shape)

  out=tl.layers.Conv2d(n_filter=45,filter_size=(3,3),strides=(1,1),padding='SAME')(out)
  out=tl.layers.Conv2d(n_filter=45,filter_size=(3,3),strides=(1,1),padding='SAME')(out)
  out=tl.layers.Conv2d(n_filter=45,filter_size=(3,3),strides=(1,1),padding='SAME')(out)
  out=tl.layers.Conv2d(n_filter=64,filter_size=(3,3),strides=(1,1),padding='SAME')(out)
  out=tl.layers.Conv2d(n_filter=64,filter_size=(3,3),strides=(1,1),padding='SAME')(out)
  out=tl.layers.Dropout(0.85)(out)

  #print(out.shape)

  out=tl.layers.Conv2d(n_filter=90,filter_size=(3,3),strides=(1,1),padding='SAME')(out)
  out=tl.layers.Conv2d(n_filter=90,filter_size=(3,3),strides=(1,1),padding='SAME')(out)
  out=tl.layers.Conv2d(n_filter=96,filter_size=(3,3),strides=(1,1),padding='SAME')(out)
  out=tl.layers.Conv2d(n_filter=128,filter_size=(3,3),strides=(1,1),padding='SAME')(out)
  out=tl.layers.BatchNorm2d()(out)
  out=tl.layers.Conv2d(n_filter=128,filter_size=(3,3),strides=(1,1),padding='SAME')(out)

  #print(out.shape)

  #differnent dialations
  dialation1= tl.layers.Conv2d(n_filter=18,filter_size=(3,3),strides=(1,1),padding='SAME',dilation_rate=(1,3,3,1))(out)
  dialation2= tl.layers.Conv2d(n_filter=18,filter_size=(3,3),strides=(1,1),padding='SAME',dilation_rate=(1,6,6,1))(out) 
  dialation3= tl.layers.Conv2d(n_filter=18,filter_size=(3,3),strides=(1,1),padding='SAME',dilation_rate=(1,12,12,1))(out)
  dialation4= tl.layers.Conv2d(n_filter=18,filter_size=(3,3),strides=(1,1),padding='SAME',dilation_rate=(1,18,18,1))(out)
  dialation5= tl.layers.Conv2d(n_filter=18,filter_size=(3,3),strides=(1,1),padding='SAME',dilation_rate=(1,24,24,1))(out)
  
  #print(dialation1.shape)

  #offset warping
  deformconv1 = tl.layers.DeformableConv2d(offset_layer=dialation1, n_filter=3, filter_size=(3, 3), name='deformable1')(inp_fb)
  deformconv2 = tl.layers.DeformableConv2d(offset_layer=dialation2, n_filter=3, filter_size=(3, 3), name='deformable2')(inp_fb)
  deformconv3 = tl.layers.DeformableConv2d(offset_layer=dialation3, n_filter=3, filter_size=(3, 3), name='deformable3')(inp_fb)
  deformconv4 = tl.layers.DeformableConv2d(offset_layer=dialation4, n_filter=3, filter_size=(3, 3), name='deformable4')(inp_fb)
  deformconv5 = tl.layers.DeformableConv2d(offset_layer=dialation5, n_filter=3, filter_size=(3, 3), name='deformable5')(inp_fb)
  output = tl.layers.ElementwiseLambda(fn=lambda a, b, c, d, e: a + b + c + d +e, name='elementwiselambda')([deformconv1, deformconv2, deformconv3,deformconv4, deformconv5])
  #print(output.shape)
  M = Model(inputs=inp_zhi, outputs= output)
  return M

In [0]:
print(get_model())

In [0]:
base_model = get_model()
optimizer= tf.optimizers.Adam(learning_rate=0.0001)
for n in range(4):

  labeled, unlabeled, ground_truth = gen_video_heatmaps_pairs(n)
  zhi = labeled_heatmap - unlabeled_heatmap
  zhi, unlabeled = change_shape(zhi, unlabeled)
  tl.utils.fit(network= base_model, train_op= optimizer, cost= tl.cost.mean_squared_error, X_train=np.array([zhi, unlabeled]) , y_train= ground_truth, batch_size= 25, n_epoch=5)
  
  print(str(i+1) + " video completed")

output = tl.utils.predict(base_model, {'inp_zhi': inp_zhi, 'inp_fb': inp_fb}, batch_size=8)
