In [4]:
import torch
import torch.nn as nn
from torch.nn.utils import weight_norm


class Chomp1d(nn.Module):
    def __init__(self, chomp_size):
        super(Chomp1d, self).__init__()
        self.chomp_size = chomp_size

    def forward(self, x):
        return x[:, :, :-self.chomp_size].contiguous()


class TemporalBlock(nn.Module):
    def __init__(self, n_inputs, n_outputs, kernel_size, stride, dilation, padding, dropout=0.2):
        super(TemporalBlock, self).__init__()
        self.conv1 = weight_norm(nn.Conv1d(n_inputs, n_outputs, kernel_size,
                                           stride=stride, padding=padding, dilation=dilation))
        self.chomp1 = Chomp1d(padding)
        self.relu1 = nn.ReLU()
        self.dropout1 = nn.Dropout(dropout)

        self.conv2 = weight_norm(nn.Conv1d(n_outputs, n_outputs, kernel_size,
                                           stride=stride, padding=padding, dilation=dilation))
        self.chomp2 = Chomp1d(padding)
        self.relu2 = nn.ReLU()
        self.dropout2 = nn.Dropout(dropout)

        self.net = nn.Sequential(self.conv1, self.chomp1, self.relu1, self.dropout1,
                                 self.conv2, self.chomp2, self.relu2, self.dropout2)
        self.downsample = nn.Conv1d(n_inputs, n_outputs, 1) if n_inputs != n_outputs else None
        self.relu = nn.ReLU()
        self.init_weights()

    def init_weights(self):
        self.conv1.weight.data.normal_(0, 0.01)
        self.conv2.weight.data.normal_(0, 0.01)
        if self.downsample is not None:
            self.downsample.weight.data.normal_(0, 0.01)

    def forward(self, x):
        out = self.net(x)
        res = x if self.downsample is None else self.downsample(x)
        return self.relu(out + res)


class TemporalConvNet(nn.Module):
    def __init__(self, num_inputs, num_channels, kernel_size=2, dropout=0.2):
        super(TemporalConvNet, self).__init__()
        layers = []
        num_levels = len(num_channels)
        for i in range(num_levels):
            dilation_size = 2 ** i
            in_channels = num_inputs if i == 0 else num_channels[i-1]
            out_channels = num_channels[i]
            layers += [TemporalBlock(in_channels, out_channels, kernel_size, stride=1, dilation=dilation_size,
                                     padding=(kernel_size-1) * dilation_size, dropout=dropout)]

        self.network = nn.Sequential(*layers)

    def forward(self, x):
        return self.network(x)


In [5]:
# -*- coding: utf-8 -*-
"""
Created on Mon Jul 27 10:42:50 2020

@author: Haoran6
"""

import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm

class DBS_lstm(nn.Module):

    def __init__(self, input_dim, hidden_dim, num_stacks):
        super(DBS_lstm, self).__init__()
        self.lstm_layer = nn.LSTM(input_dim, hidden_dim, num_stacks).double()
        self.hidden2dbs = nn.Linear(hidden_dim, 6890*3, bias=False).double()

    def forward(self, pose_beta_seq):
        num_frames = pose_beta_seq.shape[0]
        lstm_out, _ = self.lstm_layer(pose_beta_seq.view(num_frames, 1, -1))
        dbs = self.hidden2dbs(lstm_out).view(num_frames, 6890, 3)
        return dbs

class DBS_gru(nn.Module):

    def __init__(self, input_dim, hidden_dim, num_stacks):
        super(DBS_gru, self).__init__()
        self.gru_layer = nn.GRU(input_dim, hidden_dim, num_stacks).double()
        self.hidden2dbs = nn.Linear(hidden_dim, 6890*3, bias=False).double()

    def forward(self, pose_beta_seq):
        num_frames = pose_beta_seq.shape[0]
        gru_out, _ = self.gru_layer(pose_beta_seq.view(num_frames, 1, -1))
        dbs = self.hidden2dbs(gru_out).view(num_frames, 6890, 3)
        return dbs

class DBS_tcn(nn.Module):
    
    def __init__(self, input_size, output_size, num_channels, kernel_size=3, dropout=0.1):
        super(DBS_tcn, self).__init__()
        self.tcn = TemporalConvNet(input_size, num_channels, kernel_size, dropout=dropout).double()
        self.hidden2dbs = nn.Linear(num_channels[-1], output_size, bias=False).double()

    def forward(self, pose_beta_seq):
        """ Input ought to have dimension (N, C_in, L_in), where L_in is the seq_len """
        num_frames = pose_beta_seq.shape[0]
        tcn_out = self.tcn(pose_beta_seq.transpose(0, 1).view(1,-1,num_frames)).transpose(1, 2).squeeze()
        dbs = self.hidden2dbs(tcn_out).view(num_frames, 6890, 3)
        return dbs

In [24]:
import numpy as np
import pickle
import torch
from torch.nn import Module
import os
from time import time

class DBSModel(Module):
  def __init__(self, device=None, model_path='./body_models/smpl/male/model.pkl',\
               dbs_type='tcn',num_c=[512,768,1024],hd=1024,num_sk=5,dbs_model_path=None):
    
    super(DBSModel, self).__init__()
    with open(model_path, 'rb') as f:
      params = pickle.load(f)
    self.J_regressor = torch.from_numpy(
      np.array(params['J_regressor'].todense())
    ).type(torch.float64)
    if 'joint_regressor' in params.keys():
      self.joint_regressor = torch.from_numpy(
        np.array(params['joint_regressor'].T.todense())
      ).type(torch.float64)
    else:
      self.joint_regressor = torch.from_numpy(
        np.array(params['J_regressor'].todense())
      ).type(torch.float64)
    self.weights = torch.from_numpy(params['weights']).type(torch.float64)
    self.posedirs = torch.from_numpy(params['posedirs']).type(torch.float64)
    self.v_template = torch.from_numpy(params['v_template']).type(torch.float64)
    self.shapedirs = torch.from_numpy(params['shapedirs']).type(torch.float64)
    self.kintree_table = params['kintree_table']
    self.faces = params['f']
    self.device = device if device is not None else torch.device('cpu')
    for name in ['J_regressor', 'joint_regressor', 'weights', 'posedirs', 'v_template', 'shapedirs']:
      _tensor = getattr(self, name)
      # print('Tensor {} shape: '.format(name), _tensor.shape)
      setattr(self, name, _tensor.to(device))
    
    if dbs_type == 'lstm':
        dbs_layer = DBS_lstm(input_dim=289,hidden_dim=hd,num_stacks=num_sk)
    elif dbs_type == 'gru':
        dbs_layer = DBS_gru(input_dim=289,hidden_dim=hd,num_stacks=num_sk)
    elif dbs_type == 'tcn':
        dbs_layer = DBS_tcn(input_size=289, output_size=6890*3, num_channels=num_c)
    else:
        raise AssertionError('dbs_type is wrong')
    
    self.dbs_layer = dbs_layer.to(device)
    
    if dbs_model_path is not None:
        self.load_state_dict(torch.load(dbs_model_path))
        self.eval()
    

  @staticmethod
  def rodrigues(r):
    """
    Rodrigues' rotation formula that turns axis-angle tensor into rotation
    matrix in a batch-ed manner.

    Parameter:
    ----------
    r: Axis-angle rotation tensor of shape [batch_size * angle_num, 1, 3].

    Return:
    -------
    Rotation matrix of shape [batch_size * angle_num, 3, 3].

    """
    eps = r.clone().normal_(std=1e-8)
    theta = torch.norm(r + eps, dim=(1, 2), keepdim=True)  # dim cannot be tuple
    theta_dim = theta.shape[0]
    r_hat = r / theta
    cos = torch.cos(theta)
    z_stick = torch.zeros(theta_dim, dtype=torch.float64).to(r.device)
    m = torch.stack(
      (z_stick, -r_hat[:, 0, 2], r_hat[:, 0, 1], r_hat[:, 0, 2], z_stick,
       -r_hat[:, 0, 0], -r_hat[:, 0, 1], r_hat[:, 0, 0], z_stick), dim=1)
    m = torch.reshape(m, (-1, 3, 3))
    i_cube = (torch.eye(3, dtype=torch.float64).unsqueeze(dim=0) \
             + torch.zeros((theta_dim, 3, 3), dtype=torch.float64)).to(r.device)
    A = r_hat.permute(0, 2, 1)
    dot = torch.matmul(A, r_hat)
    R = cos * i_cube + (1 - cos) * dot + torch.sin(theta) * m
    return R

  @staticmethod
  def with_zeros(x):
    """
    Append a [0, 0, 0, 1] tensor to a [3, 4] tensor.

    Parameter:
    ---------
    x: Tensor to be appended.

    Return:
    ------
    Tensor after appending of shape [4,4]

    """
    ones = torch.tensor(
      [[[0.0, 0.0, 0.0, 1.0]]], dtype=torch.float64
    ).expand(x.shape[0],-1,-1).to(x.device)
    ret = torch.cat((x, ones), dim=1)
    return ret

  @staticmethod
  def pack(x):
    """
    Append zero tensors of shape [4, 3] to a batch of [4, 1] shape tensor.

    Parameter:
    ----------
    x: A tensor of shape [batch_size, 4, 1]

    Return:
    ------
    A tensor of shape [batch_size, 4, 4] after appending.

    """
    zeros43 = torch.zeros(
      (x.shape[0], x.shape[1], 4, 3), dtype=torch.float64).to(x.device)
    ret = torch.cat((zeros43, x), dim=3)
    return ret

  def write_obj(self, verts, file_name):
    with open(file_name, 'w') as fp:
      for v in verts:
        fp.write('v %f %f %f\n' % (v[0], v[1], v[2]))

      for f in self.faces + 1:
        fp.write('f %d %d %d\n' % (f[0], f[1], f[2]))

  def forward(self, betas, pose, trans, simplify=False):
    
    """
          Construct a compute graph that takes in parameters and outputs a tensor as
          model vertices. Face indices are also returned as a numpy ndarray.
          
          20190128: Add batch support.

          Parameters:
          ---------
          pose: Also known as 'theta', an [N, 24, 3] tensor indicating child joint rotation
          relative to parent joint. For root joint it's global orientation.
          Represented in a axis-angle format.

          betas: Parameter for model shape. A tensor of shape [N, 10] as coefficients of
          PCA components. Only 10 components were released by SMPL author.

          trans: Global translation tensor of shape [N, 3].

          Return:
          ------
          A 3-D tensor of [N * 6890 * 3] for vertices,
          and the corresponding [N * 19 * 3] joint positions.

    """
    batch_num = betas.shape[0]
    id_to_col = {self.kintree_table[1, i]: i
                 for i in range(self.kintree_table.shape[1])}
    parent = {
      i: id_to_col[self.kintree_table[0, i]]
      for i in range(1, self.kintree_table.shape[1])
    }
    v_shaped = torch.tensordot(betas, self.shapedirs, dims=([1], [2])) + self.v_template
    J = torch.matmul(self.J_regressor, v_shaped)
    R_cube_big = self.rodrigues(pose.view(-1, 1, 3)).reshape(batch_num, -1, 3, 3)

    if simplify:
      v_posed = v_shaped
    else:
      R_cube = R_cube_big[:, 1:, :, :]
      I_cube = (torch.eye(3, dtype=torch.float64).unsqueeze(dim=0) + \
        torch.zeros((batch_num, R_cube.shape[1], 3, 3), dtype=torch.float64)).to(self.device)
      lrotmin = (R_cube - I_cube).reshape(batch_num, -1, 1).squeeze(dim=2)
      v_posed = v_shaped + torch.tensordot(lrotmin, self.posedirs, dims=([1], [2]))
      # print(lrotmin.shape, self.posedirs.shape)
    
    R_pose = R_cube.reshape(batch_num, -1, 1).squeeze(dim=2)
    pose_beta_seq = torch.cat((R_pose, pose, betas),1)
    dbs = self.dbs_layer(pose_beta_seq)
    v_posed += dbs
    
    results = []
    results.append(
      self.with_zeros(torch.cat((R_cube_big[:, 0], torch.reshape(J[:, 0, :], (-1, 3, 1))), dim=2))
    )
    for i in range(1, self.kintree_table.shape[1]):
      results.append(
        torch.matmul(
          results[parent[i]],
          self.with_zeros(
            torch.cat(
              (R_cube_big[:, i], torch.reshape(J[:, i, :] - J[:, parent[i], :], (-1, 3, 1))),
              dim=2
            )
          )
        )
      )
    
    stacked = torch.stack(results, dim=1)
    results = stacked - \
      self.pack(
        torch.matmul(
          stacked,
          torch.reshape(
            torch.cat((J, torch.zeros((batch_num, 24, 1), dtype=torch.float64).to(self.device)), dim=2),
            (batch_num, 24, 4, 1)
          )
        )
      )
    # Restart from here
    T = torch.tensordot(results, self.weights, dims=([1], [1])).permute(0, 3, 1, 2)
    rest_shape_h = torch.cat(
      (v_posed, torch.ones((batch_num, v_posed.shape[1], 1), dtype=torch.float64).to(self.device)), dim=2
    )
    v = torch.matmul(T, torch.reshape(rest_shape_h, (batch_num, -1, 4, 1)))
    v = torch.reshape(v, (batch_num, -1, 4))[:, :, :3]
    result = v + torch.reshape(trans, (batch_num, 1, 3))
    # estimate 3D joint locations
    # print(result.shape)
    # print(self.joint_regressor.shape)
    # joints = torch.tensordot(result, self.joint_regressor, dims=([1], [0])).transpose(1, 2)
        
    # pose_beta_seq = torch.cat((pose, betas),1)
    # dbs = self.dbs_layer(pose_beta_seq)
    # result += dbs
    
    return result


def test_gpu(dbs_type):
  # if len(gpu_id) > 0 and torch.cuda.is_available():
  #   os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_id[0])
  #   device = torch.device('cuda')
  # else:
  #   device = torch.device('cpu')
  #print(device)
  
  device = torch.device('cuda')
  print(torch.cuda.get_device_name(0))
  num_frames = 1000
  
  pose_size = 72
  beta_size = 10

  np.random.seed(9608)
  model = DBSModel(device=device, model_path='/content/drive/My Drive/model.pkl', dbs_type=dbs_type,\
                   num_c=[256,64,16],hd=1024)
  time_list = []
  for i in range(10):
      pose = torch.from_numpy((np.random.rand(num_frames, pose_size) - 0.5) * 0.4)\
              .type(torch.float64).to(device)
      betas = torch.from_numpy((np.random.rand(num_frames, beta_size) - 0.5) * 0.06) \
              .type(torch.float64).to(device)
      trans = torch.from_numpy(np.zeros((num_frames, 3))).type(torch.float64).to(device)
      
      s = time()
      result = model(betas, pose, trans)
      cost_time = time() - s
      print(cost_time)
      time_list.append(cost_time)
  print('mean cost:', np.mean(time_list[1:]))
      
   # outmesh_path = './dmpl_batch_obj/dmpl_torch_{}.obj'
   # for i in range(result.shape[0]):
   #      model.write_obj(result[i], outmesh_path.format(i))

if __name__ == '__main__':
  test_gpu(dbs_type='tcn')

Tesla K80
0.0895700454711914
0.07889485359191895
0.07511186599731445
0.0733346939086914
0.07562041282653809
0.07160472869873047
0.07255005836486816
0.07156634330749512
0.07183670997619629
0.07600593566894531
mean cost: 0.07405840026007758


In [26]:
class SMPLModel(Module):
  def __init__(self, device=None, model_path='./body_models/smpl/male/model.pkl'):
    
    super(SMPLModel, self).__init__()
    with open(model_path, 'rb') as f:
      params = pickle.load(f)
    self.J_regressor = torch.from_numpy(
      np.array(params['J_regressor'].todense())
    ).type(torch.float64)
    if 'joint_regressor' in params.keys():
      self.joint_regressor = torch.from_numpy(
        np.array(params['joint_regressor'].T.todense())
      ).type(torch.float64)
    else:
      self.joint_regressor = torch.from_numpy(
        np.array(params['J_regressor'].todense())
      ).type(torch.float64)
    self.weights = torch.from_numpy(params['weights']).type(torch.float64)
    self.posedirs = torch.from_numpy(params['posedirs']).type(torch.float64)
    self.v_template = torch.from_numpy(params['v_template']).type(torch.float64)
    self.shapedirs = torch.from_numpy(params['shapedirs']).type(torch.float64)
    self.kintree_table = params['kintree_table']
    self.faces = params['f']
    self.device = device if device is not None else torch.device('cpu')
    for name in ['J_regressor', 'joint_regressor', 'weights', 'posedirs', 'v_template', 'shapedirs']:
      _tensor = getattr(self, name)
      # print(' Tensor {} shape: '.format(name), _tensor.shape)
      setattr(self, name, _tensor.to(device))

  @staticmethod
  def rodrigues(r):
    """
    Rodrigues' rotation formula that turns axis-angle tensor into rotation
    matrix in a batch-ed manner.

    Parameter:
    ----------
    r: Axis-angle rotation tensor of shape [batch_size * angle_num, 1, 3].

    Return:
    -------
    Rotation matrix of shape [batch_size * angle_num, 3, 3].

    """
    eps = r.clone().normal_(std=1e-8)
    theta = torch.norm(r + eps, dim=(1, 2), keepdim=True)  # dim cannot be tuple
    theta_dim = theta.shape[0]
    r_hat = r / theta
    cos = torch.cos(theta)
    z_stick = torch.zeros(theta_dim, dtype=torch.float64).to(r.device)
    m = torch.stack(
      (z_stick, -r_hat[:, 0, 2], r_hat[:, 0, 1], r_hat[:, 0, 2], z_stick,
       -r_hat[:, 0, 0], -r_hat[:, 0, 1], r_hat[:, 0, 0], z_stick), dim=1)
    m = torch.reshape(m, (-1, 3, 3))
    i_cube = (torch.eye(3, dtype=torch.float64).unsqueeze(dim=0) \
             + torch.zeros((theta_dim, 3, 3), dtype=torch.float64)).to(r.device)
    A = r_hat.permute(0, 2, 1)
    dot = torch.matmul(A, r_hat)
    R = cos * i_cube + (1 - cos) * dot + torch.sin(theta) * m
    return R

  @staticmethod
  def with_zeros(x):
    """
    Append a [0, 0, 0, 1] tensor to a [3, 4] tensor.

    Parameter:
    ---------
    x: Tensor to be appended.

    Return:
    ------
    Tensor after appending of shape [4,4]

    """
    ones = torch.tensor(
      [[[0.0, 0.0, 0.0, 1.0]]], dtype=torch.float64
    ).expand(x.shape[0],-1,-1).to(x.device)
    ret = torch.cat((x, ones), dim=1)
    return ret

  @staticmethod
  def pack(x):
    """
    Append zero tensors of shape [4, 3] to a batch of [4, 1] shape tensor.

    Parameter:
    ----------
    x: A tensor of shape [batch_size, 4, 1]

    Return:
    ------
    A tensor of shape [batch_size, 4, 4] after appending.

    """
    zeros43 = torch.zeros(
      (x.shape[0], x.shape[1], 4, 3), dtype=torch.float64).to(x.device)
    ret = torch.cat((zeros43, x), dim=3)
    return ret

  def write_obj(self, verts, file_name):
    with open(file_name, 'w') as fp:
      for v in verts:
        fp.write('v %f %f %f\n' % (v[0], v[1], v[2]))

      for f in self.faces + 1:
        fp.write('f %d %d %d\n' % (f[0], f[1], f[2]))

  def forward(self, betas, pose, trans, simplify=False):
    
    """
          Construct a compute graph that takes in parameters and outputs a tensor as
          model vertices. Face indices are also returned as a numpy ndarray.
          
          20190128: Add batch support.

          Parameters:
          ---------
          pose: Also known as 'theta', an [N, 24, 3] tensor indicating child joint rotation
          relative to parent joint. For root joint it's global orientation.
          Represented in a axis-angle format.

          betas: Parameter for model shape. A tensor of shape [N, 10] as coefficients of
          PCA components. Only 10 components were released by SMPL author.

          trans: Global translation tensor of shape [N, 3].

          Return:
          ------
          A 3-D tensor of [N * 6890 * 3] for vertices,
          and the corresponding [N * 19 * 3] joint positions.

    """
    batch_num = betas.shape[0]
    id_to_col = {self.kintree_table[1, i]: i
                 for i in range(self.kintree_table.shape[1])}
    parent = {
      i: id_to_col[self.kintree_table[0, i]]
      for i in range(1, self.kintree_table.shape[1])
    }
    v_shaped = torch.tensordot(betas, self.shapedirs, dims=([1], [2])) + self.v_template
    J = torch.matmul(self.J_regressor, v_shaped)
    R_cube_big = self.rodrigues(pose.view(-1, 1, 3)).reshape(batch_num, -1, 3, 3)

    if simplify:
      v_posed = v_shaped
    else:
      R_cube = R_cube_big[:, 1:, :, :]
      I_cube = (torch.eye(3, dtype=torch.float64).unsqueeze(dim=0) + \
        torch.zeros((batch_num, R_cube.shape[1], 3, 3), dtype=torch.float64)).to(self.device)
      lrotmin = (R_cube - I_cube).reshape(batch_num, -1, 1).squeeze(dim=2)
      v_posed = v_shaped + torch.tensordot(lrotmin, self.posedirs, dims=([1], [2]))

    results = []
    results.append(
      self.with_zeros(torch.cat((R_cube_big[:, 0], torch.reshape(J[:, 0, :], (-1, 3, 1))), dim=2))
    )
    for i in range(1, self.kintree_table.shape[1]):
      results.append(
        torch.matmul(
          results[parent[i]],
          self.with_zeros(
            torch.cat(
              (R_cube_big[:, i], torch.reshape(J[:, i, :] - J[:, parent[i], :], (-1, 3, 1))),
              dim=2
            )
          )
        )
      )
    
    stacked = torch.stack(results, dim=1)
    results = stacked - \
      self.pack(
        torch.matmul(
          stacked,
          torch.reshape(
            torch.cat((J, torch.zeros((batch_num, 24, 1), dtype=torch.float64).to(self.device)), dim=2),
            (batch_num, 24, 4, 1)
          )
        )
      )
    # Restart from here
    T = torch.tensordot(results, self.weights, dims=([1], [1])).permute(0, 3, 1, 2)
    rest_shape_h = torch.cat(
      (v_posed, torch.ones((batch_num, v_posed.shape[1], 1), dtype=torch.float64).to(self.device)), dim=2
    )
    v = torch.matmul(T, torch.reshape(rest_shape_h, (batch_num, -1, 4, 1)))
    v = torch.reshape(v, (batch_num, -1, 4))[:, :, :3]
    result = v + torch.reshape(trans, (batch_num, 1, 3))
    # estimate 3D joint locations
    # print(result.shape)
    # print(self.joint_regressor.shape)
    # joints = torch.tensordot(result, self.joint_regressor, dims=([1], [0])).transpose(1, 2)
    return result

def test_gpu():
  # if len(gpu_id) > 0 and torch.cuda.is_available():
  #   os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_id[0])
  #   device = torch.device('cuda')
  # else:
  #   device = torch.device('cpu')
  #print(device)
  
  device = torch.device('cuda')
  print(torch.cuda.get_device_name(0))
  num_frames = 1000
  
  pose_size = 72
  beta_size = 10

  np.random.seed(9608)
  model = SMPLModel(device=device, model_path='/content/drive/My Drive/model.pkl')
  time_list = []
  for i in range(10):
      pose = torch.from_numpy((np.random.rand(num_frames, pose_size) - 0.5) * 0.4)\
              .type(torch.float64).to(device)
      betas = torch.from_numpy((np.random.rand(num_frames, beta_size) - 0.5) * 0.06) \
              .type(torch.float64).to(device)
      trans = torch.from_numpy(np.zeros((num_frames, 3))).type(torch.float64).to(device)
      
      s = time()
      result = model(betas, pose, trans)
      cost_time = time() - s
      print(cost_time)
      time_list.append(cost_time)
  print('mean cost:', np.mean(time_list[1:]))
      
   # outmesh_path = './dmpl_batch_obj/dmpl_torch_{}.obj'
   # for i in range(result.shape[0]):
   #      model.write_obj(result[i], outmesh_path.format(i))

if __name__ == '__main__':
  test_gpu()

Tesla K80
0.07825493812561035
0.07197904586791992
0.0688169002532959
0.06967878341674805
0.0648200511932373
0.06434917449951172
0.06195974349975586
0.06177520751953125
0.06447529792785645
0.0614781379699707
mean cost: 0.06548137134975857


In [30]:
class DMPLModel(Module):
  def __init__(self, device=None, model_path='./body_models/smpl/male/model.pkl'):
    
    super(DMPLModel, self).__init__()
    with open(model_path, 'rb') as f:
      params = pickle.load(f)
    self.J_regressor = torch.from_numpy(
      np.array(params['J_regressor'].todense())
    ).type(torch.float64)
    if 'joint_regressor' in params.keys():
      self.joint_regressor = torch.from_numpy(
        np.array(params['joint_regressor'].T.todense())
      ).type(torch.float64)
    else:
      self.joint_regressor = torch.from_numpy(
        np.array(params['J_regressor'].todense())
      ).type(torch.float64)
    self.weights = torch.from_numpy(params['weights']).type(torch.float64)
    self.posedirs = torch.from_numpy(params['posedirs']).type(torch.float64)
    self.v_template = torch.from_numpy(params['v_template']).type(torch.float64)
    self.shapedirs = torch.from_numpy(params['shapedirs']).type(torch.float64)
    self.kintree_table = params['kintree_table']
    self.faces = params['f']
    self.device = device if device is not None else torch.device('cpu')
    for name in ['J_regressor', 'joint_regressor', 'weights', 'posedirs', 'v_template', 'shapedirs']:
      _tensor = getattr(self, name)
      # print(' Tensor {} shape: '.format(name), _tensor.shape)
      setattr(self, name, _tensor.to(device))
    DMPL_params = np.load("/content/drive/My Drive/model.npz")
    self.dmpls_eig = torch.Tensor(DMPL_params['eigvec']).type(torch.float64).to(device)
    # self.dmpls_eig *= 2

  @staticmethod
  def rodrigues(r):
    """
    Rodrigues' rotation formula that turns axis-angle tensor into rotation
    matrix in a batch-ed manner.

    Parameter:
    ----------
    r: Axis-angle rotation tensor of shape [batch_size * angle_num, 1, 3].

    Return:
    -------
    Rotation matrix of shape [batch_size * angle_num, 3, 3].

    """
    eps = r.clone().normal_(std=1e-8)
    theta = torch.norm(r + eps, dim=(1, 2), keepdim=True)  # dim cannot be tuple
    theta_dim = theta.shape[0]
    r_hat = r / theta
    cos = torch.cos(theta)
    z_stick = torch.zeros(theta_dim, dtype=torch.float64).to(r.device)
    m = torch.stack(
      (z_stick, -r_hat[:, 0, 2], r_hat[:, 0, 1], r_hat[:, 0, 2], z_stick,
       -r_hat[:, 0, 0], -r_hat[:, 0, 1], r_hat[:, 0, 0], z_stick), dim=1)
    m = torch.reshape(m, (-1, 3, 3))
    i_cube = (torch.eye(3, dtype=torch.float64).unsqueeze(dim=0) \
             + torch.zeros((theta_dim, 3, 3), dtype=torch.float64)).to(r.device)
    A = r_hat.permute(0, 2, 1)
    dot = torch.matmul(A, r_hat)
    R = cos * i_cube + (1 - cos) * dot + torch.sin(theta) * m
    return R

  @staticmethod
  def with_zeros(x):
    """
    Append a [0, 0, 0, 1] tensor to a [3, 4] tensor.

    Parameter:
    ---------
    x: Tensor to be appended.

    Return:
    ------
    Tensor after appending of shape [4,4]

    """
    ones = torch.tensor(
      [[[0.0, 0.0, 0.0, 1.0]]], dtype=torch.float64
    ).expand(x.shape[0],-1,-1).to(x.device)
    ret = torch.cat((x, ones), dim=1)
    return ret

  @staticmethod
  def pack(x):
    """
    Append zero tensors of shape [4, 3] to a batch of [4, 1] shape tensor.

    Parameter:
    ----------
    x: A tensor of shape [batch_size, 4, 1]

    Return:
    ------
    A tensor of shape [batch_size, 4, 4] after appending.

    """
    zeros43 = torch.zeros(
      (x.shape[0], x.shape[1], 4, 3), dtype=torch.float64).to(x.device)
    ret = torch.cat((zeros43, x), dim=3)
    return ret

  def write_obj(self, verts, file_name):
    with open(file_name, 'w') as fp:
      for v in verts:
        fp.write('v %f %f %f\n' % (v[0], v[1], v[2]))

      for f in self.faces + 1:
        fp.write('f %d %d %d\n' % (f[0], f[1], f[2]))

  def forward(self, betas, pose, trans, dmpls, simplify=False):
    
    """
          Construct a compute graph that takes in parameters and outputs a tensor as
          model vertices. Face indices are also returned as a numpy ndarray.
          
          20190128: Add batch support.

          Parameters:
          ---------
          pose: Also known as 'theta', an [N, 24, 3] tensor indicating child joint rotation
          relative to parent joint. For root joint it's global orientation.
          Represented in a axis-angle format.

          betas: Parameter for model shape. A tensor of shape [N, 10] as coefficients of
          PCA components. Only 10 components were released by SMPL author.

          trans: Global translation tensor of shape [N, 3].

          Return:
          ------
          A 3-D tensor of [N * 6890 * 3] for vertices,
          and the corresponding [N * 19 * 3] joint positions.

    """
    batch_num = betas.shape[0]
    id_to_col = {self.kintree_table[1, i]: i
                 for i in range(self.kintree_table.shape[1])}
    parent = {
      i: id_to_col[self.kintree_table[0, i]]
      for i in range(1, self.kintree_table.shape[1])
    }
    v_shaped = torch.tensordot(betas, self.shapedirs, dims=([1], [2])) + self.v_template
    J = torch.matmul(self.J_regressor, v_shaped)
    R_cube_big = self.rodrigues(pose.view(-1, 1, 3)).reshape(batch_num, -1, 3, 3)

    if simplify:
      v_posed = v_shaped
    else:
      R_cube = R_cube_big[:, 1:, :, :]
      I_cube = (torch.eye(3, dtype=torch.float64).unsqueeze(dim=0) + \
        torch.zeros((batch_num, R_cube.shape[1], 3, 3), dtype=torch.float64)).to(self.device)
      lrotmin = (R_cube - I_cube).reshape(batch_num, -1, 1).squeeze(dim=2)
      v_posed = v_shaped + torch.tensordot(lrotmin, self.posedirs, dims=([1], [2]))
    
    dbs = torch.tensordot(self.dmpls_eig, dmpls, dims=([2],[1]))
    dbs = dbs.permute(2,0,1)
    v_posed += dbs
    
    results = []
    results.append(
      self.with_zeros(torch.cat((R_cube_big[:, 0], torch.reshape(J[:, 0, :], (-1, 3, 1))), dim=2))
    )
    for i in range(1, self.kintree_table.shape[1]):
      results.append(
        torch.matmul(
          results[parent[i]],
          self.with_zeros(
            torch.cat(
              (R_cube_big[:, i], torch.reshape(J[:, i, :] - J[:, parent[i], :], (-1, 3, 1))),
              dim=2
            )
          )
        )
      )
    
    stacked = torch.stack(results, dim=1)
    results = stacked - \
      self.pack(
        torch.matmul(
          stacked,
          torch.reshape(
            torch.cat((J, torch.zeros((batch_num, 24, 1), dtype=torch.float64).to(self.device)), dim=2),
            (batch_num, 24, 4, 1)
          )
        )
      )
    # Restart from here
    T = torch.tensordot(results, self.weights, dims=([1], [1])).permute(0, 3, 1, 2)
    rest_shape_h = torch.cat(
      (v_posed, torch.ones((batch_num, v_posed.shape[1], 1), dtype=torch.float64).to(self.device)), dim=2
    )
    v = torch.matmul(T, torch.reshape(rest_shape_h, (batch_num, -1, 4, 1)))
    v = torch.reshape(v, (batch_num, -1, 4))[:, :, :3]
    result = v + torch.reshape(trans, (batch_num, 1, 3))
    # estimate 3D joint locations
    # print(result.shape)
    # print(self.joint_regressor.shape)
    # joints = torch.tensordot(result, self.joint_regressor, dims=([1], [0])).transpose(1, 2)
    
    # dbs = torch.tensordot(self.dmpls_eig, dmpls, dims=([2],[1]))
    # dbs = dbs.permute(2,0,1)
    # result += dbs
    return result

def test_gpu():
  # if len(gpu_id) > 0 and torch.cuda.is_available():
  #   os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_id[0])
  #   device = torch.device('cuda')
  # else:
  #   device = torch.device('cpu')
  #print(device)
  
  device = torch.device('cuda')
  print(torch.cuda.get_device_name(0))
  num_frames = 1000
  
  pose_size = 72
  beta_size = 10
  dmpls_size = 8

  np.random.seed(9608)
  model = DMPLModel(device=device, model_path='/content/drive/My Drive/model.pkl')
  time_list = []
  for i in range(10):
      pose = torch.from_numpy((np.random.rand(num_frames, pose_size) - 0.5) * 0.4)\
              .type(torch.float64).to(device)
      betas = torch.from_numpy((np.random.rand(num_frames, beta_size) - 0.5) * 0.06) \
              .type(torch.float64).to(device)
      trans = torch.from_numpy(np.zeros((num_frames, 3))).type(torch.float64).to(device)
      dmpls = torch.from_numpy((np.random.rand(num_frames, dmpls_size))).type(torch.float64).to(device)

      s = time()
      result = model(betas, pose, trans, dmpls)
      cost_time = time() - s
      print(cost_time)
      time_list.append(cost_time)
  print('mean cost:', np.mean(time_list[1:]))
      
   # outmesh_path = './dmpl_batch_obj/dmpl_torch_{}.obj'
   # for i in range(result.shape[0]):
   #      model.write_obj(result[i], outmesh_path.format(i))

if __name__ == '__main__':
  test_gpu()

Tesla K80
0.09332990646362305
0.08585476875305176
0.08218264579772949
0.07542800903320312
0.07521462440490723
0.07555079460144043
0.07386207580566406
0.07398128509521484
0.07248687744140625
0.074127197265625
mean cost: 0.07652091979980469
