# <div align="center">Discriminator</div>
---------------------------------------------------------------------

you can Find me on Github:
> ###### [ GitHub](https://github.com/lev1khachatryan)


The reprojection loss encourages the network to produce a 3D body that explains the 2D joint locations, however  nthropometrically implausible 3D bodies or bodies with gross self-intersections may still minimize the reprojection loss. To regularize this, we use a discriminator network D that is trained to tell whether SMPL parameters correspond to a real body or not. We refer to this as an adversarial prior since the discriminator acts as a data-driven prior that guides the 3D inference.

A further benefit of employing a rich, explicit 3D representation like SMPL is that we precisely know the meaning of the latent space. In particular SMPL has a factorized form that we can take advantage of to make the adversary more data efficient and stable to train. More concretely, we mirror the shape and pose decomposition of SMPL and train a discriminator for shape and pose independently. The pose is based on a kinematic tree, so we further decompose the pose discriminators and train one for each joint rotation. This amounts to learning the angle limits for each joint. In order to capture the joint distribution of the entire kinematic tree, we also learn a discriminator that takes in all the rotations.

Since the input to each discriminator is very low dimensional (10-D for $\beta$, 9-D for each joint and 9K-D for all joints), they can each be small networks, making them rather stable to train. All pose discriminators share a common feature space of rotation matrices and only the final classifiers are learned separately.

In all we train K + 2 discriminators. Each discriminator $D_{i}$ outputs values between [0, 1], representing the probability that $\theta$ came from the data. In practice we use the least square formulation for its stability. Let E represent the
encoder including the image encoder and the 3D module.

Then the adversarial loss function for the encoder is:

<img src='assets/1.png'>

and the objective for each discriminator is

<img src='assets/2.png'>

We optimize $E$ and all $D_{i}$'s jointly

In [1]:
import tensorflow as tf
import tensorflow.contrib.slim as slim

from tensorflow.contrib.layers.python.layers.initializers import variance_scaling_initializer

In [4]:
def Encoder_resnet(x, is_training=True, weight_decay=0.001, reuse=False):
    """
    Resnet v2-50
    Assumes input is [batch, height_in, width_in, channels]!!
    Input:
    - x: N x H x W x 3
    - weight_decay: float
    - reuse: bool->True if test

    Outputs:
    - cam: N x 3
    - Pose vector: N x 72
    - Shape vector: N x 10
    - variables: tf variables
    """
    from tensorflow.contrib.slim.python.slim.nets import resnet_v2
    # with tf.name_scope("Encoder_resnet", [x]):
    with tf.name_scope("Encoder_resnet"):
        with slim.arg_scope(resnet_v2.resnet_arg_scope(weight_decay=weight_decay)):
            net, end_points = resnet_v2.resnet_v2_50(
                x,
                num_classes=None,
                is_training=is_training,
                reuse=reuse,
                scope='resnet_v2_50')
            net = tf.squeeze(net, axis=[1, 2])
    variables = tf.contrib.framework.get_variables('resnet_v2_50')
    return net, variables

In [5]:
def Encoder_fc3_dropout(x,
                        num_output=85,
                        is_training=True,
                        reuse=False,
                        name="3D_module"):
    """
    3D inference module. 3 MLP layers (last is the output)
    With dropout  on first 2.
    Input:
    - x: N x [|img_feat|, |3D_param|]
    - reuse: bool

    Outputs:
    - 3D params: N x num_output
      if orthogonal: 
           either 85: (3 + 24*3 + 10) or 109 (3 + 24*4 + 10) for factored axis-angle representation
      if perspective:
          86: (f, tx, ty, tz) + 24*3 + 10, or 110 for factored axis-angle.
    - variables: tf variables
    """
    if reuse:
        print('Reuse is on!')
    with tf.variable_scope(name, reuse=reuse) as scope:
        net = slim.fully_connected(x, 1024, scope='fc1')
        net = slim.dropout(net, 0.5, is_training=is_training, scope='dropout1')
        net = slim.fully_connected(net, 1024, scope='fc2')
        net = slim.dropout(net, 0.5, is_training=is_training, scope='dropout2')
        small_xavier = variance_scaling_initializer(
            factor=.01, mode='FAN_AVG', uniform=True)
        net = slim.fully_connected(
            net,
            num_output,
            activation_fn=None,
            weights_initializer=small_xavier,
            scope='fc3')

    variables = tf.contrib.framework.get_variables(scope)
    return net, variables

In [6]:
def get_encoder_fn_separate(model_type):
    """
    Retrieves diff encoder fn for image and 3D
    """
    encoder_fn = None
    threed_fn = None
    if 'resnet' in model_type:
        encoder_fn = Encoder_resnet
    else:
        print('Unknown encoder %s!' % model_type)
        exit(1)

    if 'fc3_dropout' in model_type:
        threed_fn = Encoder_fc3_dropout

    if encoder_fn is None or threed_fn is None:
        print('Dont know what encoder to use for %s' % model_type)
        import ipdb
        ipdb.set_trace()

    return encoder_fn, threed_fn

In [7]:
img_enc_fn, threed_enc_fn = get_encoder_fn_separate('resnet_fc3_dropout')

In [8]:
import skimage.io as io
import numpy as np
from src.util import image as img_util
from matplotlib import pyplot as plt

In [9]:
def preprocess_image(img_path, json_path=None):
    img = io.imread(img_path)
    if img.shape[2] == 4:
        img = img[:, :, :3]

    if json_path is None:
        if np.max(img.shape[:2]) != 224:
            print('Resizing so the max image size is %d..' % 224)
            scale = (float(224) / np.max(img.shape[:2]))
        else:
            scale = 1.
        center = np.round(np.array(img.shape[:2]) / 2).astype(int)
        # image center in (x,y)
        center = center[::-1]
    else:
        scale, center = op_util.get_bbox(json_path)

    crop, proc_param = img_util.scale_and_crop(img, scale, center, 224)

    # Normalize image to [-1, 1]
    crop = 2 * ((crop / 255.) - 0.5)

    return crop, proc_param, img

In [10]:
img_path = 'data/im1963.jpg'
json_path = None
input_img, proc_param, img = preprocess_image(img_path, json_path)
input_img = np.expand_dims(input_img, 0)

Resizing so the max image size is 224..


In [11]:
input_img = np.float32(input_img)

In [12]:
# Extract image features.
img_feat, E_var = img_enc_fn(input_img, is_training=False, reuse=False)

In [13]:
import os.path as osp
import os
import sys
curr_path = osp.dirname(os.getcwd())
model_dir = osp.join(curr_path, '..', 'models')

# SMPL_MODEL_PATH = osp.join(model_dir, 'neutral_smpl_with_cocoplus_reg.pkl')
# SMPL_FACE_PATH = osp.join(curr_path, '../src/tf_smpl', 'smpl_faces.npy')

# smpl_model_path = SMPL_MODEL_PATH
# smpl_face_path = SMPL_FACE_PATH

smpl_model_path = r'C:\_Files\MyProjects\ASDS_3\Photo_Wake-Up\src\HMR\TensorFlow\models\neutral_smpl_with_cocoplus_reg.pkl'
smpl_face_path = r'C:\_Files\MyProjects\ASDS_3\Photo_Wake-Up\src\HMR\TensorFlow\src\tf_smpl\smpl_faces.npy'

In [14]:
import deepdish as dd
from os.path import join, dirname
def load_mean_param():
    mean = np.zeros((1, 85))
    # Initialize scale at 0.9
    mean[0, 0] = 0.9
    mean_path = join(
        dirname(smpl_model_path), 'neutral_smpl_mean_params.h5')
    mean_vals = dd.io.load(mean_path)

    mean_pose = mean_vals['pose']
    # Ignore the global rotation.
    mean_pose[:3] = 0.
    mean_shape = mean_vals['shape']

    # This initializes the global pose to be up-right when projected
    mean_pose[0] = np.pi

    mean[0, 3:] = np.hstack((mean_pose, mean_shape))
    mean = tf.constant(mean, tf.float32)
    mean_var = tf.Variable(
        mean, name="mean_param", dtype=tf.float32, trainable=True)
    E_var.append(mean_var)
    init_mean = tf.tile(mean_var, [1, 1])
    return init_mean

In [15]:
from src.tf_smpl.batch_lbs import batch_rodrigues
from src.tf_smpl.batch_smpl import SMPL
from src.tf_smpl.projection import batch_orth_proj_idrot

In [16]:
smpl = SMPL(smpl_model_path)

In [17]:
num_cam=3
num_theta=72
loss_kps = []
# if self.use_3d_label:
#     loss_3d_joints, loss_3d_params = [], []
# For discriminator
fake_rotations, fake_shapes = [], []
# Start loop
# 85D
theta_prev = load_mean_param()

# For visualizations
all_verts = []
all_pred_kps = []
all_pred_cams = []
all_delta_thetas = []
all_theta_prev = []

num_stage = 1

# Main IEF loop
for i in np.arange(num_stage):
    print('Iteration %d' % i)
    # ---- Compute outputs
    state = tf.concat([img_feat, theta_prev], 1)

    if i == 0:
        delta_theta, threeD_var = threed_enc_fn(
            state, num_output=85, reuse=False)
        E_var.extend(threeD_var)
    else:
        delta_theta, _ = threed_enc_fn(
            state, num_output=85, reuse=True)

    # Compute new theta
    theta_here = theta_prev + delta_theta
    # cam = N x 3, pose N x self.num_theta, shape: N x 10
    cams = theta_here[:, : num_cam]
    poses = theta_here[:, num_cam:(num_cam + num_theta)]
    shapes = theta_here[:, (num_cam + num_theta):]
    
    # Rs_wglobal is Nx24x3x3 rotation matrices of poses
    verts, Js, pred_Rs = smpl(shapes, poses, get_skin=True)
    pred_kp = batch_orth_proj_idrot(
        Js, cams, name='proj2d_stage%d' % i)
#     # --- Compute losses:
#     loss_kps.append(self.e_loss_weight * self.keypoint_loss(
#         self.kp_loader, pred_kp))
    pred_Rs = tf.reshape(pred_Rs, [-1, 24, 9])
#     if self.use_3d_label:
#         loss_poseshape, loss_joints = self.get_3d_loss(
#             pred_Rs, shapes, Js)
#         loss_3d_params.append(loss_poseshape)
#         loss_3d_joints.append(loss_joints)

    # Save pred_rotations for Discriminator
    fake_rotations.append(pred_Rs[:, 1:, :])
    fake_shapes.append(shapes)

#     # Save things for visualiations:
#     self.all_verts.append(tf.gather(verts, self.show_these))
#     self.all_pred_kps.append(tf.gather(pred_kp, self.show_these))
#     self.all_pred_cams.append(tf.gather(cams, self.show_these))

    # Finally update to end iteration.
    theta_prev = theta_here


Iteration 0


W0216 21:22:17.310510 16060 deprecation.py:323] From C:\_Files\MyProjects\ASDS_3\Photo_Wake-Up\src\HMR\TensorFlow\src\tf_smpl\batch_lbs.py:55: div (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.


In [20]:
def Discriminator_separable_rotations(
        poses,
        shapes,
        weight_decay,
):
    """
    23 Discriminators on each joint + 1 for all joints + 1 for shape.
    To share the params on rotations, this treats the 23 rotation matrices
    as a "vertical image":
    Do 1x1 conv, then send off to 23 independent classifiers.

    Input:
    - poses: N x 23 x 1 x 9, NHWC ALWAYS!!
    - shapes: N x 10
    - weight_decay: float

    Outputs:
    - prediction: N x (1+23) or N x (1+23+1) if do_joint is on.
    - variables: tf variables
    """
    data_format = "NHWC"
    with tf.name_scope("Discriminator_sep_rotations", [poses, shapes]):
        with tf.variable_scope("D") as scope:
            with slim.arg_scope(
                [slim.conv2d, slim.fully_connected],
                    weights_regularizer=slim.l2_regularizer(weight_decay)):
                with slim.arg_scope([slim.conv2d], data_format=data_format):
                    poses = slim.conv2d(poses, 32, [1, 1], scope='D_conv1')
                    poses = slim.conv2d(poses, 32, [1, 1], scope='D_conv2')
                    theta_out = []
                    for i in range(0, 23):
                        theta_out.append(
                            slim.fully_connected(
                                poses[:, i, :, :],
                                1,
                                activation_fn=None,
                                scope="pose_out_j%d" % i))
                    theta_out_all = tf.squeeze(tf.stack(theta_out, axis=1))

                    # Do shape on it's own:
                    shapes = slim.stack(
                        shapes,
                        slim.fully_connected, [10, 5],
                        scope="shape_fc1")
                    shape_out = slim.fully_connected(
                        shapes, 1, activation_fn=None, scope="shape_final")
                    """ Compute joint correlation prior!"""
                    nz_feat = 1024
                    poses_all = slim.flatten(poses, scope='vectorize')
                    poses_all = slim.fully_connected(
                        poses_all, nz_feat, scope="D_alljoints_fc1")
                    poses_all = slim.fully_connected(
                        poses_all, nz_feat, scope="D_alljoints_fc2")
                    poses_all_out = slim.fully_connected(
                        poses_all,
                        1,
                        activation_fn=None,
                        scope="D_alljoints_out")
                    out = tf.concat([theta_out_all,
                                     poses_all_out, shape_out], 1)

            variables = tf.contrib.framework.get_variables(scope)
            return out, variables

In [21]:
def setup_discriminator(fake_rotations, fake_shapes):
    # Compute the rotation matrices of "rea" pose.
    # These guys are in 24 x 3.
    real_rotations = batch_rodrigues(tf.reshape(pose_loader, [-1, 3]))
    real_rotations = tf.reshape(real_rotations, [-1, 24, 9])
    # Ignoring global rotation. N x 23*9
    # The # of real rotation is B*num_stage so it's balanced.
    real_rotations = real_rotations[:, 1:, :]
    all_fake_rotations = tf.reshape(
        tf.concat(fake_rotations, 0),
        [batch_size * num_stage, -1, 9])
    comb_rotations = tf.concat(
        [real_rotations, all_fake_rotations], 0, name="combined_pose")

    comb_rotations = tf.expand_dims(comb_rotations, 2)
    all_fake_shapes = tf.concat(fake_shapes, 0)
    comb_shapes = tf.concat(
        [shape_loader, all_fake_shapes], 0, name="combined_shape")

    disc_input = {
        'weight_decay': d_wd,
        'shapes': comb_shapes,
        'poses': comb_rotations
    }

    d_out, D_var = Discriminator_separable_rotations(
        **disc_input)

    d_out_real, d_out_fake = tf.split(d_out, 2)
    # Compute losses:
    with tf.name_scope("comp_d_loss"):
        d_loss_real = tf.reduce_mean(
            tf.reduce_sum((d_out_real - 1)**2, axis=1))
        d_loss_fake = tf.reduce_mean(
            tf.reduce_sum((d_out_fake)**2, axis=1))
        # Encoder loss
        e_loss_disc = tf.reduce_mean(
            tf.reduce_sum((d_out_fake - 1)**2, axis=1))

In [24]:
fake_shapes

[<tf.Tensor 'strided_slice_2:0' shape=(1, 10) dtype=float32>]

In [25]:
fake_rotations

[<tf.Tensor 'strided_slice_3:0' shape=(1, 23, 9) dtype=float32>]

In [23]:
# setup_discriminator(fake_rotations, fake_shapes)