In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import datetime
import os
import sys

In [None]:
sys.path.append("/home/caleml/main-pe/")

In [None]:
import numpy as np
import tensorflow as tf

from tensorflow.keras import Model, Input, Sequential
from tensorflow.keras.layers import MaxPooling2D, UpSampling2D, Convolution2D, Activation, BatchNormalization, Reshape
from tensorflow.keras.layers import Permute, add, concatenate
from tensorflow.keras.losses import mean_squared_error
from tensorflow.keras.optimizers import RMSprop

from tensorflow.keras.applications import ResNet50

In [None]:
from data.datasets.mpii import MpiiSinglePerson
from data.utils.data_utils import TEST_MODE, TRAIN_MODE, VALID_MODE
from data.loader import BatchLoader

from model import blocks
from model import layers
from model import losses
from model import config
from model import callbacks
from model.models import BaseModel, AppearanceModel
from model.utils import pose_format

# model

In [None]:
class Encoder(object):
    
    def __init__(self):
        self.input_shape = (256, 256, 3)
        self.start_lr = 0.001
        
    def stem(self, inp):
        '''
        common first stem
        '''
        print(inp.shape)
        stem_input = Input(shape=inp.shape[1:]) # 256 x 256 x 3

        x = layers.conv_bn_act(stem_input, 32, (3, 3), strides=(2, 2))
        x = layers.conv_bn_act(x, 32, (3, 3))
        x = layers.conv_bn_act(x, 64, (3, 3))

        a = layers.conv_bn_act(x, 96, (3, 3), strides=(2, 2))
        b = MaxPooling2D((3, 3), strides=(2, 2), padding='same')(x)
        x = concatenate([a, b])

        a = layers.conv_bn_act(x, 64, (1, 1))
        a = layers.conv_bn(a, 96, (3, 3))
        b = layers.conv_bn_act(x, 64, (1, 1))
        b = layers.conv_bn_act(b, 64, (5, 1))
        b = layers.conv_bn_act(b, 64, (1, 5))
        b = layers.conv_bn(b, 96, (3, 3))
        x = concatenate([a, b])

        a = layers.act_conv_bn(x, 192, (3, 3), strides=(2, 2))
        b = MaxPooling2D((2, 2), strides=(2, 2))(x)
        x = concatenate([a, b])

        x = layers.sepconv_residual(x, 3*192, name='sepconv1')

        model = Model(stem_input, x, name='stem')
        x = model(inp)
        
        return x
    
    def pose_model(self, inp):
        stem_out = self.stem(inp)
        
        out = stem_out
        
        return out
    
    def appearance_model(self, inp):
        out = ResNet50(inp)
        return out
    
    def build(self):
        '''
        Input: 256 x 256 x 3 image
        Outputs: 
            - pose tensor
            - reconstructed image
        
        1. E_p is the encoder for the pose estimation
        2. E_a is the encoder for the appearance
        3. concat z_a and z_p to form the input of the decoder
        4. decode into an image
        '''
        inp = Input(shape=self.input_shape)
        
        # 1. E_p
        z_p, pred_pose = self.pose_model(inp)
        
        # 2. E_a
        z_a = self.appearance_model(inp)
        
        # 3. reconstruction base
        concat = self.prepare_concat(z_p, z_a)
        
        # 4. decoding
        rec_img = self.decoder(concat)
        
        outputs = [pred_pose, rec_img]
        self.model = Model(inputs=inp, outputs=outputs)
        
        # compile it
        loss = losses.combined_loss()
        self.model.compile(loss=loss, optimizer=RMSprop(lr=self.start_lr))
        self.model.summary()
        
    def train(self, data_tr, steps_per_epoch):
        callbacks = []
        callbacks.append(SaveModel(weights_path))
        callbacks.append(mpii_callback)
        # callbacks.append(h36m_callback)

        model.fit_generator(
            data_tr,
            steps_per_epoch=steps_per_epoch,
            epochs=60,
            callbacks=callbacks,
            workers=8,
            initial_epoch=0)
        

In [None]:
class OldAppearanceModel(object):
    '''
    Only autoencoding z_a for now
    '''
    
    def __init__(self):
        self.input_shape = (256, 256, 3)
        self.start_lr = 0.001
        
    def decoder(self):
        pass
    
    def build(self):
        inp = Input(shape=self.input_shape)
        
        enc_model = ResNet50(include_top=False, weights='imagenet', input_tensor=inp)
        
        z_a = enc_model.output   # 8 x 8 x 2048
        
        # decoder part
        up = layers.up(z_a)  # 16 x 16
        up = layers.conv_bn_act(up, 512, (3, 3))
        up = layers.conv_bn_act(up, 512, (3, 3))
        up = layers.conv_bn_act(up, 512, (3, 3))
        
        up = layers.up(up)  # 32 x 32
        up = layers.conv_bn_act(up, 512, (3, 3))
        up = layers.conv_bn_act(up, 512, (3, 3))
        up = layers.conv_bn_act(up, 256, (3, 3))
        
        up = layers.up(up)  # 64 x 64
        up = layers.conv_bn_act(up, 256, (3, 3))
        up = layers.conv_bn_act(up, 256, (3, 3))
        up = layers.conv_bn_act(up, 128, (3, 3))
        
        up = layers.up(up)  # 128 x 128
        up = layers.conv_bn_act(up, 128, (3, 3))
        up = layers.conv_bn_act(up, 64, (3, 3))
        
        up = layers.up(up)  # 256 x 256
        up = layers.conv_bn_act(up, 3, (3, 3))
        up = layers.conv_bn(up, 3, (1, 1))   # 3 channels, output shape of this should be (None, 3, 256, 256)
            
        # TODO: should we permute here or have the input formatted with channels first?
        # perm = Permute((1, 2))(up)
        # i_hat = Permute((2, 3))(perm)
        i_hat = up
        
        self.model = Model(inputs=inp, outputs=i_hat)
        
        # loss = losses.combined_loss()
        loss = mean_squared_error
        
        # run_opts = tf.RunOptions(report_tensor_allocations_upon_oom = True)
        # self.model.compile(loss=loss, optimizer=RMSprop(lr=self.start_lr), options=run_opts)
        self.model.compile(loss=loss, optimizer=RMSprop(lr=self.start_lr))
        self.model.summary()
        
    def train(self, data_tr, steps_per_epoch, model_folder):
        weights_file = os.path.join(model_folder, 'weights_mpii_{epoch:03d}.h5')
        
        cb_list = []
        cb_list.append(callbacks.SaveModel(weights_file))
        # callbacks.append(LearningRateScheduler(lr_scheduler))
        # callbacks.append(eval_callback)

        self.model.fit_generator(data_tr,
                                 steps_per_epoch=steps_per_epoch,
                                 epochs=60,
                                 callbacks=cb_list,
                                 workers=4,
                                 initial_epoch=0)

In [None]:
class PoseModel(object):
    
    def __init__(self, input_tensor, n_joints, n_blocks, kernel_size):
        self.n_joints = n_joints
        self.n_blocks = n_blocks
        self.kernel_size = kernel_size
        
        self.n_heatmaps = self.n_joints   # this seems silly but we will augment with context later
        
        self.build(input_tensor)
        return self.model
        
    def build(self, inp):
        '''
        1. stem
        2. stacking the blocks
        '''
        
        outputs = list()
        x = self.stem(inp)
        
        for i_block in range(self.n_blocks):
            x = self.reception_block(x, name='rBlock%d' % (i_block + 1))
            
            x = self.sepconv_block(x, name='SepConv%d' % (i_block + 1))
            h = self.pose_block(x, name='RegMap%d' % (i_block + 1))
            
            pose = self.pose_regression_2d(h)
            
            outputs.append(pose)
            # outputs.append(visible)
            outputs.append(h)

            if bidx < num_blocks - 1:
                h = build_fremap_block(h, block_shape[-1], name='fReMap%d' % (bidx + 1))
                x = add([ident_map, x, h])
            
        self.model = Model(inputs=inp, outputs=outputs)
        
    def stem(self, inp):
        '''
        inception v4 stem
        
        input: 256 x 256 x 3
        output: 32 x 32 x 576
        '''
        xi = Input(shape=inp.get_shape().as_list()[1:]) # 256 x 256 x 3

        x = layers.conv_bn_act(xi, 32, (3, 3), strides=(2, 2))
        x = layers.conv_bn_act(x, 32, (3, 3))
        x = layers.conv_bn_act(x, 64, (3, 3))

        a = layers.conv_bn_act(x, 96, (3, 3), strides=(2, 2))
        b = MaxPooling2D((3, 3), strides=(2, 2), padding='same')(x)
        x = concatenate([a, b])

        a = layers.conv_bn_act(x, 64, (1, 1))
        a = layers.conv_bn(a, 96, (3, 3))
        b = layers.conv_bn_act(x, 64, (1, 1))
        b = layers.conv_bn_act(b, 64, (5, 1))
        b = layers.conv_bn_act(b, 64, (1, 5))
        b = layers.conv_bn(b, 96, (3, 3))
        x = concatenate([a, b])

        a = layers.act_conv_bn(x, 192, (3, 3), strides=(2, 2))
        b = MaxPooling2D((2, 2), strides=(2, 2))(x)
        x = concatenate([a, b])

        x = blocks.sepconv_residual(x, 3*192, name='sepconv1')

        model = Model(xi, x, name='Stem')
        x = model(inp)
        
        return x
        
    def reception_block(self, inp, name):
        '''
        each pose block starts with a reception block
        it is u-shaped and relies on separable convolutions
        
        inp ------------------------- a (SR 576) -------------------- + -- out
          |                                                           |
          |                                                           |
          MP --- C 288 -- SR 288 ---- b (SR 288) ---- + -- SR 576 -- US
                            |                         |
                            |                         |
                            MP --- SR -- SR -- SR --- US     <- all 288 channels
                            
          
        SR: Sepconv Residual (all 5x5)
        C: Conv (1x1)
        MP: Max Pooling (2x2 with stride 2x2)
        US: UpSampling (2x2)
        
        input: 32 x 32 x 576
        output: 32 x 32 x 576
        '''
        ksize = self.kernel_size
        
        input_shape = inp.get_shape().as_list()[1:]
        print("INPUT SHAPE %s %s" % (type(input_shape), str(input_shape)))
        size = int(input_shape[-1])

        # first branch
        xi = Input(shape=input_shape)
        a = blocks.sepconv_residual(xi, size, name='sepconv_l1', kernel_size=ksize)

        # second branch
        low1 = MaxPooling2D((2, 2))(xi)
        low1 = layers.act_conv_bn(low1, int(size/2), (1, 1))
        low1 = blocks.sepconv_residual(low1, int(size/2), name='sepconv_l2_1', kernel_size=ksize)
        b = blocks.sepconv_residual(low1, int(size/2), name='sepconv_l2_2', kernel_size=ksize)

        # third branch
        c = MaxPooling2D((2, 2))(low1)
        c = blocks.sepconv_residual(c, int(size/2), name='sepconv_l3_1', kernel_size=ksize)
        c = blocks.sepconv_residual(c, int(size/2), name='sepconv_l3_2', kernel_size=ksize)
        c = blocks.sepconv_residual(c, int(size/2), name='sepconv_l3_3', kernel_size=ksize)
        c = UpSampling2D((2, 2))(c)

        # merge second and third branches
        b = add([b, c])
        b = blocks.sepconv_residual(b, size, name='sepconv_l2_3', kernel_size=ksize)
        b = UpSampling2D((2, 2))(b)
        
        # merge first and second branches
        x = add([a, b])
        model = Model(inputs=xi, outputs=x, name=name)

        return model(inp)
    
    def sepconv_block(self, inp, name):
        '''
        Separable convolution
        '''
        input_shape = inp.get_shape().as_list()[1:]

        xi = Input(shape=input_shape)
        x = layers.separable_act_conv_bn(xi, input_shape[-1], self.kernel_size)

        model = Model(inputs=xi, outputs=x, name=name)

        return model(inp)
        
    def pose_block(self, inp, name):
        '''
        input: 32 x 32 x 576
        output: 32 x 32 x 16 (number of heatmaps)
        '''
        input_shape = inp.get_shape().as_list()[1:]

        xi = Input(shape=input_shape)
        x = layers.act_conv(xi, self.n_heatmaps, (1, 1))

        model = Model(inputs=xi, outputs=x, name=name)

        return model(inp)
    
    def pose_regression_2d(heatmaps):
        '''
        soft argmax to get the pose from the heatmaps
        
        input: 32 x 32 x 16 (number of joints)
        output: 
        '''
        input_shape = inp.get_shape().as_list()[1:]
        
        pose = self.soft_argmax(heatmaps)
        # visible = jprob_s_model(h)

        return pose
    
    def soft_argmax(sams_input_shape, rho=0, name='sSAM')

In [None]:
class MultiBranchModel(BaseModel):
    '''
    2 branch model :
    - appearance (z_a)
    - pose (z_p)
    One common decoder to recreate the image
    '''
    
    def __init__(self, n_joints=16, nb_pose_blocks=8, reception_kernel_size=(5,5)):
        self.n_joints = n_joints
        self.n_blocks = nb_pose_blocks
        self.reception_kernel_size = reception_kernel_size
        
        BaseModel.__init__(self)
        
    def build(self):
        inp = Input(shape=self.input_shape)
        
        z_a = self.appearance_encoder(inp)
        z_p = self.pose_encoder(inp)
        
        # concat = self.concat(z_a, z_p)
        
        # i_hat = self.decoder(concat)
        
        # self.model = Model(inputs=inp, outputs=[z_p, i_hat])
        self.model = Model(inputs=inp, outputs=[z_p, z_a])
       
        # loss = multi_loss
        loss = mean_squared_error
        self.model.compile(loss=loss, optimizer=RMSProp(lr=self.start_lr))
        self.model.summary()
        
    def appearance_encoder(self, inp):
        '''
        resnet50 for now
        input: 256 x 256 x 3
        output: 8 x 8 x 2048
        '''
        enc_model = ResNet50(include_top=False, weights='imagenet', input_tensor=inp)
        
        z_a = enc_model.output   # 8 x 8 x 2048
        return z_a
    
    def pose_encoder(self, inp):
        '''
        reception / stacked hourglass
        input: 256 x 256 x 3
        output: 
        '''
        pose_model = PoseModel(inp, self.n_joints, self.n_blocks, self.reception_kernel_size)
        out = pose_model.output
        
        return out
    
    def concat(self, z_a, z_p):
        '''
        concat pose and appearance representations before decoding
        input:
            - z_p: 
            - z_a: 8 x 8 x 2048
        output:
        '''
        pass
    
    def decoder(self, concat):
        '''
        from concatenated representations to image reconstruction
        input:
        output: 256 x 256 x 3
        '''
        pass
    
        

# dataset

In [None]:
h36m_path = "/share/DEEPLEARNING/datasets/human36m"
mpii_path = "/share/DEEPLEARNING/datasets/mpii"

In [None]:
# h36m dataset loading
h36m = Human36M(h36m_path, dataconf=config.human36m_dataconf, poselayout=pose_format.pa17j3d, topology='frames')

data_tr = BatchLoader(
    [h36m], 
    ['frame'], 
    ['pose'],
    TRAIN_MODE, 
    batch_size=h36m.get_length(TRAIN_MODE),
    num_predictions=num_predictions, 
    shuffle=True)

# batch_size=[batch_size_mpii, batch_size_mpii, batch_size_ar, batch_size_ar], 

In [None]:
# validation
h36m_val = BatchLoader(
    h36m, 
    ['frame'],
    ['pose_w', 'pose_uvd', 'afmat', 'camera', 'action'], 
    VALID_MODE,
    batch_size=h36m.get_length(VALID_MODE), 
    shuffle=True)

[x_val], [pw_val, puvd_val, afmat_val, scam_val, action] = h36m_val[0]

h36m_callback = H36MEvalCallback(x_val, pw_val, afmat_val, puvd_val[:,0,2], scam_val, action, logdir=logdir)

In [None]:
mpii = MpiiSinglePerson(mpii_path, dataconf=config.mpii_dataconf, poselayout=pose_format.pa17j3d)

In [None]:
data_tr_mpii = BatchLoader(
    mpii, 
    ['frame'], 
    ['frame'], 
    TRAIN_MODE,
    shuffle=False)

In [None]:
len(data_tr_mpii)

# training

In [None]:
model = Encoder()
model.build()

# steps_per_epoch = h36m.get_length(TRAIN_MODE) // batch_size_h36m
steps_per_epoch = mpii.get_length(TRAIN_MODE) // batch_size_mpii

In [None]:
model.train(data_tr, steps_per_epoch)

In [None]:
model = AppearanceModel()
model.build()


In [None]:
model_name = 'appearance'
dataset_name = 'mpii'
model_folder = '/home/caleml/pe_experiments/exp_%s_%s_%s' % (model_name, dataset_name, datetime.datetime.now().strftime("%Y%m%d%H%M")) 
os.makedirs(model_folder)
model.train(data_tr_mpii, steps_per_epoch=len(data_tr_mpii), model_folder=model_folder)

In [None]:
model = MultiBranchModel()
model.build()

In [None]:
model_name = 'multib'
dataset_name = 'mpii'
model_folder = '/home/caleml/pe_experiments/exp_%s_%s_%s' % (model_name, dataset_name, datetime.datetime.now().strftime("%Y%m%d%H%M")) 
os.makedirs(model_folder)


# eval

In [None]:
from model.models import AppearanceModel

In [None]:
model_folder = '/home/caleml/pe_experiments/exp_appearance_mpii_201902051901'
model_checkpoint = '/home/caleml/pe_experiments/exp_appearance_mpii_201902051901/weights_mpii_058.h5'  # weights
checkpoint_2 = '/home/caleml/pe_experiments/exp_appearance_mpii_201902061614/weights_mpii_013.h5'  # made with save_model

In [None]:
model = AppearanceModel()
model.load(checkpoint_2)

In [None]:
# eval data
mpii = MpiiSinglePerson(mpii_path, dataconf=config.mpii_dataconf, poselayout=pose_format.pa17j3d)
data_val_mpii = BatchLoader(
    mpii, 
    ['frame'], 
    ['frame'], 
    VALID_MODE,
    shuffle=False)

len(data_val_mpii)

# debug

In [None]:
def pouet(definition):
    
    ret = list()
    for elt in definition:
        ret.append('truc')
        
    return tuple(ret)

a = pouet('a')
        
    

In [None]:
a

In [None]:
b

In [None]:
b