In [13]:
import sys
import os 
import random
from random import randint
import numpy as np
sys.path.append('..')
import Video_Utils
import CNN_Utils
import h5py
import time
import numpy as np
import tensorflow as tf
from keras.preprocessing.image import ImageDataGenerator
from keras.models import Sequential
from keras.layers import Input, Convolution2D, UpSampling2D, MaxPooling2D, ZeroPadding2D, Reshape, merge
from keras.layers import Activation, Dropout, Flatten, Dense
from keras.utils.np_utils import to_categorical
from keras.callbacks import ModelCheckpoint, TensorBoard
from keras.optimizers import RMSprop, SGD
from keras.applications.vgg16 import VGG16
from keras.preprocessing import image
from keras.applications.vgg16 import preprocess_input
from keras.models import Model
from keras.models import load_model
from keras import backend as K
from keras.layers.core import Lambda, Merge
from keras.engine import Layer

'''
From the command line you can convert a notebook to python with this command:

ipython nbconvert --to python <YourNotebook>.ipynb

You may have to install the python mistune package:

sudo pip install mistune
'''


'\nFrom the command line you can convert a notebook to python with this command:\n\nipython nbconvert --to python <YourNotebook>.ipynb\n\nYou may have to install the python mistune package:\n\nsudo pip install mistune\n'

In [14]:
############################################## GLOBAL VARIABLES #########################################
# These variables are specific to the dataset of the problem. 
# Our data is the following: 
#
# time-series of 4 cameras (128 x 128 x 3) looking at a robot complete a task (pick up object, place in bin)
# + corresponding time series of 7-d vector of joint commands (6 DOF + gripper) denoting the position delta command.
#
# In future comments, we will refer to the states and actions as x(t) and u(t) respectively, where 'state' 
# denotes the visual data sensed from the world (the 4 cameras) and action is the 7-d joint position delta vector. 
#
# Data was generated and collected using the V-REP (http://www.coppeliarobotics.com/) robot simulator 
# (free/education version). Please contact the author of this notebook for the specific scene and data-generation
# script 

global CAM_W
global CAM_H
global CAM_C
global NUM_CAM_CHANNELS
global NUM_CAMS
global ACTION_LEN

CAM_W = 128
CAM_H = 128
CAM_C = 3
NUM_CAM_CHANNELS = 3
NUM_CAMS = 4
CAM_C = NUM_CAM_CHANNELS*NUM_CAMS #R,G,B x 4 cameras
ACTION_LEN = 7

In [24]:
####################################### MODEL PARAMETERS #########################################
global NUM_VGG_FEAT_MAPS
global NUM_FUTURE_FRAMES
global NUM_PAST_FRAMES
global VGG_FEAT_W
global VGG_FEAT_H 
global CONV1_FEAT_MAPS
global CONV2_FEAT_MAPS
global CONV3_FEAT_MAPS
global CONVP1_FEAT_MAPS 
global CONVP2_FEAT_MAPS
global CONVP3_FEAT_MAPS
global CONVF1_FEAT_MAPS
global CONVF2_FEAT_MAPS
global CONVF3_FEAT_MAPS
global CONVD1_FEAT_MAPS
global CONVD2_FEAT_MAPS
global CONVD3_FEAT_MAPS
global D_DENSE1
global D_DENSE2
global D_DENSE3

VGG_FEAT_W = 64        # using "block1_pool"
VGG_FEAT_H = 64        # using "block1_pool"  
NUM_VGG_FEAT_MAPS = 64 # using "block1_pool" (per camera)
NUM_FUTURE_FRAMES = 10
NUM_PAST_FRAMES = 10

CONVP1_FEAT_MAPS = round(NUM_VGG_FEAT_MAPS*NUM_PAST_FRAMES/2) 
CONVP2_FEAT_MAPS = round(CONVP1_FEAT_MAPS/2) 
CONVP3_FEAT_MAPS = round(CONVP2_FEAT_MAPS/2)  

CONV1_FEAT_MAPS = round(4*NUM_PAST_FRAMES)
CONV2_FEAT_MAPS = round(2*NUM_PAST_FRAMES)
CONV3_FEAT_MAPS = NUM_PAST_FRAMES

CONVF4_FEAT_MAPS = round(CAM_C*NUM_CAM_CHANNELS)
CONVF3_FEAT_MAPS = round(CONVF4_FEAT_MAPS*2)
CONVF2_FEAT_MAPS = round(CONVF3_FEAT_MAPS*2)    # e.g. 48
CONVF1_FEAT_MAPS = CONVF2_FEAT_MAPS             # e.g. 48

D_DENSE3 = VGG_FEAT_W * VGG_FEAT_H
D_DENSE2 = round(D_DENSE3 / 4)
D_DENSE1 = round(D_DENSE2 / 4)


In [25]:
################################################# THE MODEL ####################################################
'''
An explanation of the dynamics model and the buffers: 

We are training a model to produce p(x(t+1: t+F) | x(t-P: t), u(t-P: t), u(t+1: t+F-1)) 

I.e.: predict the future F frames, 
      given the past P frames, past P actions, and the future F-1 actions 
      
This can be viewed as unrolling the classic p(x(t+1) | x(t), u(t)) dynamics model. 

We have proposed this 'unrolled dynamics' problem formulation for the following reasons:

1) Conditioning on longer sequences to capture the effects of repeated actions, reduce discretization errors
2) Improved training due to longer sequences. Single-step visual models have a tendency to "copy" prev. frame
   as the output
3) Be able to visualize a future state trajectory based on a proposed set of actions.
   ---> this has applications in control planning, and can be useful to a human operator/co-worker, or for
        another system to check for safety hazards, etc. Plus makes a sick demo ("this is what the robot is
        thinking") lol. 
        
PREV_FRAMES_BUFFER ---> input, this is x(t-P: t)         (size P,W,H,3*num_cams)
PREV_ACTION_BUFFER ---> input, this is u(t-P: t)         (size P,A)
FUTURE_FRAMES_BUFFER ---> output, this is x(t+1: t+F)    (size F,W,H,3*num_cams)
FUTURE_ACTION_BUFFER ---> input, this is u(t+1: t+F-1)   (size F-1,A)

(note: visual inputs may be pre-processed with another pre-trained model)
'''
# Use part of pre-trained VGG model to seed features with reasonable features: (used online during training)  
vgg_preprocessor = CNN_Utils.GetVGGModel("block1_pool", CAM_W, CAM_H, print_timing=1) 

# Model input #1: past frames
input_prev_frames = Input(shape=(VGG_FEAT_W, VGG_FEAT_H, NUM_VGG_FEAT_MAPS*NUM_PAST_FRAMES), name='input_prev_frames_raw')
# below: some layers to learn what information is important from the past
input_prev_frames = Convolution2D(CONVP1_FEAT_MAPS, 3, 3, activation='elu', border_mode='same')(input_prev_frames)
input_prev_frames = Convolution2D(CONVP2_FEAT_MAPS, 3, 3, activation='elu', border_mode='same')(input_prev_frames)
input_prev_frames = Convolution2D(CONVP3_FEAT_MAPS, 3, 3, activation='elu', border_mode='same')(input_prev_frames) 

# Model input #1: past actions 
input_prev_actions_raw = Input(shape=(NUM_PAST_FRAMES, ACTION_LEN), name='input_prev_actions_raw')
# Below: project and reshape, so we can merge inputs later: 
input_prev_actions_p = Dense(output_dim=VGG_FEAT_W*VGG_FEAT_H*1, activation='elu')(input_prev_actions_raw)
input_prev_actions_p_r = Reshape((VGG_FEAT_W, VGG_FEAT_H, 1), name='input_prev_actions_p_r')(input_prev_actions_p)

# Model input #3: future actions
input_future_actions_raw = Input(shape=(NUM_PAST_FRAMES, ACTION_LEN), name='input_future_actions_raw')
# format for convenience, to let us 'pick off' actions and sequentially predict the next frame logically
# Below: these parameters gather information for an action pertaining to how it affects a future state
D_A1 = Dense(output_dim=D_DENSE1, activation='elu', name='D_A1')(input_future_actions_raw)
D_A2 = Dense(output_dim=D_DENSE2, activation='elu', name='D_A2')(D_A1)
D_A3 = Dense(output_dim=D_DENSE3, activation='elu', name='D_A3')(D_A2)
future_action_branch = Reshape((VGG_FEAT_W, VGG_FEAT_H, 1), name='action_branch_r')(D_A3)

# MERGE PAST INPUTS: 
merged_input = merge([input_prev_frames, input_prev_actions_p_r, future_action_branch], 
                      mode='concat', concat_axis=3, name='merged_prev_inputs')
# ^ so now past frames, info from past actions has been merged into a tensor that is size e.g.:
# [VGG_W, VGG_H, (VGG_feat_maps * NUM_PAST_FRAMES)/8 + 2] (e.g. == 82 for 10 past frames)

# Pre-process merged inputs that contain past information:
# This is meant to form a more efficient representation to be used in predicting future frames sequentially given actions:
merged_input = Convolution2D(CONV1_FEAT_MAPS, 3, 3, activation='elu', border_mode='same', name='merged_lvl_1')(merged_input)
merged_input = Convolution2D(CONV2_FEAT_MAPS, 3, 3, activation='elu', border_mode='same', name='merged_lvl_2')(merged_input)
merged_input = Convolution2D(CONV3_FEAT_MAPS, 3, 3, activation='elu', border_mode='same', name='merged_lvl_3')(merged_input)

# Shared conv (and projected & reshaped action) layers for predicting the next frame given a new action: 
SHARED_C1 = Convolution2D(CONVF1_FEAT_MAPS, 3, 3, activation='elu', border_mode='same', name='SHARED_C1')
SHARED_C2 = Convolution2D(CONVF2_FEAT_MAPS, 3, 3, activation='elu', border_mode='same', name='SHARED_C2')
SHARED_C3 = Convolution2D(CONVF3_FEAT_MAPS, 3, 3, activation='elu', border_mode='same', name='SHARED_C3')
SHARED_C4 = Convolution2D(CONVF4_FEAT_MAPS, 3, 3, activation='elu', border_mode='same', name='SHARED_C4')
# ^ SHARED_C4 is a final output layer, spits out a frame (defined as WxHx3*num_cams)

################################# WHAT ARE WE DOING HERE????????????

nfp = SHARED_C1(merged_data)
nfp = SHARED_C2(nfp)
nfp = SHARED_C3(nfp)
nfp = SHARED_C4(nfp) # and this is a frame predicted from the past info, and proposed future action

# SEQUENTIAL PREDICTION: 
# Use the shared layers, sequential actions to predict next frame
for t in range(1, NUM_FUTURE_FRAMES):
    
    
    nfp = SHARED_C1(merged_data)
    nfp = SHARED_C2(nfp)
    nfp = SHARED_C3(nfp)
    nfp = SHARED_C4(nfp) # and this is a frame predicted from the past info, and proposed future action
    
    next_frames = merge([next_frames, nfp], mode='concat', concat_axis=3)

VGG model loaded. Time elasped: 3.1 secs


In [None]:
####################################### INITIALIZATION, SETUP OF MODEL #########################################

In [17]:
########################################## Globals for training params ####################################

global TOTAL_TRAINING_ITRS
global SAVE_CHECKPOINT_ITRS
global NUM_DEMONSTRATIONS
global CURR_DEMONSTRATION
global LENGTH_CURR_DEMONSTRATION # e.g. current demo we're looking at is 230 timesteps
global T_CURR_DEMONSTRATION      # and e.g. we're currently on timestep 87
global PERCENT_TRAIN             # percent of data used for training vs. valudation
global DEMONSTRATION_FOLDERS
global TRAINING_FOLDERS
global TESTING_FOLDERS
global NUM_TRAINING_DEMONSTRATIONS
global NUM_TESTING_DEMONSTRATIONS

global PREV_FRAMES_BUFFER
global PREV_ACTION_BUFFER
global FUTURE_FRAMES_BUFFER
global FUTURE_ACTION_BUFFER

PREV_FRAMES_BUFFER = np.zeros((NUM_PAST_FRAMES, CAM_W, CAM_H, CAM_C))
PREV_ACTION_BUFFER = np.zeros((NUM_PAST_FRAMES, ACTION_LEN))
FUTURE_FRAMES_BUFFER = np.zeros((NUM_FUTURE_FRAMES, CAM_W, CAM_H, CAM_C))
FUTURE_ACTION_BUFFER = np.zeros((NUM_FUTURE_FRAMES-1, ACTION_LEN))

TOTAL_TRAINING_ITRS = 100000
SAVE_CHECKPOINT_ITRS = 100
CURR_DEMONSTRATION = -1 # start on the first one, loop to another one each itr
LENGTH_CURR_DEMONSTRATION = -1
PERCENT_TRAIN = 0.75

In [10]:
########################################## START THE TRAINING #######################################

print("Running program...")

DEMONSTRATION_FOLDERS = Video_Utils.GetFoldersForRuns()

NUM_DEMONSTRATIONS = len(DEMONSTRATION_FOLDERS)

for f in DEMONSTRATION_FOLDERS:
    print(f)

# Separate into training and testing data based on the number of recordings (folders) we have: 
random.shuffle(DEMONSTRATION_FOLDERS)    
lim_separate = round(NUM_DEMONSTRATIONS * PERCENT_TRAIN)    
TRAINING_FOLDERS = DEMONSTRATION_FOLDERS[0:lim_separate]
TESTING_FOLDERS = DEMONSTRATION_FOLDERS[lim_separate:]
NUM_TRAINING_DEMONSTRATIONS = len(TRAINING_FOLDERS)
NUM_TESTING_DEMONSTRATIONS = len(TESTING_FOLDERS)

print(NUM_TRAINING_DEMONSTRATIONS); print(NUM_TESTING_DEMONSTRATIONS); print(NUM_DEMONSTRATIONS) # a + b = c

    

# Data flow process: we train on entire folder (sample run of a robot) before moving on to the next to 
# amortize the time required to load that folder's training data into RAM (multiple seconds). For a dynamics
# model this should be perfectly acceptable because the dynamics to be learned are ideally the *same* between
# different sample runs recorded on the (simulated) robot. 
itrs = 0
while itrs < TOTAL_TRAINING_ITRS:
    
    # Choose which expert demonstration we're using: 
    CURR_DEMONSTRATION = randint(0,NUM_TRAINING_DEMONSTRATIONS-1)

    frames, actions = Video_Utils.LoadFramesActionsFromFolder(TRAINING_FOLDERS[CURR_DEMONSTRATION], CAM_W, CAM_H, CAM_C, ACTION_LEN)
    # ^ about 5 secs
    
    LENGTH_CURR_DEMONSTRATION = frames.shape[0] # number of timesteps for this demonstration
    
    print("\n===== Training on demonstration #:"+str(CURR_DEMONSTRATION)+" with num timesteps "+str(LENGTH_CURR_DEMONSTRATION))
    
    

Running program...
u_sequence_1486408000621
u_sequence_1486420877768
u_sequence_1486421372155
u_sequence_1486422003939
u_sequence_1486423780289
u_sequence_1486424226636
u_sequence_1486424664609
u_sequence_1486425113072
u_sequence_1486425723665
u_sequence_1486426394928
u_sequence_1486483126414
u_sequence_1486483372904
u_sequence_1486483607589
u_sequence_1486483842933
u_sequence_1486484146973
u_sequence_1486484663068
u_sequence_1486485509467
u_sequence_1486485832231
u_sequence_1486486076841
u_sequence_1486486803219
15
5
20

===== Training on demonstration #:0 with num timesteps 185

===== Training on demonstration #:1 with num timesteps 169

===== Training on demonstration #:13 with num timesteps 226


KeyboardInterrupt: 