In [1]:
# set up autoreload
%load_ext autoreload
%autoreload 2

In [2]:
import tensorflow as tf
import numpy as np
from dataset import DataManager, FrameDataGenerator

# set up data generator
data_manager = DataManager(
        data_path=r"/home/ji/Dropbox/Robotics/ENPM809K_Fundamentals_in_AI_and_DL/Data",
        train_val_test_split=(0.80,0.1,0.1))

2023-11-24 09:33:43.297107: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
[32m2023-11-24 09:33:45.984[0m | [1mINFO    [0m | [36mdataset[0m:[36m__init__[0m:[36m28[0m - [1mFound video file: /home/ji/Dropbox/Robotics/ENPM809K_Fundamentals_in_AI_and_DL/Data/Data_2023-11-06_09-22-28.avi[0m
[32m2023-11-24 09:33:45.985[0m | [1mINFO    [0m | [36mdataset[0m:[36m__init__[0m:[36m28[0m - [1mFound video file: /home/ji/Dropbox/Robotics/ENPM809K_Fundamentals_in_AI_and_DL/Data/Data_2023-11-12_09-53-23.avi[0m
[32m2023-11-24 09:33:45.985[0m | [1mINFO    [0m | [36mdataset[0m:[36m__init__[0m:[36m28[0m - [1mFound video file: /home/ji/Dropbox/Robotics/ENPM809K_Fundamentals_in_AI_and_DL/Data/Data_2023-11-06_09-42-29.avi[0m
[32m2023-11-24 

In [15]:
# set up tf database
config = {}
config["fold_n_frames"] = 4 # fold n frames to predict the n+1 frame

# train/val/test data loader
train_loader = FrameDataGenerator(file_list=data_manager.get_training_files(),
                                    config=config)
val_loader = FrameDataGenerator(file_list=data_manager.get_validation_files(),
                                    config=config)
test_loader = FrameDataGenerator(file_list=data_manager.get_test_files(),
                                    config=config)

# create tensorflow database from the generator
n_col = 7 # the txt files have 7 columns
output_signature = (tf.TensorSpec(shape = (None, None, 3*config["fold_n_frames"]),
                                    dtype = train_loader.frame_dtype),
                    tf.TensorSpec(shape = (None, None, 3),
                                    dtype = train_loader.frame_dtype),
                    tf.TensorSpec(shape = (n_col*config["fold_n_frames"],),
                                    dtype = train_loader.action_dtype))
    
# create train/val/test tf dataset
train_ds = tf.data.Dataset.from_generator(train_loader,
                                          output_signature = output_signature)
val_ds = tf.data.Dataset.from_generator(val_loader,
                                          output_signature = output_signature)
test_ds = tf.data.Dataset.from_generator(test_loader,
                                          output_signature = output_signature)

# set up batch size
train_ds = train_ds.batch(8)
train_ds = train_ds.prefetch(buffer_size = 20)


# for validation and test evaluation, we are gonna sample
# mini batches and calculate the loss, and repeat several
# times and compute the mean loss, as opposed to compute 
# the loss on the whole validation/test set, so we can use
# the same data generation pipeline built for the training
# set
val_ds = val_ds.batch(8)
val_ds = val_ds.prefetch(buffer_size = 20)
test_ds = test_ds.batch(8)
test_ds = test_ds.prefetch(buffer_size = 20)

[32m2023-11-24 09:46:03.444[0m | [1mINFO    [0m | [36mdataset[0m:[36m__init__[0m:[36m122[0m - [1mFound action file: /home/ji/Dropbox/Robotics/ENPM809K_Fundamentals_in_AI_and_DL/Data/Data_2023-10-27_10-00-04_merge.txt[0m
[32m2023-11-24 09:46:03.444[0m | [1mINFO    [0m | [36mdataset[0m:[36m__init__[0m:[36m122[0m - [1mFound action file: /home/ji/Dropbox/Robotics/ENPM809K_Fundamentals_in_AI_and_DL/Data/Data_2023-11-06_09-32-29_merge.txt[0m
[32m2023-11-24 09:46:03.445[0m | [1mINFO    [0m | [36mdataset[0m:[36m__init__[0m:[36m122[0m - [1mFound action file: /home/ji/Dropbox/Robotics/ENPM809K_Fundamentals_in_AI_and_DL/Data/Data_2023-10-26_10-15-07_merge.txt[0m
[32m2023-11-24 09:46:03.446[0m | [1mINFO    [0m | [36mdataset[0m:[36m__init__[0m:[36m122[0m - [1mFound action file: /home/ji/Dropbox/Robotics/ENPM809K_Fundamentals_in_AI_and_DL/Data/Data_2023-10-25_09-33-20_merge.txt[0m
[32m2023-11-24 09:46:03.446[0m | [1mINFO    [0m | [36mdataset[0m:

In [4]:
import numpy as np

# image in
def outdim(d,f,p,s):
    return np.floor(d+2*p-f)/s+1

# image out size
def outdim_conv2d_tr(d,f,p,s, output_padding):
    return ((d-1)*s+f-2*p+output_padding)

d1 = outdim(192,8,0,2)
d2 = outdim(d1,6,0,2)
d3 = outdim(d2,6,0,2)
d4 = outdim(d3,4,0,2)
print(d4)

d4_ = outdim_conv2d_tr(d4,4,0,2,0)
d3_ = outdim_conv2d_tr(d4_,6,0,2,0)
d2_ = outdim_conv2d_tr(d3_,6,0,2,1)
d1_ = outdim_conv2d_tr(d2_,8,0,2,0)
print(d1_)

9.0
192.0


In [16]:
from model import EncoderNet, DecoderNet, InteractionModule, FramePredictionModel

# some hyper parameters for the network
encoder_output_dim = 1024


img_dim = (192,256,3*config["fold_n_frames"])

encoder = EncoderNet(layer_specs = [
                        {"type":"conv2d","kwargs":{"filters": 64,  "kernel_size": 8, "strides":2,"activation":"relu","input_shape":img_dim}},
                        {"type":"conv2d","kwargs":{"filters": 128, "kernel_size": 6, "strides":2,"activation":"relu"}},
                        {"type":"conv2d","kwargs":{"filters": 128, "kernel_size": 6, "strides":2,"activation":"relu"}},
                        {"type":"conv2d","kwargs":{"filters": 128, "kernel_size": 4, "strides":2,"activation":"relu"}},
                        {"type":"flatten"},
                        {"type":"dense","kwargs":{"units":encoder_output_dim,"activation":"relu"}}
                        ]
                    )

# get the output size of the last conv layer
conv_out_size = encoder._layers.layers[3].output_shape[1:]

# create the interaction module
interaction_dim, interaction_output_dim = 2048, 2048
interaction = InteractionModule(encoder_dim=encoder_output_dim,
                                action_dim=7,
                                intermediate_dim=interaction_dim,
                                output_dim=interaction_output_dim)

# fc_dim is the dimension of the first fc layer in decoder
nchan = 64
fc_dim = np.prod(conv_out_size[:2])*nchan

decoder = DecoderNet(layer_specs = [
                        {"type":"dense","kwargs":{"units":fc_dim,"activation":"relu","input_shape":(interaction_output_dim,)}},
                        {"type":"reshape","kwargs":{"target_shape":(conv_out_size[0],conv_out_size[1],nchan)}},
                        {"type":"conv2dtr","kwargs":{"filters": 128, "kernel_size": 4, "strides":2,"activation":"relu"}},
                        {"type":"conv2dtr","kwargs":{"filters": 128, "kernel_size": 6, "strides":2,"activation":"relu"}},
                        {"type":"conv2dtr","kwargs":{"filters": 128, "kernel_size": 6, "strides":2,"activation":"relu", "output_padding":1}},
                        {"type":"conv2dtr","kwargs":{"filters":   3, "kernel_size": 8, "strides":2,"activation":None}},
                        ]
                    )

# make sure decoder output matches the input image's size
assert(decoder.output_shape[:2]==img_dim[:2])
assert(decoder.output_shape[-1]==3)


In [17]:
# create the full model
fpm = FramePredictionModel(encoder=encoder,
                           decoder=decoder,
                           interaction=interaction)

# test with some sample data
frame_input_sample, frame_output_sample,actions_sample = next(iter(train_ds))
out_test = fpm((frame_input_sample, actions_sample))
assert(out_test.shape==frame_output_sample.shape)

In [24]:
# try fitting on a small dataset, e.g., a single 5 min video
i = 0
import time
t0 = time.time()
for data in train_ds:
    # fpm.step(mini_batch=data,verbose=True)
    i+=1
    print(i,time.time()-t0)
    if i > 30:
        break

1 0.6197526454925537
2 1.065995454788208
3 1.5411694049835205
4 1.9713358879089355
5 2.44104266166687
6 2.7647764682769775
7 3.1039600372314453
8 3.435509443283081
9 3.773118257522583
10 4.11020827293396
11 4.455305099487305
12 4.793283462524414
13 5.132407903671265
14 5.4733850955963135
15 5.794481515884399
16 6.125990629196167
17 6.46504807472229
18 6.793807029724121
19 7.09549617767334
20 7.385892629623413
21 7.673980951309204
22 7.965121746063232
23 8.268872737884521
24 8.558481216430664
25 8.852063655853271
26 9.141260147094727
27 9.427605867385864
28 9.716598510742188
29 10.011133909225464
30 10.303534269332886
31 10.599039554595947


In [23]:
t0 = time.time()
fpm.step(mini_batch=data,verbose=True)
print(time.time()-t0)

[32m2023-11-24 10:20:35.982[0m | [1mINFO    [0m | [36mmodel[0m:[36mstep[0m:[36m213[0m - [1mloss: 0.09967827796936035[0m


0.20226407051086426


In [27]:
from PIL import Image
data =frame_output_sample.numpy()[0,:,:,:].squeeze()
data = (data*255).astype(np.uint8)
im = Image.fromarray(data)
im.show()

/snap/core20/current/lib/x86_64-linux-gnu/libstdc++.so.6: version `GLIBCXX_3.4.29' not found (required by /lib/x86_64-linux-gnu/libproxy.so.1)
Failed to load module: /home/ji/snap/code/common/.cache/gio-modules/libgiolibproxy.so
