In [None]:
import os
import time
import numpy as np
from glob import glob
import json
import random
import cv2
import gc
from time import time
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'
#os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import tensorflow as tf
policy = tf.keras.mixed_precision.Policy("mixed_float16")
tf.keras.mixed_precision.set_global_policy(policy)
strategy = tf.distribute.MultiWorkerMirroredStrategy()
from tensorflow import keras
from tensorflow.keras import layers
from keras_cv_attention_models import efficientnet

In [None]:
image_frames = 60
frame_draw_size = 384
input_shape = (image_frames, frame_draw_size, frame_draw_size, 3)

In [None]:
backbone_file = 'efficientnetv2-s-21k-ft1k.h5'
if not os.path.exists(backbone_file):
    print('downloading backbone')
    os.system('wget https://github.com/leondgarse/keras_efficientnet_v2/releases/download/effnetv2_pretrained/' + backbone_file + ' -O ' + backbone_file)


In [None]:
models = glob('models/*.h5')
models.sort(key=os.path.getmtime)
model_name = models[-1]
print('loading model: ' + model_name)
model = keras.models.load_model(model_name)
model.summary()

In [None]:
test_video = 'test.mp4'
if not os.path.exists(test_video):
    print('no video')

start_time_minute = 31
start_time_second = 40
start_time = (start_time_minute*60 +start_time_second) * 1000

# open the video
cap = cv2.VideoCapture(test_video)

cap.set(cv2.CAP_PROP_POS_MSEC, start_time)
frames = []
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
image_size = 384

featuremodel = efficientnet.EfficientNetV2S(pretrained=backbone_file,dropout=1e-6, num_classes=0, include_preprocessing = True)
predictions = []
frame_batch = []


In [None]:

while True:
    ret, frame = cap.read()
    if not ret:
        break
    frame = frame[:, : width // 2]
    frame = cv2.resize(frame, (image_size, image_size))
    cv2.imshow('frame', frame)
    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    k = cv2.waitKey(1)
    if k == 27:
        break
    frame = np.array(frame)
    frame_features = featuremodel.predict(np.expand_dims(frame, axis=0))
    frames.append(frame_features[0])
    if len(frames) == 60:
        frames = np.array(frames)
        frame_batch.append(frames)
        frames = []
    if len(frame_batch) == 2:
        frame_batch = np.array(frame_batch)
        prediction = model.predict(frame_batch)
        predictions.extend(prediction)
        frame_batch = []
    if len(predictions) > 9:
        break
           

In [None]:

import matplotlib.pyplot as plt

predictions_nparray = np.array(predictions)
print(predictions_nparray.shape)
new_predictions = []
for i in range(0, len(predictions_nparray)):
    for j in range(0, len(predictions_nparray[i]), 6):
        new_predictions.append(predictions_nparray[i][j:j+6])
new_predictions = np.array(new_predictions)
print(new_predictions.shape)
# plot the first value of each frame prediction
plt.plot(new_predictions[:, 0])
plt.show()    
plt.plot(new_predictions[:, 1])
plt.show()    


In [None]:
predictions

In [None]:
cap.set(cv2.CAP_PROP_POS_MSEC, start_time)
frames = []
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

print(len(new_predictions))
frame_draw_size = 1024
spacing = 16
line_width = 4
frame_for_mp4 = []

while True:
    cap.set(cv2.CAP_PROP_POS_MSEC, start_time)
    for frame_prediction in new_predictions:
        ret, frame = cap.read()
        frame = frame[:, : width // 2]
        frame = cv2.resize(frame, (frame_draw_size, frame_draw_size))
        # frame = cv2.line(frame, (frame_draw_size//2, frame_draw_size), (frame_draw_size//2, int(frame_draw_size * frame_prediction[0])), (0, 255, 0), line_width)
        # frame = cv2.line(frame, (frame_draw_size//2 + spacing, frame_draw_size), (frame_draw_size//2 + spacing, int(frame_draw_size * frame_prediction[1])), (0, 0, 255), line_width)
        # frame = cv2.line(frame, (frame_draw_size//2 - spacing, frame_draw_size), (frame_draw_size//2 - spacing, int(frame_draw_size * frame_prediction[2])), (0, 0, 255), line_width)
        # frame = cv2.line(frame, (frame_draw_size//2 + spacing * 2, frame_draw_size), (frame_draw_size//2 + spacing * 2, int(frame_draw_size * frame_prediction[3])), (0, 0, 255), line_width)
        # frame = cv2.line(frame, (frame_draw_size//2 - spacing * 2, frame_draw_size), (frame_draw_size//2 - spacing * 2, int(frame_draw_size * frame_prediction[4])), (0, 0, 255), line_width)

        # axis 0 is vertical axis and is represented by a vertical line from the bottom of the image
        axis_0_y1 = frame_draw_size
        axis_0_y2 = frame_draw_size - int(frame_draw_size * frame_prediction[0])
        #frame = cv2.line(frame, (frame_draw_size//2, axis_0_y1), (frame_draw_size//2, axis_0_y2), (0, 255, 0), line_width)
        
        # axis 1 is depth and is the proximity to the camera and is represented by a horizontal line that grows from the center of the image
        axis_1_x1 = frame_draw_size //2 - int(frame_draw_size * frame_prediction[1]) // 2
        axis_1_x2 = frame_draw_size //2 + int(frame_draw_size * frame_prediction[1]) // 2
        #frame = cv2.line(frame, (frame_draw_size //2 - int(frame_draw_size * frame_prediction[1]) // 2, frame_draw_size//2), (frame_draw_size //2 + int(frame_draw_size * frame_prediction[1]) // 2, frame_draw_size//2), (0, 0, 255), line_width)

        # merged representation of axis 0 and axis 1
        frame = cv2.line(frame, (axis_1_x1, axis_0_y2), (axis_1_x2, axis_0_y2), (0, 0, 255), line_width)

        # axis 2 is horizontal axis and is represented by a vertical line that moves from left to right
        axis_2_x1 = int(frame_draw_size * frame_prediction[2])
        axis_2_x2 = int(frame_draw_size * frame_prediction[2])
        #frame = cv2.line(frame, (axis_2_x1, 0), (axis_2_x2, frame_draw_size), (0, 0, 255), line_width)
        
        # axis 3 is pitch and represented by a vertical line that grows from the center of the image
        axis_3_y1 = frame_draw_size //2 - int(frame_draw_size * frame_prediction[3]) // 2
        axis_3_y2 = frame_draw_size //2 + int(frame_draw_size * frame_prediction[3]) // 2
        #frame = cv2.line(frame, (frame_draw_size //2, axis_3_y1), (frame_draw_size //2, axis_3_y2), (0, 0, 255), line_width)

        # merged representation of axis 2 and axis 3
        frame = cv2.line(frame, (axis_2_x1, axis_3_y1), (axis_2_x2, axis_3_y2), (0, 255, 0), line_width)

        # axis 4 is roll and is represented by a vertical line that tilts from left to right where 0.5 is vertical and placed in the top left quadrant
        axis_4_x1 = int(frame_draw_size * frame_prediction[4])
        axis_4_x2 = int(frame_draw_size - frame_draw_size * frame_prediction[4])
        # shift it to the top left corner quadrant
        axis_4_x1 = axis_4_x1 // 2
        axis_4_x2 = axis_4_x2 // 2
        axis_4_y1 = 0
        axis_4_y2 = frame_draw_size // 2
        frame = cv2.line(frame, (axis_4_x1, axis_4_y1), (axis_4_x2, axis_4_y2), (255, 0, 0), line_width)
        
        # axis 5 is twist and is represented by a horizontal line that tilts from left to right where 0.5 is vertical and placed in the top right quadrant
        axis_5_y1 = int(frame_draw_size * frame_prediction[5])
        axis_5_y2 = int(frame_draw_size - frame_draw_size * frame_prediction[5])
        # shift it to the top right corner quadrant
        axis_5_y1 = axis_5_y1 // 2
        axis_5_y2 = axis_5_y2 // 2
        axis_5_x1 = frame_draw_size // 2
        axis_5_x2 = frame_draw_size
        frame = cv2.line(frame, (axis_5_x1, axis_5_y1), (axis_5_x2, axis_5_y2), (255, 255, 0), line_width)

        frame_for_mp4.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
        #print(frame_prediction[0])
        cv2.imshow('frame', frame)
        k = cv2.waitKey(16)
        if k == 27:
            break
    if k == 27:
        break
    break

cv2.destroyAllWindows()

import imageio
imageio.mimsave('test.mp4', frame_for_mp4, fps=60)

# write gif
import imageio
imageio.mimsave('test.gif', frame_for_mp4, duration=1/60)

