In [1]:
from tensorflow_docs.vis import embed
from tensorflow import keras
from imutils import paths

import matplotlib.pyplot as plt
import tensorflow as tf
import pandas as pd
import numpy as np
import imageio
import cv2
import os

In [2]:
# IMG_SIZE = 224
# BATCH_SIZE = 32
# EPOCHS = 20

# MAX_SEQ_LENGTH = 100
# NUM_FEATURES = 2048

In [3]:
MAX_SEQ_LENGTH = 20
NUM_FEATURES = 2048
IMG_SIZE = 128

EPOCHS = 10

In [4]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

print(f"Total videos for training: {len(train_df)}")
print(f"Total videos for testing: {len(test_df)}")

train_df.sample(10)

Total videos for training: 9537
Total videos for testing: 3783


Unnamed: 0,video_name,tag
8154,v_StillRings_g22_c01.avi,StillRings
8263,v_Surfing_g09_c02.avi,Surfing
6404,v_PlayingViolin_g23_c02.avi,PlayingViolin
1564,v_BoxingPunchingBag_g09_c04.avi,BoxingPunchingBag
1853,v_BrushingTeeth_g11_c01.avi,BrushingTeeth
1170,v_Billiards_g22_c02.avi,Billiards
7913,v_SoccerJuggling_g12_c04.avi,SoccerJuggling
4490,v_JumpingJack_g22_c01.avi,JumpingJack
9455,v_YoYo_g09_c06.avi,YoYo
3136,v_GolfSwing_g20_c06.avi,GolfSwing


In [4]:
# The following two methods are taken from this tutorial:
# https://www.tensorflow.org/hub/tutorials/action_recognition_with_tf_hub


def crop_center_square(frame):
    y, x = frame.shape[0:2]
    min_dim = min(y, x)
    start_x = (x // 2) - (min_dim // 2)
    start_y = (y // 2) - (min_dim // 2)
    return frame[start_y : start_y + min_dim, start_x : start_x + min_dim]


def load_video(path, max_frames=0, resize=(IMG_SIZE, IMG_SIZE)):
    cap = cv2.VideoCapture(path)
    frames = []
    try:
        while True:
            ret, frame = cap.read()
            if not ret:
                break
            frame = crop_center_square(frame)
            frame = cv2.resize(frame, resize)
            frame = frame[:, :, [2, 1, 0]]
            frames.append(frame)

            if len(frames) == max_frames:
                break
    finally:
        cap.release()
    return np.array(frames)


In [5]:

def build_feature_extractor():
    feature_extractor = keras.applications.ResNet152(
        weights="imagenet",
        include_top=False,
        pooling="avg",
        input_shape=(IMG_SIZE, IMG_SIZE, 3),
    )
    preprocess_input = keras.applications.resnet.preprocess_input

    inputs = keras.Input((IMG_SIZE, IMG_SIZE, 3))
    preprocessed = preprocess_input(inputs)

    outputs = feature_extractor(preprocessed)
    return keras.Model(inputs, outputs, name="feature_extractor")


feature_extractor = build_feature_extractor()

2022-07-10 00:08:35.766152: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-07-10 00:08:36.264055: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 10415 MB memory:  -> device: 0, name: GeForce GTX 1080 Ti, pci bus id: 0000:02:00.0, compute capability: 6.1


In [6]:
label_processor = keras.layers.StringLookup(
    num_oov_indices=0, vocabulary=np.unique(train_df["tag"])
)
print(label_processor.get_vocabulary())

['ApplyEyeMakeup', 'ApplyLipstick', 'Archery', 'BabyCrawling', 'BalanceBeam', 'BandMarching', 'BaseballPitch', 'Basketball', 'BasketballDunk', 'BenchPress', 'Biking', 'Billiards', 'BlowDryHair', 'BlowingCandles', 'BodyWeightSquats', 'Bowling', 'BoxingPunchingBag', 'BoxingSpeedBag', 'BreastStroke', 'BrushingTeeth', 'CleanAndJerk', 'CliffDiving', 'CricketBowling', 'CricketShot', 'CuttingInKitchen', 'Diving', 'Drumming', 'Fencing', 'FieldHockeyPenalty', 'FloorGymnastics', 'FrisbeeCatch', 'FrontCrawl', 'GolfSwing', 'Haircut', 'HammerThrow', 'Hammering', 'HandstandPushups', 'HandstandWalking', 'HeadMassage', 'HighJump', 'HorseRace', 'HorseRiding', 'HulaHoop', 'IceDancing', 'JavelinThrow', 'JugglingBalls', 'JumpRope', 'JumpingJack', 'Kayaking', 'Knitting', 'LongJump', 'Lunges', 'MilitaryParade', 'Mixing', 'MoppingFloor', 'Nunchucks', 'ParallelBars', 'PizzaTossing', 'PlayingCello', 'PlayingDaf', 'PlayingDhol', 'PlayingFlute', 'PlayingGuitar', 'PlayingPiano', 'PlayingSitar', 'PlayingTabla', 'P

In [7]:
from tqdm import tqdm

def prepare_all_videos(df, root_dir):
    num_samples = len(df)
    video_paths = df["video_name"].values.tolist()
    labels = df["tag"].values
    labels = label_processor(labels[..., None]).numpy()

    # `frame_masks` and `frame_features` are what we will feed to our sequence model.
    # `frame_masks` will contain a bunch of booleans denoting if a timestep is
    # masked with padding or not.
    frame_masks = np.zeros(shape=(num_samples, MAX_SEQ_LENGTH), dtype="bool")
    frame_features = np.zeros(
        shape=(num_samples, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32"
    )

    # For each video.
    for idx, path in tqdm(enumerate(video_paths)):
        # Gather all its frames and add a batch dimension.
        frames = load_video(os.path.join(root_dir, path))
        frames = frames[None, ...]

        # Initialize placeholders to store the masks and features of the current video.
        temp_frame_mask = np.zeros(shape=(1, MAX_SEQ_LENGTH,), dtype="bool")
        temp_frame_features = np.zeros(
            shape=(1, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32"
        )

        # Extract features from the frames of the current video.
        for i, batch in enumerate(frames):
            video_length = batch.shape[0]
            length = min(MAX_SEQ_LENGTH, video_length)
            for j in range(length):
                temp_frame_features[i, j, :] = feature_extractor.predict(
                    batch[None, j, :]
                )
            temp_frame_mask[i, :length] = 1  # 1 = not masked, 0 = masked

        frame_features[idx,] = temp_frame_features.squeeze()
        frame_masks[idx,] = temp_frame_mask.squeeze()

    return (frame_features, frame_masks), labels


# train_data, train_labels = prepare_all_videos(train_df, "train")
test_data, test_labels = prepare_all_videos(test_df, "test")

# print(f"Frame features in train set: {train_data[0].shape}")
# print(f"Frame masks in train set: {train_data[1].shape}")

0it [00:00, ?it/s]2022-07-10 00:08:42.919387: I tensorflow/stream_executor/cuda/cuda_dnn.cc:368] Loaded cuDNN version 8100
2022-07-10 00:08:43.148367: I tensorflow/core/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2022-07-10 00:08:43.148778: I tensorflow/core/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2022-07-10 00:08:43.148815: W tensorflow/stream_executor/gpu/asm_compiler.cc:80] Couldn't get ptxas version string: INTERNAL: Couldn't invoke ptxas --version
2022-07-10 00:08:43.149460: I tensorflow/core/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2022-07-10 00:08:43.149574: W tensorflow/stream_executor/gpu/redzone_allocator.cc:314] INTERNAL: Failed to launch ptxas
Relying on driver to perform ptx compilation. 
Modify $PATH to customize ptxas location.
This message will be only logged once.
3783it [6:55:47,  6.59s/it]


In [9]:
train_data, train_labels = prepare_all_videos(train_df, "train")

2999it [5:29:26,  5.61s/it]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

9537it [17:48:27,  6.72s/it]


In [None]:
# Utility for our sequence model.
# Bidirectional

def get_sequence_model():
    class_vocab = label_processor.get_vocabulary()

    frame_features_input = keras.Input((MAX_SEQ_LENGTH, NUM_FEATURES))
    mask_input = keras.Input((MAX_SEQ_LENGTH,), dtype="bool")

    # Refer to the following tutorial to understand the significance of using `mask`:
    # https://keras.io/api/layers/recurrent_layers/gru/
    x = keras.layers.Bidirectional(keras.layers.LSTM(2048, return_sequences=False,dropout=0.5))(#16, 512
        frame_features_input, mask=mask_input
    )
#     x = keras.layers.LSTM(2048)(x)#8, 256
#     x = keras.layers.Dropout(0.1)(x)
    x = keras.layers.Dense(1024)(x)#8, 256
    x = keras.layers.LeakyReLU()(x)
    output = keras.layers.Dense(len(class_vocab), activation="softmax")(x)

    rnn_model = keras.Model([frame_features_input, mask_input], output)

    rnn_model.compile(
        loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"]
    )
    return rnn_model


# Utility for running experiments.
def run_experiment():
    filepath = "/tmp/video_classifier"
    checkpoint = keras.callbacks.ModelCheckpoint(
        filepath, save_weights_only=True, save_best_only=True, verbose=1
    )

    seq_model = get_sequence_model()
    print(seq_model.summary())
    
    history = seq_model.fit(
        [train_data[0], train_data[1]],
        train_labels,
        validation_split=0.2,
        epochs=EPOCHS,
        callbacks=[checkpoint],
    )

    seq_model.load_weights(filepath)
    _, accuracy = seq_model.evaluate([test_data[0], test_data[1]], test_labels)
    print(f"Test accuracy: {round(accuracy * 100, 2)}%")

    return history, seq_model


_, sequence_model = run_experiment()

In [None]:
#40.13% LSTM(2048,dropout=0.1)
#46.47% LSTM(2048,dropout=0.5)
#43.96% LSTM(4096,dropout=0.5)

In [None]:
class_vocab = label_processor.get_vocabulary()

frame_features_input = keras.Input((MAX_SEQ_LENGTH, NUM_FEATURES))
mask_input = keras.Input((MAX_SEQ_LENGTH,), dtype="bool")

# Refer to the following tutorial to understand the significance of using `mask`:
# https://keras.io/api/layers/recurrent_layers/gru/
x = keras.layers.LSTM(2048, return_sequences=True,dropout=0.1)(#16, 512
    frame_features_input, mask=mask_input
)
#     x = keras.layers.LSTM(2048)(x)#8, 256
#     x = keras.layers.Dropout(0.1)(x)
x = keras.layers.Dense(1024, activation="relu")(x)#8, 256
output = keras.layers.Dense(len(class_vocab), activation="softmax")(x)

rnn_model = keras.Model([frame_features_input, mask_input], output)

rnn_model.compile(
    loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"]
)

print(rnn_model.summary())

In [None]:
class_vocab = label_processor.get_vocabulary()

frame_features_input = keras.Input((MAX_SEQ_LENGTH, NUM_FEATURES))
mask_input = keras.Input((MAX_SEQ_LENGTH,), dtype="bool")

# Refer to the following tutorial to understand the significance of using `mask`:
# https://keras.io/api/layers/recurrent_layers/gru/
x = keras.layers.LSTM(2048, return_sequences=False,dropout=0.1)(#16, 512
    frame_features_input, mask=mask_input
)
# x = keras.layers.LSTM(2048)(x)#8, 256
#     x = keras.layers.Dropout(0.1)(x)
x = keras.layers.Dense(1024, activation="relu")(x)#8, 256
output = keras.layers.Dense(len(class_vocab), activation="softmax")(x)

rnn_model = keras.Model([frame_features_input, mask_input], output)

rnn_model.compile(
    loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"]
)

print(rnn_model.summary())

In [None]:

def prepare_single_video(frames):
    frames = frames[None, ...]
    frame_mask = np.zeros(shape=(1, MAX_SEQ_LENGTH,), dtype="bool")
    frame_features = np.zeros(shape=(1, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32")

    for i, batch in enumerate(frames):
        video_length = batch.shape[0]
        length = min(MAX_SEQ_LENGTH, video_length)
        for j in range(length):
            frame_features[i, j, :] = feature_extractor.predict(batch[None, j, :])
        frame_mask[i, :length] = 1  # 1 = not masked, 0 = masked

    return frame_features, frame_mask


def sequence_prediction(path):
    class_vocab = label_processor.get_vocabulary()

    frames = load_video(os.path.join("test", path))
    frame_features, frame_mask = prepare_single_video(frames)
    probabilities = sequence_model.predict([frame_features, frame_mask])[0]

    for i in np.argsort(probabilities)[::-1]:
        print(f"  {class_vocab[i]}: {probabilities[i] * 100:5.2f}%")
    return frames


# This utility is for visualization.
# Referenced from:
# https://www.tensorflow.org/hub/tutorials/action_recognition_with_tf_hub
def to_gif(images):
    converted_images = images.astype(np.uint8)
    imageio.mimsave("animation.gif", converted_images, fps=10)
    return embed.embed_file("animation.gif")


test_video = np.random.choice(test_df["video_name"].values.tolist())
print(f"Test video path: {test_video}")
test_frames = sequence_prediction(test_video)
to_gif(test_frames[:MAX_SEQ_LENGTH])

In [None]:
ls -l

In [None]:
import numpy as np
import tensorflow as tf 

np.save('Inception_CNNRNN_train_data.npy', train_data)    # .npy extension is added if not given
# np.save('Inception_CNNRNN_test_data.npy', test_data[0])    # .npy extension is added if not given
# np.save('Inception_CNNRNN_train_labels.npy', train_data[1])    # .npy extension is added if not given
# np.save('Inception_CNNRNN_test_labels.npy', test_labels)    # .npy extension is added if not given

In [None]:
print(train_data[0].shape)
print(train_data[1].shape)

In [None]:
print(train_data.type)

In [10]:
import pickle

with open('Resnet152_CNNRNN_train_100.pickle', 'wb') as f:
    pickle.dump(train_data, f)

In [8]:
import pickle

with open('Resnet152_CNNRNN_test_100.pickle', 'wb') as f:
    pickle.dump(test_data, f)

In [None]:
import numpy as np
import tensorflow as tf 

# np.save('densenet_train_data.npy', train_data)    # .npy extension is added if not given
# np.save('densenet_test_data.npy', test_data)    # .npy extension is added if not given
np.save('CNNRNN_train_labels.npy', train_labels)    # .npy extension is added if not given
np.save('CNNRNN_test_labels.npy', test_labels)    # .npy extension is added if not given

In [None]:
ls -l

In [None]:
mv Inception_CNNRNN_20.pickle Inception_CNNRNN_train_20.pickle

In [4]:
import numpy as np
import tensorflow as tf 
import pickle

with open('Resnet152_CNNRNN_train_100.pickle', 'rb') as f:
     d_train_data = pickle.load(f)
with open('Resnet152_CNNRNN_test_100.pickle', 'rb') as f:
     d_test_data = pickle.load(f)
        
# d_train_data = np.load('densenet_train_data.npy')
# d_test_data = np.load('densenet_test_data.npy')
d_train_labels = np.load('CNNRNN_train_labels.npy')
d_test_labels = np.load('CNNRNN_test_labels.npy')

In [None]:
label_processor = keras.layers.StringLookup(
    num_oov_indices=0, vocabulary=np.unique(train_df["tag"])
)
# print(label_processor.get_vocabulary())

def get_sequence_model():
    class_vocab = label_processor.get_vocabulary()

    frame_features_input = keras.Input((MAX_SEQ_LENGTH, NUM_FEATURES))
    mask_input = keras.Input((MAX_SEQ_LENGTH,), dtype="bool")

    # Refer to the following tutorial to understand the significance of using `mask`:
    # https://keras.io/api/layers/recurrent_layers/gru/
    x = keras.layers.LSTM(2048, return_sequences=False,dropout=0.5)(#16, 512
        frame_features_input, mask=mask_input
    )    
#     x = keras.layers.Bidirectional(keras.layers.LSTM(2048, return_sequences=False,dropout=0.5),merge_mode='concat')(#16, 512
#         frame_features_input, mask=mask_input
#     )
#     x = keras.layers.LSTM(2048)(x)#8, 256
#     x = keras.layers.Dropout(0.1)(x)
    x = keras.layers.Dense(1024)(x)#8, 256
    x = keras.layers.GaussianNoise(0.4)(x)
    x = keras.layers.LeakyReLU(0.1)(x)    
    output = keras.layers.Dense(len(class_vocab), activation="softmax")(x)

    rnn_model = keras.Model([frame_features_input, mask_input], output)

#     rnn_model.compile(
#         loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"]
#     )
    return rnn_model

# Utility for running experiments.
def d_run_experiment():
    filepath = "/tmp/video_classifier"
    checkpoint = keras.callbacks.ModelCheckpoint(
        filepath, save_weights_only=True, save_best_only=True, verbose=1
    )

    seq_model = get_sequence_model()
    print(seq_model.summary())
    
    optimizer = keras.optimizers.SGD(lr=1e-3)
#     optimizer = keras.optimizers.Adadelta()
    
    seq_model.compile(
        loss="sparse_categorical_crossentropy", optimizer=optimizer, metrics=["accuracy"]
    )    
    
    history = seq_model.fit(
        [d_train_data[0], d_train_data[1]],
        d_train_labels,
#         validation_split=0.2,
        validation_data=([d_test_data[0], d_test_data[1]],
        d_test_labels),
        epochs=100,
#         epochs=EPOCHS,
        callbacks=[checkpoint],
    )

    seq_model.load_weights(filepath)
    _, accuracy = seq_model.evaluate([d_test_data[0], d_test_data[1]], d_test_labels)
    print(f"Test accuracy: {round(accuracy * 100, 2)}%")

    return history, seq_model


d_run_experiment()

In [None]:
#lr=0.1 77.40%
#lr=0.01 78.69%
#lr=0.001 77.85%

In [None]:
label_processor = keras.layers.StringLookup(
    num_oov_indices=0, vocabulary=np.unique(train_df["tag"])
)
# print(label_processor.get_vocabulary())

def get_sequence_model():
    class_vocab = label_processor.get_vocabulary()

    frame_features_input = keras.Input((MAX_SEQ_LENGTH, NUM_FEATURES))
    mask_input = keras.Input((MAX_SEQ_LENGTH,), dtype="bool")

    # Refer to the following tutorial to understand the significance of using `mask`:
    # https://keras.io/api/layers/recurrent_layers/gru/
    x = keras.layers.LSTM(2048, return_sequences=False,dropout=0.5)(#16, 512
        frame_features_input, mask=mask_input
    )    
#     x = keras.layers.Bidirectional(keras.layers.LSTM(2048, return_sequences=False,dropout=0.5),merge_mode='concat')(#16, 512
#         frame_features_input, mask=mask_input
#     )
#     x = keras.layers.LSTM(2048)(x)#8, 256
#     x = keras.layers.Dropout(0.1)(x)
    x = keras.layers.Dense(1024)(x)#8, 256
    x = keras.layers.GaussianNoise(0.4)(x)
    x = keras.layers.LeakyReLU(0.1)(x)    
    output = keras.layers.Dense(len(class_vocab), activation="softmax")(x)

    rnn_model = keras.Model([frame_features_input, mask_input], output)

#     rnn_model.compile(
#         loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"]
#     )
    return rnn_model

# Utility for running experiments.
def d_run_experiment():
    filepath = "/tmp/video_classifier"
    checkpoint = keras.callbacks.ModelCheckpoint(
        filepath, save_weights_only=True, save_best_only=True, verbose=1
    )

    seq_model = get_sequence_model()
    print(seq_model.summary())
    

    lr_schedule = keras.callbacks.LearningRateScheduler(
                  lambda epoch: 1e-6 * 10**(4*epoch / 10))
    
    optimizer = keras.optimizers.SGD(lr=1e-3)
    
    seq_model.compile(
        loss="sparse_categorical_crossentropy", optimizer=optimizer, metrics=["accuracy"]
    )    
#     seq_model.compile(optimizer=optimizer,
#                       loss='categorical_crossentropy',
#                      metrics=['accuracy'])     
    
    history = seq_model.fit(
        [d_train_data[0], d_train_data[1]],
        d_train_labels,
#         validation_split=0.2,
        validation_data=([d_test_data[0], d_test_data[1]],
        d_test_labels),
        epochs=20,
#         epochs=EPOCHS,
        callbacks=[lr_schedule,checkpoint],
    )

    seq_model.load_weights(filepath)
    _, accuracy = seq_model.evaluate([d_test_data[0], d_test_data[1]], d_test_labels)
    print(f"Test accuracy: {round(accuracy * 100, 2)}%")

    return history, seq_model


his, seq = d_run_experiment()


In [None]:
from matplotlib import pyplot as plt

plt.semilogx(his.history['lr'], his.history['accuracy'])
plt.axis([1e-6, 1, 0, 1])
plt.xlabel('lr')
plt.ylabel('accuracy')
plt.show()

In [6]:
import numpy as np

MAX_SEQ_LENGTH = 20#override max sequence length
#max_seq=40
# copy_indices=[0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38]
# copy_indices=[0,4,8,12,16,20,24,28,32,36]
# copy_indices=[0,5,10,15,20,25,30,35]
# copy_indices=[0,8,16,24,32]
# copy_indices=[0,19]
# copy_indices=[0]
# max_seq=100
# # copy_indices=[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,
#               26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,
#               51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,
#               76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99]
copy_indices=[0,5,10,15,20,25,30,35,40,45,50,55,60,65,70,75,80,85,90,95]
# copy_indices=[0,10,20,30,40,50,60,70,80,90]
#copy_indices=[0,20,40,60,80]
# copy_indices=[0,25,50,75]
# copy_indices=[0,50]
d_train_data2_1 = np.copy(d_train_data[0][:,copy_indices, ])
d_train_data2_2 = np.copy(d_train_data[1][:,copy_indices, ])
d_train_data2 = (d_train_data2_1,d_train_data2_2)

d_test_data2_1 = np.copy(d_test_data[0][:,copy_indices, ])
d_test_data2_2 = np.copy(d_test_data[1][:,copy_indices, ])
d_test_data2 = (d_test_data2_1,d_test_data2_2)
# print(d_train_data2.shape)

label_processor = keras.layers.StringLookup(
    num_oov_indices=0, vocabulary=np.unique(train_df["tag"])
)
# print(label_processor.get_vocabulary())

def get_sequence_model():
    class_vocab = label_processor.get_vocabulary()

    frame_features_input = keras.Input((MAX_SEQ_LENGTH, NUM_FEATURES))
    mask_input = keras.Input((MAX_SEQ_LENGTH,), dtype="bool")

    # Refer to the following tutorial to understand the significance of using `mask`:
    # https://keras.io/api/layers/recurrent_layers/gru/
    x = keras.layers.LSTM(2048, return_sequences=False,dropout=0.5)(#16, 512
        frame_features_input, mask=mask_input
    )    
#     x = keras.layers.Bidirectional(keras.layers.LSTM(2048, return_sequences=False,dropout=0.5),merge_mode='concat')(#16, 512
#         frame_features_input, mask=mask_input
#     )
#     x = keras.layers.LSTM(2048)(x)#8, 256
#     x = keras.layers.Dropout(0.1)(x)
    x = keras.layers.Dense(1024)(x)#8, 256
    x = keras.layers.GaussianNoise(0.4)(x)
    x = keras.layers.LeakyReLU(0.1)(x)    
    output = keras.layers.Dense(len(class_vocab), activation="softmax")(x)

    rnn_model = keras.Model([frame_features_input, mask_input], output)

#     rnn_model.compile(
#         loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"]
#     )
    return rnn_model

# Utility for running experiments.
def d_run_experiment():
    filepath = "/tmp/video_classifier"
    checkpoint = keras.callbacks.ModelCheckpoint(
        filepath, save_weights_only=True, save_best_only=True, verbose=1
    )

    seq_model = get_sequence_model()
    print(seq_model.summary())
    
    optimizer = keras.optimizers.SGD(lr=1e-2)
#     optimizer = keras.optimizers.Adadelta()
    
    seq_model.compile(
        loss="sparse_categorical_crossentropy", optimizer=optimizer, metrics=["accuracy"]
    )    
    
    history = seq_model.fit(
        [d_train_data2[0], d_train_data2[1]],
        d_train_labels,
#         validation_split=0.2,
        validation_data=([d_test_data2[0], d_test_data2[1]],
        d_test_labels),
        epochs=100,
#         epochs=EPOCHS,
        callbacks=[checkpoint],
    )

    seq_model.load_weights(filepath)
    _, accuracy = seq_model.evaluate([d_test_data2[0], d_test_data2[1]], d_test_labels)
    print(f"Test accuracy: {round(accuracy * 100, 2)}%")

    return history, seq_model


d_run_experiment()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_3 (InputLayer)           [(None, 20, 2048)]   0           []                               
                                                                                                  
 input_4 (InputLayer)           [(None, 20)]         0           []                               
                                                                                                  
 lstm_1 (LSTM)                  (None, 2048)         33562624    ['input_3[0][0]',                
                                                                  'input_4[0][0]']                
                                                                                                  
 dense_2 (Dense)                (None, 1024)         2098176     ['lstm_1[0][0]']           

2022-07-11 02:21:40.196429: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 1562542080 exceeds 10% of free system memory.
2022-07-11 02:21:41.123292: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 1562542080 exceeds 10% of free system memory.


Epoch 1/100
Epoch 1: val_loss improved from inf to 3.90117, saving model to /tmp/video_classifier
Epoch 2/100
Epoch 2: val_loss did not improve from 3.90117
Epoch 3/100
Epoch 3: val_loss did not improve from 3.90117
Epoch 4/100
Epoch 4: val_loss improved from 3.90117 to 0.79690, saving model to /tmp/video_classifier
Epoch 5/100
Epoch 5: val_loss improved from 0.79690 to 0.77655, saving model to /tmp/video_classifier
Epoch 6/100
Epoch 6: val_loss improved from 0.77655 to 0.74427, saving model to /tmp/video_classifier
Epoch 7/100
Epoch 7: val_loss did not improve from 0.74427
Epoch 8/100
Epoch 8: val_loss did not improve from 0.74427
Epoch 9/100
Epoch 9: val_loss did not improve from 0.74427
Epoch 10/100
Epoch 10: val_loss did not improve from 0.74427
Epoch 11/100
Epoch 11: val_loss did not improve from 0.74427
Epoch 12/100
Epoch 12: val_loss did not improve from 0.74427
Epoch 13/100
Epoch 13: val_loss did not improve from 0.74427
Epoch 14/100
Epoch 14: val_loss did not improve from 0.74

Epoch 30/100
Epoch 30: val_loss did not improve from 0.74427
Epoch 31/100
Epoch 31: val_loss did not improve from 0.74427
Epoch 32/100
Epoch 32: val_loss did not improve from 0.74427
Epoch 33/100
Epoch 33: val_loss did not improve from 0.74427
Epoch 34/100
Epoch 34: val_loss did not improve from 0.74427
Epoch 35/100
Epoch 35: val_loss did not improve from 0.74427
Epoch 36/100
Epoch 36: val_loss did not improve from 0.74427
Epoch 37/100
Epoch 37: val_loss did not improve from 0.74427
Epoch 38/100
Epoch 38: val_loss did not improve from 0.74427
Epoch 39/100
Epoch 39: val_loss did not improve from 0.74427
Epoch 40/100
Epoch 40: val_loss did not improve from 0.74427
Epoch 41/100
Epoch 41: val_loss did not improve from 0.74427
Epoch 42/100
Epoch 42: val_loss did not improve from 0.74427
Epoch 43/100
Epoch 43: val_loss did not improve from 0.74427
Epoch 44/100
Epoch 44: val_loss did not improve from 0.74427
Epoch 45/100
Epoch 45: val_loss did not improve from 0.74427
Epoch 46/100
Epoch 46: v

Epoch 60/100
Epoch 60: val_loss did not improve from 0.74427
Epoch 61/100
Epoch 61: val_loss did not improve from 0.74427
Epoch 62/100
Epoch 62: val_loss did not improve from 0.74427
Epoch 63/100
Epoch 63: val_loss did not improve from 0.74427
Epoch 64/100
Epoch 64: val_loss did not improve from 0.74427
Epoch 65/100
Epoch 65: val_loss did not improve from 0.74427
Epoch 66/100
Epoch 66: val_loss did not improve from 0.74427
Epoch 67/100
Epoch 67: val_loss did not improve from 0.74427
Epoch 68/100
Epoch 68: val_loss did not improve from 0.74427
Epoch 69/100
Epoch 69: val_loss did not improve from 0.74427
Epoch 70/100
Epoch 70: val_loss did not improve from 0.74427
Epoch 71/100
Epoch 71: val_loss did not improve from 0.74427
Epoch 72/100
Epoch 72: val_loss did not improve from 0.74427
Epoch 73/100
Epoch 73: val_loss did not improve from 0.74427
Epoch 74/100
Epoch 74: val_loss did not improve from 0.74427
Epoch 75/100
Epoch 75: val_loss did not improve from 0.74427
Epoch 76/100
Epoch 76: v

Epoch 89/100
Epoch 89: val_loss did not improve from 0.74427
Epoch 90/100
Epoch 90: val_loss did not improve from 0.74427
Epoch 91/100
Epoch 91: val_loss did not improve from 0.74427
Epoch 92/100
Epoch 92: val_loss did not improve from 0.74427
Epoch 93/100
Epoch 93: val_loss did not improve from 0.74427
Epoch 94/100
Epoch 94: val_loss did not improve from 0.74427
Epoch 95/100
Epoch 95: val_loss did not improve from 0.74427
Epoch 96/100
Epoch 96: val_loss did not improve from 0.74427
Epoch 97/100
Epoch 97: val_loss did not improve from 0.74427
Epoch 98/100
Epoch 98: val_loss did not improve from 0.74427
Epoch 99/100
Epoch 99: val_loss did not improve from 0.74427
Epoch 100/100
Epoch 100: val_loss did not improve from 0.74427


ValueError: Shapes (2048, 8192) and (1920, 8192) are incompatible

In [None]:
#max_seq=40
#seq=1 75.76%
#seq=2 78.48%
#seq=5 79.96%
#seq=8 79.88%
#seq=10 79.62%
#seq=20 79.62%

#max_seq=100
#seq=2 78.91%
#seq=4 80.89
#seq=5 80.31
#seq=10 81.07
#seq=20 81.18

In [5]:
import numpy as np
import tensorflow as tf 
import pickle

with open('Resnet152_CNNRNN_train_100.pickle', 'rb') as f:
     d_train_data = pickle.load(f)
with open('Resnet152_CNNRNN_test_100.pickle', 'rb') as f:
     d_test_data = pickle.load(f)
        
# d_train_data = np.load('densenet_train_data.npy')
# d_test_data = np.load('densenet_test_data.npy')
d_train_labels = np.load('CNNRNN_train_labels.npy')
d_test_labels = np.load('CNNRNN_test_labels.npy')

In [6]:
train_labels = d_train_labels
test_labels = d_test_labels

# print(f"Frame features in train set: {train_data.shape}")

In [7]:
import numpy as np

MAX_SEQ_LENGTH = 20#override max sequence length
#max_seq=40
# copy_indices=[0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38]
# copy_indices=[0,4,8,12,16,20,24,28,32,36]
# copy_indices=[0,5,10,15,20,25,30,35]
# copy_indices=[0,8,16,24,32]
# copy_indices=[0,19]
# copy_indices=[0]
# max_seq=100
# # copy_indices=[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,
#               26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,
#               51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,
#               76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99]
copy_indices=[0,5,10,15,20,25,30,35,40,45,50,55,60,65,70,75,80,85,90,95]
# copy_indices=[0,10,20,30,40,50,60,70,80,90]
#copy_indices=[0,20,40,60,80]
# copy_indices=[0,25,50,75]
# copy_indices=[0,50]
train_data = np.copy(d_train_data[0][:,copy_indices, ])
test_data = np.copy(d_test_data[0][:,copy_indices, ])
print(train_data.shape)

(9537, 20, 2048)


In [8]:

class PositionalEmbedding(keras.layers.Layer):
    def __init__(self, sequence_length, output_dim, **kwargs):
        super().__init__(**kwargs)
        self.position_embeddings = keras.layers.Embedding(
            input_dim=sequence_length, output_dim=output_dim
        )
        self.sequence_length = sequence_length
        self.output_dim = output_dim

    def call(self, inputs):
        # The inputs are of shape: `(batch_size, frames, num_features)`
        length = tf.shape(inputs)[1]
        positions = tf.range(start=0, limit=length, delta=1)
        embedded_positions = self.position_embeddings(positions)
        return inputs + embedded_positions

    def compute_mask(self, inputs, mask=None):
        mask = tf.reduce_any(tf.cast(inputs, "bool"), axis=-1)
        return mask


In [9]:

class TransformerEncoder(keras.layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.attention = keras.layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim, dropout=0.3
        )
        self.dense_proj = keras.Sequential(
            [keras.layers.Dense(dense_dim, activation=tf.nn.gelu), keras.layers.Dense(embed_dim),]
        )
        self.layernorm_1 = keras.layers.LayerNormalization()
        self.layernorm_2 = keras.layers.LayerNormalization()

    def call(self, inputs, mask=None):
        if mask is not None:
            mask = mask[:, tf.newaxis, :]

        attention_output = self.attention(inputs, inputs, attention_mask=mask)
        proj_input = self.layernorm_1(inputs + attention_output)
        proj_output = self.dense_proj(proj_input)
        return self.layernorm_2(proj_input + proj_output)


In [11]:
# Label preprocessing with StringLookup.
label_processor = keras.layers.StringLookup(
    num_oov_indices=0, vocabulary=np.unique(train_df["tag"]), mask_token=None
)

def get_compiled_model():
    sequence_length = MAX_SEQ_LENGTH
    embed_dim = NUM_FEATURES
    dense_dim = 16
    num_heads = 6
    classes = len(label_processor.get_vocabulary())

    inputs = keras.Input(shape=(None, None))
    x = PositionalEmbedding(
        sequence_length, embed_dim, name="frame_position_embedding"
    )(inputs)
#     x = TransformerEncoder2(embed_dim, dense_dim, num_heads, name="transformer_layer")(x)
#     x = TransformerEncoder(embed_dim, dense_dim, num_heads, name="transformer_layer")(x)
#     x = TransformerEncoder(embed_dim, dense_dim, num_heads, name="transformer_layer2")(x)
#     x = TransformerEncoder3(embed_dim, dense_dim, num_heads, name="transformer_layer")(x)

#     attention_output = keras.layers.MultiHeadAttention(
#         num_heads=num_heads, key_dim=embed_dim, dropout=0.1
#     )(x, x)

#     proj_input = keras.layers.LayerNormalization()(x + attention_output)
#     proj_output = keras.Sequential(
#         [keras.layers.Dense(dense_dim, activation=tf.nn.gelu), keras.layers.Dense(embed_dim),]
#     ) (proj_input)
#     representation = keras.layers.LayerNormalization()(proj_input + proj_output)

    # Layer normalization and MHSA
    encoded_patches = x
    
    for _ in range(2):
#         x1 = keras.layers.LayerNormalization()(encoded_patches)
        x1 = encoded_patches
        attention_output = keras.layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim, dropout=0.1
        )(x1, x1)

        # Skip connection
        x2 = keras.layers.Add()([attention_output, x1])

        # Layer Normalization and MLP
        x3 = keras.layers.LayerNormalization()(x2)
        x3 = keras.Sequential(
            [
                keras.layers.Dense(units=dense_dim, activation=tf.nn.gelu),
                keras.layers.Dense(units=embed_dim),
            ]
        )(x3)

        # Skip connection
        encoded_patches = keras.layers.Add()([x3, x2])
        encoded_patches = keras.layers.LayerNormalization()(encoded_patches)

    representation = keras.layers.GlobalMaxPooling1D()(encoded_patches)
    representation = keras.layers.Dropout(0.5)(representation)
    
    outputs = keras.layers.Dense(classes, activation="softmax")(representation)
    model = keras.Model(inputs, outputs)

    model.compile(
        optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
    )
    return model


def run_experiment():
    filepath = "/tmp/video_classifier"
    checkpoint = keras.callbacks.ModelCheckpoint(
        filepath, save_weights_only=True, save_best_only=True, verbose=1
    )

    model = get_compiled_model()
    history = model.fit(
        train_data,
        train_labels,
        # validation_split=0.15,
        validation_data=([test_data, test_labels]),
        epochs=200,
        callbacks=[checkpoint],
    )
    
    print(f"Test best accuracy: {round(max(history.history['val_accuracy'])*100,2)}%")

#     model.load_weights(filepath)
#     _, accuracy = model.evaluate(test_data, test_labels)
#     print(f"Test accuracy: {round(accuracy * 100, 2)}%")

    return model

trained_model = run_experiment()

Epoch 1/200
Epoch 1: val_loss improved from inf to 4.47149, saving model to /tmp/video_classifier
Epoch 2/200
Epoch 2: val_loss improved from 4.47149 to 4.03899, saving model to /tmp/video_classifier
Epoch 3/200
Epoch 3: val_loss improved from 4.03899 to 3.87626, saving model to /tmp/video_classifier
Epoch 4/200
Epoch 4: val_loss improved from 3.87626 to 3.38392, saving model to /tmp/video_classifier
Epoch 5/200
Epoch 5: val_loss improved from 3.38392 to 3.17215, saving model to /tmp/video_classifier
Epoch 6/200
Epoch 6: val_loss improved from 3.17215 to 2.89690, saving model to /tmp/video_classifier
Epoch 7/200
Epoch 7: val_loss improved from 2.89690 to 2.74776, saving model to /tmp/video_classifier
Epoch 8/200
Epoch 8: val_loss improved from 2.74776 to 2.54474, saving model to /tmp/video_classifier
Epoch 9/200
Epoch 9: val_loss did not improve from 2.54474
Epoch 10/200
Epoch 10: val_loss improved from 2.54474 to 2.33083, saving model to /tmp/video_classifier
Epoch 11/200
Epoch 11: va

Epoch 28: val_loss did not improve from 1.85592
Epoch 29/200
Epoch 29: val_loss did not improve from 1.85592
Epoch 30/200
Epoch 30: val_loss did not improve from 1.85592
Epoch 31/200
Epoch 31: val_loss did not improve from 1.85592
Epoch 32/200
Epoch 32: val_loss did not improve from 1.85592
Epoch 33/200
Epoch 33: val_loss did not improve from 1.85592
Epoch 34/200
Epoch 34: val_loss did not improve from 1.85592
Epoch 35/200
Epoch 35: val_loss did not improve from 1.85592
Epoch 36/200
Epoch 36: val_loss did not improve from 1.85592
Epoch 37/200
Epoch 37: val_loss did not improve from 1.85592
Epoch 38/200
Epoch 38: val_loss did not improve from 1.85592
Epoch 39/200
Epoch 39: val_loss did not improve from 1.85592
Epoch 40/200
Epoch 40: val_loss did not improve from 1.85592
Epoch 41/200
Epoch 41: val_loss did not improve from 1.85592
Epoch 42/200
Epoch 42: val_loss did not improve from 1.85592
Epoch 43/200
Epoch 43: val_loss did not improve from 1.85592
Epoch 44/200
Epoch 44: val_loss did n

Epoch 57: val_loss did not improve from 1.85592
Epoch 58/200
Epoch 58: val_loss did not improve from 1.85592
Epoch 59/200
Epoch 59: val_loss did not improve from 1.85592
Epoch 60/200
Epoch 60: val_loss did not improve from 1.85592
Epoch 61/200
Epoch 61: val_loss did not improve from 1.85592
Epoch 62/200
Epoch 62: val_loss did not improve from 1.85592
Epoch 63/200
Epoch 63: val_loss did not improve from 1.85592
Epoch 64/200
Epoch 64: val_loss did not improve from 1.85592
Epoch 65/200
Epoch 65: val_loss did not improve from 1.85592
Epoch 66/200
Epoch 66: val_loss did not improve from 1.85592
Epoch 67/200
Epoch 67: val_loss did not improve from 1.85592
Epoch 68/200
Epoch 68: val_loss did not improve from 1.85592
Epoch 69/200
Epoch 69: val_loss did not improve from 1.85592
Epoch 70/200
Epoch 70: val_loss did not improve from 1.85592
Epoch 71/200
Epoch 71: val_loss did not improve from 1.85592
Epoch 72/200
Epoch 72: val_loss did not improve from 1.85592
Epoch 73/200
Epoch 73: val_loss did n

Epoch 86: val_loss did not improve from 1.85592
Epoch 87/200
Epoch 87: val_loss did not improve from 1.85592
Epoch 88/200
Epoch 88: val_loss did not improve from 1.85592
Epoch 89/200
Epoch 89: val_loss did not improve from 1.85592
Epoch 90/200
Epoch 90: val_loss did not improve from 1.85592
Epoch 91/200
Epoch 91: val_loss did not improve from 1.85592
Epoch 92/200
Epoch 92: val_loss did not improve from 1.85592
Epoch 93/200
Epoch 93: val_loss did not improve from 1.85592
Epoch 94/200
Epoch 94: val_loss did not improve from 1.85592
Epoch 95/200
Epoch 95: val_loss did not improve from 1.85592
Epoch 96/200
Epoch 96: val_loss did not improve from 1.85592
Epoch 97/200
Epoch 97: val_loss did not improve from 1.85592
Epoch 98/200
Epoch 98: val_loss did not improve from 1.85592
Epoch 99/200
Epoch 99: val_loss did not improve from 1.85592
Epoch 100/200
Epoch 100: val_loss did not improve from 1.85592
Epoch 101/200
Epoch 101: val_loss did not improve from 1.85592
Epoch 102/200
Epoch 102: val_loss

Epoch 115: val_loss did not improve from 1.85592
Epoch 116/200
Epoch 116: val_loss did not improve from 1.85592
Epoch 117/200
Epoch 117: val_loss did not improve from 1.85592
Epoch 118/200
Epoch 118: val_loss did not improve from 1.85592
Epoch 119/200
Epoch 119: val_loss did not improve from 1.85592
Epoch 120/200
Epoch 120: val_loss did not improve from 1.85592
Epoch 121/200
Epoch 121: val_loss did not improve from 1.85592
Epoch 122/200
Epoch 122: val_loss did not improve from 1.85592
Epoch 123/200
Epoch 123: val_loss did not improve from 1.85592
Epoch 124/200
Epoch 124: val_loss did not improve from 1.85592
Epoch 125/200
Epoch 125: val_loss did not improve from 1.85592
Epoch 126/200
Epoch 126: val_loss did not improve from 1.85592
Epoch 127/200
Epoch 127: val_loss did not improve from 1.85592
Epoch 128/200
Epoch 128: val_loss did not improve from 1.85592
Epoch 129/200
Epoch 129: val_loss did not improve from 1.85592
Epoch 130/200
Epoch 130: val_loss did not improve from 1.85592
Epoch 

Epoch 144: val_loss did not improve from 1.85592
Epoch 145/200
Epoch 145: val_loss did not improve from 1.85592
Epoch 146/200
Epoch 146: val_loss did not improve from 1.85592
Epoch 147/200
Epoch 147: val_loss did not improve from 1.85592
Epoch 148/200
Epoch 148: val_loss did not improve from 1.85592
Epoch 149/200
Epoch 149: val_loss did not improve from 1.85592
Epoch 150/200
Epoch 150: val_loss did not improve from 1.85592
Epoch 151/200
Epoch 151: val_loss did not improve from 1.85592
Epoch 152/200
Epoch 152: val_loss did not improve from 1.85592
Epoch 153/200
Epoch 153: val_loss did not improve from 1.85592
Epoch 154/200
Epoch 154: val_loss did not improve from 1.85592
Epoch 155/200
Epoch 155: val_loss did not improve from 1.85592
Epoch 156/200
Epoch 156: val_loss did not improve from 1.85592
Epoch 157/200
Epoch 157: val_loss did not improve from 1.85592
Epoch 158/200
Epoch 158: val_loss did not improve from 1.85592
Epoch 159/200
Epoch 159: val_loss did not improve from 1.85592
Epoch 

Epoch 173: val_loss did not improve from 1.85592
Epoch 174/200
Epoch 174: val_loss did not improve from 1.85592
Epoch 175/200
Epoch 175: val_loss did not improve from 1.85592
Epoch 176/200
Epoch 176: val_loss did not improve from 1.85592
Epoch 177/200
Epoch 177: val_loss did not improve from 1.85592
Epoch 178/200
Epoch 178: val_loss did not improve from 1.85592
Epoch 179/200
Epoch 179: val_loss did not improve from 1.85592
Epoch 180/200
Epoch 180: val_loss did not improve from 1.85592
Epoch 181/200
Epoch 181: val_loss did not improve from 1.85592
Epoch 182/200
Epoch 182: val_loss did not improve from 1.85592
Epoch 183/200
Epoch 183: val_loss did not improve from 1.85592
Epoch 184/200
Epoch 184: val_loss did not improve from 1.85592
Epoch 185/200
Epoch 185: val_loss did not improve from 1.85592
Epoch 186/200
Epoch 186: val_loss did not improve from 1.85592
Epoch 187/200
Epoch 187: val_loss did not improve from 1.85592
Epoch 188/200
Epoch 188: val_loss did not improve from 1.85592
Epoch 

In [None]:
#total sequence = 100
#max_sequence = 4
#head=1
#dense_dim=4 69.26
#8 71.50
#16 70.82
#32 68.70
#64 71.21
#128 69.65
#256
#512
#1024


#max_sequence = 2
#head=1
#dense_dim=4 71.08
#128 70.05
#head=4
#128 75.02
#256 74.76
#head=6
#128 75.57
#64 75.39

##max_sequence = 10
#head=6
#64 77.24
#128 76.53
#32 77.4
#1024 74.81
#16 77.13
#8 76.55

##max_sequence =20
#head=6
#8 77.08
#16 76.55
#4 75.73
#32 77.24
#64 75.84