# Video Classifier Using CNN and RNN

In [3]:
import os
import shutil
from sklearn.model_selection import train_test_split

def split_dataset(root_dir, train_dir, test_dir, test_size=0.2, random_state=42):
    """
    Splits the dataset into train and test sets.
    
    :param root_dir: Root directory containing class subfolders
    :param train_dir: Directory to store the training set
    :param test_dir: Directory to store the test set
    :param test_size: Proportion of the dataset to include in the test split
    :param random_state: Random seed for reproducibility
    """
    if not os.path.exists(train_dir):
        os.makedirs(train_dir)
    if not os.path.exists(test_dir):
        os.makedirs(test_dir)
        
    classes = [d for d in os.listdir(root_dir) if os.path.isdir(os.path.join(root_dir, d))]
    
    for class_name in classes:
        class_path = os.path.join(root_dir, class_name)
        videos = [os.path.join(class_path, f) for f in os.listdir(class_path) if os.path.isfile(os.path.join(class_path, f))]
        
        train_videos, test_videos = train_test_split(videos, test_size=test_size, random_state=random_state)
        
        train_class_dir = os.path.join(train_dir, class_name)
        test_class_dir = os.path.join(test_dir, class_name)
        
        if not os.path.exists(train_class_dir):
            os.makedirs(train_class_dir)
        if not os.path.exists(test_class_dir):
            os.makedirs(test_class_dir)
            
        for video in train_videos:
            shutil.copy(video, train_class_dir)
        
        for video in test_videos:
            shutil.copy(video, test_class_dir)
            
    print(f"Dataset split into {train_dir} and {test_dir} with test size {test_size}")

# Usage
root_dir = 'raw-data/Videos'
train_dir = 'dataset/train'
test_dir = 'dataset/test'


# split_dataset(root_dir, train_dir, test_dir)

Dataset split into dataset/train and dataset/test with test size 0.2


In [4]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import os

dataset_path = os.listdir('dataset/train')

label_types = os.listdir('dataset/train')
print (label_types)  

['storytelling', 'coverage']


# Preparing Training Data

In [5]:
rooms = []

for item in dataset_path:
 # Get all the file names
 all_rooms = os.listdir('dataset/train' + '/' +item)

 # Add them to the list
 for room in all_rooms:
    rooms.append((item, str('dataset/train' + '/' +item) + '/' + room))
    
# Build a dataframe        
train_df = pd.DataFrame(data=rooms, columns=['tag', 'video_name'])
print(train_df.head())
print(train_df.tail())


            tag                                         video_name
0  storytelling  dataset/train/storytelling/FQIoAkMzLBdALabpeNT...
1  storytelling  dataset/train/storytelling/FQIoAkMzLBdAY9wo9cK...
2  storytelling  dataset/train/storytelling/fRgSZGFzaF9iYXNlbGl...
3  storytelling  dataset/train/storytelling/0B485B7E5AD678A4E18...
4  storytelling  dataset/train/storytelling/324F38FF657A54C14F3...
          tag                                         video_name
203  coverage  dataset/train/coverage/F4490C5A6035F152E38A305...
204  coverage  dataset/train/coverage/774140244F7C71D31F382DC...
205  coverage  dataset/train/coverage/FQIoAkMzLBdATrhR64UeuBg...
206  coverage  dataset/train/coverage/qdit5AFQIoAkMzLBdAMzMzM...
207  coverage  dataset/train/coverage/FQIoAkMzLBdANCj1wo9cKRg...


In [6]:
df = train_df.loc[:,['video_name','tag']]
df
df.to_csv('train.csv')

# Preparing Test Data

In [7]:
dataset_path = os.listdir('dataset/test')
print(dataset_path)

room_types = os.listdir('dataset/test')
print("Types of activities found: ", len(dataset_path))

rooms = []

for item in dataset_path:
 # Get all the file names
 all_rooms = os.listdir('dataset/test' + '/' +item)

 # Add them to the list
 for room in all_rooms:
    rooms.append((item, str('dataset/test' + '/' +item) + '/' + room))
    
# Build a dataframe        
test_df = pd.DataFrame(data=rooms, columns=['tag', 'video_name'])
print(test_df.head())
print(test_df.tail())

df = test_df.loc[:,['video_name','tag']]
df
df.to_csv('test.csv')

['storytelling', 'coverage']
Types of activities found:  2
            tag                                         video_name
0  storytelling  dataset/test/storytelling/194EC19DA59882F7A18D...
1  storytelling  dataset/test/storytelling/FQIoAkMzLBdAQ2ZmZmZm...
2  storytelling  dataset/test/storytelling/GGmHeQIop4EHxEDABRfN...
3  storytelling  dataset/test/storytelling/FQIoAkMzLBdALNP3ztkW...
4  storytelling  dataset/test/storytelling/FQIoAkMzLBdAPFHrhR64...
         tag                                         video_name
48  coverage  dataset/test/coverage/FQIoAkMzLBdAJrhR64UeuBgS...
49  coverage  dataset/test/coverage/204B3F057FA28AFAECED8753...
50  coverage  dataset/test/coverage/FQIoAkMzLBdAMgAAAAAAABgS...
51  coverage  dataset/test/coverage/FQIoAkMzLBdANqPXCj1wpBgS...
52  coverage  dataset/test/coverage/GICWmABTJQkazQcDAJU137yp...


In [5]:
#!pip install git+https://github.com/tensorflow/docs

In [8]:
from tensorflow_docs.vis import embed
from tensorflow import keras
from imutils import paths

import matplotlib.pyplot as plt
import tensorflow as tf
import pandas as pd
import numpy as np
import imageio
import cv2
import os

2024-07-10 18:28:52.975740: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [9]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  try:
    tf.config.experimental.set_virtual_device_configuration(
        gpus[0],[tf.config.experimental.VirtualDeviceConfiguration(memory_limit=5120)])
  except RuntimeError as e:
    print(e)

2024-07-10 18:28:55.170229: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-07-10 18:28:55.202551: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2251] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


# Data preparation

In [10]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

print(f"Total videos for training: {len(train_df)}")
print(f"Total videos for testing: {len(test_df)}")


train_df.sample(10)

Total videos for training: 208
Total videos for testing: 53


Unnamed: 0.1,Unnamed: 0,video_name,tag
179,179,dataset/train/coverage/FQIoAkMzLBdAL0euFHrhSBg...,coverage
50,50,dataset/train/storytelling/6749D6C4DC9B2377389...,storytelling
158,158,dataset/train/coverage/FQIoAkMzLBdAMHrhR64Uexg...,coverage
46,46,dataset/train/storytelling/A54D4EA6DFEBB45471B...,storytelling
10,10,dataset/train/storytelling/FQIoAkMzLBdANLhR64U...,storytelling
22,22,dataset/train/storytelling/8D4B39FCACC958236FC...,storytelling
105,105,dataset/train/coverage/FQIoAkMzLBdAMBQ5WBBiThg...,coverage
97,97,dataset/train/coverage/FQIoAkMzLBdAMFHrhR64RgS...,coverage
109,109,dataset/train/coverage/FQIoAkMzLBdASHCj1wo9cRg...,coverage
87,87,dataset/train/coverage/B84C805E46DFD0436D57E27...,coverage


# Feed the videos to a network:


In [11]:
IMG_SIZE = 224


def crop_center_square(frame):
    y, x = frame.shape[0:2]
    min_dim = min(y, x)
    start_x = (x // 2) - (min_dim // 2)
    start_y = (y // 2) - (min_dim // 2)
    return frame[start_y : start_y + min_dim, start_x : start_x + min_dim]



In [12]:

def load_video(path, max_frames=0, resize=(IMG_SIZE, IMG_SIZE)):
    cap = cv2.VideoCapture(path)
    frames = []
    try:
        while True:
            ret, frame = cap.read()
            if not ret:
                break
            frame = crop_center_square(frame)
            frame = cv2.resize(frame, resize)
            frame = frame[:, :, [2, 1, 0]]
            frames.append(frame)

            if len(frames) == max_frames:
                break
    finally:
        cap.release()
    return np.array(frames)

   ### Feature Extraction

In [13]:


def build_feature_extractor():
    feature_extractor = keras.applications.InceptionV3(
        weights="imagenet",
        include_top=False,
        pooling="avg",
        input_shape=(IMG_SIZE, IMG_SIZE, 3),
    )
    preprocess_input = keras.applications.inception_v3.preprocess_input

    inputs = keras.Input((IMG_SIZE, IMG_SIZE, 3))
    preprocessed = preprocess_input(inputs)

    outputs = feature_extractor(preprocessed)
    return keras.Model(inputs, outputs, name="feature_extractor")


feature_extractor = build_feature_extractor()

### Label Encoding
StringLookup layer encode the class labels as integers.

In [14]:
label_processor = keras.layers.StringLookup(num_oov_indices=0, vocabulary=np.unique(train_df["tag"]))
print(label_processor.get_vocabulary())

labels = train_df["tag"].values
labels = label_processor(labels[..., None]).numpy()
labels

['coverage', 'storytelling']


array([[1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
    

Finally, we can put all the pieces together to create our data processing utility.

In [17]:
#print(train_data[0].shape)
#train_data[0]

In [15]:
#Define hyperparameters

IMG_SIZE = 224
BATCH_SIZE = 64
EPOCHS = 50

MAX_SEQ_LENGTH = 50
NUM_FEATURES = 2048

In [69]:
# def prepare_all_videos(df, root_dir):
#     num_samples = len(df)
#     video_paths = df["video_name"].values.tolist()
    
#     ##take all classlabels from train_df column named 'tag' and store in labels
#     labels = df["tag"].values
    
#     #convert classlabels to label encoding
#     labels = label_processor(labels[..., None]).numpy()

#     # `frame_masks` and `frame_features` are what we will feed to our sequence model.
#     # `frame_masks` will contain a bunch of booleans denoting if a timestep is
#     # masked with padding or not.
#     frame_masks = np.zeros(shape=(num_samples, MAX_SEQ_LENGTH), dtype="bool") # 145,20
#     frame_features = np.zeros(shape=(num_samples, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32") #145,20,2048

#     # For each video.
#     for idx, path in enumerate(video_paths):
#         # Gather all its frames and add a batch dimension.
#         frames = load_video(path)
#         frames = frames[None, ...]

#         # Initialize placeholders to store the masks and features of the current video.
#         temp_frame_mask = np.zeros(shape=(1, MAX_SEQ_LENGTH,), dtype="bool")
#         temp_frame_features = np.zeros(
#             shape=(1, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32"
#         )

#         # Extract features from the frames of the current video.
#         for i, batch in enumerate(frames):
#             video_length = batch.shape[0]
#             length = min(MAX_SEQ_LENGTH, video_length)
#             for j in range(length):
#                 temp_frame_features[i, j, :] = feature_extractor.predict(
#                     batch[None, j, :]
#                 )
#             temp_frame_mask[i, :length] = 1  # 1 = not masked, 0 = masked
            

#         frame_features[idx,] = temp_frame_features.squeeze()
#         frame_masks[idx,] = temp_frame_mask.squeeze()

#     return (frame_features, frame_masks), labels


# train_data, train_labels = prepare_all_videos(train_df, "train")
# test_data, test_labels = prepare_all_videos(test_df, "test")

# print(f"Frame features in train set: {train_data[0].shape}")
# print(f"Frame masks in train set: {train_data[1].shape}")



# print(f"train_labels in train set: {train_labels.shape}")

# print(f"test_labels in train set: {test_labels.shape}")

# MAX_SEQ_LENGTH = 20, NUM_FEATURES = 2048. We have defined this above under hyper parameters

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25

In [16]:

def prepare_all_videos(df, root_dir, feature_extractor):
    video_paths = df["video_name"].values
    labels = df["tag"].values
    # convert classlabels to label encoding
    labels = label_processor(labels[..., None]).numpy()
    
    all_features = []
    all_labels = []
    all_masks = []

    for video_path, label in zip(video_paths, labels):

        frames = load_video( video_path)
        print(len(frames))
        total_clips = len(frames) // MAX_SEQ_LENGTH
        remaining_frames = len(frames) % MAX_SEQ_LENGTH

        # Loop through all the clips that can be formed
        for clip_start in range(0, len(frames) - remaining_frames, MAX_SEQ_LENGTH):
            clip_frames = frames[clip_start:clip_start + MAX_SEQ_LENGTH]
            clip_features = feature_extractor.predict(clip_frames)
            
            all_features.append(clip_features)
            all_labels.append(label)
            all_masks.append([True] * MAX_SEQ_LENGTH)  # No masking, all frames are used

        # Optionally handle remaining frames
        if remaining_frames > 0:
            last_clip = frames[-remaining_frames:]  # take the last few frames
            padded_clip = np.zeros((MAX_SEQ_LENGTH,) + last_clip.shape[1:], dtype=last_clip.dtype)
            padded_clip[:remaining_frames] = last_clip
            clip_features = feature_extractor.predict(padded_clip)
            
            mask = [True] * remaining_frames + [False] * (MAX_SEQ_LENGTH - remaining_frames)
            
            all_features.append(clip_features)
            all_labels.append(label)
            all_masks.append(mask)

    return (np.array(all_features), np.array(all_masks)), np.array(all_labels)

train_data, train_labels = prepare_all_videos(train_df, "train", feature_extractor)
test_data, test_labels = prepare_all_videos(test_df, "test", feature_extractor)


444


2024-07-10 18:30:49.118946: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 50466816 exceeds 10% of free system memory.
2024-07-10 18:30:49.140616: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 48664576 exceeds 10% of free system memory.
2024-07-10 18:30:49.160369: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 97329152 exceeds 10% of free system memory.
2024-07-10 18:30:49.214754: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 66453504 exceeds 10% of free system memory.


[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1s/step


2024-07-10 18:30:50.512455: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 54747648 exceeds 10% of free system memory.


[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 176ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 171ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 171ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 167ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 170ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 169ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 171ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 169ms/step
3972
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 170ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 181ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 175ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 169ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 173ms/step
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m

In [32]:
len(train_data[0])

4522

In [17]:
print(f"Frame features in train set: {train_data[0].shape}")
print(f"Frame masks in train set: {train_data[1].shape}")



print(f"train_labels in train set: {train_labels.shape}")

print(f"test_labels in train set: {test_labels.shape}")


Frame features in train set: (4522, 50, 2048)
Frame masks in train set: (4522, 50)
train_labels in train set: (4522, 1)
test_labels in train set: (1046, 1)


# The sequence model
Now, we can feed this data to a sequence model consisting of recurrent layers like GRU.

In [33]:
# Utility for our sequence model.
def get_sequence_model():
    class_vocab = label_processor.get_vocabulary()

    frame_features_input = keras.Input((MAX_SEQ_LENGTH, NUM_FEATURES))
    mask_input = keras.Input((MAX_SEQ_LENGTH,), dtype="bool")

    # Refer to the following tutorial to understand the significance of using `mask`:
    # https://keras.io/api/layers/recurrent_layers/gru/
    x = keras.layers.GRU(16, return_sequences=True)(frame_features_input, mask=mask_input)
    x = keras.layers.GRU(8)(x)
    x = keras.layers.Dropout(0.4)(x)
    x = keras.layers.Dense(8, activation="relu")(x)
    output = keras.layers.Dense(len(class_vocab), activation="softmax")(x)

    rnn_model = keras.Model([frame_features_input, mask_input], output)

    rnn_model.compile(
        loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy", "f1_score"]
    )
    return rnn_model

EPOCHS = 100
# Utility for running experiments.
def run_experiment():
    filepath = "./tmp/video_classifier.weights.h5"
    checkpoint = keras.callbacks.ModelCheckpoint(
        filepath, save_weights_only=True, save_best_only=True, verbose=1
    )

    seq_model = get_sequence_model()
    history = seq_model.fit(
        [train_data[0], train_data[1]],
        train_labels,
        validation_split=0.3,
        epochs=EPOCHS,
        callbacks=[checkpoint],
    )
    

    seq_model.load_weights(filepath)
    _, accuracy = seq_model.evaluate([test_data[0], test_data[1]], test_labels)
    print(f"Test accuracy: {round(accuracy * 100, 2)}%")

    return history, seq_model


_, sequence_model = run_experiment()

Epoch 1/100
[1m98/99[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 17ms/step - accuracy: 0.7207 - loss: 0.5399
Epoch 1: val_loss improved from inf to 0.64097, saving model to ./tmp/video_classifier.weights.h5
[1m99/99[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 26ms/step - accuracy: 0.7211 - loss: 0.5387 - val_accuracy: 0.8931 - val_loss: 0.6410
Epoch 2/100
[1m97/99[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 16ms/step - accuracy: 0.7910 - loss: 0.3886
Epoch 2: val_loss improved from 0.64097 to 0.55505, saving model to ./tmp/video_classifier.weights.h5
[1m99/99[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 19ms/step - accuracy: 0.7912 - loss: 0.3885 - val_accuracy: 0.9175 - val_loss: 0.5550
Epoch 3/100
[1m99/99[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.8221 - loss: 0.3679
Epoch 3: val_loss did not improve from 0.55505
[1m99/99[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 19ms/step - accuracy: 0.8222 

In [53]:
test_labels

array([1, 1, 1, ..., 0, 0, 0])

In [52]:
from sklearn.metrics import f1_score

# Load the sequence model
seq_model = get_sequence_model()
filepath = "./tmp/video_classifier.weights.h5"
seq_model.load_weights(filepath)
# Predict classes to compute F1 score
predictions = seq_model.predict([test_data[0], test_data[1]])
# Convert probabilities to binary predictions
# Assuming your model outputs class probabilities or logits for each class
predicted_classes = np.argmax(predictions, axis=1)

# Ensure labels are in the correct format (convert if they are one-hot encoded)
if test_labels.ndim > 1 and test_labels.shape[1] > 1:
    test_labels = np.argmax(test_labels, axis=1)

# Calculate F1 score for multiclass
f1 = f1_score(test_labels, predicted_classes, average='macro')
print(f"Test F1 score: {round(f1, 2)}")



  saveable.load_own_variables(weights_store.get(inner_path))


[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step
Test F1 score: 0.78


In [54]:
print(predictions)

[[0.8795304  0.12046959]
 [0.00334397 0.996656  ]
 [0.8669338  0.13306618]
 ...
 [0.8766199  0.12338011]
 [0.00174107 0.9982589 ]
 [0.00149419 0.99850583]]


# Inference

In [41]:
test_df['video_name']

0     dataset/test/storytelling/194EC19DA59882F7A18D...
1     dataset/test/storytelling/FQIoAkMzLBdAQ2ZmZmZm...
2     dataset/test/storytelling/GGmHeQIop4EHxEDABRfN...
3     dataset/test/storytelling/FQIoAkMzLBdALNP3ztkW...
4     dataset/test/storytelling/FQIoAkMzLBdAPFHrhR64...
5     dataset/test/storytelling/FQIoAkMzLBdAVoAAAAAA...
6     dataset/test/storytelling/4phAFQIoAkMzLBdAZ6j1...
7     dataset/test/storytelling/fOxgSZGFzaF9iYXNlbGl...
8     dataset/test/storytelling/C14496A44F3F513B0303...
9     dataset/test/storytelling/FQIoAkMzLBdAVeuFHrhR...
10    dataset/test/storytelling/FQIoAkMzLBdAPwvGpdsh...
11    dataset/test/storytelling/FQIoAkMzLBdAYArAgxJu...
12    dataset/test/storytelling/FQIoAkMzLBdAXaPXCj1w...
13    dataset/test/storytelling/fRgSZGFzaF9iYXNlbGlu...
14    dataset/test/coverage/oBAFQIoAkMzLBdAUvCj1wo9c...
15    dataset/test/coverage/FQIoAkMzLBdARoUeuFHrhRgS...
16    dataset/test/coverage/E44863E32842B3B74B69F093...
17    dataset/test/coverage/BF45386F13CB0295C92D

In [42]:
def prepare_single_video(frames):
    frames = frames[None, ...]
    frame_mask = np.zeros(shape=(1, MAX_SEQ_LENGTH,), dtype="bool")
    frame_features = np.zeros(shape=(1, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32")

    for i, batch in enumerate(frames):
        video_length = batch.shape[0]
        length = min(MAX_SEQ_LENGTH, video_length)
        for j in range(length):
            frame_features[i, j, :] = feature_extractor.predict(batch[None, j, :])
        frame_mask[i, :length] = 1  # 1 = not masked, 0 = masked

    return frame_features, frame_mask
          

def sequence_prediction(path):
    class_vocab = label_processor.get_vocabulary()

    frames = load_video(path)
    frame_features, frame_mask = prepare_single_video(frames)
    probabilities = sequence_model.predict([frame_features, frame_mask])[0]

    for i in np.argsort(probabilities)[::-1]:
        print(f"  {class_vocab[i]}: {probabilities[i] * 100:5.2f}%")
    return frames

# test_video = np.random.choice(test_df["video_name"].values.tolist())
# test_video = test_df['video_name'][0]
# print(f"Test video path: {test_video}")

# test_frames = sequence_prediction(test_video)

for i in range(10):
    test_video = test_df['video_name'][i+15]
    print(f"Test video path: {test_video}")

    test_frames = sequence_prediction(test_video)

Test video path: dataset/test/coverage/FQIoAkMzLBdARoUeuFHrhRgSZGFzaF9iYXNlbGluZV8xX3YxEQB1AAA_18032370904777467.mp4
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━

In [43]:
for i in range(10):
    test_video = test_df['video_name'][i]
    print(f"Test video path: {test_video}")

    test_frames = sequence_prediction(test_video)

Test video path: dataset/test/storytelling/194EC19DA59882F7A18D75ADF410EFA9_video_dashinit_17859344523146387.mp4
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━