diff --git a/examples/vision/video_classify.py b/examples/vision/video_classify.py new file mode 100644 index 0000000000..c02e17a5fb --- /dev/null +++ b/examples/vision/video_classify.py @@ -0,0 +1,211 @@ +""" +Title: Video Classification With TimeDistributed Layer +Author: [Sujoy K Goswami](https://www.linkedin.com/in/sujoy-kumar-goswami/) +Date created: 2022/01/09 +Last modified: 2022/01/10 +Description: Guide to examine any video-classification-model quickly without any GPU. +""" + +""" +## Introduction + +Video Classification DL Models are heavy and need huge size of data. +So it is time-consuming and expensive. Here it is shown, how to examine your model +quickly, before feeding the actual data, and that also without need of any GPU. + +Here video dataset will be created; a white rectangle moving in different directions, +on a black canvas. The sample code for creating left-moving-rectangle videos is below. +""" + +import numpy as np +import skvideo.io as sk +from IPython.display import Video + +# creating sample video data +num_vids = 5 +num_imgs = 50 +img_size = 50 +min_object_size = 1 +max_object_size = 5 + +for i_vid in range(num_vids): + imgs = np.zeros((num_imgs, img_size, img_size)) # set background to 0 + vid_name = "vid" + str(i_vid) + ".mp4" + w, h = np.random.randint(min_object_size, max_object_size, size=2) + x = np.random.randint(0, img_size - w) + y = np.random.randint(0, img_size - h) + i_img = 0 + while x > 0: + imgs[i_img, y : y + h, x : x + w] = 255 # set rectangle as foreground + x = x - 1 + i_img = i_img + 1 + sk.vwrite(vid_name, imgs.astype(np.uint8)) +Video( + "vid3.mp4" +) # play a video; the script and video generated should be in same folder + +""" +## Data Generation and Preparation + +Now dataset with 4 classes will be created where, a rectangle is moving in 4 +different directions in those classes respectively. +""" + + +# preparing dataset +X_train = [] +Y_train = [] +labels = {0: "left", 1: "right", 2: "up", 3: "down"} # 4 classes +num_vids = 40 +num_imgs = 40 +img_size = 40 +min_object_size = 1 +max_object_size = 5 +# video frames with left moving object +for i_vid in range(num_vids): + imgs = np.zeros((num_imgs, img_size, img_size)) # set background to 0 + # vid_name = 'vid' + str(i_vid) + '.mp4' + w, h = np.random.randint(min_object_size, max_object_size, size=2) + x = np.random.randint(0, img_size - w) + y = np.random.randint(0, img_size - h) + i_img = 0 + while x > 0: + imgs[i_img, y : y + h, x : x + w] = 255 # set rectangle as foreground + x = x - 1 + i_img = i_img + 1 + X_train.append(imgs) +for i in range(0, num_imgs): + Y_train.append(0) +# video frames with right moving object +for i_vid in range(num_vids): + imgs = np.zeros((num_imgs, img_size, img_size)) # set background to 0 + # vid_name = 'vid' + str(i_vid) + '.mp4' + w, h = np.random.randint(min_object_size, max_object_size, size=2) + x = np.random.randint(0, img_size - w) + y = np.random.randint(0, img_size - h) + i_img = 0 + while x < img_size: + imgs[i_img, y : y + h, x : x + w] = 255 # set rectangle as foreground + x = x + 1 + i_img = i_img + 1 + X_train.append(imgs) +for i in range(0, num_imgs): + Y_train.append(1) +# video frames with up moving object +for i_vid in range(num_vids): + imgs = np.zeros((num_imgs, img_size, img_size)) # set background to 0 + # vid_name = 'vid' + str(i_vid) + '.mp4' + w, h = np.random.randint(min_object_size, max_object_size, size=2) + x = np.random.randint(0, img_size - w) + y = np.random.randint(0, img_size - h) + i_img = 0 + while y > 0: + imgs[i_img, y : y + h, x : x + w] = 255 # set rectangle as foreground + y = y - 1 + i_img = i_img + 1 + X_train.append(imgs) +for i in range(0, num_imgs): + Y_train.append(2) +# video frames with down moving object +for i_vid in range(num_vids): + imgs = np.zeros((num_imgs, img_size, img_size)) # set background to 0 + # vid_name = 'vid' + str(i_vid) + '.mp4' + w, h = np.random.randint(min_object_size, max_object_size, size=2) + x = np.random.randint(0, img_size - w) + y = np.random.randint(0, img_size - h) + i_img = 0 + while y < img_size: + imgs[i_img, y : y + h, x : x + w] = 255 # set rectangle as foreground + y = y + 1 + i_img = i_img + 1 + X_train.append(imgs) +for i in range(0, num_imgs): + Y_train.append(3) + +# data pre-processing +from tensorflow.keras.utils import to_categorical + +X_train = np.array(X_train, dtype=np.float32) / 255 +X_train = X_train.reshape(X_train.shape[0], num_imgs, img_size, img_size, 1) +print(X_train.shape) +Y_train = np.array(Y_train, dtype=np.uint8) +Y_train = Y_train.reshape(X_train.shape[0], 1) +print(Y_train.shape) +Y_train = to_categorical(Y_train, 4) + +""" +## Model Building and Training + +TimeDistributed layer is used to pass temporal information of videos to the network. +**No GPU is needed.** Training gets completed within few minutes. +""" + +# building model +from tensorflow.keras.models import Sequential +from tensorflow.keras.layers import Dense, Conv2D, Flatten, Dropout +from tensorflow.keras.layers import MaxPooling2D +from tensorflow.keras.layers import LSTM +from tensorflow.keras.layers import TimeDistributed + +model = Sequential() +model.add( + TimeDistributed( + Conv2D(8, (3, 3), strides=(1, 1), activation="relu", padding="same"), + input_shape=(num_imgs, img_size, img_size, 1), + ) +) +model.add( + TimeDistributed( + Conv2D(8, (3, 3), kernel_initializer="he_normal", activation="relu") + ) +) +model.add(TimeDistributed(MaxPooling2D((1, 1), strides=(1, 1)))) +model.add(TimeDistributed(Flatten())) +model.add(Dropout(0.3)) +model.add(LSTM(64, return_sequences=False, dropout=0.3)) +model.add(Dense(4, activation="softmax")) +model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"]) +model.summary() +# model training +model.fit(X_train, Y_train, epochs=40, verbose=1) + +""" +## Model Inferencing + +Model is tested on new generated video data. +""" + +# model testing with new data (4 videos) +X_test = [] +Y_test = [] +for i_vid in range(2): + imgs = np.zeros((num_imgs, img_size, img_size)) # set background to 0 + w, h = np.random.randint(min_object_size, max_object_size, size=2) + x = np.random.randint(0, img_size - w) + y = np.random.randint(0, img_size - h) + i_img = 0 + while x < img_size: + imgs[i_img, y : y + h, x : x + w] = 255 # set rectangle as foreground + x = x + 1 + i_img = i_img + 1 + X_test.append(imgs) # 2nd class - 'right' +for i_vid in range(2): + imgs = np.zeros((num_imgs, img_size, img_size)) # set background to 0 + w, h = np.random.randint(min_object_size, max_object_size, size=2) + x = np.random.randint(0, img_size - w) + y = np.random.randint(0, img_size - h) + i_img = 0 + while y < img_size: + imgs[i_img, y : y + h, x : x + w] = 255 # set rectangle as foreground + y = y + 1 + i_img = i_img + 1 + X_test.append(imgs) # 4th class - 'down' +X_test = np.array(X_test, dtype=np.float32) / 255 +X_test = X_test.reshape(X_test.shape[0], num_imgs, img_size, img_size, 1) +pred = np.argmax(model.predict(X_test), axis=-1) +for i in range(len(X_test)): + print(labels[pred[i]]) + +""" +Clearly, the model is examined well on this synthetic dataset. +"""