# Test on handling video file in Python
## Use trained model to detect character and extract section of video

In [3]:
import math
import cv2
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from keras.preprocessing import image
from keras.utils import np_utils
from skimage.transform import resize

from keras.applications.vgg16 import preprocess_input
from sklearn.model_selection import train_test_split


Using TensorFlow backend.


In [4]:
from keras.models import Sequential, load_model
from keras.applications.vgg16 import VGG16
from keras.layers import Dense, InputLayer, Dropout
from keras.callbacks import ModelCheckpoint

## Use 1st video file to train model, sample 1 frame per second.

In [5]:
inputFolder = "./SampleVideo/"
outputFolder = "./OutputVideo/"

In [6]:
# output layer has 3 neurons as we have 3 classes to predict.
model = Sequential()
model.add(InputLayer((7*7*512,)))
model.add(Dense(units=1024, activation='sigmoid'))
model.add(Dropout(0.5))
model.add(Dense(units=512, activation='sigmoid'))
model.add(Dropout(0.5))
model.add(Dense(units=256, activation='sigmoid'))
model.add(Dense(3, activation='softmax'))
#model.summary()
model.load_weights("best_model.hdf5")
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 1024)              25691136  
_________________________________________________________________
dropout_1 (Dropout)          (None, 1024)              0         
_________________________________________________________________
dense_2 (Dense)              (None, 512)               524800    
_________________________________________________________________
dropout_2 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 256)               131328    
_________________________________________________________________
dense_4 (Dense)              (None, 3)                 771       
Total params: 26,348,035
Trainable params: 26,348,035
Non-trainable params: 0
__________________________________________

In [7]:
# use pretrained model VGG16
base_model = VGG16(weights='imagenet', include_top=False, input_shape=(224, 224, 3))   #top layer not included

In [13]:
videoFile = inputFolder + "Tom and Jerry 3.mp4"
categoryToFind = 1    # Interested in finding Jerry
extractTime = 2 #sec
framesToSkipIfFound = 

df = pd.DataFrame(columns = ['Category','FrameID','StartTime'])

# Create handle for video
capture = cv2.VideoCapture(videoFile)
frameRate = capture.get(5)
samplingRate = math.floor(frameRate)

# Iterate through video and position of 1st character

found = False
startTime = 0
count = 0
skipToCount = 0
while(capture.isOpened()):
    frameId = capture.get(1)   # frame number
    ret, frame = capture.read()
    test_frame = []
    if (ret != True):
        break
    if (frameId % samplingRate == 0):    # extracting 1 frames per second
        ######################################
        #   PROCESS FRAME AND RUN MODEL ON IT
        ######################################
        count += 1
        if count>=skipToCount:
            test_frame.append(frame)
            test_img = np.array(test_frame)
            # resize the image as required by vgg16
            test_frame = []
            a = resize(test_img[0], preserve_range=True, output_shape=(224,224)).astype(int)
            test_frame.append(a)
            test_img = np.array(test_frame)
            # preprocess the image
            test_img = preprocess_input(test_img, mode='tf')
            # use pretrained model VGG16
            test_img = base_model.predict(test_img)
            # reshape and center the data
            test_img = test_img.reshape(1, 7*7*512)
            test_img = test_img / test_img.max()
            # predict
            test_prediction = model.predict_classes(test_img)
            # add item to df if it's the category we are after
            if test_prediction[0]==categoryToFind:
                skipToCount = count + 2
                df = df.append({'Category':2,'FrameID':frameId,'StartTime':startTime}, ignore_index=True)
                startTime = math.floor(count/2)
                found =True
capture.release()
if found:
    print(df)

   Category  FrameID  StartTime
0       2.0    145.0        0.0
1       2.0   1044.0        3.0
2       2.0   1624.0       18.0
3       2.0   1682.0       28.0
4       2.0   1740.0       29.0
5       2.0   3306.0       30.0


## Cut 3 secs of the video starting there

In [28]:
from moviepy.video.io.ffmpeg_tools import ffmpeg_extract_subclip
if found:
    for index,row in df.iterrows():
        targetVideo = outputFolder + f"extract{row['StartTime']:02f}.mp4"
        ffmpeg_extract_subclip(videoFile, row['StartTime'], row['StartTime']+2, targetname=targetVideo)
        print(row['StartTime'])

Moviepy - Running:
>>> "+ " ".join(cmd)
Moviepy - Command successful
0.0
Moviepy - Running:
>>> "+ " ".join(cmd)
Moviepy - Command successful
3.0
Moviepy - Running:
>>> "+ " ".join(cmd)
Moviepy - Command successful
18.0
Moviepy - Running:
>>> "+ " ".join(cmd)
Moviepy - Command successful
28.0
Moviepy - Running:
>>> "+ " ".join(cmd)
Moviepy - Command successful
29.0
Moviepy - Running:
>>> "+ " ".join(cmd)
Moviepy - Command successful
30.0


# Labels for images already in CSV file
## Classes as follow:
0 - neither JERRY nor TOM

1 - for JERRY

2 - for TOM

In [21]:
df

Unnamed: 0,Category,FrameID,StartTime
0,2.0,145.0,0.0
1,2.0,1044.0,3.0
2,2.0,1624.0,18.0
3,2.0,1682.0,28.0
4,2.0,1740.0,29.0
5,2.0,3306.0,30.0


# Check visualizing a frame