In [1]:
# Training the Video Classification Model
#     1. Read all the frames that we extracted earlier for the training images
#     2. Create a validation set which will help us examine how well our model
#         will perform on unseen data
#     3. Define the architecture of our model
#     4. Finally, train the model and save its weights

In [2]:
import keras
from keras.models import Sequential
from keras.applications.vgg16 import VGG16
from keras.layers import Dense, InputLayer, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D, GlobalMaxPooling2D
from keras.preprocessing import image
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [3]:
# Reading the .csv file that contains the names of each frame and their tag
train = pd.read_csv('UCF/train_new.csv')
train.head()

Unnamed: 0,image,class
0,v_ApplyEyeMakeup_g08_c01.avi_frame0.jpg,ApplyEyeMakeup
1,v_ApplyEyeMakeup_g08_c01.avi_frame1.jpg,ApplyEyeMakeup
2,v_ApplyEyeMakeup_g08_c01.avi_frame2.jpg,ApplyEyeMakeup
3,v_ApplyEyeMakeup_g08_c01.avi_frame3.jpg,ApplyEyeMakeup
4,v_ApplyEyeMakeup_g08_c01.avi_frame4.jpg,ApplyEyeMakeup


In [4]:
# Read the frames that we extracted earlier and then store those frames as 
# a NumPy array

# creating an empty list
train_image = []

# for loop to read and store frames
for i in tqdm(range(train.shape[0])):
    # loading the image and keeping the target size as (224,224,3)
    img = image.load_img('train_1/'+train['image'][i], target_size=(224,224,3))
    # converting it to array
    img = image.img_to_array(img)
    # normalizing the pixel value
    img = img/255
    # appending the image to the train_image list
    train_image.append(img)
    
# converting the list to numpy array
X = np.array(train_image)

# shape of the array
X.shape

 68%|██████████████████████████████████████████████████▊                        | 50061/73844 [03:27<01:31, 260.89it/s]

MemoryError: Unable to allocate array with shape (224, 224, 3) and data type float32

 68%|██████████████████████████████████████████████████▊                        | 50061/73844 [03:40<01:31, 260.89it/s]

In [None]:
#Creating a validation set
#Need to make sure that the distribution of each class is similar in both
# training and validation sets --> stratify parameter

# Here, stratify = y (which is the class or tags of each frame) keeps the 
# similar distribution of classes in both the training as well as 
# the validation set.

# separating the target
y = train['class']

# creating the training and validation set
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2, stratify = y)

In [None]:
# Remember – there are 101 categories in which a video can be classified. 
# So, we will have to create 101 different columns in the target, one for 
# each category. 
# We will use the get_dummies() function for that


# creating dummies of target variable for train and validation set
y_train = pd.get_dummies(y_train)
y_test = pd.get_dummies(y_test)

In [None]:
#Defining the architecture of the video classification model
#uses pretrained model: VGG-16 

#include_top = False will remove the last layer of this model so we can 
#tune it as per our need

# creating the base model of pre-trained VGG16 model
base_model = VGG16(weights='imagenet', include_top=False)

In [None]:
#Extract features from this pre-trained model for training and 
#validation images

# extracting features for training frames
X_train = base_model.predict(X_train)
X_train.shape

In [None]:
# extracting features for validation frames
X_test = base_model.predict(X_test)
X_test.shape

In [None]:
# We will use a fully connected network now to fine-tune the model. 
# This fully connected network takes input in single dimension. 
# So, we will reshape the images into a single dimension:

# reshaping the training as well as validation frames in single dimension
X_train = X_train.reshape(59075, 7*7*512)
X_test = X_test.reshape(14769, 7*7*512)

In [None]:
# It is always advisable to normalize the pixel values, 
# i.e., keep the pixel values between 0 and 1. 
# This helps the model to converge faster.

# normalizing the pixel values
max = X_train.max()
X_train = X_train/max
X_test = X_test/max

In [None]:
# Next, we will create the architecture of the model. 
# We have to define the input shape for that. 
# So, let’s check the shape of our images:

# shape of images
X_train.shape

In [None]:
#defining the model architecture
model = Sequential()
model.add(Dense(1024, activation='relu', input_shape=(25088,)))
model.add(Dropout(0.5))
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(101, activation='softmax'))

In [None]:
#Training the Video Classification Model

# defining a function to save the weights of best model
from keras.callbacks import ModelCheckpoint
mcp_save = ModelCheckpoint('weight.hdf5', save_best_only=True, monitor='val_loss', mode='min')

In [None]:
# We will decide the optimum model based on the validation loss. 
# Note that the weights will be saved as weights.hdf5. 
# You can rename the file if you wish. 
# Before training the model, we have to compile it
# We are using the categorical_crossentropy as the loss function 
#and the optimizer is Adam

# compiling the model
model.compile(loss='categorical_crossentropy',optimizer='Adam',metrics=['accuracy'])

In [None]:
# training the model
model.fit(X_train, y_train, epochs=200, validation_data=(X_test, y_test), callbacks=[mcp_save], batch_size=128)