In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.metrics import accuracy_score

import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.models as models
import torchvision.transforms as transforms
import torch.utils.data as data
import torchvision
from torch.autograd import Variable
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import transforms
import torchvision.datasets as datasets

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

# **Dataset**

You can download the dataset from [here](https://www.crcv.ucf.edu/research/data-sets/ucf101/) and run this notebook, I have not downloaded the dataset since it is huge (~6GB). But my notebook will most likely work one you have uploaded this dataset to google colab's files space !!

In [None]:
class Dataset_3DCNN(data.Dataset):

    '''
    folders : stores vidoes in form of images like

      - 1
          - frame1.jpg
          - frame2.jpg
          - ...
          - frame28.jpg

      - 2
          - frame1.jpg
          - frame2.jpg
          - ...
          - frame28.jpg

      ...

    labels : stores class labels for each video
    '''

    def __init__(self, folders, labels):
        self.folders = folders
        self.labels = labels
        self.transform = transforms.Compose([transforms.Resize([256, 342]), # resizing each frame / image size in the video to 256*342
                                transforms.ToTensor(),
                                transforms.Normalize(mean=[0.5], std=[0.5])])

    def __len__(self):
        return len(self.folders)

    def __getitem__(self, index):

        # Selecting that 1 video using the index
        folder = self.folders[index]

        # Processing on that video
        X = []
        for i in range(1,29): # cause we are only interested in first 28 frames of the video
            image = Image.open(os.path.join(folder, f'frame{i}.jpg')).convert('L')
            image = self.transform(image)
            X.append(image.squeeze_(0))
        X = torch.stack(X, dim=0)
        X = X.unsqueeze_(0)

        # class information
        y = torch.LongTensor([self.labels[index]])

        return X, y

In [None]:
train_set = Dataset_3DCNN(folders, labels)
train_loader = data.DataLoader(train_set, batch_size = 30, shuffle = True, num_workers = 4, pin_memory = True)


# **Modelling**

In [None]:
def conv3D_output_size(img_size, padding, kernel_size, stride):
    return (np.floor((img_size[0] + 2 * padding[0] - (kernel_size[0] - 1) - 1) / stride[0] + 1).astype(int),
                np.floor((img_size[1] + 2 * padding[1] - (kernel_size[1] - 1) - 1) / stride[1] + 1).astype(int),
                np.floor((img_size[2] + 2 * padding[2] - (kernel_size[2] - 1) - 1) / stride[2] + 1).astype(int))

In [None]:
class CNN3D(nn.Module):

    def __init__(self):
        super(CNN3D, self).__init__()

        ## 3D CNN Layer
        self.cnn_3d_layer = nn.Sequential(
                                            nn.Conv3d(in_channels = 1, out_channels = 32, kernel_size = (5,5,5), stride = (2, 2, 2), padding = (0, 0, 0)), nn.BatchNorm3d(32), nn.ReLU(inplace=True), nn.Dropout3d(0.2),
                                            nn.Conv3d(in_channels = 32, out_channels = 48, kernel_size = (3,3,3), stride = (2, 2, 2), padding = (0, 0, 0)), nn.BatchNorm3d(48), nn.ReLU(inplace=True), nn.Dropout3d(0.2),
                                        )

        ## compute conv1 & conv2 output shape to determine neurons in FFNN Layer
        self.conv1_outshape = conv3D_output_size( (28, 256, 342), (0,0,0), (5,5,5), (2,2,2) )
        self.conv2_outshape = conv3D_output_size(self.conv1_outshape, (0,0,0), (3,3,3), (2,2,2))

        ## FFNN Layer
        self.ffnn_layer = nn.Sequential(
                                          nn.Linear(48 * self.conv2_outshape[0] * self.conv2_outshape[1] * self.conv2_outshape[2], 256), nn.ReLU(inplace=True),
                                          nn.Linear(256, 128), nn.ReLU(inplace=True), nn.dropout(0.2),
                                          nn.Linear(128, 101)
                                          )

    def forward(self, x):

        # passing through 3d cnn layer
        x = self.cnn_3d_layer(x)

        # performing flattening
        x = x.view(x.size(0), -1)

        # passing to FFNN layers
        x = self.ffnn_layer(x)

        return x

# **Training**

In [None]:
# 3d cnn
cnn3d = CNN3D().to(device)
cnn3d.train()

# defining optimizer
optimizer = torch.optim.Adam(cnn3d.parameters(), lr=1e-4)

# house keeping stuff
epoch_train_losses = []
epoch_train_scores = []

In [None]:
for epoch in range(15):

    losses = []
    scores = []
    N_count = 0   # counting total trained sample in one epoch



    for batch_idx, (X, y) in enumerate(train_loader):
        X = X.to(device)
        y = y.to(device).view(-1, )

        N_count += X.size(0)

        optimizer.zero_grad()
        output = cnn3d(X)  # output size = (batch, number of classes)

        loss = F.cross_entropy(output, y)
        losses.append(loss.item())

        # to compute accuracy
        y_pred = torch.max(output, 1)[1]  # y_pred != output
        step_score = accuracy_score(y.cpu().data.squeeze().numpy(), y_pred.cpu().data.squeeze().numpy())
        scores.append(step_score)

        loss.backward()
        optimizer.step()

        # show information
        if (batch_idx + 1) % 10 == 0:
            print(f'Train Epoch: {epoch + 1} [{N_count}/{len(train_loader.dataset)} ({round(100. * (batch_idx + 1)/ len(train_loader),2)}%)]\tLoss: {round(loss.item(),5)}, Accu: {round(100 * step_score,2)}%')

    epoch_train_losses.append(losses)
    epoch_train_scores.append(scores)
