In [None]:
!wget --load-cookies ~/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies ~/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1Br9F1os2dLkUqwIXwJjByzre2wXTez1W' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1Br9F1os2dLkUqwIXwJjByzre2wXTez1W" -O crop_mean.npy && rm -rf ~/cookies.txt
!wget --load-cookies ~/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies ~/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1BnU8a7l9tGxZN7wVpeCQx0CIgutW-742' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1BnU8a7l9tGxZN7wVpeCQx0CIgutW-742" -O data_split.pkl && rm -rf ~/cookies.txt
!wget --load-cookies ~/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies ~/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1BQC3l22wya-sFYQMoUEYbmjhGZOYFZF-' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1BQC3l22wya-sFYQMoUEYbmjhGZOYFZF-" -O dataset.tar.gz && rm -rf ~/cookies.txt
!tar -zxvf dataset.tar.gz

In [None]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import PIL.Image as Image
import random
import numpy as np
import os
import os.path
from os.path import join
import time
import pickle
import cv2
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.nn import functional as F

#Run the code using selected GPU
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
#os.environ["CUDA_VISIBLE_DEVICES"] = "0, 1, 2, 3"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

TRAIN_CHECK_POINT = 'check_point/'

#Experiment, Optimization options
DATA_SPLIT_PATH = 'data_split.pkl'
BATCH_SIZE = 10
NUM_CLASSES = 11
CROP_SIZE = 112
CHANNEL_NUM = 3
CLIP_LENGTH = 16
EPOCH_NUM = 50
LEARNING_RATE = 1e-4

## Data Processing : Define UCF11Dataset class

In [None]:
CLIP_LENGTH = 16

np_mean = np.load('crop_mean.npy').reshape([CLIP_LENGTH, 112, 112, 3])

def get_test_num(filename):
    lines = open(filename, 'r')
    return len(list(lines))

def frame_process(clip, clip_length=CLIP_LENGTH, crop_size=112, channel_num=3):
    frames_num = len(clip)
    croped_frames = np.zeros([frames_num, crop_size, crop_size, channel_num]).astype(np.float32)


    #Crop every frame into shape[crop_size, crop_size, channel_num]
    for i in range(frames_num):
        img = Image.fromarray(clip[i].astype(np.uint8))
        if img.width > img.height:
            scale = float(crop_size) / float(img.height)
            img = np.array(cv2.resize(np.array(img), (int(img.width * scale + 1), crop_size))).astype(np.float32)
        else:
            scale = float(crop_size) / float(img.width)
            img = np.array(cv2.resize(np.array(img), (crop_size, int(img.height * scale + 1)))).astype(np.float32)
        crop_x = int((img.shape[0] - crop_size) / 2)
        crop_y = int((img.shape[1] - crop_size) / 2)
        img = img[crop_x: crop_x + crop_size, crop_y : crop_y + crop_size, :]
        croped_frames[i, :, :, :] = img - np_mean[i]

    return croped_frames


def convert_images_to_clip(filename, clip_length=CLIP_LENGTH, crop_size=112, channel_num=3):
    clip = []
    for parent, dirnames, filenames in os.walk(filename):
        filenames = sorted(filenames)
        if len(filenames) < clip_length:
            for i in range(0, len(filenames)):
                image_name = str(filename) + '/' + str(filenames[i])
                img = Image.open(image_name)
                img_data = np.array(img)
                clip.append(img_data)
            for i in range(clip_length - len(filenames)):
                image_name = str(filename) + '/' + str(filenames[len(filenames) - 1])
                img = Image.open(image_name)
                img_data = np.array(img)
                clip.append(img_data)
        else:
            s_index = random.randint(0, len(filenames) - clip_length)
            for i in range(s_index, s_index + clip_length):
                image_name = str(filename) + '/' + str(filenames[i])
                img = Image.open(image_name)
                img_data = np.array(img)
                clip.append(img_data)
    if len(clip) == 0:
        print(filename)
    clip = frame_process(clip, clip_length, crop_size, channel_num)
    return clip # shape: [clip_length, crop_size, crop_size, channel_num]

class UCF11Dataset(Dataset):
    def __init__(self, data_list, num_classes, crop_size=112, channel_num=3):
        self.data_list = data_list
        self.video_list = list(data_list)
        self.crop_size = crop_size
        self.channel_num = channel_num        
        self.num_classes = num_classes
    
    def __len__(self):
        return len(self.video_list)
    
    def __getitem__(self, i):                
        line = self.video_list[i].strip('\n').split()
        dirname = line[0]
        label = int(self.data_list[dirname])
        clips = convert_images_to_clip(dirname, CLIP_LENGTH, self.crop_size, self.channel_num)              
        
        clips = np.transpose(np.array(clips).astype(np.float32), (3, 0, 1, 2))
        
        batch_data = {'clips': clips, 'labels': label}
        
        return batch_data

## Load UCF11(UCF YouTube Action) Dataset Path

In [None]:
DATA_SPLIT_PATH = 'data_split.pkl'
ucf11_dataset = pickle.load(open(DATA_SPLIT_PATH,'rb'))
train_set = ucf11_dataset['train']
test_set = ucf11_dataset['test']

## Set Dataset and Dataloader

In [None]:
train_video_dataset = UCF11Dataset(train_set, NUM_CLASSES)
test_video_dataset = UCF11Dataset(test_set, NUM_CLASSES)

train_video_dataloader = DataLoader(train_video_dataset, batch_size = BATCH_SIZE, shuffle=True)
test_video_dataloader = DataLoader(test_video_dataset, batch_size = BATCH_SIZE, shuffle=True)

## Define NonLocal Block

In [None]:
class NonLocalBlock3D(nn.Module):
    def __init__(self, in_channels, test_mode=False, dimension=3, sub_sample=True):
        super(NonLocalBlock3D, self).__init__()
        
        self.test_mode = test_mode
        self.dimension = dimension
        self.sub_sample = sub_sample

        self.in_channels = in_channels

        self.inter_channels = in_channels // 2
        if self.inter_channels == 0:
            self.inter_channels = 1

        max_pool_layer = nn.MaxPool3d(kernel_size=(1, 2, 2))
       
        #============================================================
        #make self.g , self.theta, self.phi
        #these are nn.Conv3d, 1x1x1, stride=1, padding=0
        #============================================================
        self.g = nn.Conv3d(in_channels=self.in_channels, out_channels=self.inter_channels,
                         kernel_size=1, stride=1, padding=0)
        
        self.theta = nn.Conv3d(in_channels=self.in_channels, out_channels=self.inter_channels,
                             kernel_size=1, stride=1, padding=0)
        
        self.phi = nn.Conv3d(in_channels=self.in_channels, out_channels=self.inter_channels,
                           kernel_size=1, stride=1, padding=0)
        #============================================================

        #============================================================
        #make self.W
        #in this part, self.W.weight and self.W.bias must initialize to 0
        #============================================================
        self.W = nn.Conv3d(in_channels=self.inter_channels, out_channels=self.in_channels,
                         kernel_size=1, stride=1, padding=0)
        nn.init.constant_(self.W.weight, 0)
        nn.init.constant_(self.W.bias, 0)
        #============================================================

        if sub_sample:
            self.g = nn.Sequential(self.g, max_pool_layer)
            self.phi = nn.Sequential(self.phi, max_pool_layer)

    def forward(self, x):
        '''
        :param x: (b, c, t, h, w)
        :return:
        '''
        batch_size = x.size(0)
        #============================================================
        #1. use self.g(x)
        #2. use self.theta(x)
        #3. use self.phi(x)
        #4. several matrix multiplication between previous return value
        #5. use self.W(y)
        #6. make z with x and self.W(y)
        #============================================================
        g_x = self.g(x).view(batch_size, self.inter_channels, -1)
        g_x = g_x.permute(0, 2, 1)

        theta_x = self.theta(x).view(batch_size, self.inter_channels, -1)
        theta_x = theta_x.permute(0, 2, 1)
        
        phi_x = self.phi(x).view(batch_size, self.inter_channels, -1)

        f = torch.matmul(theta_x, phi_x)
        f_div_C = F.softmax(f, dim=-1)

        y = torch.matmul(f_div_C, g_x)
        
        if self.test_mode:
            print("x: {}".format(x.shape))
            print("g_x: {}".format(g_x.shape))
            print("theta_x: {}".format(theta_x.shape))
            print("phi_x: {}".format(phi_x.shape))
            print("f: {}".format(f.shape))
            print("y: {}".format(y.shape))

        y = y.permute(0, 2, 1).contiguous()
        y = y.view(batch_size, self.inter_channels, *x.size()[2:])
        W_y = self.W(y)
        z = W_y + x
        #============================================================

        return z

## Define C3D Network

In [None]:
class C3D(nn.Module):
    """
    The C3D network.
    """
    
    def __init__(self, num_classes, pretrained=""):
        super(C3D, self).__init__()
        
        #============================================================
        #All of convolution layers use kernel_size (3,3,3) and padding (1, 1, 1)
        #conv1 3 -> 64
        #conv2 64 -> 128
        #conv3a 128 -> 256
        #conv3b 256 -> 256
        #conv4a 256 -> 512
        #conv4b 512 -> 512
        #conv5a 512 -> 512
        #conv5b 512 -> 512
        #fc6 (you need to find input channel size) -> 4096
        #fc7 4096 -> num_classes
        #============================================================

        self.conv1 = nn.Conv3d(3, 64, kernel_size=(3, 3, 3), padding=(1, 1, 1))
        self.pool1 = nn.MaxPool3d(kernel_size=(1, 2, 2), stride=(1, 2, 2))
        self.nonlocal1 = NonLocalBlock3D(64)

        self.conv2 = nn.Conv3d(64, 128, kernel_size=(3, 3, 3), padding=(1, 1, 1))
        self.pool2 = nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2))
        self.nonlocal2 = NonLocalBlock3D(128)

        self.conv3a = nn.Conv3d(128, 256, kernel_size=(3, 3, 3), padding=(1, 1, 1))
        self.conv3b = nn.Conv3d(256, 256, kernel_size=(3, 3, 3), padding=(1, 1, 1))
        self.pool3 = nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2))
        self.nonlocal3 = NonLocalBlock3D(256)

        self.conv4a = nn.Conv3d(256, 512, kernel_size=(3, 3, 3), padding=(1, 1, 1))
        self.conv4b = nn.Conv3d(512, 512, kernel_size=(3, 3, 3), padding=(1, 1, 1))
        self.pool4 = nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2))
        self.nonlocal4 = NonLocalBlock3D(512)

        self.conv5a = nn.Conv3d(512, 512, kernel_size=(3, 3, 3), padding=(1, 1, 1))
        self.conv5b = nn.Conv3d(512, 512, kernel_size=(3, 3, 3), padding=(1, 1, 1))
        self.pool5 = nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2), padding=(0, 1, 1))

        self.fc6 = nn.Linear(8192, 4096)
        self.fc7 = nn.Linear(4096, num_classes)
        #============================================================

        self.dropout = nn.Dropout(p=0.5)

        self.relu = nn.ReLU()

        self.__init_weight()

        if pretrained:
            self.__load_pretrained_weights(pretrained)

    def forward(self, x):
        
        #============================================================
        #use all layer to forward
        #============================================================
        x = self.relu(self.conv1(x))
        x = self.pool1(x)
        #x = self.nonlocal1(x)

        x = self.relu(self.conv2(x))
        x = self.pool2(x)
        #x = self.nonlocal2(x)

        x = self.relu(self.conv3a(x))
        x = self.relu(self.conv3b(x))
        x = self.pool3(x)
        #x = self.nonlocal3(x)

        x = self.relu(self.conv4a(x))
        x = self.relu(self.conv4b(x))
        x = self.pool4(x)
        #x = self.nonlocal4(x)

        x = self.relu(self.conv5a(x))
        x = self.relu(self.conv5b(x))
        x = self.pool5(x)

        x = x.view(-1, 8192)
        x = self.relu(self.fc6(x))
        x = self.dropout(x)
        
        #============================================================
        logits = self.fc7(x)

        return logits

    def __load_pretrained_weights(self, model_path):
        """Initialiaze network."""
        corresp_name = {
                        # Conv1
                        "features.0.weight": "conv1.weight",
                        "features.0.bias": "conv1.bias",
                        # Conv2
                        "features.3.weight": "conv2.weight",
                        "features.3.bias": "conv2.bias",
                        # Conv3a
                        "features.6.weight": "conv3a.weight",
                        "features.6.bias": "conv3a.bias",
                        # Conv3b
                        "features.8.weight": "conv3b.weight",
                        "features.8.bias": "conv3b.bias",
                        # Conv4a
                        "features.11.weight": "conv4a.weight",
                        "features.11.bias": "conv4a.bias",
                        # Conv4b
                        "features.13.weight": "conv4b.weight",
                        "features.13.bias": "conv4b.bias",
                        # Conv5a
                        "features.16.weight": "conv5a.weight",
                        "features.16.bias": "conv5a.bias",
                         # Conv5b
                        "features.18.weight": "conv5b.weight",
                        "features.18.bias": "conv5b.bias",
                        # fc6
                        "classifier.0.weight": "fc6.weight",
                        "classifier.0.bias": "fc6.bias",
                        # fc7
                        "classifier.3.weight": "fc7.weight",
                        "classifier.3.bias": "fc7.bias",
                        }

        p_dict = torch.load(model_path)['state_dict']
        s_dict = self.state_dict()
        for name in p_dict:
            if name not in corresp_name:
                continue
            s_dict[corresp_name[name]] = p_dict[name]
        self.load_state_dict(s_dict)

    def __init_weight(self):
        for m in self.modules():
            if isinstance(m, nn.Conv3d):
                # n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                # m.weight.data.normal_(0, math.sqrt(2. / n))
                torch.nn.init.kaiming_normal_(m.weight)
            elif isinstance(m, nn.BatchNorm3d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()

## Set Network and Optimizer

In [None]:
net = C3D(num_classes=NUM_CLASSES)
net = net.cuda()

#net = C3D(num_classes=NUM_CLASSES).cuda()
#net = torch.nn.DataParallel(net).to(device)

optimizer = torch.optim.Adam(net.parameters(), lr=LEARNING_RATE)

## Train and Test C3D

In [None]:
for epoch in range(EPOCH_NUM):
    # train
    correct_epoch = 0
    loss_epoch = 0
    net.train()
    
    for i, batch in enumerate(train_video_dataloader):        
        batch_clips = batch['clips']
        batch_labels = batch['labels']
        batch_clips = batch_clips.cuda()
        batch_labels = batch_labels.cuda()
        
        logits = net(batch_clips)                

        loss = F.cross_entropy(logits, batch_labels)
        correct = (torch.argmax(logits, 1) == batch_labels).sum()
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        loss_epoch += loss
        correct_epoch += correct
        
        if i % 10 == 0:
            print('Epoch %d, Batch %d: Loss is %.5f; Accuracy is %.5f'%(epoch+1, i, loss, correct/batch_clips.shape[0]))
            
    print('Epoch %d: Average loss is: %.5f; Average accuracy is: %.5f'%(epoch+1, loss_epoch / len(train_video_dataloader),
                                                                                correct_epoch / len(train_video_dataset)))
                
    # test
    correct_epoch = 0
    loss_epoch = 0
    net.eval()
    
    with torch.no_grad():
        for i, batch in enumerate(test_video_dataloader):
            batch_clips = batch['clips']
            batch_labels = batch['labels']
            batch_clips = batch_clips.cuda()
            batch_labels = batch_labels.cuda()

            logits = net(batch_clips)

            loss = F.cross_entropy(logits, batch_labels)
            correct = (torch.argmax(logits, 1) == batch_labels).sum()    

            loss_epoch += loss
            correct_epoch += correct
        
    print('Test loss is %.5f; Accuracy is %.5f'%(loss_epoch / len(test_video_dataloader),
                                                                                correct_epoch / len(test_video_dataset)))


### 