In [33]:
!pip install icecream



In [59]:
import sys
import os

import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torchvision
from torchvision import transforms
from torchsummary import summary

import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import ImageGrid
from icecream import ic  # better than print()

# Set a random seed for predictable behavior
torch.manual_seed(6862)

<torch._C.Generator at 0x7f4db4fabbd0>

In [35]:
# Returns a copy of the model but without the last layer, according to model.children().
def strip_last_layer(model):
    return torch.nn.Sequential(*(list(model.children())[:-1]))

resnet50_pretrained = torchvision.models.resnet50(pretrained=True, progress=True)
# print(strip_last_layer(resnet18_pretrained))

# torchsummary.summary(strip_last_layer(resnet50_pretrained), (3, 224, 224))

Downloading: "https://download.pytorch.org/models/resnet50-19c8e357.pth" to /root/.cache/torch/hub/checkpoints/resnet50-19c8e357.pth


HBox(children=(FloatProgress(value=0.0, max=102502400.0), HTML(value='')))




In [36]:
# Clone and Import VideoFrameDataLoader utils.

if not os.path.isdir('/content/Video-Dataset-Loading-Pytorch'):
  !git clone https://github.com/RaivoKoot/Video-Dataset-Loading-Pytorch

sys.path.insert(1,'/content/Video-Dataset-Loading-Pytorch')  # don't use sys.path.append, as we might re-run repeatedly

from video_dataset import VideoFrameDataset, ImglistToTensor

Cloning into 'Video-Dataset-Loading-Pytorch'...
remote: Enumerating objects: 250, done.[K
remote: Counting objects: 100% (250/250), done.[K
remote: Compressing objects: 100% (211/211), done.[K
remote: Total 250 (delta 103), reused 144 (delta 38), pack-reused 0[K
Receiving objects: 100% (250/250), 688.74 KiB | 4.72 MiB/s, done.
Resolving deltas: 100% (103/103), done.


In [None]:
'''
Download the simplest 2-class toy data, already split into frames:
moments2classes/
-- training/
   |-- clapping/
      |-- clapping1/ # video sample file
           |-- <frame_01>.jpg
           |-- <frame_02>.jpg
           |-- ...
      |-- clapping2/
           |-- ...
      |-- .../
   |-- constructing/
      |-- construction01/
           |-- ...
'''

if not os.path.isfile('moments2classes.zip'):
  !wget 'https://www.dropbox.com/s/6rdvj8qr2foju90/moments2classes.zip'

if not os.path.isdir('moments2classes'):
  !unzip 'moments2classes.zip'

root_training_data_path = 'moments2classes/training/'
annotations_file = 'moments2classes/annotations.txt'
if not os.path.isfile(annotations_file):
  f = open(annotations_file, "w")
  f.write("clapping/clapping1 1 87 0\n")
  f.write("clapping/clapping2 1 90 0\n")
  f.write("constructing/constructing1 1 90 0\n")
  f.write("constructing/constructing2 1 18 0\n")
  f.close()

All pre-trained models expect input images normalized in the same way, i.e. mini-batches of 3-channel RGB images of shape (3 x H x W), where H and W are expected to be at least 224. The images have to be loaded in to a range of [0, 1] and then normalized using mean = `[0.485, 0.456, 0.406]` and std = `[0.229, 0.224, 0.225]`. You can use the following transform to normalize:
```
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225])
```
(source: https://pytorch.org/vision/stable/models.html)

In [38]:
preprocess = torchvision.transforms.Compose([
    ImglistToTensor(),  # list of PIL images to (FRAMES x CHANNELS x HEIGHT x WIDTH) tensor
    transforms.Resize(256),  # image batch, resize smaller edge to 299
    transforms.CenterCrop(224),  # image batch, center crop to square 299x299
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Load single, continuous tensors without any temporal sampling.
# i.e. load all frames
dataset = VideoFrameDataset(
    root_path=root_training_data_path,
    annotationfile_path=annotations_file,
    num_segments=1,
    frames_per_segment=18,  # SMALLEST AMOUNT OF FRAMES IN A SAMPLE
    imagefile_template='f{:04d}.jpg',  # 4 digits in our naming convention here
    transform=preprocess,
    random_shift=False,
    test_mode=False
)

dataloader = torch.utils.data.DataLoader(
    dataset=dataset,
    batch_size=2,
    shuffle=True,
    num_workers=2,  # max suggested by runtime
    pin_memory=True
)

In [39]:
# Each processed video tensor has shape (BATCH, FRAMES, C, W, H)
for video_batch, labels in dataloader:
    print("Labels:", labels)
    print("Video Batch Tensor Size:", video_batch.size())
    print("Batch Labels Size:", labels.size())
    print("")

Labels: tensor([0, 0])
Video Batch Tensor Size: torch.Size([2, 18, 3, 224, 224])
Batch Labels Size: torch.Size([2])

Labels: tensor([0, 0])
Video Batch Tensor Size: torch.Size([2, 18, 3, 224, 224])
Batch Labels Size: torch.Size([2])



In [40]:
def plot_video(rows, cols, frame_list, plot_width, plot_height):
    fig = plt.figure(figsize=(plot_width, plot_height))
    grid = ImageGrid(fig, 111,  # similar to subplot(111)
                     nrows_ncols=(rows, cols),  # creates 2x2 grid of axes
                     axes_pad=0.3,  # pad between axes in inch.
                     )

    for index, (ax, im) in enumerate(zip(grid, frame_list)):
        # Iterating over the grid returns the Axes.
        ax.imshow(im)
        ax.set_title(index)
    plt.show()

In [44]:
class MyRNN(nn.Module):
    '''
    gru_hidden_size: ???
    '''
    # 
    def __init__(self, gru_hidden_size, device="cpu"):
        super(MyRNN, self).__init__()
        
        self.gru_hidden_size = gru_hidden_size

        # Use pre-trained ResNet50 model, stripped of the last fully-connected
        # classification layer.
        self.cnn_pretrained = strip_last_layer(torchvision.models.resnet50(pretrained=True)).cuda()
        # Indicate that no training is needed.
        for p in self.cnn_pretrained.parameters():
            p.requires_grad = False

        # Found by: torchsummary.summary(strip_last_layer(resnet50_pretrained), (3, 224, 224))
        self.cnn_pretrained_out_feature_size = 2048

        # TODO: Use GRU that takes prior output (not just hidden state) as next input.
        # That is, directly concatenate the 'y' output to the next 'x' input,
        # rather than just relying on the GRU's inner hidden state.
        # Use out-of-the-box GRU for now.
        self.gru = nn.GRU(input_size=self.cnn_pretrained_out_feature_size,
                          hidden_size=gru_hidden_size, num_layers=1, bias=True,
                          batch_first=True, dropout=0, bidirectional=False)
        
        '''
        # Layer between the convolutional layer and GRU, which takes the
        # convolutional features PLUS the GRU output features as input,
        # and returns the same size. The purpose is to "learn" the importance of
        # features from the CNN and the GRU.
        # Consider this a pre-processing of the GRU input.
        self.fc_combine = nn.Linear(in_features=2048 + output_size, out_features=2048 + output_size, bias=True)

        # GRU cell that will be looped over. We need to loop over a cell manually
        # (rather than a nn.GRU module that takes a whole sequence) so that we
        # can concatenate the output from each GRU cell to the input of the next.
        # input_size: output of a convolution of a frame (2048), plus the output.
        self.gru = nn.GRUCell(input_size=2048 + output_size, hidden_size=gru_hidden_size, bias=True)
        '''


    def forward(self, input):
        input = input.to(device)
        ic(input.size())
        batch_size, frames, C, H, W = input.size()

        # The CNN expects (BATCH x C x H x W). Re-size our batched video frames
        # as one sequence just for the CNN's per-batch sample operations...
        cnn_input = input.view(batch_size * frames, C, H, W)
        ic(cnn_input.size())
        cnn_output = self.cnn_pretrained(cnn_input)
        # [BATCH * FRAMES, 2048, 1, 1]
        ic(cnn_output.size())

        # ... Then return them to our batched frames.
        # WARNING: Very big possibility I'm fucking up the matrix shapes.
        # batch_first=True -> (batch, seq_len, feature_size)
        gru_input = cnn_output.view(batch_size, frames, self.cnn_pretrained_out_feature_size)
        ic(gru_input.size())

        # Run feature vectors (one per frame per batch sample) through GRU.
        gru_output, h_n = self.gru(gru_input)
        # batch_first=True -> [BATCH, FRAMES, HIDDEN_SIZE]
        ic(gru_output.size())
        # [1, BATCH, HIDDEN_SIZE] (ignores batch_first and is shared across batches!)
        ic(h_n.size())

        return gru_output

        # TODO: code for the custom GRU cell
        '''
        output = torch.tensor([])
        output = output.to(input.get_device())
        h_0 = self.initHidden()
        last_gru_out = self.initHidden()

        for frame in range(frames):
            # Pre-trained CNN expects one frame at a time
            cnn_out = self.cnn(input[:, f, :, :, :]).unsqueeze(0)
            print('cnn_out: ', cnn_out.size)
        '''

    # def initHidden(self):
    #     return torch.zeros(1, self.gru_hidden_size)

In [56]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

net = MyRNN(gru_hidden_size=3, device=device).to(device)
# ic(net)

# Each processed video tensor has shape (BATCH x FRAMES x CHANNELS x HEIGHT x WIDTH)
for video_batch, labels in dataloader:
  ic(video_batch.size())
  output = net(video_batch)
  ic(output.size())

ic| video_batch.size(): torch.Size([2, 18, 3, 224, 224])
ic| input.size(): torch.Size([2, 18, 3, 224, 224])
ic| cnn_input.size(): torch.Size([36, 3, 224, 224])
ic| cnn_output.size(): torch.Size([36, 2048, 1, 1])
ic| gru_input.size(): torch.Size([2, 18, 2048])
ic| gru_output.size(): torch.Size([2, 18, 3])
    h_n.size(): torch.Size([1, 2, 3])
ic| output.size(): torch.Size([2, 18, 3])
ic| video_batch.size(): torch.Size([2, 18, 3, 224, 224])
ic| input.size(): torch.Size([2, 18, 3, 224, 224])
ic| cnn_input.size(): torch.Size([36, 3, 224, 224])
ic| cnn_output.size(): torch.Size([36, 2048, 1, 1])
ic| gru_input.size(): torch.Size([2, 18, 2048])
ic| gru_output.size(): torch.Size([2, 18, 3])
    h_n.size(): torch.Size([1, 2, 3])
ic| output.size(): torch.Size([2, 18, 3])
ic| input.size(): torch.Size([2, 18, 3, 224, 224])
ic| cnn_input.size(): torch.Size([36, 3, 224, 224])
ic| cnn_output.size(): torch.Size([36, 2048, 1, 1])
ic| gru_input.size(): torch.Size([2, 18, 2048])
ic| gru_output.size(): to

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 64, 112, 112]           9,408
       BatchNorm2d-2         [-1, 64, 112, 112]             128
              ReLU-3         [-1, 64, 112, 112]               0
         MaxPool2d-4           [-1, 64, 56, 56]               0
            Conv2d-5           [-1, 64, 56, 56]           4,096
       BatchNorm2d-6           [-1, 64, 56, 56]             128
              ReLU-7           [-1, 64, 56, 56]               0
            Conv2d-8           [-1, 64, 56, 56]          36,864
       BatchNorm2d-9           [-1, 64, 56, 56]             128
             ReLU-10           [-1, 64, 56, 56]               0
           Conv2d-11          [-1, 256, 56, 56]          16,384
      BatchNorm2d-12          [-1, 256, 56, 56]             512
           Conv2d-13          [-1, 256, 56, 56]          16,384
      BatchNorm2d-14          [-1, 256,

In [62]:
summary(net, (100, 3, 224, 224))

ic| input.size(): torch.Size([2, 100, 3, 224, 224])
ic| cnn_input.size(): torch.Size([200, 3, 224, 224])
ic| cnn_output.size(): torch.Size([200, 2048, 1, 1])
ic| gru_input.size(): torch.Size([2, 100, 2048])
ic| gru_output.size(): torch.Size([2, 100, 3])
    h_n.size(): torch.Size([1, 2, 3])


----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 64, 112, 112]           9,408
       BatchNorm2d-2         [-1, 64, 112, 112]             128
              ReLU-3         [-1, 64, 112, 112]               0
         MaxPool2d-4           [-1, 64, 56, 56]               0
            Conv2d-5           [-1, 64, 56, 56]           4,096
       BatchNorm2d-6           [-1, 64, 56, 56]             128
              ReLU-7           [-1, 64, 56, 56]               0
            Conv2d-8           [-1, 64, 56, 56]          36,864
       BatchNorm2d-9           [-1, 64, 56, 56]             128
             ReLU-10           [-1, 64, 56, 56]               0
           Conv2d-11          [-1, 256, 56, 56]          16,384
      BatchNorm2d-12          [-1, 256, 56, 56]             512
           Conv2d-13          [-1, 256, 56, 56]          16,384
      BatchNorm2d-14          [-1, 256,