Note: Change runtime type to GPU.

In [1]:
# Better printing.
!pip install icecream
# Simple Recurrent Units for Highly Parallelizable Recurrence (required by attention-network).
!pip install sru
# Another torch summary tool.
!pip install torchinfo

Collecting icecream
  Downloading https://files.pythonhosted.org/packages/31/cc/5454531fe9ae123720b496fdea806e282843d6e75e5718a5e8b1d8e5c47f/icecream-2.1.0-py2.py3-none-any.whl
Collecting asttokens>=2.0.1
  Downloading https://files.pythonhosted.org/packages/16/d5/b0ad240c22bba2f4591693b0ca43aae94fbd77fb1e2b107d54fff1462b6f/asttokens-2.0.5-py2.py3-none-any.whl
Collecting executing>=0.3.1
  Downloading https://files.pythonhosted.org/packages/e1/a6/07d28b53b1fab42985cba6b704d685a60a2e3a5efce4cfaaad42a4494bd8/executing-0.6.0-py2.py3-none-any.whl
Collecting colorama>=0.3.9
  Downloading https://files.pythonhosted.org/packages/44/98/5b86278fbbf250d239ae0ecb724f8572af1c91f4a11edf4d36a206189440/colorama-0.4.4-py2.py3-none-any.whl
Installing collected packages: asttokens, executing, colorama, icecream
Successfully installed asttokens-2.0.5 colorama-0.4.4 executing-0.6.0 icecream-2.1.0
Collecting sru
  Downloading https://files.pythonhosted.org/packages/61/58/61244b88eb5a63e1f10f7d4c9eccee960ce

In [2]:
import os
import sys

import numpy as np
from icecream import ic
import torch
from torchsummary import summary
from torchinfo import summary as torchinfo_summary

In [3]:
if not os.path.isdir('attention-network'):
  !git clone https://github.com/ZhenxingZheng/attention-network

sys.path.insert(1,'/content/attention-network')

from Models import Spatial_TemporalNet

Cloning into 'attention-network'...
remote: Enumerating objects: 64, done.[K
remote: Counting objects: 100% (64/64), done.[K
remote: Compressing objects: 100% (62/62), done.[K
remote: Total 64 (delta 27), reused 0 (delta 0), pack-reused 0[K
Unpacking objects: 100% (64/64), done.


In [4]:
# Download the pre-trained model.
if not os.path.isfile('kinetics-tnnls.pkl'):
  !wget https://www.dropbox.com/s/y7qy1e8g0luciy7/kinetics-tnnls.pkl

--2021-05-01 01:29:06--  https://www.dropbox.com/s/y7qy1e8g0luciy7/kinetics-tnnls.pkl
Resolving www.dropbox.com (www.dropbox.com)... 162.125.6.18, 2620:100:6019:18::a27d:412
Connecting to www.dropbox.com (www.dropbox.com)|162.125.6.18|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /s/raw/y7qy1e8g0luciy7/kinetics-tnnls.pkl [following]
--2021-05-01 01:29:06--  https://www.dropbox.com/s/raw/y7qy1e8g0luciy7/kinetics-tnnls.pkl
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://ucf3991ff3d3c552271921863858.dl.dropboxusercontent.com/cd/0/inline/BNrtjM2ZIUgvelMfaszlPdllmACFdJvICjP_PHJztix2-ItonX4paJ_s-dJcx-zpSXbPtCw6-ZKA6UyOgw4nfHk70tTi_tif8FAyNN4L33E5DMsuaVtIvXj-Qb4QarXSlRhi4sYnGUqXp4tFaUhVTe_l/file# [following]
--2021-05-01 01:29:07--  https://ucf3991ff3d3c552271921863858.dl.dropboxusercontent.com/cd/0/inline/BNrtjM2ZIUgvelMfaszlPdllmACFdJvICjP_PHJztix2-ItonX4paJ_s-dJcx-zpSXbPtCw6

In [5]:
# Initialize attention network as suggested by default args:
# https://github.com/ZhenxingZheng/attention-network/blob/master/opts.py
attn_net = Spatial_TemporalNet(basemodel='resnet34',
                               dataset='kinetics',
                               segment=1,
                               attention_type='all',
                               hidden_size=1024,
                               img_dim=512,
                               kernel_size=7)  # pre-trained param

# Required for the model to be loaded below.
attn_net = torch.nn.DataParallel(attn_net).cuda()

# Load the model as it appears to be done in:
# https://github.com/ZhenxingZheng/attention-network/blob/396aa9e12f451a83e4723586ba2bd9c6ac541847/Main.py
# NB: Requires GPU environment. Otherwise, add map_location=torch.device('cpu').
attn_net.load_state_dict(torch.load('/content/kinetics-tnnls.pkl'))

Downloading: "https://download.pytorch.org/models/resnet34-333f7ec4.pth" to /root/.cache/torch/hub/checkpoints/resnet34-333f7ec4.pth


HBox(children=(FloatProgress(value=0.0, max=87306240.0), HTML(value='')))


using all attention  for action recognition


<All keys matched successfully>

In [6]:
# print(attn_net)
# network has multiple outputs:
# output_average, output_auto, output_learned, output = net(input)

# can't use regular torchsummary, as this pre-trained model has multiple outputs
# summary(attn_net, (3, 224, 224))
torchinfo_summary(attn_net, (100, 3, 224, 224), depth=7)

Layer (type:depth-idx)                                  Output Shape              Param #
├─Spatial_TemporalNet: 1-1                              [100, 600]                --
|    └─Spatial_Net: 2-1                                 [100, 512, 7, 7]          --
|    |    └─Sequential: 3-1                             [100, 512, 7, 7]          --
|    |    |    └─Conv2d: 4-1                            [100, 64, 112, 112]       9,408
|    |    |    └─BatchNorm2d: 4-2                       [100, 64, 112, 112]       128
|    |    |    └─ReLU: 4-3                              [100, 64, 112, 112]       --
|    |    |    └─MaxPool2d: 4-4                         [100, 64, 56, 56]         --
|    |    |    └─Sequential: 4-5                        [100, 64, 56, 56]         --
|    |    |    |    └─BasicBlock: 5-1                   [100, 64, 56, 56]         --
|    |    |    |    |    └─Conv2d: 6-1                  [100, 64, 56, 56]         36,864
|    |    |    |    |    └─BatchNorm2d: 6-2         

In [7]:
faka_data = torch.autograd.Variable(torch.randn(2, 12, 3, 224, 224)).cuda().view(-1, 3, 224, 224)
output = attn_net(faka_data)
ic(output[0].size(), output[1].size(), output[2].size(), output[3].size())
output[0]

ic| output[0].size(): torch.Size([24, 600])
    output[1].size(): torch.Size([24, 600])
    output[2].size(): torch.Size([24, 600])
    output[3].size(): torch.Size([24, 600])


tensor([[ 0.6851,  2.7169,  2.0585,  ...,  2.0774,  0.8687, -6.6654],
        [-0.7182,  1.7635,  2.3280,  ...,  0.5655,  3.0825,  1.6397],
        [-1.2921, -3.0752, -0.3863,  ..., -0.7206, -0.5377, -2.6047],
        ...,
        [ 3.8728, -0.2080, -2.2923,  ...,  0.4866, -0.1789, -2.2905],
        [ 0.1388,  0.0520, -0.7689,  ..., -0.1901,  1.7421,  1.2032],
        [ 2.7553, -1.8898,  4.4638,  ..., -2.2066,  0.3647, -2.0597]],
       device='cuda:0', grad_fn=<MeanBackward1>)