# torchvision UFC101, pytorchvideo X3D scratch

torchvisionのdatasetを使ってUFC101を読み込み，pytorchvideoのx3dモデルをスクラッチで学習してみる．
UFC101はあらかじめダウンロードして展開済みであるとする．

- https://pytorch.org/vision/stable/datasets.html?highlight=ucf101#torchvision.datasets.UCF101

- https://github.com/facebookresearch/pytorchvideo/blob/ef2d3a96bb939b12aa0f21fb467d2175b0f05e9f/pytorchvideo/models/x3d.py#L537



In [3]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.utils.data.dataloader import default_collate

from torchvision.models import resnet18
from torchvision import transforms
from torchvision.datasets import UCF101

from pytorchvideo.models import x3d

import torchinfo

from tqdm.notebook import tqdm

import os
import pickle

argparseを真似たパラメータ設定．
- rootで指定したディレクトリには，101クラスのサブディレクトリがあること
- annotation_pathには，UCF101のアノテーションファイルであるtrainlist0{1,2,3}.txtなどがあること


```bash
$ ls UFC101/video | head
ApplyEyeMakeup
ApplyLipstick
Archery
BabyCrawling
BalanceBeam
BandMarching
BaseballPitch
Basketball
BasketballDunk
BenchPress

$ head UCF101/ucfTrainTestlist/trainlist01.txt
ApplyEyeMakeup/v_ApplyEyeMakeup_g08_c01.avi 1
ApplyEyeMakeup/v_ApplyEyeMakeup_g08_c02.avi 1
ApplyEyeMakeup/v_ApplyEyeMakeup_g08_c03.avi 1
ApplyEyeMakeup/v_ApplyEyeMakeup_g08_c04.avi 1
ApplyEyeMakeup/v_ApplyEyeMakeup_g08_c05.avi 1
ApplyEyeMakeup/v_ApplyEyeMakeup_g09_c01.avi 1
ApplyEyeMakeup/v_ApplyEyeMakeup_g09_c02.avi 1
ApplyEyeMakeup/v_ApplyEyeMakeup_g09_c03.avi 1
ApplyEyeMakeup/v_ApplyEyeMakeup_g09_c04.avi 1
ApplyEyeMakeup/v_ApplyEyeMakeup_g09_c05.avi 1
```

In [4]:
class Args:
    def __init__(self):
        self.metadata_path = '/mnt/HDD10TB/dataset/UFC101/'
        self.root = self.metadata_path + 'video/'
        self.annotation_path = self.metadata_path + 'ucfTrainTestlist/'
        self.frames_per_clip = 16
        self.step_between_clips = 16
        self.model = 'X3D'
        self.batch_size = 16
        self.num_workers = 24

args = Args()

transformの定義．
- UCF101を読み込むとuint8なので，255で割ってfloatにする．
- torchvisionのUCF101データセットは(T, H, W, C)の形式．しかしpytorchvideoのx3dの入力形式は(B, C, T, H, W)らしいので，それに合わせる．
- X3D-Mを想定して，画像を224x224にリサイズする．torchvision.transforms.Resizeはshapeが`[..., H, W]`ならOKなので，画像だけでなく動画もOK
 - https://github.com/pytorch/vision/blob/183a722169421c83638e68ee2d8fc5bd3415c4b4/torchvision/transforms/transforms.py#L227

In [5]:
# https://www.kaggle.com/pevogam/starter-ucf101-with-pytorch
transform = transforms.Compose([
    transforms.Lambda(lambda x: x / 255.),
    # (T, H, W, C) --> (C, T, H, W)
    transforms.Lambda(lambda x: x.permute(3, 0, 1, 2)),
    # transforms.Lambda(
    #     lambda x: nn.functional.interpolate(x, (224, 224))),
    transforms.Resize(224),
])

データセットはimage, audio, labelの三組を返すが，UCF101には音声がない動画もあり，そのまま使うとdataloaderがバッチにできないというエラーが出てしまう（audioの次元数がサンプルによって異なるため）．そこでcollateでaudioを取り除く．

In [6]:
def remove_audio_collate(batch):
    # https://www.kaggle.com/pevogam/starter-ucf101-with-pytorch
    '''
    remove audio channel because
    not all of UCF101 vidoes have audio channel
    '''
    video_only_batch = []
    for video, audio, label in batch:
        video_only_batch.append((video, label))
    return default_collate(video_only_batch)

custom_collate = remove_audio_collate

メタデータの準備．UCF101の全動画をスキャンして，FPSなどの情報を取得するらしい．かなり時間がかかる．
それを保存して再利用（毎回計算し直すと時間の無駄）．
コードを見たところ，foldやtrainには無関係で，fpcとsbcにだけ依存するらしいので，それをファイル名にして保存する．

In [7]:
metadata_filename = os.path.join(
    args.metadata_path,
    'UCF101metadata_fpc{}_sbc{}.pickle'.format(
        args.frames_per_clip,
        args.step_between_clips))

if not os.path.exists(metadata_filename):
    # precompute and save metadata
    dataset_dict = UCF101(root=args.root,
                            annotation_path=args.annotation_path,
                            frames_per_clip=args.frames_per_clip,
                            step_between_clips=args.step_between_clips,
                            num_workers=args.num_workers,
                            )
    with open(metadata_filename, "wb") as f:
        pickle.dump(dataset_dict.metadata, f)

with open(metadata_filename, 'rb') as f:
    metadata = pickle.load(f)

UCF101には3つのスプリットがあるので，foldでそれを指定

In [8]:
train_set = UCF101(root=args.root,
                    annotation_path=args.annotation_path,
                    frames_per_clip=args.frames_per_clip,
                    step_between_clips=args.step_between_clips,
                    fold=1,
                    train=True,
                    transform=transform,
                    _precomputed_metadata=metadata)
val_set = UCF101(root=args.root,
                    annotation_path=args.annotation_path,
                    frames_per_clip=args.frames_per_clip,
                    step_between_clips=args.step_between_clips,
                    fold=1,
                    train=False,
                    transform=transform,
                    _precomputed_metadata=metadata)
n_classes = 101

データローダーの作成．collateをここで指定．

In [9]:
train_loader = DataLoader(train_set,
                            batch_size=args.batch_size,
                            shuffle=True,
                            drop_last=True,
                            collate_fn=custom_collate,
                            num_workers=args.num_workers)
val_loader = DataLoader(val_set,
                        batch_size=args.batch_size,
                        shuffle=False,
                        drop_last=True,
                        collate_fn=custom_collate,
                        num_workers=args.num_workers)


data loaderの挙動を確認．ランダムなラベルが得られている

In [10]:

# torchvisionのvideo.pyで，ワーニングが多数出るのでそれを抑制．
import warnings
warnings.filterwarnings("ignore", category=UserWarning,
                                   module='torchvision')


for i, (data, label) in enumerate(train_loader):
    print(label.cpu().numpy())
    if i > 10:
        break

[72 56  0 74 53 61 84 46 77 70 49 82 68 32 73 36]
[73 68 30 93  8 81 87 93 84 67 70 65 46 44 64 77]
[51 20 75 34 50  3  5 60 13 94  3 67 24 43 43 34]
[58 77 25  9 43 94 65 61 78 17 90 12 85 67 20 74]
[40 11 61 61 88 59 87 57 73  0 94  6 55 94 98 73]
[ 66  19  68  75  94  10  88  91  64  46  55  14  73  50  55 100]
[88 36  3 61 11 45 86 90 52 83 75 37 40 18 80 40]
[41 61  7 63 55 32 35 46 69 73 59 77 38 46 61 50]
[39 53 65 24 21 36 62 94 40 40 53 83 83 40 20 26]
[69 70 68  7 51  1 70 68 60 13 76 11 61 73 68 71]
[13 24 75 26 79 62 75 52 68 46 95 89 45 54 33 31]
[51 45 70 93 70 64 12 13 77  7 62 76 97 75 70 85]


データローダーのlenを確認する．

- 学習用ビデオ数は9000程度のはずなのに，train_setのlengthは非常に多い
  - おそらく，各ビデオからサンプリしたclip数になっている
  - 各ビデオから同じ数のクリップがサンプルされているとは限らない（確認できていない）

In [13]:
len(train_loader), len(train_set), len(train_set) / args.batch_size

(6692, 107085, 6692.8125)

In [14]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

pytorchvideoのx3dモデルを作成．
webマニュアルにはないが，コードをみると，クリップ長とサイズが指定できる．
X3Dは数種類あるが，ここではX3D-Mに合わせた数字を指定（コードのコメントに書いてある）

In [15]:
# X3D-M
# https://github.com/facebookresearch/pytorchvideo/blob/master/pytorchvideo/models/x3d.py#L601
model = x3d.create_x3d(
    input_clip_length=16,
    input_crop_size=224,
    depth_factor=2.2,
    model_num_class=101
).to(device)

# model = nn.DataParallel(model)

ランダムなデータを流し込んで出力されるかを確認する

In [16]:
data = torch.randn(2, 3, 16, 224, 224).to(device)

In [17]:
model(data)

tensor([[0.0114, 0.0083, 0.0091, 0.0103, 0.0102, 0.0109, 0.0089, 0.0087, 0.0097,
         0.0104, 0.0103, 0.0089, 0.0098, 0.0101, 0.0092, 0.0092, 0.0083, 0.0104,
         0.0082, 0.0104, 0.0098, 0.0112, 0.0095, 0.0087, 0.0114, 0.0095, 0.0095,
         0.0077, 0.0104, 0.0122, 0.0095, 0.0110, 0.0103, 0.0105, 0.0071, 0.0101,
         0.0090, 0.0096, 0.0085, 0.0080, 0.0127, 0.0094, 0.0129, 0.0105, 0.0089,
         0.0094, 0.0105, 0.0091, 0.0077, 0.0091, 0.0099, 0.0104, 0.0096, 0.0086,
         0.0083, 0.0106, 0.0087, 0.0092, 0.0102, 0.0116, 0.0101, 0.0091, 0.0099,
         0.0103, 0.0094, 0.0102, 0.0118, 0.0099, 0.0089, 0.0094, 0.0104, 0.0139,
         0.0102, 0.0100, 0.0075, 0.0117, 0.0102, 0.0116, 0.0089, 0.0089, 0.0092,
         0.0118, 0.0092, 0.0098, 0.0103, 0.0113, 0.0088, 0.0105, 0.0094, 0.0081,
         0.0127, 0.0099, 0.0097, 0.0102, 0.0103, 0.0106, 0.0101, 0.0095, 0.0106,
         0.0109, 0.0109],
        [0.0094, 0.0090, 0.0088, 0.0100, 0.0106, 0.0103, 0.0104, 0.0099, 0.0108,
  

summaryで中身を確認

In [18]:
torchinfo.summary(
    model,
    (4, 3, 16, 224, 224),
    depth=4,
    col_names=["input_size",
               "output_size"],
    row_settings=("var_names",)
)

Layer (type (var_name))                                      Input Shape               Output Shape
Net                                                          --                        --
├─ModuleList (blocks)                                        --                        --
│    └─ResNetBasicStem (0)                                   [4, 3, 16, 224, 224]      [4, 24, 16, 112, 112]
│    │    └─Conv2plus1d (conv)                               [4, 3, 16, 224, 224]      [4, 24, 16, 112, 112]
│    │    │    └─Conv3d (conv_t)                             [4, 3, 16, 224, 224]      [4, 24, 16, 112, 112]
│    │    │    └─Conv3d (conv_xy)                            [4, 24, 16, 112, 112]     [4, 24, 16, 112, 112]
│    │    └─BatchNorm3d (norm)                               [4, 24, 16, 112, 112]     [4, 24, 16, 112, 112]
│    │    └─ReLU (activation)                                [4, 24, 16, 112, 112]     [4, 24, 16, 112, 112]
│    └─ResStage (1)                                          [4, 2

便利関数を定義

In [14]:
class AverageMeter(object):
    """
    Computes and stores the average and current value
    Imported from https://github.com/pytorch/examples/blob/master/imagenet/main.py#L247-L262
    https://github.com/machine-perception-robotics-group/attention_branch_network/blob/ced1d97303792ac6d56442571d71bb0572b3efd8/utils/misc.py#L59
    """
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        if type(val) == torch.Tensor:
            val = val.item()
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

def top1(outputs, targets):
    batch_size = outputs.size(0)
    _, predicted = outputs.max(1)
    return predicted.eq(targets).sum().item() / batch_size

torchvisionのvideo.pyで，ワーニングが多数出るのでそれを抑制．

In [15]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning,
                                   module='torchvision')

In [16]:
optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4)
criterion = nn.CrossEntropyLoss()

In [17]:
num_epochs = 5

with tqdm(range(num_epochs)) as pbar_epoch:
    for epoch in pbar_epoch:
        pbar_epoch.set_description("[Epoch %d]" % (epoch))


        with tqdm(enumerate(train_loader),
                  total=len(train_loader),
                  leave=True) as pbar_loss:

            train_loss = AverageMeter()
            train_acc = AverageMeter()
            model.train()

            for batch_idx, (inputs, targets) in pbar_loss:
                pbar_loss.set_description("[train]")

                inputs, targets = inputs.to(device), targets.to(device)
                bs = inputs.size(0)  # current batch size, may vary at the end of the epoch

                optimizer.zero_grad()
                outputs = model(inputs)
                loss = criterion(outputs, targets)
                loss.backward()
                optimizer.step()
                train_loss.update(loss, bs)
                train_acc.update(top1(outputs, targets), bs)

                pbar_loss.set_postfix_str(
                    ' | loss={:6.04f} , top1={:6.04f}'
                    ''.format(
                    train_loss.avg, train_acc.avg,
                ))



HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=6692.0), HTML(value='')))





KeyboardInterrupt: 

- 4GPUでおよそ2.4it/s，1エポック約50分
- 1GPUでおよそ1.3it/s，1エポック約1時間半


以下の設定のとおり
- frames_per_clip = 16
- step_between_clips = 16
- batch_size = 16
- num_workers = 24