In [1]:
import torch
import torch.nn as nn
import torchvision
import torchinfo

In [3]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model_org = torchvision.models.vgg16(pretrained=True)
model_org = model_org.to(device)

In [19]:
# データセットに合わせてモデルの出力次元を変更すするため，オリジナルのモデルの出力層への入力次元を取得

# model_org_features = model_org.classifier[6].in_features
# print(model_org_features)
# print(type(model_org_features))

In [69]:

batch_size = 1
torchinfo.summary(
    model=model_org,
    input_size=(batch_size, 3, 256, 256),
    col_names=["input_size",
                "output_size"],
    row_settings=["var_names"],
    depth=3 
)

Layer (type (var_name))                  Input Shape               Output Shape
VGG                                      --                        --
├─Sequential (features)                  [1, 3, 256, 256]          [1, 512, 8, 8]
│    └─Conv2d (0)                        [1, 3, 256, 256]          [1, 64, 256, 256]
│    └─ReLU (1)                          [1, 64, 256, 256]         [1, 64, 256, 256]
│    └─Conv2d (2)                        [1, 64, 256, 256]         [1, 64, 256, 256]
│    └─ReLU (3)                          [1, 64, 256, 256]         [1, 64, 256, 256]
│    └─MaxPool2d (4)                     [1, 64, 256, 256]         [1, 64, 128, 128]
│    └─Conv2d (5)                        [1, 64, 128, 128]         [1, 128, 128, 128]
│    └─ReLU (6)                          [1, 128, 128, 128]        [1, 128, 128, 128]
│    └─Conv2d (7)                        [1, 128, 128, 128]        [1, 128, 128, 128]
│    └─ReLU (8)                          [1, 128, 128, 128]        [1, 128, 128, 128]

In [2]:
class Adapter(nn.Module):
    def __init__(self, dim):
        super().__init__()
        self.bn1 = nn.BatchNorm2d(dim)
        self.conv1 = nn.Conv2d(dim, dim, 1)       
        self.bn2 = nn.BatchNorm2d(dim)


    def forward(self, x):
        residual = x

        out = self.bn1(x)
        out = self.conv1(out)
        
        out += residual
        out = self.bn2(out)        

        return out

In [3]:
class ReconstructNet(nn.Module):
    def __init__(self):
        super().__init__()
        model = torchvision.models.vgg16(pretrained=True)
        model_num_features = model.classifier[6].in_features
        num_class = 400

        self.net_bottom_0 = nn.Sequential(
            model.features[:17]
        )

        self.adapter = Adapter(256)

        self.net_bottom_1 = nn.Sequential(
            model.features[17:],
            model.avgpool
        )

        self.net_top = nn.Sequential(
            model.classifier[:6],
            nn.Linear(model_num_features, num_class)
        )


    def forward(self, x):
        x = self.net_bottom_0(x)
        x = self.adapter(x)
        x = self.net_bottom_1(x)
        x = torch.flatten(x,1)
        x = self.net_top(x)
        return x

In [22]:
model_new = ReconstructNet()
model_new = model_new.to(device)

In [73]:
torchinfo.summary(
    model=model_new,
    input_size=(batch_size, 3, 256, 256),
    col_names=["input_size",
                "output_size"],
    row_settings=["var_names"],
    depth=3 
)

Layer (type (var_name))                  Input Shape               Output Shape
ReconstructNet                           --                        --
├─Sequential (net_bottom_0)              [1, 3, 256, 256]          [1, 256, 32, 32]
│    └─Sequential (0)                    [1, 3, 256, 256]          [1, 256, 32, 32]
│    │    └─Conv2d (0)                   [1, 3, 256, 256]          [1, 64, 256, 256]
│    │    └─ReLU (1)                     [1, 64, 256, 256]         [1, 64, 256, 256]
│    │    └─Conv2d (2)                   [1, 64, 256, 256]         [1, 64, 256, 256]
│    │    └─ReLU (3)                     [1, 64, 256, 256]         [1, 64, 256, 256]
│    │    └─MaxPool2d (4)                [1, 64, 256, 256]         [1, 64, 128, 128]
│    │    └─Conv2d (5)                   [1, 64, 128, 128]         [1, 128, 128, 128]
│    │    └─ReLU (6)                     [1, 128, 128, 128]        [1, 128, 128, 128]
│    │    └─Conv2d (7)                   [1, 128, 128, 128]        [1, 128, 128, 128]

In [74]:
# ダミーデータを用意し，出力が一致するか確認
data = torch.randn(1, 3, 256, 256).to(device)
print(data.shape)
print(type(data))

# data1 = torch.full((1,3,256,256), 2).to(device)
# print(data1.shape)
# print(type(data1))

torch.Size([1, 3, 256, 256])
<class 'torch.Tensor'>


In [75]:
model_org.eval()
model_new.eval()
output_org = model_org(data).max(axis=1)
output_new = model_new(data).max(axis=1)
print(output_org)
print(output_new)
# output_org = model_org(data)
# output_new = model_new(data)
# # print(output_org.shape)
# print(output_new.shape)


torch.return_types.max(
values=tensor([6.1813], device='cuda:0', grad_fn=<MaxBackward0>),
indices=tensor([556], device='cuda:0'))
torch.return_types.max(
values=tensor([0.5234], device='cuda:0', grad_fn=<MaxBackward0>),
indices=tensor([113], device='cuda:0'))


In [76]:
# テンソルの出力のまま比較する場合
# flag = torch.allclose(output_org,output_new, atol=1e-8)
# print(flag)

### 実際に学習させてみる
- data：Kinetics400
- model:vgg16 (Imagenetでpretrain)

In [4]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.utils.data.dataloader import default_collate
from torch.utils.data import DistributedSampler, RandomSampler

from torchvision import transforms

from pytorchvideo.models import x3d
from pytorchvideo.data import Ucf101, RandomClipSampler, UniformClipSampler, Kinetics

# from torchvision.transforms._transforms_video import (
#     CenterCropVideo,
#     NormalizeVideo,
# )

from pytorchvideo.transforms import (
    ApplyTransformToKey,
    Normalize,
    RandomShortSideScale,
    RemoveKey,
    ShortSideScale,
    UniformTemporalSubsample,
)
from torchvision.transforms import (
    CenterCrop,
    Compose,
    Lambda,
    RandomCrop,
    RandomHorizontalFlip,
)

from tqdm.notebook import tqdm
from collections import OrderedDict
import itertools
import os

- batch_sizeが16だとCudaのメモリ不足だったので4に変更
- cluster-smiで確認したらモデルが1000MiB,batch_sizeが4でモデルと合計4000MiB

In [5]:
class Args:
    def __init__(self):
        self.metadata_path = '/mnt/NAS-TVS872XT/dataset/'
        self.root = self.metadata_path
        self.annotation_path = self.metadata_path
        self.FRAMES_PER_CLIP = 16
        self.STEP_BETWEEN_CLIPS = 16
        self.BATCH_SIZE = 4
        self.NUM_WORKERS = 8  # kinetics:8, ucf101:24
        # self.CLIP_DURATION = 16 / 25
        self.CLIP_DURATION = (8 * 8) / 30  # (num_frames * sampling_rate)/fps
        self.VIDEO_NUM_SUBSAMPLED = 8  # 事前学習済みモデルに合わせて16→8
        self.UCF101_NUM_CLASSES = 101
        self.KINETIC400_NUM_CLASSES = 400


In [6]:
class LimitDataset(torch.utils.data.Dataset):
    def __init__(self, dataset):
        super().__init__()
        self.dataset = dataset
        self.dataset_iter = itertools.chain.from_iterable(
            itertools.repeat(iter(dataset), 2)
        )

    def __getitem__(self, index):
        return next(self.dataset_iter)

    def __len__(self):
        return self.dataset.num_videos



In [7]:

def get_kinetics(subset):
    """
    Kinetics400のデータセットを取得

    Args:
        subset (str): "train" or "val" or "test"

    Returns:
        pytorchvideo.data.labeled_video_dataset.LabeledVideoDataset: 取得したデータセット
    """
    args = Args()
    transform = Compose([
        ApplyTransformToKey(
            key="video",
            transform=Compose([
                UniformTemporalSubsample(args.VIDEO_NUM_SUBSAMPLED),
                transforms.Lambda(lambda x: x / 255.),
                Normalize((0.45, 0.45, 0.45), (0.225, 0.225, 0.225)),
                ShortSideScale(size=256),
                # RandomShortSideScale(min_size=256, max_size=320,),
                # CenterCropVideo(crop_size=(256, 256)),
                CenterCrop(256),
                # RandomCrop(224),
                RandomHorizontalFlip(),
            ]),
        ),
        ApplyTransformToKey(
            key="label",
            transform=transforms.Lambda(lambda x: x),
        ),
        RemoveKey("audio"),
    ])

    root_kinetics = '/mnt/NAS-TVS872XT/dataset/Kinetics400/'

    if subset == "test":
        dataset = Kinetics(
            data_path=root_kinetics + "test_list.txt",
            video_path_prefix=root_kinetics + 'test/',
            clip_sampler=RandomClipSampler(clip_duration=args.CLIP_DURATION),
            video_sampler=RandomSampler,
            decode_audio=False,
            transform=transform,
        )
        return dataset
    else:
        dataset = Kinetics(
            data_path=root_kinetics + subset,
            video_path_prefix=root_kinetics + subset,
            clip_sampler=RandomClipSampler(clip_duration=args.CLIP_DURATION),
            video_sampler=RandomSampler,
            decode_audio=False,
            transform=transform,
        )
        return dataset

    return False

In [8]:
def make_loader(dataset):
    """
    データローダーを作成

    Args:
        dataset (pytorchvideo.data.labeled_video_dataset.LabeledVideoDataset): get_datasetメソッドで取得したdataset

    Returns:
        torch.utils.data.DataLoader: 取得したデータローダー
    """
    args = Args()
    loader = DataLoader(LimitDataset(dataset),
                        batch_size=args.BATCH_SIZE,
                        drop_last=True,
                        num_workers=args.NUM_WORKERS)
    return loader

In [9]:
class AverageMeter(object):
    """
    Computes and stores the average and current value
    Imported from https://github.com/pytorch/examples/blob/master/imagenet/main.py#L247-L262
    https://github.com/machine-perception-robotics-group/attention_branch_network/blob/ced1d97303792ac6d56442571d71bb0572b3efd8/utils/misc.py#L59
    """
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        if type(val) == torch.Tensor:
            val = val.item()
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

def top1(outputs, targets):
    batch_size = outputs.size(0)
    _, predicted = outputs.max(1)
    return predicted.eq(targets).sum().item() / batch_size

In [10]:
def make_new_batch(inputs, labels):
    """
    動画データを画像データに分割

    Args:
        inputs torch.Tensor: inputs
        labels torch.Tensor: labels

    Returns:
        new_inputs torch.Tensor: new_inputs
        new_labels torch.Tensor: new_labels
    """

    num_frame = inputs.size()[2]
    inputs = inputs.permute(0,2,1,3,4)
    video_data_list = []
    for i in range(inputs.size()[0]):
        video_data_list.append(inputs[i])
    new_inputs = torch.cat(video_data_list, dim=0)

    label_list = []

    for i in range(labels.size()[0]):
        target_id = labels[i].item()
        label = torch.full((1,num_frame), target_id)
        label_list.append(label)
    new_labels = torch.cat(label_list, dim=1)
    new_labels = torch.squeeze(new_labels)

    return new_inputs, new_labels

In [11]:
device = torch.device("cuda:2" if torch.cuda.is_available() else "cpu")
print(device)
dataset = get_kinetics("val")
dataset.video_sampler._num_samples = 100
train_loader = make_loader(dataset)

cuda:2


In [12]:
model = ReconstructNet()
model = model.to(device)
if device == 'cuda:2':
    model = torch.nn.DataParallel(model)
    cudnn.benchmark = True
optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4)
criterion = nn.CrossEntropyLoss()

In [13]:
num_epochs = 1

with tqdm(range(num_epochs)) as pbar_epoch:
    for epoch in pbar_epoch:
        pbar_epoch.set_description("[Epoch %d]" % (epoch))


        with tqdm(enumerate(train_loader),
                  total=len(train_loader),
                  leave=True) as pbar_batch:

            train_loss = AverageMeter()
            train_acc = AverageMeter()
            model.train()

            for batch_idx, batch in pbar_batch:
                pbar_batch.set_description("[train]")

                inputs = batch['video'].to(device)
                labels = batch['label'].to(device)

                new_inputs, new_labels = make_new_batch(inputs, labels)
                bs = inputs.size(0)  # current batch size, may vary at the end of the epoch

                optimizer.zero_grad()
                outputs = model(new_inputs)
                # print(outputs.shape)
                # print(new_labels.shape)
                loss = criterion(outputs, new_labels)
                loss.backward()
                optimizer.step()
                train_loss.update(loss, bs)
                train_acc.update(top1(outputs, new_labels), bs)

                pbar_batch.set_postfix_str(
                    ' | loss={:6.04f} , top1={:6.04f}'
                    ' | loss={:6.04f} , top1={:6.04f}'
                    ''.format(
                    train_loss.avg, train_acc.avg,
                    train_loss.val, train_acc.val,
                ))



HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=25.0), HTML(value='')))





RuntimeError: Expected object of device type cuda but got device type cpu for argument #2 'target' in call to _thnn_nll_loss_forward