In [1]:
import torch
import torch.nn as nn
import torchvision
import torchinfo

In [2]:
# device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# model_org = torchvision.models.resnet152(pretrained=True)
# model_org = model_org.to(device)

In [3]:
# データセットに合わせてモデルの出力次元を変更すするため，オリジナルのモデルの出力層への入力次元を取得

# model_org_features = model_org.classifier[6].in_features
# print(model_org_features)
# print(type(model_org_features))

In [4]:

# batch_size = 1
# torchinfo.summary(
#     model=model_org,
#     input_size=(batch_size, 3, 256, 256),
#     col_names=["input_size",
#                 "output_size"],
#     row_settings=["var_names"],
#     depth=3 
# )

In [5]:
class Adapter(nn.Module):
    def __init__(self, dim):
        super().__init__()
        self.bn1 = nn.BatchNorm2d(dim)
        self.conv1 = nn.Conv2d(dim, dim, 1)       
        self.bn2 = nn.BatchNorm2d(dim)


    def forward(self, x):
        residual = x

        out = self.bn1(x)
        out = self.conv1(out)
        
        out += residual
        out = self.bn2(out)        

        return out

In [6]:
class ReconstructNet(nn.Module):
    def __init__(self):
        super().__init__()
        model = torchvision.models.resnet152(pretrained=True)
        model_num_features = model.fc.in_features
        num_class = 101

        self.net_bottom = nn.Sequential(
            model.conv1,
            model.bn1,
            model.relu,
            model.maxpool,
            model.layer1,
            model.layer2,
            model.layer3,
            model.layer4,
            model.avgpool
        )

        # self.adapter = Adapter(256)


        self.net_top = nn.Sequential(
            # model.fc
            nn.Linear(model_num_features, num_class)
        )


    def forward(self, x):
        x = self.net_bottom(x)
        # x = self.adapter(x)
        x = torch.flatten(x,1)
        x = self.net_top(x)
        return x

In [7]:
# model_new = ReconstructNet()
# model_new = model_new.to(device)

In [8]:
# torchinfo.summary(
#     model=model_new,
#     input_size=(batch_size, 3, 256, 256),
#     col_names=["input_size",
#                 "output_size"],
#     row_settings=["var_names"],
#     depth=3 
# )

In [9]:
# # ダミーデータを用意し，出力が一致するか確認
# data = torch.randn(1, 3, 256, 256).to(device)
# print(data.shape)
# print(type(data))

# # data1 = torch.full((1,3,256,256), 2).to(device)
# # print(data1.shape)
# # print(type(data1))

In [10]:
# model_org.eval()
# model_new.eval()
# output_org = model_org(data).max(axis=1)
# output_new = model_new(data).max(axis=1)
# print(output_org)
# print(output_new)
# output_org = model_org(data)
# output_new = model_new(data)
# # print(output_org.shape)
# print(output_new.shape)


In [11]:
# テンソルの出力のまま比較する場合
# flag = torch.allclose(output_org,output_new, atol=1e-8)
# print(flag)

### 実際に学習させてみる
- data：Kinetics400
- model:vgg16 (Imagenetでpretrain)

In [12]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.utils.data.dataloader import default_collate
from torch.utils.data import DistributedSampler, RandomSampler

from torchvision import transforms

from pytorchvideo.models import x3d
from pytorchvideo.data import Ucf101, RandomClipSampler, UniformClipSampler, Kinetics

# from torchvision.transforms._transforms_video import (
#     CenterCropVideo,
#     NormalizeVideo,
# )

from pytorchvideo.transforms import (
    ApplyTransformToKey,
    Normalize,
    RandomShortSideScale,
    RemoveKey,
    ShortSideScale,
    UniformTemporalSubsample,
)
from torchvision.transforms import (
    CenterCrop,
    Compose,
    Lambda,
    RandomCrop,
    RandomHorizontalFlip,
)

from tqdm.notebook import tqdm
from collections import OrderedDict
import itertools
import os

- batch_sizeが16だとCudaのメモリ不足だったので4に変更
- cluster-smiで確認したらモデルが1000MiB,batch_sizeが4でモデルと合計4000MiB

In [13]:
class Args:
    def __init__(self):
        self.metadata_path = '/mnt/NAS-TVS872XT/dataset/'
        self.root = self.metadata_path
        self.annotation_path = self.metadata_path
        self.FRAMES_PER_CLIP = 16
        self.STEP_BETWEEN_CLIPS = 16
        self.BATCH_SIZE = 64
        self.NUM_WORKERS = 32  
        # self.CLIP_DURATION = 16 / 25
        self.CLIP_DURATION = (8 * 8) / 30  # (num_frames * sampling_rate)/fps
        self.VIDEO_NUM_SUBSAMPLED = 2  # 事前学習済みモデルに合わせて16→8
        self.UCF101_NUM_CLASSES = 101
        self.KINETIC400_NUM_CLASSES = 400
args = Args()

In [14]:
class LimitDataset(torch.utils.data.Dataset):
    def __init__(self, dataset):
        super().__init__()
        self.dataset = dataset
        self.dataset_iter = itertools.chain.from_iterable(
            itertools.repeat(iter(dataset), 2)
        )

    def __getitem__(self, index):
        return next(self.dataset_iter)

    def __len__(self):
        return self.dataset.num_videos



In [15]:
def get_ucf101(subset):
    """
    ucf101のデータセットを取得

    Args:
        subset (str): "train" or "test"

    Returns:
        pytorchvideo.data.labeled_video_dataset.LabeledVideoDataset: 取得したデータセット
    """
    subset_root_Ucf101 = 'ucfTrainTestlist/trainlist01.txt'
    if subset == "test":
        subset_root_Ucf101 = 'ucfTrainTestlist/testlist.txt'

    args = Args()
    transform = Compose([
        ApplyTransformToKey(
            key="video",
            transform=Compose([
                UniformTemporalSubsample(args.VIDEO_NUM_SUBSAMPLED),
                transforms.Lambda(lambda x: x / 255.),
                Normalize((0.45, 0.45, 0.45), (0.225, 0.225, 0.225)),
                RandomShortSideScale(min_size=256, max_size=320,),
                RandomCrop(224),
                RandomHorizontalFlip(),
            ]),
        ),
        ApplyTransformToKey(
            key="label",
            transform=transforms.Lambda(lambda x: x-1),
        ),
        RemoveKey("audio"),
    ])

    root_ucf101 = '/mnt/dataset/UCF101/'

    dataset = Ucf101(
        data_path=root_ucf101 + subset_root_Ucf101,
        video_path_prefix=root_ucf101 + 'video/',
        clip_sampler=RandomClipSampler(clip_duration=args.CLIP_DURATION),
        video_sampler=RandomSampler,
        decode_audio=False,
        transform=transform,
    )

    return dataset

In [16]:

def get_kinetics(subset):
    """
    Kinetics400のデータセットを取得

    Args:
        subset (str): "train" or "val" or "test"

    Returns:
        pytorchvideo.data.labeled_video_dataset.LabeledVideoDataset: 取得したデータセット
    """
    args = Args()
    transform = Compose([
        ApplyTransformToKey(
            key="video",
            transform=Compose([
                UniformTemporalSubsample(args.VIDEO_NUM_SUBSAMPLED),
                transforms.Lambda(lambda x: x / 255.),
                Normalize((0.45, 0.45, 0.45), (0.225, 0.225, 0.225)),
                ShortSideScale(size=256),
                # RandomShortSideScale(min_size=256, max_size=320,),
                # CenterCropVideo(crop_size=(256, 256)),
                CenterCrop(256),
                # RandomCrop(224),
                RandomHorizontalFlip(),
            ]),
        ),
        ApplyTransformToKey(
            key="label",
            transform=transforms.Lambda(lambda x: x),
        ),
        RemoveKey("audio"),
    ])

    root_kinetics = '/mnt/NAS-TVS872XT/dataset/Kinetics400/'

    if subset == "test":
        dataset = Kinetics(
            data_path=root_kinetics + "test_list.txt",
            video_path_prefix=root_kinetics + 'test/',
            clip_sampler=RandomClipSampler(clip_duration=args.CLIP_DURATION),
            video_sampler=RandomSampler,
            decode_audio=False,
            transform=transform,
        )
        return dataset
    else:
        dataset = Kinetics(
            data_path=root_kinetics + subset,
            video_path_prefix=root_kinetics + subset,
            clip_sampler=RandomClipSampler(clip_duration=args.CLIP_DURATION),
            video_sampler=RandomSampler,
            decode_audio=False,
            transform=transform,
        )
        return dataset

    return False

In [17]:
def make_loader(dataset):
    """
    データローダーを作成

    Args:
        dataset (pytorchvideo.data.labeled_video_dataset.LabeledVideoDataset): get_datasetメソッドで取得したdataset

    Returns:
        torch.utils.data.DataLoader: 取得したデータローダー
    """
    args = Args()
    loader = DataLoader(LimitDataset(dataset),
                        batch_size=args.BATCH_SIZE,
                        drop_last=True,
                        num_workers=args.NUM_WORKERS)
    return loader

In [18]:
class AverageMeter(object):
    """
    Computes and stores the average and current value
    Imported from https://github.com/pytorch/examples/blob/master/imagenet/main.py#L247-L262
    https://github.com/machine-perception-robotics-group/attention_branch_network/blob/ced1d97303792ac6d56442571d71bb0572b3efd8/utils/misc.py#L59
    """
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        if type(val) == torch.Tensor:
            val = val.item()
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

def top1(outputs, targets):
    batch_size = outputs.size(0)
    _, predicted = outputs.max(1)
    return predicted.eq(targets).sum().item() / batch_size

In [19]:
def make_new_batch(inputs, labels):
    """
    動画データを画像データに分割

    Args:
        inputs (torch.Tensor): inputs
        labels (torch.Tensor): labels

    Returns:
        new_inputs torch.Tensor: new_inputs
        new_labels torch.Tensor: new_labels
    """

    num_frame = inputs.size()[2]
    inputs = inputs.permute(0,2,1,3,4)
    video_data_list = []
    for i in range(inputs.size()[0]):
        video_data_list.append(inputs[i])
    new_inputs = torch.cat(video_data_list, dim=0)

    label_list = []

    for i in range(labels.size()[0]):
        target_id = labels[i].item()
        label = torch.full((1,num_frame), target_id)
        label_list.append(label)
    new_labels = torch.cat(label_list, dim=1)
    new_labels = torch.squeeze(new_labels)

    return new_inputs, new_labels

In [20]:
def frame_out_to_video_out(output, batch_size, num_frame):
    """
    フレームごとの出力をビデオとしての出力に変換
    Args:
        output (torch.Tensor): フレームごとの出力
        batch_size (int): バッチサイズ
        num_frame (int): フレーム数

    Returns:
        torch.Tensor: [ビデオとしての出力
    """
    video_output_list = []

    for i in range(batch_size):
        video_output = output[i*num_frame:(i+1)*num_frame]
        video_output = torch.mean(video_output, dim=0)
        video_output_list.append(video_output)
    
    new_output =  torch.stack(video_output_list, dim=0)
    return new_output

In [21]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
dataset = get_ucf101("train")
# dataset.video_sampler._num_samples = 100
train_loader = make_loader(dataset)

cuda


In [22]:
model = ReconstructNet()
model = model.to(device)
# print(device)

# if device == "cuda":
#     print("test")
#     model = torch.nn.DataParallel(model)
#     cudnn.benchmark = True
#     print("test")

# model = torch.nn.DataParallel(model)
# model = torch.nn.parallel.DistributedDataParallel(model)
# torch.backends.cudnn.benchmark = True


In [23]:
# torch.backends.cudnn.version()

In [24]:
# パラメータの名前を確認する
# for name, params in model.named_parameters():
#     print(name)

In [25]:
# 転移学習で学習させるパラメータを、変数params_to_updateに格納する
# optimizerの引数にparams_to_updataを入れることもできる
params_to_update = []

# 学習させるパラメータ名
update_param_names = ["net_top.0.weight", "net_top.0.bias"]

# 学習させるパラメータ以外は勾配計算をなくし、変化しないように設定
for name, param in model.named_parameters():
    if name in update_param_names:
        param.requires_grad = True
        params_to_update.append(param)
        print(name)
    else:
        param.requires_grad = False

net_top.0.weight
net_top.0.bias


In [26]:
# for name, param in model.named_parameters():
#     param.requires_grad = True
#     # print(name)

In [27]:
optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4)
criterion = nn.CrossEntropyLoss()

In [28]:
num_epochs = 30

acc_list = []
loss_list = []

with tqdm(range(num_epochs)) as pbar_epoch:
    for epoch in pbar_epoch:
        pbar_epoch.set_description("[Epoch %d]" % (epoch))


        with tqdm(enumerate(train_loader),
                  total=len(train_loader),
                  leave=True) as pbar_batch:

            train_loss = AverageMeter()
            train_acc = AverageMeter()
            model.train()


            for batch_idx, batch in pbar_batch:
                pbar_batch.set_description("[Epoch :{}]".format(epoch))

                inputs = batch['video'].to(device)
                labels = batch['label'].to(device)

                new_inputs, new_labels = make_new_batch(inputs, labels)
                new_inputs = new_inputs.to(device)
                new_labels = new_labels.to(device)
                bs = inputs.size(0)
                new_bs = new_inputs.size(0)  # current batch size, may vary at the end of the epoch

                optimizer.zero_grad()
                outputs = model(new_inputs)
                # print(outputs.device)
                # print(new_labels.device)

                # ここでフレームごとの出力をビデオごとの出力に変換する
                video_outputs = frame_out_to_video_out(outputs, bs, args.VIDEO_NUM_SUBSAMPLED) 

                loss = criterion(video_outputs, labels)
                loss.backward()
                optimizer.step()

                
                preds = torch.squeeze(video_outputs.max(dim=1)[1])
                # print(video_outputs.shape)
                # print(preds.shape)

                # acc = (preds == labels).float().mean().item()
                # acc_list.append(acc)
                # pbar_batch.set_postfix(OrderedDict(loss=loss.item(),acc=acc))

                train_loss.update(loss, bs)
                train_acc.update(top1(video_outputs, labels), bs)

                pbar_batch.set_postfix_str(
                    ' | loss={:6.04f} , top1={:6.04f}'
                    ' | loss={:6.04f} , top1={:6.04f}'
                    ''.format(
                    train_loss.avg, train_acc.avg,
                    train_loss.val, train_acc.val,
                ))

            acc_list.append(train_acc.avg)
            loss_list.append(train_loss.avg)
        pbar_epoch.set_postfix(OrderedDict(
            acc=sum(acc_list)/len(acc_list),
            loss=sum(loss_list)/len(loss_list)
        ))



HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=30.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=149.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=149.0), HTML(value='')))