In [1]:
from comet_ml import Experiment
import torch
model = torch.hub.load('facebookresearch/pytorchvideo', 'x3d_m', pretrained=True)

Using cache found in /home/omi/.cache/torch/hub/facebookresearch_pytorchvideo_main


In [2]:
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.utils.data.dataloader import default_collate
from torch.utils.data import DistributedSampler, RandomSampler, SequentialSampler


from torchvision import transforms


from pytorchvideo.models import x3d
from pytorchvideo.data import (
    Ucf101, 
    RandomClipSampler, 
    UniformClipSampler, 
    Kinetics,
    SSv2
)


from pytorchvideo.transforms import (
    ApplyTransformToKey,
    Normalize,
    RandomShortSideScale,
    RemoveKey,
    ShortSideScale,
    UniformTemporalSubsample,
)
from torchvision.transforms import (
    CenterCrop,
    Compose,
    Lambda,
    RandomCrop,
    RandomHorizontalFlip,
)


import torchinfo

import numpy as np
from tqdm.notebook import tqdm
import itertools
import os
import pickle
import random
import matplotlib.pyplot as plt
import shutil
from sklearn import mixture
from sklearn import svm
from sklearn import decomposition
import os.path as osp
import argparse

In [3]:
class Args:
    def __init__(self):
        self.NUM_EPOCH = 5
        self.FRAMES_PER_CLIP = 16
        self.STEP_BETWEEN_CLIPS = 16
        self.BATCH_SIZE = 32
        self.NUM_WORKERS = 32
        # self.CLIP_DURATION = 16 / 25
        # (num_frames * sampling_rate)/fps
        self.kinetics_clip_duration = (8 * 8) / 30
        self.ucf101_clip_duration = 16 / 25
        self.VIDEO_NUM_SUBSAMPLED = 16
        self.UCF101_NUM_CLASSES = 101
        self.KINETIC400_NUM_CLASSES = 400

In [4]:
class ReconstructNet(nn.Module):
    def __init__(self):
        super().__init__()
        model = torch.hub.load(
        'facebookresearch/pytorchvideo', "x3d_m", pretrained=True)
        self.model_num_features = model.blocks[5].proj.in_features
        self.num_class = 101

        self.net_bottom = nn.Sequential(
            model.blocks[0],
            model.blocks[1],
            model.blocks[2],
            model.blocks[3],
        )

        self.blocks4 = model.blocks[4]

        self.net_top = nn.Sequential(
            model.blocks[5].pool,
            model.blocks[5].dropout
        )

        # self.linear = model.blocks[5].proj
        self.linear = nn.Linear(self.model_num_features, self.num_class)

        # 学習させるパラメータ名
        self.update_param_names = ["linear.weight", "linear.bias"]
        # 学習させるパラメータ以外は勾配計算をなくし、変化しないように設定
        for name, param in self.named_parameters():
            if name in self.update_param_names:
                param.requires_grad = True
                # print(name)
            else:
                param.requires_grad = False
               

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # x = x.permute(0,2,1,3,4)
        x = self.net_bottom(x)
        x = self.blocks4(x)
        x = self.net_top(x)
        x = x.permute(0,2,3,4,1)
        x = self.linear(x)
        x = x.view(-1,self.num_class)
        return x




In [5]:
class Adapter2D(nn.Module):
    expansion = 1

    def __init__(self, dim):
        super().__init__()
        self.bn1 = nn.BatchNorm2d(dim)
        self.conv1 = nn.Conv2d(dim, dim, 1)       
        self.bn2 = nn.BatchNorm2d(dim)
    
    def video_to_frame(self, inputs):
        batch_size = inputs.size(0)
        num_frame = inputs.size(2)

        inputs = inputs.permute(0, 2, 1, 3, 4)
        outputs = inputs.reshape(batch_size * num_frame,
                                 inputs.size(2),
                                 inputs.size(3),
                                 inputs.size(4))

        return outputs

    def frame_to_video(
            self, input: torch.Tensor, batch_size, num_frame, channel, height, width) -> torch.Tensor:
        output = input.reshape(batch_size, num_frame, channel, height, width)
        output = output.permute(0,2,1,3,4)
        return output


    def forward(self, x):
        batch_size = x.size(0)
        num_frame = x.size(2)
        channel= x.size(1)
        height = x.size(3)

        # print(x.shape)
        x = self.video_to_frame(x)
        # print(x.shape)
        
        residual = x
        out = self.bn1(x)
        out = self.conv1(out)
        out += residual
        out = self.bn2(out)

        out = self.frame_to_video(out, batch_size, num_frame, channel, height, height)
        # print(out.shape)

        return out

In [6]:
class ReconstructNet2D(nn.Module):
    def __init__(self):
        super().__init__()
        model = torch.hub.load(
        'facebookresearch/pytorchvideo', "x3d_m", pretrained=True)
        self.model_num_features = model.blocks[5].proj.in_features
        self.num_class = 101

        self.net_bottom = nn.Sequential(
            model.blocks[0],
            model.blocks[1],
            model.blocks[2],
            model.blocks[3],
        )
        
        self.adapter0 = Adapter2D(96)

        self.blocks4 = model.blocks[4]

        self.adapter1 = Adapter2D(192)

        self.net_top = nn.Sequential(
            model.blocks[5].pool,
            model.blocks[5].dropout
        )

        # self.linear = model.blocks[5].proj
        self.linear = nn.Linear(self.model_num_features, self.num_class)

        # 学習させるパラメータ名
        self.update_param_names = ["adapter0.bn1.weight", "adapter0.bn1.bias",
                                   "adapter0.conv1.weight", "adapter0.conv1.bias",
                                   "adapter0.bn2.weight", "adapter0.bn2.bias",
                                   "adapter1.bn1.weight", "adapter1.bn1.bias",
                                   "adapter1.conv1.weight", "adapter1.conv1.bias",
                                   "adapter1.bn2.weight", "adapter1.bn2.bias",
                                   "linear.weight", "linear.bias"]
        # 学習させるパラメータ以外は勾配計算をなくし、変化しないように設定
        for name, param in self.named_parameters():
            if name in self.update_param_names:
                param.requires_grad = True
                # print(name)
            else:
                param.requires_grad = False
               

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # x = x.permute(0,2,1,3,4)
        x = self.net_bottom(x)
        x = self.adapter0(x)
        x = self.blocks4(x)
        x = self.adapter1(x)
        x = self.net_top(x)
        x = x.permute(0,2,3,4,1)
        x = self.linear(x)
        x = x.view(-1,self.num_class)
        return x




In [7]:
# adapter = Adapter2D(192)

# torchinfo.summary(
#     adapter,
#     input_size=(1,192,16,7,7),
#     depth=4,
#     col_names=["input_size",
#                "output_size"],
#     row_settings=("var_names",)
# )

In [8]:
# model_new = ReconstructNet()

# torchinfo.summary(
#     model_new,
#     input_size=(1,3,16,224,224),
#     depth=6,
#     col_names=["input_size",
#                "output_size"],
#     row_settings=("var_names",)
# )

# torchinfo.summary(
#     model_new.net_bottom[4].res_blocks[0],
#     input_size=(1,96,16,14,14),
#     depth=4,
#     col_names=["input_size",
#                "output_size"],
#     row_settings=("var_names",)
# )

# torchinfo.summary(
#     model_new.net_bottom[4].res_blocks[6],
#     input_size=(1,192,16,7,7),
#     depth=4,
#     col_names=["input_size",
#                "output_size"],
#     row_settings=("var_names",)
# )

In [9]:
def get_kinetics(subset):
    """
    Kinetics400のデータセットを取得

    Args:
        subset (str): "train" or "val" or "test"

    Returns:
        pytorchvideo.data.labeled_video_dataset.LabeledVideoDataset: 取得したデータセット
    """
    args = Args()
    train_transform = Compose([
        ApplyTransformToKey(
            key="video",
            transform=Compose([
                UniformTemporalSubsample(args.VIDEO_NUM_SUBSAMPLED),
                transforms.Lambda(lambda x: x / 255.),
                Normalize((0.45, 0.45, 0.45), (0.225, 0.225, 0.225)),
                RandomShortSideScale(min_size=256, max_size=320,),
                RandomCrop(224),
                RandomHorizontalFlip(),
            ]),
        ),
        RemoveKey("audio"),
    ])

    val_transform = Compose([
        ApplyTransformToKey(
            key="video",
            transform=Compose([
                UniformTemporalSubsample(args.VIDEO_NUM_SUBSAMPLED),
                transforms.Lambda(lambda x: x / 255.),
                Normalize((0.45, 0.45, 0.45), (0.225, 0.225, 0.225)),
                ShortSideScale(256),
                CenterCrop(224),
            ]),
        ),
        RemoveKey("audio"),
    ])

    transform = val_transform if subset == "val" else train_transform

    root_kinetics = '/mnt/dataset/Kinetics400/'

    if subset == "test":
        dataset = Kinetics(
            data_path=root_kinetics + "test_list.txt",
            video_path_prefix=root_kinetics + 'test/',
            clip_sampler=RandomClipSampler(
                clip_duration=args.kinetics_clip_duration),
            video_sampler=RandomSampler,
            decode_audio=False,
            transform=transform,
        )
        return dataset
    else:
        dataset = Kinetics(
            data_path=root_kinetics + subset,
            video_path_prefix=root_kinetics + subset,
            clip_sampler=RandomClipSampler(
                clip_duration=args.kinetics_clip_duration),
            video_sampler=RandomSampler,
            decode_audio=False,
            transform=transform,
        )
        return dataset

    return False


In [10]:
def get_ucf101(subset):
    """
    ucf101のデータセットを取得

    Args:
        subset (str): "train" or "test"

    Returns:
        pytorchvideo.data.labeled_video_dataset.LabeledVideoDataset: 取得したデータセット
    """
    subset_root_Ucf101 = 'ucfTrainTestlist/trainlist01.txt' if subset == "train" else 'ucfTrainTestlist/testlist.txt'
    # if subset == "test":
    #     subset_root_Ucf101 = 'ucfTrainTestlist/testlist.txt'

    args = Args()
    train_transform = Compose([
        ApplyTransformToKey(
            key="video",
            transform=Compose([
                UniformTemporalSubsample(args.VIDEO_NUM_SUBSAMPLED),
                transforms.Lambda(lambda x: x / 255.),
                Normalize((0.45, 0.45, 0.45), (0.225, 0.225, 0.225)),
                RandomShortSideScale(min_size=256, max_size=320,),
                RandomCrop(224),
                RandomHorizontalFlip(),
            ]),
        ),
        ApplyTransformToKey(
            key="label",
            transform=transforms.Lambda(lambda x: x - 1),
        ),
        RemoveKey("audio"),
    ])

    val_transform = Compose([
        ApplyTransformToKey(
            key="video",
            transform=Compose([
                UniformTemporalSubsample(args.VIDEO_NUM_SUBSAMPLED),
                transforms.Lambda(lambda x: x / 255.),
                Normalize((0.45, 0.45, 0.45), (0.225, 0.225, 0.225)),
                ShortSideScale(256),
                CenterCrop(224),
            ]),
        ),
        ApplyTransformToKey(
            key="label",
            transform=transforms.Lambda(lambda x: x - 1),
        ),
        RemoveKey("audio"),
    ])

    transform = train_transform if subset == "train" else val_transform

    root_ucf101 = '/mnt/dataset/UCF101/'
    # root_ucf101 = '/mnt/NAS-TVS872XT/dataset/UCF101/'

    dataset = Ucf101(
        data_path=root_ucf101 + subset_root_Ucf101,
        video_path_prefix=root_ucf101 + 'video/',
        clip_sampler=RandomClipSampler(
            clip_duration=args.ucf101_clip_duration),
        video_sampler=RandomSampler,
        decode_audio=False,
        transform=transform,
    )

    return dataset


In [11]:
class LimitDataset(torch.utils.data.Dataset):
    def __init__(self, dataset):
        super().__init__()
        self.dataset = dataset
        self.dataset_iter = itertools.chain.from_iterable(
            itertools.repeat(iter(dataset), 2)
        )

    def __getitem__(self, index):
        return next(self.dataset_iter)

    def __len__(self):
        return self.dataset.num_videos

In [12]:
def make_loader(dataset):
    """
    データローダーを作成

    Args:
        dataset (pytorchvideo.data.labeled_video_dataset.LabeledVideoDataset): get_datasetメソッドで取得したdataset

    Returns:
        torch.utils.data.DataLoader: 取得したデータローダー
    """
    args = Args()
    loader = DataLoader(LimitDataset(dataset),
                        batch_size=args.BATCH_SIZE,
                        drop_last=True,
                        num_workers=args.NUM_WORKERS,
                        shuffle=True)
    return loader

In [13]:
class AverageMeter(object):
    """
    Computes and stores the average and current value
    Imported from https://github.com/pytorch/examples/blob/master/imagenet/main.py#L247-L262
    https://github.com/machine-perception-robotics-group/attention_branch_network/blob/ced1d97303792ac6d56442571d71bb0572b3efd8/utils/misc.py#L59
    """

    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        if isinstance(val, torch.Tensor):
            val = val.item()
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def top1(outputs, targets):
    batch_size = outputs.size(0)
    _, predicted = outputs.max(1)
    return predicted.eq(targets).sum().item() / batch_size



In [14]:
import os.path as osp
import shutil

def save_checkpoint(state, is_best, filename, best_model_file, dir_data_name):
    file_path = osp.join(dir_data_name, filename)
    if not os.path.exists(dir_data_name):
        os.makedirs(dir_data_name)
    torch.save(state.state_dict(), file_path)
    if is_best:
        shutil.copyfile(file_path, osp.join(dir_data_name, best_model_file))

In [15]:
# save_checkpoint()メソッドのテスト

# test_net = ReconstructNet()
# test_net.to("cuda")
# print(test_net)
# save_checkpoint(test_net, True)

In [16]:
def train_head():
    args = Args()
    device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")

    train_dataset = get_ucf101("train")
    val_dataset = get_ucf101("val")
    train_loader = make_loader(train_dataset)
    val_loader = make_loader(val_dataset)

    model = ReconstructNet()
    model = model.to(device)
    # model = torch.nn.DataParallel(model)
    torch.backends.cudnn.benchmark = True

    lr = 0.001
    weight_decay = 5e-4
    # optimizer = torch.optim.SGD(
    #     model.parameters(),
    #     lr=lr,
    #     momentum=0.9,
    #     weight_decay=5e-4)
    optimizer = torch.optim.Adam(
        model.parameters(),
        lr=lr,
        betas=(0.9, 0.999),
        weight_decay=weight_decay)
    criterion = nn.CrossEntropyLoss()

    hyper_params = {
        "Dataset": "UCF101",
        "epoch": args.NUM_EPOCH,
        "batch_size": args.BATCH_SIZE,
        "num_frame": args.VIDEO_NUM_SUBSAMPLED,
        "optimizer": "Adam(0.9, 0.999)",
        "learning late": lr,
        "weight decay": weight_decay,
        "mode": "train only head",
        # "Adapter": "adp:0, adp:1",
    }

    experiment = Experiment(
        api_key="TawRAwNJiQjPaSMvBAwk4L4pF",
        project_name="feeature-extract",
        workspace="kazukiomi",
    )

    experiment.add_tag('pytorch')
    experiment.log_parameters(hyper_params)

    num_epochs = args.NUM_EPOCH

    step = 0
    best_acc = 0

    with tqdm(range(num_epochs)) as pbar_epoch:
        for epoch in pbar_epoch:
            pbar_epoch.set_description("[Epoch %d]" % (epoch))

            """Training mode"""

            train_loss = AverageMeter()
            train_acc = AverageMeter()

            with tqdm(enumerate(train_loader),
                      total=len(train_loader),
                      leave=True) as pbar_train_batch:

                model.train()

                for batch_idx, batch in pbar_train_batch:
                    pbar_train_batch.set_description(
                        "[Epoch :{}]".format(epoch))

                    inputs = batch['video'].to(device)
                    labels = batch['label'].to(device)

                    bs = inputs.size(0)

                    optimizer.zero_grad()
                    outputs = model(inputs)
                    loss = criterion(outputs, labels)
                    loss.backward()
                    optimizer.step()

                    train_loss.update(loss, bs)
                    train_acc.update(top1(outputs, labels), bs)

                    pbar_train_batch.set_postfix_str(
                        ' | loss_avg={:6.04f} , top1_avg={:6.04f}'
                        ' | batch_loss={:6.04f} , batch_top1={:6.04f}'
                        ''.format(
                            train_loss.avg, train_acc.avg,
                            train_loss.val, train_acc.val,
                        ))

                    experiment.log_metric(
                        "batch_accuracy", train_acc.val, step=step)
                    experiment.log_metric(
                        "batch_loss", train_loss.val, step=step)
                    step += 1

            """Val mode"""
            model.eval()
            val_loss = AverageMeter()
            val_acc = AverageMeter()

            with torch.no_grad():
                for batch_idx, val_batch in enumerate(val_loader):
                    inputs = val_batch['video'].to(device)
                    labels = val_batch['label'].to(device)

                    bs = inputs.size(0)

                    val_outputs = model(inputs)
                    loss = criterion(val_outputs, labels)

                    val_loss.update(loss, bs)
                    val_acc.update(top1(val_outputs, labels), bs)
            """Finish Val mode"""

            pbar_epoch.set_postfix_str(
                ' train_loss={:6.04f} , val_loss={:6.04f}, train_acc={:6.04f}, val_acc={:6.04f}'
                ''.format(
                    train_loss.avg,
                    val_loss.avg,
                    train_acc.avg,
                    val_acc.avg)
            )

            experiment.log_metric("train_accuracy",
                                  train_acc.avg,
                                  step=epoch + 1)
            experiment.log_metric("train_loss",
                                  train_loss.avg,
                                  step=epoch + 1)
            experiment.log_metric("val_accuracy",
                                  val_acc.avg,
                                  step=epoch + 1)
            experiment.log_metric("val_loss",
                                  val_loss.avg,
                                  step=epoch + 1)

    experiment.end()



In [17]:
# train_head()

In [18]:
def train_2d_adapter():
    args = Args()
    device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")

    train_dataset = get_ucf101("train")
    val_dataset = get_ucf101("val")
    train_loader = make_loader(train_dataset)
    val_loader = make_loader(val_dataset)

    model = ReconstructNet2D()
    model = model.to(device)
    # model = torch.nn.DataParallel(model)
    torch.backends.cudnn.benchmark = True

    lr = 0.001
    weight_decay = 5e-4
    # optimizer = torch.optim.SGD(
    #     model.parameters(),
    #     lr=lr,
    #     momentum=0.9,
    #     weight_decay=5e-4)
    optimizer = torch.optim.Adam(
        model.parameters(),
        lr=lr,
        betas=(0.9, 0.999),
        weight_decay=weight_decay)
    criterion = nn.CrossEntropyLoss()

    hyper_params = {
        "Dataset": "UCF101",
        "epoch": args.NUM_EPOCH,
        "batch_size": args.BATCH_SIZE,
        "num_frame": args.VIDEO_NUM_SUBSAMPLED,
        "optimizer": "Adam(0.9, 0.999)",
        "learning late": lr,
        "weight decay": weight_decay,
        "mode": "train 2d adapter",
        "Adapter": "adp:0, adp:1",
    }

    experiment = Experiment(
        api_key="TawRAwNJiQjPaSMvBAwk4L4pF",
        project_name="feeature-extract",
        workspace="kazukiomi",
    )

    experiment.add_tag('pytorch')
    experiment.log_parameters(hyper_params)

    num_epochs = args.NUM_EPOCH

    step = 0
    best_acc = 0

    with tqdm(range(num_epochs)) as pbar_epoch:
        for epoch in pbar_epoch:
            pbar_epoch.set_description("[Epoch %d]" % (epoch))

            """Training mode"""

            train_loss = AverageMeter()
            train_acc = AverageMeter()

            with tqdm(enumerate(train_loader),
                      total=len(train_loader),
                      leave=True) as pbar_train_batch:

                model.train()

                for batch_idx, batch in pbar_train_batch:
                    pbar_train_batch.set_description(
                        "[Epoch :{}]".format(epoch))

                    inputs = batch['video'].to(device)
                    labels = batch['label'].to(device)

                    bs = inputs.size(0)

                    optimizer.zero_grad()
                    outputs = model(inputs)
                    loss = criterion(outputs, labels)
                    loss.backward()
                    optimizer.step()

                    train_loss.update(loss, bs)
                    train_acc.update(top1(outputs, labels), bs)

                    pbar_train_batch.set_postfix_str(
                        ' | loss_avg={:6.04f} , top1_avg={:6.04f}'
                        ' | batch_loss={:6.04f} , batch_top1={:6.04f}'
                        ''.format(
                            train_loss.avg, train_acc.avg,
                            train_loss.val, train_acc.val,
                        ))

                    experiment.log_metric(
                        "batch_accuracy", train_acc.val, step=step)
                    experiment.log_metric(
                        "batch_loss", train_loss.val, step=step)
                    step += 1

            """Val mode"""
            model.eval()
            val_loss = AverageMeter()
            val_acc = AverageMeter()

            with torch.no_grad():
                for batch_idx, val_batch in enumerate(val_loader):
                    inputs = val_batch['video'].to(device)
                    labels = val_batch['label'].to(device)

                    bs = inputs.size(0)

                    val_outputs = model(inputs)
                    loss = criterion(val_outputs, labels)

                    val_loss.update(loss, bs)
                    val_acc.update(top1(val_outputs, labels), bs)
            """Finish Val mode"""

            pbar_epoch.set_postfix_str(
                ' train_loss={:6.04f} , val_loss={:6.04f}, train_acc={:6.04f}, val_acc={:6.04f}'
                ''.format(
                    train_loss.avg,
                    val_loss.avg,
                    train_acc.avg,
                    val_acc.avg)
            )

            experiment.log_metric("train_accuracy",
                                  train_acc.avg,
                                  step=epoch + 1)
            experiment.log_metric("train_loss",
                                  train_loss.avg,
                                  step=epoch + 1)
            experiment.log_metric("val_accuracy",
                                  val_acc.avg,
                                  step=epoch + 1)
            experiment.log_metric("val_loss",
                                  val_loss.avg,
                                  step=epoch + 1)
    
    experiment.end()



In [19]:
# train_2d_adapter()

## temporal adapter

In [20]:
class TemporalAdapter(nn.Module):

    def __init__(self, channel_dim, frame_dim):
        super().__init__()
        self.bn1 = nn.BatchNorm3d(channel_dim)
        self.conv1 = nn.Conv2d(frame_dim, frame_dim, 1)       
        self.bn2 = nn.BatchNorm3d(channel_dim)
    
    def swap_channel_frame(self, inputs):
        batch_size = inputs.size(0)
        channel = inputs.size(1)
        num_frame = inputs.size(2)

        # inputs = inputs.permute(0, 2, 1, 3, 4)
        outputs = inputs.reshape(batch_size * channel,
                                 num_frame,
                                 inputs.size(3),
                                 inputs.size(4))

        return outputs

    def frame_to_video(
            self, input: torch.Tensor, batch_size, num_frame, channel, height, width) -> torch.Tensor:
        output = input.reshape(batch_size, channel, num_frame, height, width)
        # output = output.permute(0,2,1,3,4)
        return output


    def forward(self, x):
        batch_size = x.size(0)
        channel= x.size(1)
        num_frame = x.size(2)
        height = x.size(3)
        width = x.size(4)
        
        residual = x

        out = self.bn1(x)
        out = self.swap_channel_frame(out)
        out = self.conv1(out)
        out = self.frame_to_video(out, batch_size, num_frame, channel, height, width)
        out += residual
        out = self.bn2(out)

        return out

In [21]:
class ReconstructNetTemporal(nn.Module):
    def __init__(self):
        super().__init__()
        model = torch.hub.load(
        'facebookresearch/pytorchvideo', "x3d_m", pretrained=True)
        self.model_num_features = model.blocks[5].proj.in_features
        self.num_class = 101
        self.num_frame = 16

        self.net_bottom = nn.Sequential(
            model.blocks[0],
            model.blocks[1],
            model.blocks[2],
            model.blocks[3],
        )
        
        self.adapter0 = TemporalAdapter(96, self.num_frame)

        self.blocks4 = model.blocks[4]

        self.adapter1 = TemporalAdapter(192, self.num_frame)

        self.net_top = nn.Sequential(
            model.blocks[5].pool,
            model.blocks[5].dropout
        )

        # self.linear = model.blocks[5].proj
        self.linear = nn.Linear(self.model_num_features, self.num_class)

        # 学習させるパラメータ名
        self.update_param_names = ["adapter0.bn1.weight", "adapter0.bn1.bias",
                                   "adapter0.conv1.weight", "adapter0.conv1.bias",
                                   "adapter0.bn2.weight", "adapter0.bn2.bias",
                                   "adapter1.bn1.weight", "adapter1.bn1.bias",
                                   "adapter1.conv1.weight", "adapter1.conv1.bias",
                                   "adapter1.bn2.weight", "adapter1.bn2.bias",
                                   "linear.weight", "linear.bias"]
        # self.update_param_names = ["adapter1.bn1.weight", "adapter1.bn1.bias",
        #                            "adapter1.conv1.weight", "adapter1.conv1.bias",
        #                            "adapter1.bn2.weight", "adapter1.bn2.bias",
        #                            "linear.weight", "linear.bias"]
        # 学習させるパラメータ以外は勾配計算をなくし、変化しないように設定
        for name, param in self.named_parameters():
            if name in self.update_param_names:
                param.requires_grad = True
                # print(name)
            else:
                param.requires_grad = False
               

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # x = x.permute(0,2,1,3,4)
        x = self.net_bottom(x)
        x = self.adapter0(x)
        x = self.blocks4(x)
        x = self.adapter1(x)
        x = self.net_top(x)
        x = x.permute(0,2,3,4,1)
        x = self.linear(x)
        x = x.view(-1,self.num_class)
        return x




In [22]:
# model_new = ReconstructNetTemporal()

# torchinfo.summary(
#     model_new,
#     input_size=(1,3,16,224,224),
#     depth=2,
#     col_names=["input_size",
#                "output_size"],
#     row_settings=("var_names",)
# )

In [23]:
def train_temporal_adapter():
    args = Args()
    device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")

    train_dataset = get_ucf101("train")
    val_dataset = get_ucf101("val")
    train_loader = make_loader(train_dataset)
    val_loader = make_loader(val_dataset)

    model = ReconstructNetTemporal()
    model = model.to(device)
    # model = torch.nn.DataParallel(model)
    torch.backends.cudnn.benchmark = True

    lr = 0.001
    weight_decay = 5e-4
    # optimizer = torch.optim.SGD(
    #     model.parameters(),
    #     lr=lr,
    #     momentum=0.9,
    #     weight_decay=5e-4)
    optimizer = torch.optim.Adam(
        model.parameters(),
        lr=lr,
        betas=(0.9, 0.999),
        weight_decay=weight_decay)
    criterion = nn.CrossEntropyLoss()

    hyper_params = {
        "Dataset": "UCF101",
        "epoch": args.NUM_EPOCH,
        "batch_size": args.BATCH_SIZE,
        "num_frame": args.VIDEO_NUM_SUBSAMPLED,
        "optimizer": "Adam(0.9, 0.999)",
        "learning late": lr,
        "weight decay": weight_decay,
        "mode": "train temporal adapter",
        "Adapter": "adp:0, adp:1",
    }

    experiment = Experiment(
        api_key="TawRAwNJiQjPaSMvBAwk4L4pF",
        project_name="feeature-extract",
        workspace="kazukiomi",
    )

    experiment.add_tag('pytorch')
    experiment.log_parameters(hyper_params)

    num_epochs = args.NUM_EPOCH

    step = 0
    best_acc = 0

    with tqdm(range(num_epochs)) as pbar_epoch:
        for epoch in pbar_epoch:
            pbar_epoch.set_description("[Epoch %d]" % (epoch))

            """Training mode"""

            train_loss = AverageMeter()
            train_acc = AverageMeter()

            with tqdm(enumerate(train_loader),
                      total=len(train_loader),
                      leave=True) as pbar_train_batch:

                model.train()

                for batch_idx, batch in pbar_train_batch:
                    pbar_train_batch.set_description(
                        "[Epoch :{}]".format(epoch))

                    inputs = batch['video'].to(device)
                    labels = batch['label'].to(device)

                    bs = inputs.size(0)

                    optimizer.zero_grad()
                    outputs = model(inputs)
                    loss = criterion(outputs, labels)
                    loss.backward()
                    optimizer.step()

                    train_loss.update(loss, bs)
                    train_acc.update(top1(outputs, labels), bs)

                    pbar_train_batch.set_postfix_str(
                        ' | loss_avg={:6.04f} , top1_avg={:6.04f}'
                        ' | batch_loss={:6.04f} , batch_top1={:6.04f}'
                        ''.format(
                            train_loss.avg, train_acc.avg,
                            train_loss.val, train_acc.val,
                        ))

                    experiment.log_metric(
                        "batch_accuracy", train_acc.val, step=step)
                    experiment.log_metric(
                        "batch_loss", train_loss.val, step=step)
                    step += 1

            """Val mode"""
            model.eval()
            val_loss = AverageMeter()
            val_acc = AverageMeter()

            with torch.no_grad():
                for batch_idx, val_batch in enumerate(val_loader):
                    inputs = val_batch['video'].to(device)
                    labels = val_batch['label'].to(device)

                    bs = inputs.size(0)

                    val_outputs = model(inputs)
                    loss = criterion(val_outputs, labels)

                    val_loss.update(loss, bs)
                    val_acc.update(top1(val_outputs, labels), bs)
            """Finish Val mode"""

            pbar_epoch.set_postfix_str(
                ' train_loss={:6.04f} , val_loss={:6.04f}, train_acc={:6.04f}, val_acc={:6.04f}'
                ''.format(
                    train_loss.avg,
                    val_loss.avg,
                    train_acc.avg,
                    val_acc.avg)
            )

            experiment.log_metric("train_accuracy",
                                  train_acc.avg,
                                  step=epoch + 1)
            experiment.log_metric("train_loss",
                                  train_loss.avg,
                                  step=epoch + 1)
            experiment.log_metric("val_accuracy",
                                  val_acc.avg,
                                  step=epoch + 1)
            experiment.log_metric("val_loss",
                                  val_loss.avg,
                                  step=epoch + 1)
    
    experiment.end()



In [24]:
# train_temporal_adapter()

## space temporal adapter

In [25]:
class SpaceTemporalAdapter(nn.Module):
    def __init__(self, channel_dim, frame_dim):
        super().__init__()
        self.bn1 = nn.BatchNorm3d(channel_dim)
        self.conv1 = nn.Conv3d(channel_dim, channel_dim*frame_dim, (frame_dim, 1, 1))
        self.bn2 = nn.BatchNorm3d(channel_dim)
        self.channel_dim = channel_dim
        self.frame_dim = frame_dim

    def reshape_dim(self, inputs):
        batch_size = inputs.size(0)
        output = inputs.reshape(batch_size, self.channel_dim, self.frame_dim, inputs.size(3), inputs.size(4))
        return output
    
    def forward(self, x):
        residual = x

        out = self.bn1(x)
        out = self.conv1(out)
        out = self.reshape_dim(out)
        out += residual
        out = self.bn2(out)

        return out

In [26]:
class ReconstructNetSpaceTemporal(nn.Module):
    def __init__(self):
        super().__init__()
        model = torch.hub.load(
        'facebookresearch/pytorchvideo', "x3d_m", pretrained=True)
        self.model_num_features = model.blocks[5].proj.in_features
        self.num_class = 101
        self.num_frame = 16

        self.net_bottom = nn.Sequential(
            model.blocks[0],
            model.blocks[1],
            model.blocks[2],
            model.blocks[3],
        )
        
        self.adapter0 = SpaceTemporalAdapter(96, self.num_frame)

        self.blocks4 = model.blocks[4]

        self.adapter1 = SpaceTemporalAdapter(192, self.num_frame)

        self.net_top = nn.Sequential(
            model.blocks[5].pool,
            model.blocks[5].dropout
        )

        # self.linear = model.blocks[5].proj
        self.linear = nn.Linear(self.model_num_features, self.num_class)

        # 学習させるパラメータ名
        self.update_param_names = ["adapter0.bn1.weight", "adapter0.bn1.bias",
                                   "adapter0.conv1.weight", "adapter0.conv1.bias",
                                   "adapter0.bn2.weight", "adapter0.bn2.bias",
                                   "adapter1.bn1.weight", "adapter1.bn1.bias",
                                   "adapter1.conv1.weight", "adapter1.conv1.bias",
                                   "adapter1.bn2.weight", "adapter1.bn2.bias",
                                   "linear.weight", "linear.bias"]
        # self.update_param_names = ["adapter1.bn1.weight", "adapter1.bn1.bias",
        #                            "adapter1.conv1.weight", "adapter1.conv1.bias",
        #                            "adapter1.bn2.weight", "adapter1.bn2.bias",
        #                            "linear.weight", "linear.bias"]
        # 学習させるパラメータ以外は勾配計算をなくし、変化しないように設定
        for name, param in self.named_parameters():
            if name in self.update_param_names:
                param.requires_grad = True
                # print(name)
            else:
                param.requires_grad = False
               

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # x = x.permute(0,2,1,3,4)
        x = self.net_bottom(x)
        x = self.adapter0(x)
        x = self.blocks4(x)
        x = self.adapter1(x)
        x = self.net_top(x)
        x = x.permute(0,2,3,4,1)
        x = self.linear(x)
        x = x.view(-1,self.num_class)
        return x




In [27]:
# model_new = ReconstructNetSpaceTemporal()

# torchinfo.summary(
#     model_new,
#     input_size=(1,3,16,224,224),
#     depth=2,
#     col_names=["input_size",
#                "output_size"],
#     row_settings=("var_names",)
# )

# model_new = SpaceTemporalAdapter(96, 16)

# torchinfo.summary(
#     model_new,
#     input_size=(1,96,16,7,7),
#     depth=2,
#     col_names=["input_size",
#                "output_size"],
#     row_settings=("var_names",)
# )

In [28]:
def train_spacce_temporal_adapter():
    args = Args()
    device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")

    train_dataset = get_ucf101("train")
    val_dataset = get_ucf101("val")
    train_loader = make_loader(train_dataset)
    val_loader = make_loader(val_dataset)

    model = ReconstructNetSpaceTemporal()
    model = model.to(device)
    # model = torch.nn.DataParallel(model)
    torch.backends.cudnn.benchmark = True

    lr = 0.001
    weight_decay = 5e-4
    # optimizer = torch.optim.SGD(
    #     model.parameters(),
    #     lr=lr,
    #     momentum=0.9,
    #     weight_decay=5e-4)
    optimizer = torch.optim.Adam(
        model.parameters(),
        lr=lr,
        betas=(0.9, 0.999),
        weight_decay=weight_decay)
    criterion = nn.CrossEntropyLoss()

    hyper_params = {
        "Dataset": "UCF101",
        "epoch": args.NUM_EPOCH,
        "batch_size": args.BATCH_SIZE,
        "num_frame": args.VIDEO_NUM_SUBSAMPLED,
        "optimizer": "Adam(0.9, 0.999)",
        "learning late": lr,
        "weight decay": weight_decay,
        "mode": "train space-temporal adapter",
        "Adapter": "adp:0, adp:1",
    }

    experiment = Experiment(
        api_key="TawRAwNJiQjPaSMvBAwk4L4pF",
        project_name="feeature-extract",
        workspace="kazukiomi",
    )

    experiment.add_tag('pytorch')
    experiment.log_parameters(hyper_params)

    num_epochs = args.NUM_EPOCH

    step = 0
    best_acc = 0

    with tqdm(range(num_epochs)) as pbar_epoch:
        for epoch in pbar_epoch:
            pbar_epoch.set_description("[Epoch %d]" % (epoch))

            """Training mode"""

            train_loss = AverageMeter()
            train_acc = AverageMeter()

            with tqdm(enumerate(train_loader),
                      total=len(train_loader),
                      leave=True) as pbar_train_batch:

                model.train()

                for batch_idx, batch in pbar_train_batch:
                    pbar_train_batch.set_description(
                        "[Epoch :{}]".format(epoch))

                    inputs = batch['video'].to(device)
                    labels = batch['label'].to(device)

                    bs = inputs.size(0)

                    optimizer.zero_grad()
                    outputs = model(inputs)
                    loss = criterion(outputs, labels)
                    loss.backward()
                    optimizer.step()

                    train_loss.update(loss, bs)
                    train_acc.update(top1(outputs, labels), bs)

                    pbar_train_batch.set_postfix_str(
                        ' | loss_avg={:6.04f} , top1_avg={:6.04f}'
                        ' | batch_loss={:6.04f} , batch_top1={:6.04f}'
                        ''.format(
                            train_loss.avg, train_acc.avg,
                            train_loss.val, train_acc.val,
                        ))

                    experiment.log_metric(
                        "batch_accuracy", train_acc.val, step=step)
                    experiment.log_metric(
                        "batch_loss", train_loss.val, step=step)
                    step += 1

            """Val mode"""
            model.eval()
            val_loss = AverageMeter()
            val_acc = AverageMeter()

            with torch.no_grad():
                for batch_idx, val_batch in enumerate(val_loader):
                    inputs = val_batch['video'].to(device)
                    labels = val_batch['label'].to(device)

                    bs = inputs.size(0)

                    val_outputs = model(inputs)
                    loss = criterion(val_outputs, labels)

                    val_loss.update(loss, bs)
                    val_acc.update(top1(val_outputs, labels), bs)
            """Finish Val mode"""

            pbar_epoch.set_postfix_str(
                ' train_loss={:6.04f} , val_loss={:6.04f}, train_acc={:6.04f}, val_acc={:6.04f}'
                ''.format(
                    train_loss.avg,
                    val_loss.avg,
                    train_acc.avg,
                    val_acc.avg)
            )

            experiment.log_metric("train_accuracy",
                                  train_acc.avg,
                                  step=epoch + 1)
            experiment.log_metric("train_loss",
                                  train_loss.avg,
                                  step=epoch + 1)
            experiment.log_metric("val_accuracy",
                                  val_acc.avg,
                                  step=epoch + 1)
            experiment.log_metric("val_loss",
                                  val_loss.avg,
                                  step=epoch + 1)
    
    experiment.end()



In [29]:
# train_spacce_temporal_adapter()

## efficient space-temporal adapter

In [30]:
class EfficientSpaceTemporalAdapter(nn.Module):
    def __init__(self, channel_dim, frame_dim):
        super().__init__()
        self.video2frame_adapter = Adapter2D(channel_dim)
        self.temporal_adapter = TemporalAdapter(channel_dim, frame_dim)
        self.relu = nn.ReLU()


    def forward(self, x):
        out = self.video2frame_adapter(x)
        out = self.relu(out)
        out = self.temporal_adapter(x)
        return out

In [31]:
class ReconstructNetEfficientSpaceTemporal(nn.Module):
    def __init__(self):
        super().__init__()
        model = torch.hub.load(
        'facebookresearch/pytorchvideo', "x3d_m", pretrained=True)
        self.model_num_features = model.blocks[5].proj.in_features
        self.num_class = 101
        self.num_frame = 16

        self.net_bottom = nn.Sequential(
            model.blocks[0],
            model.blocks[1],
            model.blocks[2],
            model.blocks[3],
        )
        
        self.adapter0 = EfficientSpaceTemporalAdapter(96, self.num_frame)

        self.blocks4 = model.blocks[4]

        self.adapter1 = EfficientSpaceTemporalAdapter(192, self.num_frame)

        self.net_top = nn.Sequential(
            model.blocks[5].pool,
            model.blocks[5].dropout
        )

        # self.linear = model.blocks[5].proj
        self.linear = nn.Linear(self.model_num_features, self.num_class)

        # 学習させるパラメータ名
        self.update_param_names = ["adapter0.bn1.weight", "adapter0.bn1.bias",
                                   "adapter0.conv1.weight", "adapter0.conv1.bias",
                                   "adapter0.bn2.weight", "adapter0.bn2.bias",
                                   "adapter1.bn1.weight", "adapter1.bn1.bias",
                                   "adapter1.conv1.weight", "adapter1.conv1.bias",
                                   "adapter1.bn2.weight", "adapter1.bn2.bias",
                                   "linear.weight", "linear.bias"]
        # self.update_param_names = ["adapter1.bn1.weight", "adapter1.bn1.bias",
        #                            "adapter1.conv1.weight", "adapter1.conv1.bias",
        #                            "adapter1.bn2.weight", "adapter1.bn2.bias",
        #                            "linear.weight", "linear.bias"]
        # 学習させるパラメータ以外は勾配計算をなくし、変化しないように設定
        for name, param in self.named_parameters():
            if name in self.update_param_names:
                param.requires_grad = True
                # print(name)
            else:
                param.requires_grad = False
               

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # x = x.permute(0,2,1,3,4)
        x = self.net_bottom(x)
        x = self.adapter0(x)
        x = self.blocks4(x)
        x = self.adapter1(x)
        x = self.net_top(x)
        x = x.permute(0,2,3,4,1)
        x = self.linear(x)
        x = x.view(-1,self.num_class)
        return x




In [32]:
# model_new = ReconstructNetEfficientSpaceTemporal()

# torchinfo.summary(
#     model_new,
#     input_size=(1,3,16,224,224),
#     depth=2,
#     col_names=["input_size",
#                "output_size"],
#     row_settings=("var_names",)
# )

# model_new = EfficientSpaceTemporalAdapter(96, 16)

# torchinfo.summary(
#     model_new,
#     input_size=(1,96,16,7,7),
#     depth=2,
#     col_names=["input_size",
#                "output_size"],
#     row_settings=("var_names",)
# )

In [33]:
def train_efficient_spacce_temporal_adapter():
    args = Args()
    device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")

    train_dataset = get_ucf101("train")
    val_dataset = get_ucf101("val")
    train_loader = make_loader(train_dataset)
    val_loader = make_loader(val_dataset)

    model = ReconstructNetEfficientSpaceTemporal()
    model = model.to(device)
    # model = torch.nn.DataParallel(model)
    torch.backends.cudnn.benchmark = True

    lr = 0.001
    weight_decay = 5e-4
    # optimizer = torch.optim.SGD(
    #     model.parameters(),
    #     lr=lr,
    #     momentum=0.9,
    #     weight_decay=5e-4)
    optimizer = torch.optim.Adam(
        model.parameters(),
        lr=lr,
        betas=(0.9, 0.999),
        weight_decay=weight_decay)
    criterion = nn.CrossEntropyLoss()

    hyper_params = {
        "Dataset": "UCF101",
        "epoch": args.NUM_EPOCH,
        "batch_size": args.BATCH_SIZE,
        "num_frame": args.VIDEO_NUM_SUBSAMPLED,
        "optimizer": "Adam(0.9, 0.999)",
        "learning late": lr,
        "weight decay": weight_decay,
        "mode": "train efficient space-temporal adapter",
        "Adapter": "adp:0, adp:1",
    }

    experiment = Experiment(
        api_key="TawRAwNJiQjPaSMvBAwk4L4pF",
        project_name="feeature-extract",
        workspace="kazukiomi",
    )

    experiment.add_tag('pytorch')
    experiment.log_parameters(hyper_params)

    num_epochs = args.NUM_EPOCH

    step = 0
    best_acc = 0

    with tqdm(range(num_epochs)) as pbar_epoch:
        for epoch in pbar_epoch:
            pbar_epoch.set_description("[Epoch %d]" % (epoch))

            """Training mode"""

            train_loss = AverageMeter()
            train_acc = AverageMeter()

            with tqdm(enumerate(train_loader),
                      total=len(train_loader),
                      leave=True) as pbar_train_batch:

                model.train()

                for batch_idx, batch in pbar_train_batch:
                    pbar_train_batch.set_description(
                        "[Epoch :{}]".format(epoch))

                    inputs = batch['video'].to(device)
                    labels = batch['label'].to(device)

                    bs = inputs.size(0)

                    optimizer.zero_grad()
                    outputs = model(inputs)
                    loss = criterion(outputs, labels)
                    loss.backward()
                    optimizer.step()

                    train_loss.update(loss, bs)
                    train_acc.update(top1(outputs, labels), bs)

                    pbar_train_batch.set_postfix_str(
                        ' | loss_avg={:6.04f} , top1_avg={:6.04f}'
                        ' | batch_loss={:6.04f} , batch_top1={:6.04f}'
                        ''.format(
                            train_loss.avg, train_acc.avg,
                            train_loss.val, train_acc.val,
                        ))

                    experiment.log_metric(
                        "batch_accuracy", train_acc.val, step=step)
                    experiment.log_metric(
                        "batch_loss", train_loss.val, step=step)
                    step += 1

            """Val mode"""
            model.eval()
            val_loss = AverageMeter()
            val_acc = AverageMeter()

            with torch.no_grad():
                for batch_idx, val_batch in enumerate(val_loader):
                    inputs = val_batch['video'].to(device)
                    labels = val_batch['label'].to(device)

                    bs = inputs.size(0)

                    val_outputs = model(inputs)
                    loss = criterion(val_outputs, labels)

                    val_loss.update(loss, bs)
                    val_acc.update(top1(val_outputs, labels), bs)
            """Finish Val mode"""

            pbar_epoch.set_postfix_str(
                ' train_loss={:6.04f} , val_loss={:6.04f}, train_acc={:6.04f}, val_acc={:6.04f}'
                ''.format(
                    train_loss.avg,
                    val_loss.avg,
                    train_acc.avg,
                    val_acc.avg)
            )

            experiment.log_metric("train_accuracy",
                                  train_acc.avg,
                                  step=epoch + 1)
            experiment.log_metric("train_loss",
                                  train_loss.avg,
                                  step=epoch + 1)
            experiment.log_metric("val_accuracy",
                                  val_acc.avg,
                                  step=epoch + 1)
            experiment.log_metric("val_loss",
                                  val_loss.avg,
                                  step=epoch + 1)
    
    experiment.end()



In [34]:
train_efficient_spacce_temporal_adapter()

Using cache found in /home/omi/.cache/torch/hub/facebookresearch_pytorchvideo_main
COMET INFO: Experiment is live on comet.ml https://www.comet.ml/kazukiomi/feeature-extract/8ca89972828d46d7a0e64a67b9e722d4



HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=298.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=298.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=298.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=298.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=298.0), HTML(value='')))




COMET INFO: ---------------------------
COMET INFO: Comet.ml Experiment Summary
COMET INFO: ---------------------------
COMET INFO:   Data:
COMET INFO:     display_summary_level : 1
COMET INFO:     url                   : https://www.comet.ml/kazukiomi/feeature-extract/8ca89972828d46d7a0e64a67b9e722d4
COMET INFO:   Metrics [count] (min, max):
COMET INFO:     batch_accuracy [1490] : (0.0, 1.0)
COMET INFO:     batch_loss [1490]     : (0.05116062983870506, 4.717940807342529)
COMET INFO:     loss [149]            : (0.05116062983870506, 4.579037666320801)
COMET INFO:     train_accuracy [5]    : (0.7861786912751678, 0.9397021812080537)
COMET INFO:     train_loss [5]        : (0.24570786686701662, 1.1339569405321306)
COMET INFO:     val_accuracy [5]      : (0.8654661016949152, 0.8792372881355932)
COMET INFO:     val_loss [5]          : (0.4095650094924337, 0.5039882695270796)
COMET INFO:   Parameters:
COMET INFO:     Adapter       : adp:0, adp:1
COMET INFO:     Dataset       : UCF101
COMET I




COMET INFO: Uploading metrics, params, and assets to Comet before program termination (may take several seconds)
COMET INFO: The Python SDK has 3600 seconds to finish before aborting...
