In [1]:
from comet_ml import Experiment
import torch
model = torch.hub.load('facebookresearch/pytorchvideo', 'x3d_m', pretrained=True)

Using cache found in /home/omi/.cache/torch/hub/facebookresearch_pytorchvideo_main


In [2]:
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.utils.data.dataloader import default_collate
from torch.utils.data import DistributedSampler, RandomSampler, SequentialSampler


from torchvision import transforms


from pytorchvideo.models import x3d
from pytorchvideo.data import (
    Ucf101, 
    RandomClipSampler, 
    UniformClipSampler, 
    Kinetics,
    SSv2
)


from pytorchvideo.transforms import (
    ApplyTransformToKey,
    Normalize,
    RandomShortSideScale,
    RemoveKey,
    ShortSideScale,
    UniformTemporalSubsample,
)
from torchvision.transforms import (
    CenterCrop,
    Compose,
    Lambda,
    RandomCrop,
    RandomHorizontalFlip,
)


import torchinfo

import numpy as np
from tqdm.notebook import tqdm
import itertools
import os
import pickle
import random
import matplotlib.pyplot as plt
import shutil
from sklearn import mixture
from sklearn import svm
from sklearn import decomposition
import os.path as osp
import argparse

In [3]:
class Args:
    def __init__(self):
        self.NUM_EPOCH = 2
        self.FRAMES_PER_CLIP = 16
        self.STEP_BETWEEN_CLIPS = 16
        self.BATCH_SIZE = 16
        self.NUM_WORKERS = 32
        # self.CLIP_DURATION = 16 / 25
        # (num_frames * sampling_rate)/fps
        self.kinetics_clip_duration = (8 * 8) / 30
        self.ucf101_clip_duration = 16 / 25
        self.VIDEO_NUM_SUBSAMPLED = 16
        self.UCF101_NUM_CLASSES = 101
        self.KINETIC400_NUM_CLASSES = 400

In [4]:
class Adapter2D(nn.Module):
    expansion = 1

    def __init__(self, dim):
        super().__init__()
        self.bn1 = nn.BatchNorm2d(dim)
        self.conv1 = nn.Conv2d(dim, dim, 1)       
        self.bn2 = nn.BatchNorm2d(dim)
    
    def video_to_frame(self, inputs):
        batch_size = inputs.size(0)
        num_frame = inputs.size(2)

        inputs = inputs.permute(0, 2, 1, 3, 4)
        outputs = inputs.reshape(batch_size * num_frame,
                                 inputs.size(2),
                                 inputs.size(3),
                                 inputs.size(4))

        return outputs

    def frame_to_video(
            self, input: torch.Tensor, batch_size, num_frame, channel, height, width) -> torch.Tensor:
        output = input.reshape(batch_size, num_frame, channel, height, width)
        output = output.permute(0,2,1,3,4)
        return output


    def forward(self, x):
        batch_size = x.size(0)
        num_frame = x.size(2)
        channel= x.size(1)
        height = x.size(3)

        # print(x.shape)
        x = self.video_to_frame(x)
        # print(x.shape)
        
        residual = x
        out = self.bn1(x)
        out = self.conv1(out)
        out += residual
        out = self.bn2(out)

        out = self.frame_to_video(out, batch_size, num_frame, channel, height, height)
        # print(out.shape)

        return out

In [5]:
class ReconstructNet(nn.Module):
    def __init__(self):
        super().__init__()
        model = torch.hub.load(
        'facebookresearch/pytorchvideo', "x3d_m", pretrained=True)
        self.model_num_features = model.blocks[5].proj.in_features
        self.num_class = 101

        self.net_bottom = nn.Sequential(
            model.blocks[0],
            model.blocks[1],
            model.blocks[2],
            model.blocks[3],
            model.blocks[4]
        )

        self.adapter = Adapter2D(192)

        self.net_top = nn.Sequential(
            model.blocks[5].pool,
            model.blocks[5].dropout
        )

        # self.linear = model.blocks[5].proj
        self.linear = nn.Linear(self.model_num_features, self.num_class)

        # # 学習させるパラメータ名
        # self.update_param_names = ["adapter.bn1.weight", "adapter.bn1.bias",
        #                            "adapter.conv1.weight", "adapter.conv1.bias",
        #                            "adapter.bn2.weight", "adapter.bn2.bias",
        #                            "linear.weight", "linear.bias"]
        # # 学習させるパラメータ以外は勾配計算をなくし、変化しないように設定
        # for name, param in self.named_parameters():
        #     if name in self.update_param_names:
        #         param.requires_grad = True
        #         # print(name)
        #     else:
        #         param.requires_grad = False
               

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # x = x.permute(0,2,1,3,4)
        x = self.net_bottom(x)
        x = self.adapter(x)
        x = self.net_top(x)
        x = x.permute(0,2,3,4,1)
        x = self.linear(x)
        x = x.view(-1,self.num_class)
        return x




In [6]:
# adapter = Adapter2D(192)

# torchinfo.summary(
#     adapter,
#     input_size=(1,192,16,7,7),
#     depth=4,
#     col_names=["input_size",
#                "output_size"],
#     row_settings=("var_names",)
# )

In [7]:
# model_new = ReconstructNet()

# torchinfo.summary(
#     model_new,
#     input_size=(1,3,16,224,224),
#     depth=4,
#     col_names=["input_size",
#                "output_size"],
#     row_settings=("var_names",)
# )

## 学習させてみる

In [8]:
def get_kinetics(subset):
    """
    Kinetics400のデータセットを取得

    Args:
        subset (str): "train" or "val" or "test"

    Returns:
        pytorchvideo.data.labeled_video_dataset.LabeledVideoDataset: 取得したデータセット
    """
    args = Args()
    train_transform = Compose([
        ApplyTransformToKey(
            key="video",
            transform=Compose([
                UniformTemporalSubsample(args.VIDEO_NUM_SUBSAMPLED),
                transforms.Lambda(lambda x: x / 255.),
                Normalize((0.45, 0.45, 0.45), (0.225, 0.225, 0.225)),
                RandomShortSideScale(min_size=256, max_size=320,),
                RandomCrop(224),
                RandomHorizontalFlip(),
            ]),
        ),
        RemoveKey("audio"),
    ])

    val_transform = Compose([
        ApplyTransformToKey(
            key="video",
            transform=Compose([
                UniformTemporalSubsample(args.VIDEO_NUM_SUBSAMPLED),
                transforms.Lambda(lambda x: x / 255.),
                Normalize((0.45, 0.45, 0.45), (0.225, 0.225, 0.225)),
                ShortSideScale(256),
                CenterCrop(224),
            ]),
        ),
        RemoveKey("audio"),
    ])

    transform = val_transform if subset == "val" else train_transform

    root_kinetics = '/mnt/dataset/Kinetics400/'

    if subset == "test":
        dataset = Kinetics(
            data_path=root_kinetics + "test_list.txt",
            video_path_prefix=root_kinetics + 'test/',
            clip_sampler=RandomClipSampler(
                clip_duration=args.kinetics_clip_duration),
            video_sampler=RandomSampler,
            decode_audio=False,
            transform=transform,
        )
        return dataset
    else:
        dataset = Kinetics(
            data_path=root_kinetics + subset,
            video_path_prefix=root_kinetics + subset,
            clip_sampler=RandomClipSampler(
                clip_duration=args.kinetics_clip_duration),
            video_sampler=RandomSampler,
            decode_audio=False,
            transform=transform,
        )
        return dataset

    return False


In [9]:
def get_ucf101(subset):
    """
    ucf101のデータセットを取得

    Args:
        subset (str): "train" or "test"

    Returns:
        pytorchvideo.data.labeled_video_dataset.LabeledVideoDataset: 取得したデータセット
    """
    subset_root_Ucf101 = 'ucfTrainTestlist/trainlist01.txt' if subset == "train" else 'ucfTrainTestlist/testlist.txt'
    # if subset == "test":
    #     subset_root_Ucf101 = 'ucfTrainTestlist/testlist.txt'

    args = Args()
    train_transform = Compose([
        ApplyTransformToKey(
            key="video",
            transform=Compose([
                UniformTemporalSubsample(args.VIDEO_NUM_SUBSAMPLED),
                transforms.Lambda(lambda x: x / 255.),
                Normalize((0.45, 0.45, 0.45), (0.225, 0.225, 0.225)),
                RandomShortSideScale(min_size=256, max_size=320,),
                RandomCrop(224),
                RandomHorizontalFlip(),
            ]),
        ),
        ApplyTransformToKey(
            key="label",
            transform=transforms.Lambda(lambda x: x - 1),
        ),
        RemoveKey("audio"),
    ])

    val_transform = Compose([
        ApplyTransformToKey(
            key="video",
            transform=Compose([
                UniformTemporalSubsample(args.VIDEO_NUM_SUBSAMPLED),
                transforms.Lambda(lambda x: x / 255.),
                Normalize((0.45, 0.45, 0.45), (0.225, 0.225, 0.225)),
                ShortSideScale(256),
                CenterCrop(224),
            ]),
        ),
        ApplyTransformToKey(
            key="label",
            transform=transforms.Lambda(lambda x: x - 1),
        ),
        RemoveKey("audio"),
    ])

    transform = train_transform if subset == "train" else val_transform

    root_ucf101 = '/mnt/dataset/UCF101/'
    # root_ucf101 = '/mnt/NAS-TVS872XT/dataset/UCF101/'

    dataset = Ucf101(
        data_path=root_ucf101 + subset_root_Ucf101,
        video_path_prefix=root_ucf101 + 'video/',
        clip_sampler=RandomClipSampler(
            clip_duration=args.ucf101_clip_duration),
        video_sampler=RandomSampler,
        decode_audio=False,
        transform=transform,
    )

    return dataset


In [10]:
class LimitDataset(torch.utils.data.Dataset):
    def __init__(self, dataset):
        super().__init__()
        self.dataset = dataset
        self.dataset_iter = itertools.chain.from_iterable(
            itertools.repeat(iter(dataset), 2)
        )

    def __getitem__(self, index):
        return next(self.dataset_iter)

    def __len__(self):
        return self.dataset.num_videos

In [11]:
def make_loader(dataset):
    """
    データローダーを作成

    Args:
        dataset (pytorchvideo.data.labeled_video_dataset.LabeledVideoDataset): get_datasetメソッドで取得したdataset

    Returns:
        torch.utils.data.DataLoader: 取得したデータローダー
    """
    args = Args()
    loader = DataLoader(LimitDataset(dataset),
                        batch_size=args.BATCH_SIZE,
                        drop_last=True,
                        num_workers=args.NUM_WORKERS,
                        shuffle=True)
    return loader

In [12]:
class AverageMeter(object):
    """
    Computes and stores the average and current value
    Imported from https://github.com/pytorch/examples/blob/master/imagenet/main.py#L247-L262
    https://github.com/machine-perception-robotics-group/attention_branch_network/blob/ced1d97303792ac6d56442571d71bb0572b3efd8/utils/misc.py#L59
    """

    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        if isinstance(val, torch.Tensor):
            val = val.item()
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def top1(outputs, targets):
    batch_size = outputs.size(0)
    _, predicted = outputs.max(1)
    return predicted.eq(targets).sum().item() / batch_size



In [13]:
import os.path as osp
import shutil

def save_checkpoint(state, is_best, filename='checkpoint.pth', dir_data_name='UCF101'):
    file_path = osp.join(dir_data_name, filename)
    if not os.path.exists(dir_data_name):
        os.makedirs(dir_data_name)
    torch.save(state.state_dict(), file_path)
    if is_best:
        shutil.copyfile(file_path, osp.join(dir_data_name, 'model_best.pth'))

In [14]:
# save_checkpoint()メソッドのテスト

# test_net = ReconstructNet()
# test_net.to("cuda")
# print(test_net)
# save_checkpoint(test_net, True)

In [15]:
def train():
    args = Args()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    train_dataset = get_ucf101("train")
    val_dataset = get_ucf101("val")
    train_loader = make_loader(train_dataset)
    val_loader = make_loader(val_dataset)

    model = ReconstructNet()
    model = model.to(device)
    # model = torch.nn.DataParallel(model)
    torch.backends.cudnn.benchmark = True

    optimizer = torch.optim.SGD(
        model.parameters(),
        lr=0.1,
        momentum=0.9,
        weight_decay=5e-4)
    criterion = nn.CrossEntropyLoss()

    hyper_params = {
        "Dataset": "UCF101",
        "epoch": args.NUM_EPOCH,
        "batch_size": args.BATCH_SIZE,
        "num_frame": args.VIDEO_NUM_SUBSAMPLED,
        # "Adapter": "adp:1",
    }

    experiment = Experiment(
        api_key="TawRAwNJiQjPaSMvBAwk4L4pF",
        project_name="feeature-extract",
        workspace="kazukiomi",
    )

    experiment.add_tag('pytorch')
    experiment.log_parameters(hyper_params)

    num_epochs = args.NUM_EPOCH

    step = 0
    best_acc = 0

    with tqdm(range(num_epochs)) as pbar_epoch:
        for epoch in pbar_epoch:
            pbar_epoch.set_description("[Epoch %d]" % (epoch))

            """Training mode"""

            train_loss = AverageMeter()
            train_acc = AverageMeter()

            with tqdm(enumerate(train_loader),
                      total=len(train_loader),
                      leave=True) as pbar_train_batch:

                model.train()

                for batch_idx, batch in pbar_train_batch:
                    pbar_train_batch.set_description(
                        "[Epoch :{}]".format(epoch))

                    inputs = batch['video'].to(device)
                    labels = batch['label'].to(device)

                    bs = inputs.size(0)

                    optimizer.zero_grad()
                    outputs = model(inputs)
                    loss = criterion(outputs, labels)
                    loss.backward()
                    optimizer.step()

                    train_loss.update(loss, bs)
                    train_acc.update(top1(outputs, labels), bs)

                    pbar_train_batch.set_postfix_str(
                        ' | loss_avg={:6.04f} , top1_avg={:6.04f}'
                        ' | batch_loss={:6.04f} , batch_top1={:6.04f}'
                        ''.format(
                            train_loss.avg, train_acc.avg,
                            train_loss.val, train_acc.val,
                        ))

                    experiment.log_metric(
                        "batch_accuracy", train_acc.val, step=step)
                    step += 1

            """Val mode"""
            model.eval()
            val_loss = AverageMeter()
            val_acc = AverageMeter()

            with torch.no_grad():
                for batch_idx, val_batch in enumerate(val_loader):
                    inputs = val_batch['video'].to(device)
                    labels = val_batch['label'].to(device)

                    bs = inputs.size(0)

                    val_outputs = model(inputs)
                    loss = criterion(val_outputs, labels)

                    val_loss.update(loss, bs)
                    val_acc.update(top1(val_outputs, labels), bs)
            """Finish Val mode"""

            """save model"""
            if best_acc < val_acc.avg:
                best_acc = val_acc.avg
                is_best = True
            else:
                is_best = False
                
            save_checkpoint(model, is_best)
            

            pbar_epoch.set_postfix_str(
                ' train_loss={:6.04f} , val_loss={:6.04f}, train_acc={:6.04f}, val_acc={:6.04f}'
                ''.format(
                    train_loss.avg,
                    val_loss.avg,
                    train_acc.avg,
                    val_loss.avg)
            )

            # metrics = {"train_accuracy": train_acc.avg,
            #            "val_accuracy": val_acc.avg
            #            }
            # experiment.log_multiple_metrics(metrics, epoch + 1)
            experiment.log_metric("epoch_train_accuracy",
                                  train_acc.avg,
                                  step=epoch + 1)
            experiment.log_metric("epoch_train_loss",
                                  train_loss.avg,
                                  step=epoch + 1)
            experiment.log_metric("val_accuracy",
                                  val_acc.avg,
                                  step=epoch + 1)
            experiment.log_metric("val_loss",
                                  val_loss.avg,
                                  step=epoch + 1)


In [16]:
# train()

## 上で学習したモデルを読み込む

In [17]:
# called_model = ReconstructNet()
# model_path = "UCF101/model_best.pth"
# called_model.load_state_dict(torch.load(model_path))

In [18]:
# 呼び出したモデルに1バッチだけ流して精度とロスをテスト

# device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")

# called_model = called_model.to(device)
# test_loader = make_loader(get_ucf101("val"))
# criterion = nn.CrossEntropyLoss()

# test_batch = iter(test_loader).__next__()

# model.eval()
# test_loss = AverageMeter()
# test_acc = AverageMeter()

# test_inputs = test_batch["video"].to(device)
# test_labels = test_batch["label"].to(device)
# bs = test_inputs.size(0)

# test_out = called_model(test_inputs)
# loss = criterion(test_out, test_labels)

# test_loss.update(loss, bs)
# test_acc.update(top1(test_out, test_labels), bs)

# print(test_acc.avg)
# print(test_loss.avg)

In [19]:
# valの精度をテスト

# args = Args()
# device = torch.device("cuda:2" if torch.cuda.is_available() else "cpu")

# val_dataset = get_ucf101("val")
# val_loader = make_loader(val_dataset)
# called_model = called_model.to(device)

# criterion = nn.CrossEntropyLoss()

# # called_model = called_model.to(device)
# called_model.eval()
# val_loss = AverageMeter()
# val_acc = AverageMeter()
# with tqdm(enumerate(val_loader),
#                       total=len(val_loader),
#                       leave=True) as pbar_val_batch:
#     with torch.no_grad():
#         for batch_idx, val_batch in pbar_val_batch:
#             inputs = val_batch['video'].to(device)
#             labels = val_batch['label'].to(device)

#             bs = inputs.size(0)

#             val_outputs = called_model(inputs)
#             loss = criterion(val_outputs, labels)

#             val_loss.update(loss, bs)
#             val_acc.update(top1(val_outputs, labels), bs)
            
#             pbar_val_batch.set_postfix_str(
#                         ' | loss_avg={:6.04f} , top1_avg={:6.04f}'
#                         ' | batch_loss={:6.04f} , batch_top1={:6.04f}'
#                         ''.format(
#                             val_loss.avg, val_acc.avg,
#                             val_loss.val, val_acc.val,
#                         ))
# print(val_acc.avg)
# print(val_loss.avg)

## 新しくクラスを定義して特徴量を保存する
- 新しくクラスを定義するのではなくアダプタのオプション（引数）で行いたい
  - アダプタ内で行うとアダプタごとにファイルの保存場所を変更できない
    - さらにアダプタにオプション与える？
  - やはり新しくクラスを定義する？

In [20]:
class Adapter2DExtract(nn.Module):
    expansion = 1

    def __init__(self, dim, adapter_name: str):
        super().__init__()
        self.bn1 = nn.BatchNorm2d(dim)
        self.conv1 = nn.Conv2d(dim, dim, 1)       
        self.bn2 = nn.BatchNorm2d(dim)

        self.adapter_name = adapter_name
        self.num_of_dataset = 9472
        self.path_feature = "UCF101/features/"
    
    def video_to_frame(self, inputs):
        batch_size = inputs.size(0)
        num_frame = inputs.size(2)

        inputs = inputs.permute(0, 2, 1, 3, 4)
        outputs = inputs.reshape(batch_size * num_frame,
                                 inputs.size(2),
                                 inputs.size(3),
                                 inputs.size(4))

        return outputs

    def frame_to_video(
            self, input: torch.Tensor, batch_size, num_frame, channel, height, width) -> torch.Tensor:
        output = input.reshape(batch_size, num_frame, channel, height, width)
        output = output.permute(0,2,1,3,4)
        return output


    def forward(self, x, data_name):
        batch_size = x.size(0)
        num_frame = x.size(2)
        channel= x.size(1)
        height = x.size(3)

        x = self.video_to_frame(x)
        
        residual = x
        out = self.bn1(x)

        """input feature extraxt"""
        features_in = out.data.cpu().numpy()
        # num = 2000000 * 512 / features_in.shape[1] / self.num_of_dataset
        # if num > features_in.shape[2]**2:
        #     num = features_in.shape[2]**2
        features_in = features_in.reshape(features_in.shape[1], -1).transpose(1,0)
        # features_in = random.sample(features_in, num)
        dir_name = osp.join(self.path_feature, self.adapter_name+'_wh')
        if not os.path.exists(dir_name):
            os.makedirs(dir_name)
        np.save(osp.join(self.path_feature, self.adapter_name+'_wh', data_name), features_in)


        out = self.conv1(out)

        """output feature extract"""
        features_out = out.data.cpu().numpy()
        features_out = features_out.reshape(features_out.shape[1], -1).transpose(1,0)
        dir_name = osp.join(self.path_feature, self.adapter_name+'_rc')
        if not os.path.exists(dir_name):
            os.makedirs(dir_name)
        np.save(osp.join(self.path_feature, self.adapter_name+'_rc', data_name), features_out)

        out += residual
        out = self.bn2(out)

        out = self.frame_to_video(out, batch_size, num_frame, channel, height, height)

        return out

In [21]:
class FeatureExtractor(nn.Module):
    def __init__(self):
        super().__init__()
        model = torch.hub.load(
        'facebookresearch/pytorchvideo', "x3d_m", pretrained=True)
        self.model_num_features = model.blocks[5].proj.in_features
        self.num_class = 101

        self.net_bottom = nn.Sequential(
            model.blocks[0],
            model.blocks[1],
            model.blocks[2],
            model.blocks[3],
            model.blocks[4]
        )

        self.adapter = Adapter2DExtract(192, "adapter0")

        self.net_top = nn.Sequential(
            model.blocks[5].pool,
            model.blocks[5].dropout
        )

        # self.linear = model.blocks[5].proj
        self.linear = nn.Linear(self.model_num_features, self.num_class)

        # 学習させるパラメータ名
        self.update_param_names = ["adapter.bn1.weight", "adapter.bn1.bias",
                                   "adapter.conv1.weight", "adapter.conv1.bias",
                                   "adapter.bn2.weight", "adapter.bn2.bias",
                                   "linear.weight", "linear.bias"]
        # 学習させるパラメータ以外は勾配計算をなくし、変化しないように設定
        for name, param in self.named_parameters():
            if name in self.update_param_names:
                param.requires_grad = True
                # print(name)
            else:
                param.requires_grad = False
               

    def forward(self, x: torch.Tensor, data_name) -> torch.Tensor:
        x = self.net_bottom(x)
        x = self.adapter(x, data_name)
        x = self.net_top(x)
        x = x.permute(0,2,3,4,1)
        x = self.linear(x)
        x = x.view(-1,self.num_class)
        return x




In [22]:
# ビデオの名前を取得するテスト

# dataset = get_ucf101("val")
# test_loader = DataLoader(LimitDataset(dataset),
#                         batch_size=1,
#                         drop_last=False,
#                         num_workers=32,
#                         shuffle=False)
# test_batch = iter(test_loader).__next__()

# video_name = test_batch["video_name"]
# print(video_name)
# video_name = video_name[0].split(".")[0]
# print(video_name)

In [23]:
# 新しく定義したモデルが保存済みのモデルのパラメータになっているかどうかを確認

# device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")

# test_model = FeatureExtractor()
# model_path = "UCF101/model_best.pth"
# test_model.load_state_dict(torch.load(model_path))
# test_model = test_model.to(device)
# test_loader = make_loader(get_ucf101("val"))
# criterion = nn.CrossEntropyLoss()

# test_batch = iter(test_loader).__next__()

# model.eval()
# test_loss = AverageMeter()
# test_acc = AverageMeter()

# test_inputs = test_batch["video"].to(device)
# test_labels = test_batch["label"].to(device)
# video_name = test_batch["video_name"][0].split(".")[0]
# bs = test_inputs.size(0)

# test_out = test_model(test_inputs, video_name)
# loss = criterion(test_out, test_labels)

# test_loss.update(loss, bs)
# test_acc.update(top1(test_out, test_labels), bs)

# print(test_acc.avg)
# print(test_loss.avg)

In [24]:
def feature_extract():
    args = Args()
    device = torch.device("cuda:2" if torch.cuda.is_available() else "cpu")

    train_dataset = get_ucf101("train")
    train_loader = DataLoader(LimitDataset(train_dataset),
                        batch_size=1,
                        drop_last=False,
                        num_workers=32,
                        shuffle=False)

    model = FeatureExtractor()
    best_model_path = "UCF101/model_best.pth"
    model.load_state_dict(torch.load(best_model_path))
    model = model.to(device)
    
    torch.backends.cudnn.benchmark = True


    with tqdm(enumerate(train_loader),
                total=len(train_loader),
                leave=True) as pbar_train_batch:

        model.eval()

        for batch_idx, batch in pbar_train_batch:
            inputs = batch['video'].to(device)
            labels = batch['label'].to(device)
            video_name = batch["video_name"][0].split(".")[0]

            outputs = model(inputs, video_name)
            


In [25]:
# feature_extract()

## 中間特徴量を主成分分析にかけ結果を保存する

In [26]:
# これはテスト，下で関数で定義した

# channel_dim = [192]
# exact_list = ["adapter0"]
# pre_dir_feature = "UCF101/features"

# for k in range(len(channel_dim)):
#     # print(k)
#     dim = channel_dim[k]

    
#     dir_feature_in = osp.join(pre_dir_feature, exact_list[k] + '_wh')
#     # print(dir_feature_in)
#     files_wh = os.listdir(dir_feature_in)
#     # print("file length: {}".format(len(files_wh)))

#     for i in range(len(files_wh)):
#         if i == 0:
#             num = np.load(osp.join(dir_feature_in, files_wh[i])).shape[0]
#             print(num)
#             features_in = np.zeros((len(files_wh)*num, dim))
#             print(features_in.shape)
#         features_in[i*num:(i+1)*num] = np.load(osp.join(dir_feature_in, files_wh[i]))
#     pca = decomposition.PCA(n_components=dim)
#     pca.fit(features_in)
#     pickle.dump(pca, open(osp.join(pre_dir_feature, exact_list[k] + '_wh') + '_pca.sav', 'wb'))

#     dir_feature_out = osp.join(pre_dir_feature, exact_list[k] + '_rc')
#     files_rc = os.listdir(dir_feature_out)
#     features_out = np.zeros((len(files_rc)*num, dim))
#     for i in range(len(files_rc)):
#         features_out[i*num:(i+1)*num] = np.load(osp.join(dir_feature, files_rc[i]))
#     pca = decomposition.PCA(n_components=dim)
#     pca.fit(features_out)
#     pickle.dump(pca, open(osp.join(pre_dir_feature, exact_list[k] + '_rc') + '_pca.sav', 'wb'))



### 上の処理で思ったこと
- 画像の場合
  - 特徴量のshapeを(B, C, W, H) -> (C, B*W*H) -> (B*W*H, C)に変更してからセーブ（B=1）
  - よってデータ全ての特徴量を格納したものは（データ数 x W x H, dim(チャンネル数)）
- 動画の場合
  - フレームをバッチ方向につなげているのでB=T(フレーム数)になってしまう

### 処理の違い
動画を複数の画像（複数のデータに分けて）としてみて主成分分析するか動画を1つのデータとして主成分分析するか違いが出るかと思ったけど関係なさそう
（チャネル数とそれ以外に分けて主成分分析するから，features[0:8]まで一気に格納されるか(動画の場合)，features[0]から順に逐一格納されるかの違いしかないと思う）
  
<!-- ### 処理の方法による違い
- (データ数*T*W*H, dim)のまま主成分分析
  - 1つの動画 -->

In [27]:
def save_pca():
    channel_dim = [192]
    exact_list = ["adapter0"]
    pre_dir_feature = "UCF101/features"

    for k in range(len(channel_dim)):
        dim = channel_dim[k]

        dir_feature_in = osp.join(pre_dir_feature, exact_list[k] + '_wh')
        files_wh = os.listdir(dir_feature_in)
        for i in range(len(files_wh)):
            if i == 0:
                num = np.load(osp.join(dir_feature_in, files_wh[i])).shape[0]
                features_in = np.zeros((len(files_wh)*num, dim))
            features_in[i*num:(i+1)*num] = np.load(osp.join(dir_feature_in, files_wh[i]))
        pca_wh = decomposition.PCA(n_components=dim)
        pca_wh.fit(features_in)
        pickle.dump(pca_wh, open(osp.join(pre_dir_feature, exact_list[k] + '_wh') + '_pca.sav', 'wb'))
        # print(pca_wh.explained_variance_ratio_)

        dir_feature_out = osp.join(pre_dir_feature, exact_list[k] + '_rc')
        files_rc = os.listdir(dir_feature_out)
        features_out = np.zeros((len(files_rc)*num, dim))
        for i in range(len(files_rc)):
            features_out[i*num:(i+1)*num] = np.load(osp.join(dir_feature_out, files_rc[i]))
        pca_rc = decomposition.PCA(n_components=dim)
        pca_rc.fit(features_out)
        pickle.dump(pca_rc, open(osp.join(pre_dir_feature, exact_list[k] + '_rc') + '_pca.sav', 'wb'))
        # print(pca_rc.explained_variance_ratio_)



In [28]:
# save_pca()

## 主成分分析の結果を呼び出して次元数を決定する（アダプタのチャネル数の決定）

In [29]:
# これはテスト，下に関数で定義した

# channel_dim = [192]
# exact_list = ["adapter0"]
# pre_dir_feature = "UCF101/features"
# pca_ratio = 0.995 # 次元削減を累積寄与率で指定するために用意

# pca_dim = []
# pca_dir = {}

# for k in range(len(channel_dim)):
#     dim = channel_dim[k]

#     dir_feature_in = osp.join(pre_dir_feature, exact_list[k] + '_wh' + '_pca.sav')
#     # print(dir_feature_in)
#     # print(osp.exists(dir_feature_in))
#     pca_wh = pickle.load(open(dir_feature_in, 'rb'))
#     ratio_wh = pca_wh.explained_variance_ratio_  # 因子寄与率を計算
#     # print(ratio_wh.shape)
#     cum_wh = [np.sum(ratio_wh[0:i+1]) for i in range(dim)]  # 累積寄与率を計算 
#     # print(cum_wh)
#     cum_wh_dim = np.sum(np.array(cum_wh)<pca_ratio)
#     # print(cum_wh_dim)
#     pca_dim.append(cum_wh_dim)
#     pca_dir[osp.join(pre_dir_feature, exact_list[k] + "_wh")] = osp.join(pre_dir_feature, exact_list[k] + "_wh") + "_pca.sav"

#     dir_feature_out = osp.join(pre_dir_feature, exact_list[k] + '_rc' + '_pca.sav')
#     pca_rc = pickle.load(open(dir_feature_out, 'rb'))
#     ratio_rc = pca_rc.explained_variance_ratio_
#     cum_rc = [np.sum(ratio_rc[0:i+1]) for i in range(dim)]
#     # print(cum_rc)
#     cum_rc_dim = np.sum(np.array(cum_rc)<pca_ratio)
#     pca_dim.append(cum_rc_dim)
#     pca_dir[osp.join(pre_dir_feature, exact_list[k] + "_rc")] = osp.join(pre_dir_feature, exact_list[k] + "_rc") + "_pca.sav"

#     print(pca_dim)
#     print(pca_dir)


In [30]:
def compute_dim():
    channel_dim = [192]
    exact_list = ["adapter0"]
    pre_dir_feature = "UCF101/features"
    pca_ratio = 0.995 # 次元削減を累積寄与率で指定するために用意

    pca_dim = []
    pca_dir = {}

    for k in range(len(channel_dim)):
        dim = channel_dim[k]

        dir_feature_in = osp.join(pre_dir_feature, exact_list[k] + '_wh' + '_pca.sav')
        pca_wh = pickle.load(open(dir_feature_in, 'rb'))
        ratio_wh = pca_wh.explained_variance_ratio_  # 因子寄与率を計算
        cum_wh = [np.sum(ratio_wh[0:i+1]) for i in range(dim)]  # 累積寄与率を計算 
        cum_wh_dim = np.sum(np.array(cum_wh)<pca_ratio)
        if cum_wh_dim < 2:
            cum_wh_dim = 2
        pca_dim.append(cum_wh_dim)
        pca_dir[osp.join(pre_dir_feature, exact_list[k] + "_wh")] = osp.join(pre_dir_feature, exact_list[k] + "_wh") + "_pca.sav"

        dir_feature_out = osp.join(pre_dir_feature, exact_list[k] + '_rc' + '_pca.sav')
        pca_rc = pickle.load(open(dir_feature_out, 'rb'))
        ratio_rc = pca_rc.explained_variance_ratio_
        cum_rc = [np.sum(ratio_rc[0:i+1]) for i in range(dim)]
        cum_rc_dim = np.sum(np.array(cum_rc)<pca_ratio)
        if cum_rc_dim < 2:
            cum_rc_dim = 2
        pca_dim.append(cum_rc_dim)
        pca_dir[osp.join(pre_dir_feature, exact_list[k] + "_rc")] = osp.join(pre_dir_feature, exact_list[k] + "_rc") + "_pca.sav"

    return pca_dim, pca_dir

~~### 上の結果から同じ結果が出たので以下でどこで間違えたか確かめる~~
- ~~保存した特徴量は異なる~~
  - ~~diff adapter0_wh/v_ApplyEyeMakeup_g08_c01.npy adapter0_rc/v_ApplyEyeMakeup_g08_c01.npyで確認~~
-  ~~特徴量を主成分分析にかけた結果が一致している~~
  - ~~diff adapter0_wh_pca.sav adapter0_rc_pca.savで確認~~~


### 主成分分析の結果をアダプタに適用する

In [31]:
# pca_dim, pca_dir = compute_dim()
# print(pca_dim)
# print(pca_dir)

In [32]:
class PcaAdapter2D(nn.Module):
    expansion = 1

    def __init__(self, dim, in_dim, out_dim, pca=True):
        super().__init__()
        self.pca = pca
        self.bn1 = nn.BatchNorm2d(dim)
        self.conv_wh = nn.Conv2d(dim, in_dim, 1)
        #for i in self.conv_wh.parameters():
        #    i.requires_grad = False
        if pca==True:
            self.conv_MAL = nn.Conv2d(in_dim, out_dim, 1)
        self.conv_rc = nn.Conv2d(out_dim, dim, 1)
        #for i in self.conv_rc.parameters():
        #    i.requires_grad = False        
        self.bn2 = nn.BatchNorm2d(dim)

    def video_to_frame(self, inputs):
        batch_size = inputs.size(0)
        num_frame = inputs.size(2)

        inputs = inputs.permute(0, 2, 1, 3, 4)
        outputs = inputs.reshape(batch_size * num_frame,
                                 inputs.size(2),
                                 inputs.size(3),
                                 inputs.size(4))
        return outputs

    def frame_to_video(
            self, input: torch.Tensor, batch_size, num_frame, channel, height, width) -> torch.Tensor:
        output = input.reshape(batch_size, num_frame, channel, height, width)
        output = output.permute(0,2,1,3,4)
        return output


    def forward(self, x):
        batch_size = x.size(0)
        num_frame = x.size(2)
        channel= x.size(1)
        height = x.size(3)
        
        x = self.video_to_frame(x)
        residual = x

        out = self.bn1(x)
        out = self.conv_wh(out)
        if self.pca == True:
            out = self.conv_MAL(out)
        out = self.conv_rc(out)
        
        out += residual
        out = self.bn2(out)

        out = self.frame_to_video(out, batch_size, num_frame, channel, height, height)

        return out

In [33]:
# 主成分分析の結果をアダプタの重みにのせるテスト

# pca_dim, pca_dir = compute_dim()
# print(pca_dir)

# adapter_params = {}
# exact_list = ["adapter0"]

# # print(pca_dir.keys())
# # print(type(pca_dir.keys()))
# # print(len(pca_dir))

# for i, key in enumerate(pca_dir):
#     print(key)
#     print(pca_dir[key])
#     pca = pickle.load(open(pca_dir[key], "rb"))
#     print(pca)
#     print(exact_list[int(i/2)])
#     print(pca.components_.shape)
#     print(pca.components_[:pca_dim[i]].shape)
#     print(pca.components_[:pca_dim[i]][:,:,np.newaxis,np.newaxis].shape)
#     if i % 2 == 0:
#         adapter_params[exact_list[int(i/2)]+".conv_wh.weight"] = torch.tensor(pca.components_[:pca_dim[i]][:,:,np.newaxis,np.newaxis])
#         adapter_params[exact_list[int(i/2)]+".conv_wh.bias"] = torch.tensor(-1 * np.dot(pca.components_[:pca_dim[i]], pca.mean_))
#     else:
#         adapter_params[exact_list[int(i/2)]+".conv_rc.weight"] = torch.tensor(pca.components_[:pca_dim[i]][:,:,np.newaxis,np.newaxis])
#         adapter_params[exact_list[int(i/2)]+".conv_rc.bias"] = torch.tensor(-1 * np.dot(pca.components_[:pca_dim[i]], pca.mean_))
        
# print(adapter_params.keys())
# # print(adapter_params['adapter0.conv_wh.weight'])


In [34]:
class MultiDomainNetwork(nn.Module):
    def __init__(self, pca_dim, pca_dir):
        super().__init__()
        model = torch.hub.load(
        'facebookresearch/pytorchvideo', "x3d_m", pretrained=True)
        self.model_num_features = model.blocks[5].proj.in_features
        self.num_class = 101

        self.pca_dir = pca_dir
        self.pca_dim = pca_dim

        self.net_bottom = nn.Sequential(
            model.blocks[0],
            model.blocks[1],
            model.blocks[2],
            model.blocks[3],
            model.blocks[4]
        )

        self.adapter0 = PcaAdapter2D(192, pca_dim[0], pca_dim[1])

        self.net_top = nn.Sequential(
            model.blocks[5].pool,
            model.blocks[5].dropout
        )

        self.linear = nn.Linear(self.model_num_features, self.num_class)

        # 学習させるパラメータ名
        self.update_param_names = ["adapter0.bn1.weight", "adapter0.bn1.bias",
                                   "adapter0.conv_wh.weight", "adapter0.conv_wh.bias",
                                   "adapter0.conv_MAL.weight", "adapter0.conv_MAL.bias",
                                   "adapter0.conv_rc.weight", "adapter0.conv_rc.bias",
                                   "adapter0.bn2.weight", "adapter0.bn2.bias",
                                   "linear.weight", "linear.bias"]
        # 学習させるパラメータ以外は勾配計算をなくし、変化しないように設定
        for name, param in self.named_parameters():
            if name in self.update_param_names:
                param.requires_grad = True
                # print(name)
            else:
                param.requires_grad = False
        
    def apply_weights(self):
        pca_dim = self.pca_dim
        pca_dir = self.pca_dir
        adapter_params = {}
        exact_list = ["adapter0"]

        for i, key in enumerate(pca_dir):
            pca = pickle.load(open(pca_dir[key], "rb"))
            if i % 2 == 0:
                adapter_params[exact_list[int(i/2)]+".conv_wh.weight"] = torch.tensor(pca.components_[:pca_dim[i]][:,:,np.newaxis,np.newaxis])
                # adapter_params[exact_list[int(i/2)]+".conv_wh.bias"] = torch.tensor(-1 * np.dot(pca.components_[:pca_dim[i]], pca.mean_))
            else:
                adapter_params[exact_list[int(i/2)]+".conv_rc.weight"] = torch.tensor(pca.components_[:pca_dim[i]].T[:,:,np.newaxis,np.newaxis])
                # adapter_params[exact_list[int(i/2)]+".conv_rc.bias"] = torch.tensor(-1 * np.dot(pca.components_[:pca_dim[i]], pca.mean_))
                
        self.load_state_dict(adapter_params, strict=False)
        

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.net_bottom(x)
        x = self.adapter0(x)
        x = self.net_top(x)
        x = x.permute(0,2,3,4,1)
        x = self.linear(x)
        x = x.view(-1,self.num_class)
        return x

In [35]:
# アダプタの重みパラメータに主成分分析の結果が載っているかを確認
# test_net.apply_weights()をなくすと毎回異なる重みになるがこの関数を使うと重みが一意に決まるのでアダプタの重みは問題ない(確認はupdate_param_names1)
# アダプタ以外の重みが変化していないかはupdate_param_namesにしてapply_weight()がある時のない時で変化しないことより確認

# test_net = MultiDomainNetwork(pca_dim, pca_dir)
# # test_net.apply_weights()
# update_param_names = ["net_bottom.1.res_blocks.0.branch1_conv.weight"]

# update_param_names1 = ["adapter0.conv_wh.weight"]

# for name, params in test_net.named_parameters():
#     if name in update_param_names:
#         print(name)
#         print(params)
#         print(params.shape)
    

In [36]:
# torchinfoでモデルの構造を確認

# pca_dim, pca_dir = compute_dim()
# test_net = MultiDomainNetwork(pca_dim,pca_dir)

# torchinfo.summary(
#     test_net,
#     input_size=(1,3,16,224,224),
#     depth=4,
#     col_names=["input_size",
#                "output_size"],
#     row_settings=("var_names",)
# )

### 主成分分析の結果から決めたアダプタの重みで初期化したモデルでアダプタと出力層を再学習（最終的なモデル）

In [39]:
def train_covnorm():
    args = Args()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    train_dataset = get_ucf101("train")
    val_dataset = get_ucf101("val")
    train_loader = make_loader(train_dataset)
    val_loader = make_loader(val_dataset)

    pca_dim, pca_dir = compute_dim()

    model = MultiDomainNetwork(pca_dim, pca_dim)
    model = model.to(device)
    torch.backends.cudnn.benchmark = True

    optimizer = torch.optim.SGD(
        model.parameters(),
        lr=0.01,
        momentum=0.9,
        weight_decay=5e-4)
    criterion = nn.CrossEntropyLoss()

    hyper_params = {
        "Dataset": "UCF101",
        "epoch": args.NUM_EPOCH,
        "batch_size": args.BATCH_SIZE,
        "num_frame": args.VIDEO_NUM_SUBSAMPLED,
        "Adapter": "adp:0",
    }

    experiment = Experiment(
        api_key="TawRAwNJiQjPaSMvBAwk4L4pF",
        project_name="feeature-extract",
        workspace="kazukiomi",
    )

    experiment.add_tag('pytorch')
    experiment.log_parameters(hyper_params)

    num_epochs = args.NUM_EPOCH

    step = 0
    best_acc = 0

    with tqdm(range(num_epochs)) as pbar_epoch:
        for epoch in pbar_epoch:
            pbar_epoch.set_description("[Epoch %d]" % (epoch))

            """Training mode"""

            train_loss = AverageMeter()
            train_acc = AverageMeter()

            with tqdm(enumerate(train_loader),
                      total=len(train_loader),
                      leave=True) as pbar_train_batch:

                model.train()

                for batch_idx, batch in pbar_train_batch:
                    pbar_train_batch.set_description(
                        "[Epoch :{}]".format(epoch))

                    inputs = batch['video'].to(device)
                    labels = batch['label'].to(device)

                    bs = inputs.size(0)

                    optimizer.zero_grad()
                    outputs = model(inputs)
                    loss = criterion(outputs, labels)
                    loss.backward()
                    optimizer.step()

                    train_loss.update(loss, bs)
                    train_acc.update(top1(outputs, labels), bs)

                    pbar_train_batch.set_postfix_str(
                        ' | loss_avg={:6.04f} , top1_avg={:6.04f}'
                        ' | batch_loss={:6.04f} , batch_top1={:6.04f}'
                        ''.format(
                            train_loss.avg, train_acc.avg,
                            train_loss.val, train_acc.val,
                        ))

                    experiment.log_metric(
                        "batch_accuracy", train_acc.val, step=step)
                    step += 1

            """Val mode"""
            model.eval()
            val_loss = AverageMeter()
            val_acc = AverageMeter()

            with torch.no_grad():
                for batch_idx, val_batch in enumerate(val_loader):
                    inputs = val_batch['video'].to(device)
                    labels = val_batch['label'].to(device)

                    bs = inputs.size(0)

                    val_outputs = model(inputs)
                    loss = criterion(val_outputs, labels)

                    val_loss.update(loss, bs)
                    val_acc.update(top1(val_outputs, labels), bs)
            """Finish Val mode"""

            """save model"""
            if best_acc < val_acc.avg:
                best_acc = val_acc.avg
                is_best = True
            else:
                is_best = False
                
            save_checkpoint(model, is_best)
            

            pbar_epoch.set_postfix_str(
                ' train_loss={:6.04f} , val_loss={:6.04f}, train_acc={:6.04f}, val_acc={:6.04f}'
                ''.format(
                    train_loss.avg,
                    val_loss.avg,
                    train_acc.avg,
                    val_loss.avg)
            )

            # metrics = {"train_accuracy": train_acc.avg,
            #            "val_accuracy": val_acc.avg
            #            }
            # experiment.log_multiple_metrics(metrics, epoch + 1)
            experiment.log_metric("epoch_train_accuracy",
                                  train_acc.avg,
                                  step=epoch + 1)
            experiment.log_metric("epoch_train_loss",
                                  train_loss.avg,
                                  step=epoch + 1)
            experiment.log_metric("val_accuracy",
                                  val_acc.avg,
                                  step=epoch + 1)
            experiment.log_metric("val_loss",
                                  val_loss.avg,
                                  step=epoch + 1)


In [40]:
train_covnorm()

Using cache found in /home/omi/.cache/torch/hub/facebookresearch_pytorchvideo_main
COMET INFO: Experiment is live on comet.ml https://www.comet.ml/kazukiomi/feeature-extract/e6ed346a2d414307878f4314bad310c3



HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=596.0), HTML(value='')))