## Import

In [1]:
import warnings
warnings.filterwarnings(action='ignore')

import os
import datetime
import logging
import random
import pickle
from copy import deepcopy
from pprint import pprint
from pathlib import Path

import pandas as pd
import numpy as np

import cv2
import h5py
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torchvision.models.video as vmodels
import timm

import albumentations as A
from albumentations.pytorch.transforms import ToTensorV2
import torchvision.models as models
import hdf5plugin

from tqdm.auto import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from skimage.transform import resize
import matplotlib.pyplot as plt
import mmseg
# from mmseg.apis import init_segmentor

import imageio
from IPython.display import Image
import pytorchvideo
import pytorchvideo.data
from pytorchvideo.data.encoded_video import EncodedVideo
from pytorchvideo.transforms import (
    ApplyTransformToKey,
    Normalize,
    RandomShortSideScale,
    RemoveKey,
    ShortSideScale,
    UniformTemporalSubsample,
)
from torchvision.transforms import (
    Compose,
    Lambda,
    RandomCrop,
    RandomHorizontalFlip,
    Resize,
)

2023-03-12 21:52:22.299761: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-12 21:52:22.959453: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /home/djlee/anaconda3/envs/djlee/lib/python3.10/site-packages/cv2/../../lib64::/usr/local/cuda-12.0/lib64/
2023-03-12 21:52:22.959591: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /home/djlee/anaconda3/e

## Hyperparameters

In [2]:
config = {
    "model": "MViT",
    "data_path": "/home/djlee/deep/datasets/open",
    "video_length": 50, # 10프레임 * 5초
    "image_size": 224,
    "epochs": 10,
    "learning_rate": 1e-15,
    "max_lr": 1e-3,
    "weight_decay": 5e-2,
    "batch_size": 2,
    "seed": 42,
    "device": torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu'),
    "num_workers": 4,
    "accumulation_steps": 4,
    "h5_name": "car.h5",
    "methods": ["crash", "ego_involve", "weather", "timing"],
    "early_stop": 10,
}
now = datetime.datetime.now()
config["save_path"] = Path(f"results/{now}")
config["save_path"].mkdir(parents=True)

## Logging

In [3]:
# 로거 생성
logger = logging.getLogger()

# 레벨 설정 - 'INFO' 레벨부터 출력
logger.setLevel(logging.INFO)

# 출력 포매팅 설정 - 시간, 로거이름, 로깅레벨, 메세지
formatter = logging.Formatter("%(asctime)s - %(message)s")

# 스트림 핸들러 설정 - 콘솔에 출력
stream_handler = logging.StreamHandler()
stream_handler.setFormatter(formatter)
logger.addHandler(stream_handler)

# 파일 핸들러 설정 - 파일에 출력
file_handler = logging.FileHandler(config["save_path"].joinpath("result.log"))
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)

logger.info(config)

2023-03-12 21:52:25,169 - {'model': 'MViT', 'data_path': '/home/djlee/deep/datasets/open', 'video_length': 50, 'image_size': 224, 'epochs': 10, 'learning_rate': 1e-15, 'max_lr': 0.001, 'weight_decay': 0.05, 'batch_size': 2, 'seed': 42, 'device': device(type='cuda'), 'num_workers': 4, 'accumulation_steps': 4, 'h5_name': 'car.h5', 'methods': ['crash', 'ego_involve', 'weather', 'timing'], 'early_stop': 10, 'save_path': PosixPath('results/2023-03-12 21:52:25.158308')}


## Fix Seed

In [4]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(config["seed"])

## Load Data

In [5]:
num_frames_to_sample = 16
mean = (0.45, 0.45, 0.45)
std = (0.225, 0.225, 0.225)
resize_to = (224, 224)

# Training dataset transformations.
train_transform = Compose(
    [
        ApplyTransformToKey(
            key="video",
            transform=Compose(
                [
                    UniformTemporalSubsample(num_frames_to_sample),
                    Lambda(lambda x: x / 255.0),
                    Normalize(mean, std),
                    RandomShortSideScale(min_size=256, max_size=320),
                    RandomCrop(resize_to),
                    RandomHorizontalFlip(p=0.5),
                ]
            ),
        ),
    ]
)

# Validation and evaluation datasets' transformations.
val_transform = Compose(
    [
        ApplyTransformToKey(
            key="video",
            transform=Compose(
                [
                    UniformTemporalSubsample(num_frames_to_sample),
                    Lambda(lambda x: x / 255.0),
                    Normalize(mean, std),
                    Resize(resize_to),
                ]
            ),
        ),
    ]
)

def unnormalize_img(img):
    """Un-normalizes the image pixels."""
    img = (img * std) + mean
    img = (img * 255).astype("uint8")
    return img.clip(0, 255)

def create_gif(video_tensor, filename="sample.gif"):
    """Prepares a GIF from a video tensor.
    
    The video tensor is expected to have the following shape:
    (num_frames, num_channels, height, width).
    """
    frames = []
    for video_frame in video_tensor:
        frame_unnormalized = unnormalize_img(video_frame.permute(1, 2, 0).numpy())
        frames.append(frame_unnormalized)
    kargs = {"duration": 0.25}
    imageio.mimsave(filename, frames, "GIF", **kargs)
    return filename

def display_gif(video_tensor, gif_name="sample.gif"):
    """Prepares and displays a GIF from a video tensor."""
    video_tensor = video_tensor.permute(1, 0, 2, 3)
    print(video_tensor.shape)
    gif_filename = create_gif(video_tensor, gif_name)
    return Image(filename=gif_filename)


In [6]:
def get_df(method):
    train_df = pd.read_csv(os.path.join(config["data_path"], "train.csv"))
    
    if method == "crash":
        # 차량 충돌 여부 f1 90 이상
        train_df.loc[train_df["label"]!=0, "label"] = 1
        train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=config["seed"], stratify=train_df['label'])
    
    elif method == "crash_all":
        train_df.drop(train_df[train_df["label"]==0].index, inplace=True)
        train_df.loc[train_df["label"]==1, "label"] = 0
        train_df.loc[train_df["label"]==2, "label"] = 1
        train_df.loc[train_df["label"]==3, "label"] = 2
        train_df.loc[train_df["label"]==4, "label"] = 3
        train_df.loc[train_df["label"]==5, "label"] = 4
        train_df.loc[train_df["label"]==6, "label"] = 5
        train_df.loc[train_df["label"]==7, "label"] = 6
        train_df.loc[train_df["label"]==8, "label"] = 7
        train_df.loc[train_df["label"]==9, "label"] = 8
        train_df.loc[train_df["label"]==10, "label"] = 9
        train_df.loc[train_df["label"]==11, "label"] = 10
        train_df.loc[train_df["label"]==12, "label"] = 11
        
        train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=config["seed"], stratify=train_df['label'])
        
    elif method == "ego_involve":
        # 차량 충돌 연관 여부 f1 68 이상
        train_df.drop(train_df[train_df["label"]==0].index, inplace=True)
        train_df.loc[(train_df["label"]==1)|(train_df["label"]==2)|(train_df["label"]==3)|(train_df["label"]==4)|(train_df["label"]==5)|(train_df["label"]==6), "label"] = 0 # yes
        train_df.loc[(train_df["label"]==7)|(train_df["label"]==8)|(train_df["label"]==9)|(train_df["label"]==10)|(train_df["label"]==11)|(train_df["label"]==12), "label"] = 1 # no
        
        train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=config["seed"], stratify=train_df['label'])
        
    elif method == "weather":
        # 날씨 구분 f1 49 이상
        train_df.drop(train_df[(train_df["label"]==0)].index, inplace=True)
        
        # train_df.loc[train_df["label"]==0, "label"] = -1
        train_df.loc[(train_df["label"]==1)|(train_df["label"]==2)|(train_df["label"]==7)|(train_df["label"]==8), "label"] = 0 # normal
        train_df.loc[(train_df["label"]==3)|(train_df["label"]==4)|(train_df["label"]==9)|(train_df["label"]==10), "label"] = 1 # snowy
        train_df.loc[(train_df["label"]==5)|(train_df["label"]==6)|(train_df["label"]==11)|(train_df["label"]==12), "label"] = 2 # rainy
        # train_df.loc[train_df["label"]==-1, "label"] = 3
        # print(train_df["label"].value_counts())
        train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=config["seed"], stratify=train_df['label'])
        # val_df.drop(val_df[val_df["label"]==3].index, inplace=True)
        
    elif method == "timing":
        # 낮/밤 구분 f1 90 이상
        train_df.drop(train_df[train_df["label"]==0].index, inplace=True)
        train_df.loc[(train_df["label"]==1)|(train_df["label"]==3)|(train_df["label"]==5)|(train_df["label"]==7)|(train_df["label"]==9)|(train_df["label"]==11), "label"] = 0 # day
        train_df.loc[(train_df["label"]==2)|(train_df["label"]==4)|(train_df["label"]==6)|(train_df["label"]==8)|(train_df["label"]==10)|(train_df["label"]==12), "label"] = 1 # night
        
        train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=config["seed"], stratify=train_df['label'])
    
    elif method == "all":
        train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=config["seed"], stratify=train_df['label'])
    
    
    return train_df, val_df

def apply_transform(image):
    # Define the augmentations to apply to each video frame
    transform = A.Compose([
        A.Normalize(mean=(0.45, 0.45, 0.45), std=(0.225, 0.225, 0.225)),
        # A.Crop (x_min=0, y_min=100, x_max=1280, y_max=620),
        A.Resize(height=256, width=256, interpolation=cv2.INTER_AREA),
        A.CenterCrop(height=config["image_size"], width=config["image_size"]),
        # A.Crop (x_min=640-235, y_min=100, x_max=640+235, y_max=570), # 470
        # A.Resize(height=int(config["image_size"]/0.875), width=int(config["image_size"]/0.875), interpolation=cv2.INTER_AREA),
        # A.HorizontalFlip(p=0.5),
        # A.RandomSnow(p=1),
        # A.RandomFog(p=1),
        # A.RandomRain(p=1),
        # A.RandomGamma(p=1),
        # A.CLAHE(p=1),
        # A.RandomBrightnessContrast(p=0.2),
        ToTensorV2()
    ])
    return transform(image=image)["image"]

def create_hdf5_dataset():

    # Create the HDF5 file
    with h5py.File(config["h5_name"], 'w') as hf:
    # with h5py.File("tmp.h5", 'w') as hf:
        
        train_df = pd.read_csv(os.path.join(config["data_path"], "train.csv"))
        video_group = hf.create_group("videos")
        video_shape = (3, config["video_length"], config["image_size"], config["image_size"])
        
        for idx, (sample_id, path, label) in enumerate(tqdm(train_df[["sample_id", "video_path", "label"]].values)):
            path = os.path.join(config["data_path"], path[2:])
            video = []
            cap = cv2.VideoCapture(path)
            for i in range(config["video_length"]):
                ret, frame = cap.read()
                if not ret:
                    break
                frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                frame = apply_transform(frame)
                # plt.imshow(frame.moveaxis(0, -1))
                # plt.show()
                # break
                video.append(frame)
            video = np.stack(video).transpose(1, 0, 2, 3) # (channels, frames, height, width)
            video_group.create_dataset(sample_id, data=video, dtype=np.float16)#, **hdf5plugin.Blosc2(cname='blosclz', clevel=5))
        
        for method in config["methods"]:
            train_df, val_df = get_df(method=method)
            group = hf.create_group(method)
            
            for phase, df in [("train", train_df), ("val", val_df)]:
                labels_dataset = group.create_dataset(f"{phase}/labels", shape=(len(df),), dtype=np.uint8)
                filenames_dataset = group.create_dataset(f"{phase}/filenames", shape=(len(df),), dtype=h5py.special_dtype(vlen=str))
                
                for idx, (sample_id, label) in enumerate(tqdm(df[["sample_id", "label"]].values)):
                    filenames_dataset[idx] = sample_id
                    labels_dataset[idx] = int(label)

# create_hdf5_dataset()
# get_df("weather")

## Visualize

In [7]:
# with h5py.File(config["h5_name"], 'r') as hf:
#     image = hf["videos/TRAIN_0000"][:][:, 0]
#     print(image.shape)
#     print(image.transpose(1, 2, 0).shape)
#     print(image.dtype)
    
#     import matplotlib.pyplot as plt

#     plt.imshow(image.transpose(1, 2, 0).astype(np.float32))
#     plt.show()
# z

## Class

>- ### CustomDataset

In [8]:
class CustomDataset(Dataset):
    def __init__(self, hdf5_file, method, phase):
        self.hdf5_file = h5py.File(hdf5_file, 'r')
        self.length = self.hdf5_file[f"{method}/{phase}/labels"].shape[0]
        self.method = method
        self.phase = phase

    def __getitem__(self, index):
        label = self.hdf5_file[f"{self.method}/{self.phase}/labels"][index]
        filename = self.hdf5_file[f"{self.method}/{self.phase}/filenames"][index].decode('utf-8')
        video = self.hdf5_file[f"videos/{filename}"][:]
        
#         # Initialize an EncodedVideo helper class
#         video = EncodedVideo.from_path(Path(config["data_path"], "train", f"{filename}.mp4"))

#         # Load the desired clip
#         video = video.get_clip(start_sec=0, end_sec=5)
        
#         # Apply a transform to normalize the video input
#         if self.phase == "train":
#             video = train_transform(video)
#         else:
#             video = val_transform(video)

#         # Move the inputs to the desired device
#         video = video["video"]
        return video, label, filename

    def __len__(self):
        return self.length
    
    def _apply_transform(self, image, r):
        transform_list = [
            A.Resize(height=256, width=256, interpolation=cv2.INTER_AREA),
            A.CenterCrop(height=config["image_size"], width=config["image_size"]),
        ]
        if r < 0.5:
            transform_list.append(A.RandomSnow(p=1))
        else:
            transform_list.append(A.RandomRain(p=1))
        transform_list.append(A.Normalize(mean=(0.45, 0.45, 0.45), std=(0.225, 0.225, 0.225)))
        transform_list.append(ToTensorV2())
        transform = A.Compose(transform_list)
        return transform(image=image)["image"]

In [9]:
class TestDataset(Dataset):
    def __init__(self, video_path_list, label_list, filename_list=None):
        self.video_path_list = video_path_list
        self.label_list = label_list
        self.filename_list = filename_list
        
    def __getitem__(self, index):
        frames = self.get_video(self.video_path_list[index])
        
        if self.label_list is not None:
            label = self.label_list[index]
            filename = self.filename_list[index]
            return frames, label, filename
        else:
            return frames
        
    def __len__(self):
        return len(self.video_path_list)
    
    def get_video(self, path):
        path = os.path.join(config["data_path"], path[2:])
#         # Initialize an EncodedVideo helper class
#         video = EncodedVideo.from_path(path)

#         # Load the desired clip
#         video = video.get_clip(start_sec=0, end_sec=5)
        
#         # Apply a transform to normalize the video input
#         video = val_transform(video)

#         # Move the inputs to the desired device
#         video = video["video"]
        video = []
        cap = cv2.VideoCapture(path)
        
        for i in range(config["video_length"]):
            ret, frame = cap.read()
            if not ret:
                break
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frame = apply_transform(frame)
            video.append(frame)
        video = np.stack(video).transpose(1, 0, 2, 3) # (channels, frames, height, width)
        return video

>- ### Network

In [10]:
class ResNet(nn.Module):
    def __init__(self, num_classes=13):
        super(ResNet, self).__init__()
        self.backbone = vmodels.r3d_18(weights=vmodels.R3D_18_Weights.DEFAULT)
        self.backbone.fc = nn.Linear(in_features=512, out_features=num_classes)
        # self.classifier = nn.Linear(in_features=400, out_features=num_classes)
        
    def forward(self, x):
        x = self.backbone(x)
        return x

In [11]:
class S3D(nn.Module):
    def __init__(self, num_classes=2):
        super(S3D, self).__init__()
        self.backbone = vmodels.slowfast_4x16(num_classes=num_classes)
        
    def forward(self, x):
        x = self.backbone(x)
        return x

In [12]:
# class MViT(nn.Module):
#     def __init__(self, num_classes=2):
#         super(MViT, self).__init__()
#         self.backbone = vmodels.mvit_v2_s(weights=vmodels.MViT_V2_S_Weights.DEFAULT)
#         self.backbone.head = nn.Sequential(
#             nn.Dropout(p=0.5, inplace=True),
#             nn.Linear(in_features=768, out_features=num_classes, bias=True)
#         )
        
#     def forward(self, x):
#         # x = torch.cat([x[:, :, 0:1], x[:, :, 3:4], x[:, :, 6:7], x[:, :, 9:10], x[:, :, 12:13], x[:, :, 15:16], x[:, :, 18:19], x[:, :, 21:22], x[:, :, 24:25], x[:, :, 27:28], x[:, :, 30:31], x[:, :, 33:34], x[:, :, 36:37], x[:, :, 39:40], x[:, :, 42:43], x[:, :, -1:]], dim=2)
#         # x = x[:, :, -16:]
#         x = self.backbone(x)
#         return x

In [13]:
class MViT(nn.Module):
    def __init__(self, num_classes=2):
        super(MViT, self).__init__()
        self.num_experts = 3
        
        self.backbone = vmodels.mvit_v2_s(weights=vmodels.MViT_V2_S_Weights.DEFAULT)
        # for i, (name, params) in enumerate(self.backbone.named_parameters()):
        #     if i < 160:
        #         params.requires_grad = False
            # print(i, name, params.requires_grad)
        
        blocks = nn.ModuleList([self.backbone.blocks.pop(-1) for _ in range(4)][::-1])
        self.last_block = nn.ModuleList([blocks for _ in range(self.num_experts)])
        self.norm = nn.ModuleList([self.backbone.norm for _ in range(self.num_experts)])
        self.head = nn.ModuleList([nn.Sequential(
            nn.Dropout(p=0.5, inplace=True),
            nn.Linear(in_features=768, out_features=num_classes, bias=True)
        ) for _ in range(self.num_experts)])
        
        
        # self.expert_block = nn.ModuleList([
        #     nn.Sequential(
        #         last_block,
        #         self.backbone.norm,
        #         nn.Dropout(p=0.5, inplace=True),
        #         nn.Linear(in_features=768, out_features=num_classes, bias=True)
        #     )
        # for _ in range(self.num_experts)])
        # print(self.backbone)
        del self.backbone.norm
        del self.backbone.head
        
        # self.backbone.head = nn.Sequential(
        #     nn.Dropout(p=0.5, inplace=True),
        #     nn.Linear(in_features=768, out_features=num_classes, bias=True)
        # )
        
    def forward(self, x):
        x = torch.cat([x[:, :, 0:1], x[:, :, 3:4], x[:, :, 6:7], x[:, :, 9:10], x[:, :, 12:13], x[:, :, 15:16], x[:, :, 18:19], x[:, :, 21:22], x[:, :, 24:25], x[:, :, 27:28], x[:, :, 30:31], x[:, :, 33:34], x[:, :, 36:37], x[:, :, 39:40], x[:, :, 42:43], x[:, :, -1:]], dim=2)
        # x = x[:, :, -16:]
        # x = self.backbone(x)
        
        # Convert if necessary (B, C, H, W) -> (B, C, 1, H, W)
        x = self._unsqueeze(x, 5, 2)[0]
        # patchify and reshape: (B, C, T, H, W) -> (B, embed_channels[0], T', H', W') -> (B, THW', embed_channels[0])
        x = self.backbone.conv_proj(x)
        x = x.flatten(2).transpose(1, 2)

        # add positional encoding
        x = self.backbone.pos_encoding(x)

        # pass patches through the encoder
        thw = (self.backbone.pos_encoding.temporal_size,) + self.backbone.pos_encoding.spatial_size
        for block in self.backbone.blocks:
            x, thw = block(x, thw)
        
        outs = []
        for idx in range(self.num_experts):
            outs.append(self._separate_part(x, thw, idx))
        
        final_out = torch.stack(outs, dim=1).mean(dim=1)
        return {
            "output": final_out, 
            "logits": torch.stack(outs, dim=1)
        }
    
    def _separate_part(self, x, thw, idx):
        for block in self.last_block[idx]:
            x, thw = block(x, thw)
        
        x = self.norm[idx](x)

        # classifier "token" as used by standard language architectures
        x = x[:, 0]
        x = self.head[idx](x)
        return x
    
    def _unsqueeze(self, x: torch.Tensor, target_dim: int, expand_dim: int):
        tensor_dim = x.dim()
        if tensor_dim == target_dim - 1:
            x = x.unsqueeze(expand_dim)
        elif tensor_dim != target_dim:
            raise ValueError(f"Unsupported input dimension {x.shape}")
        return x, tensor_dim
    
# MViT(2)(torch.randn(4, 3, 50, 224, 224))
# z

In [14]:
class Swin3d(nn.Module):
    def __init__(self, num_classes=2):
        super(Swin3d, self).__init__()
        self.backbone = vmodels.swin3d_s(weights=vmodels.Swin3D_S_Weights.DEFAULT)
        self.backbone.head = nn.Linear(in_features=768, out_features=num_classes, bias=True)
        
    def forward(self, x):
        # x = x[:, :, -16:]
        x = torch.cat([x[:, :, 0:1], x[:, :, 3:4], x[:, :, 6:7], x[:, :, 9:10], x[:, :, 12:13], x[:, :, 15:16], x[:, :, 18:19], x[:, :, 21:22], x[:, :, 24:25], x[:, :, 27:28], x[:, :, 30:31], x[:, :, 33:34], x[:, :, 36:37], x[:, :, 39:40], x[:, :, 42:43], x[:, :, -1:]], dim=2)
        x = self.backbone(x)
        return x

In [15]:
class HRNet(nn.Module):
    def __init__(self, num_classes=2):
        super(HRNet, self).__init__()
        config_file = "fcn_hr18s_512x1024_40k_cityscapes.py"
        checkpoint = "fcn_hr18s_512x1024_40k_cityscapes_20200601_014216-93db27d0.pth"
        self.backbone = init_segmentor(config_file, checkpoint=checkpoint, device=config["device"]).backbone
        self.classifier = nn.Linear(in_features=144, out_features=num_classes)
        
    def forward(self, x):
        x = x[:, :, 0]
        x = self.backbone.forward(x)
        x = F.adaptive_avg_pool2d(x[-1], 1)
        x = x.flatten(1)
        x = self.classifier(x)
        return x

In [16]:
class WeatherNet(nn.Module):
    def __init__(self, backbone, num_classes=2):
        super(WeatherNet, self).__init__()
        self.backbone = model = timm.create_model(backbone, num_classes=num_classes, pretrained=True) # drop_path_rate=args.drop_path,
        
    def forward(self, x):
        x = x[:, :, 0]
        x = self.backbone(x)
        return x

In [17]:
"""
Inspired by positional_encoding in [pytorchvideo](https://github.com/facebookresearch/pytorchvideo/blob/f7e7a88a9a04b70cb65a564acfc38538fe71ff7b/pytorchvideo/layers/positional_encoding.py).
Convert to pytorch version.
"""

from typing import Tuple
import torch


def get_3d_sincos_pos_embed(embed_dim: int,
                            tube_shape: Tuple[int, int, int],
                            stride,
                            offset,
                            kernel_size,
                            cls_token: bool = False) -> torch.Tensor:
    """
    Get 3D sine-cosine positional embedding.
    Args:
        tube_shape: (t_size, grid_h_size, grid_w_size)
        kernel_size:
        offset:
        stride:
        embed_dim:
        cls_token: bool, whether to contain CLS token
    Returns:
        (torch.Tensor): [t_size*grid_size*grid_size, embed_dim] or [1+t_size*grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
    """
    assert embed_dim % 4 == 0
    embed_dim_spatial = embed_dim // 3 * 2
    embed_dim_temporal = embed_dim // 3

    # spatial
    grid_h_size = tube_shape[1]
    grid_h = torch.arange(grid_h_size, dtype=torch.float)
    grid_h = grid_h * stride[1] + offset[1] + kernel_size[1] // 2

    grid_w_size = tube_shape[2]
    grid_w = torch.arange(tube_shape[2], dtype=torch.float)
    grid_w = grid_w * stride[2] + offset[2] + kernel_size[2] // 2
    grid = torch.meshgrid(grid_w, grid_h, indexing='ij')
    grid = torch.stack(grid, dim=0)

    grid = grid.reshape([2, 1, grid_h_size, grid_w_size])
    pos_embed_spatial = get_2d_sincos_pos_embed_from_grid(embed_dim_spatial, grid)

    # temporal
    t_size = tube_shape[0]
    grid_t = torch.arange(t_size, dtype=torch.float)
    grid_t = grid_t * stride[0] + offset[0] + kernel_size[0] // 2
    pos_embed_temporal = get_1d_sincos_pos_embed_from_grid(embed_dim_temporal, grid_t)

    pos_embed_temporal = pos_embed_temporal[:, None, :]
    pos_embed_temporal = torch.repeat_interleave(pos_embed_temporal, grid_h_size * grid_w_size, dim=1)
    pos_embed_spatial = pos_embed_spatial[None, :, :]
    pos_embed_spatial = torch.repeat_interleave(pos_embed_spatial, t_size, dim=0)

    pos_embed = torch.cat([pos_embed_temporal, pos_embed_spatial], dim=-1)
    pos_embed = pos_embed.reshape([-1, embed_dim])

    if cls_token:
        pos_embed = torch.cat([torch.zeros([1, embed_dim]), pos_embed], dim=0)
    return pos_embed


def get_2d_sincos_pos_embed(embed_dim: int, grid_size: int, cls_token: bool = False) -> torch.Tensor:
    """
    Get 2D sine-cosine positional embedding.
    Args:
        grid_size: int of the grid height and width
        cls_token: bool, whether to contain CLS token
    Returns:
        (torch.Tensor): [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
    """
    grid_h = torch.arange(grid_size, dtype=torch.float)
    grid_w = torch.arange(grid_size, dtype=torch.float)
    grid = torch.meshgrid(grid_w, grid_h, indexing='ij')
    grid = torch.stack(grid, dim=0)

    grid = grid.reshape([2, 1, grid_size, grid_size])
    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
    if cls_token:
        pos_embed = torch.cat([torch.zeros([1, embed_dim]), pos_embed], dim=0)
    return pos_embed


def get_2d_sincos_pos_embed_from_grid(embed_dim: int, grid: torch.Tensor) -> torch.Tensor:
    """
    Get 2D sine-cosine positional embedding from grid.
    Args:
        embed_dim: embedding dimension.
        grid: positions
    Returns:
        (torch.Tensor): [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
    """
    assert embed_dim % 2 == 0

    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])
    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])

    emb = torch.cat([emb_h, emb_w], dim=1)
    return emb


def get_1d_sincos_pos_embed_from_grid(embed_dim: int, pos: torch.Tensor) -> torch.Tensor:
    """
    Get 1D sine-cosine positional embedding.
    Args:
        embed_dim: output dimension for each position
        pos: a list of positions to be encoded: size (M,)
    Returns:
        (torch.Tensor): tensor of shape (M, D)
    """
    assert embed_dim % 2 == 0
    omega = torch.arange(embed_dim // 2, dtype=torch.float)
    omega /= embed_dim / 2.0
    omega = 1.0 / 10000 ** omega

    pos = pos.reshape(-1)
    out = torch.einsum("m,d->md", pos, omega)

    emb_sin = torch.sin(out)
    emb_cos = torch.cos(out)

    emb = torch.cat([emb_sin, emb_cos], dim=1)
    return emb

In [18]:
from functools import partial
from typing import Callable, Any
from typing import List, Union

import numpy as np
import torch
from torch import nn, Tensor, optim
from torch.nn import functional as F
from torchvision.models.vision_transformer import EncoderBlock
from typing_extensions import OrderedDict

class Encoder(nn.Module):
    """
    Transformer Model Encoder for sequence to sequence translation.
    Code from torch.
    Move pos_embedding to TubeViT
    """
    def __init__(
            self,
            num_layers: int,
            num_heads: int,
            hidden_dim: int,
            mlp_dim: int,
            dropout: float,
            attention_dropout: float,
            norm_layer: Callable[..., nn.Module] = partial(nn.LayerNorm, eps=1e-6),
    ):
        super().__init__()
        self.dropout = nn.Dropout(dropout)
        layers: OrderedDict[str, nn.Module] = OrderedDict()
        for i in range(num_layers):
            layers[f"encoder_layer_{i}"] = EncoderBlock(
                num_heads,
                hidden_dim,
                mlp_dim,
                dropout,
                attention_dropout,
                norm_layer,
            )
        self.layers = nn.Sequential(layers)
        self.ln = norm_layer(hidden_dim)

    def forward(self, x: Tensor):
        torch._assert(x.dim() == 3, f"Expected (batch_size, seq_length, hidden_dim) got {x.shape}")
        return self.ln(self.layers(self.dropout(x)))


class SparseTubesTokenizer(nn.Module):
    def __init__(self, hidden_dim, kernel_sizes, strides, offsets):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.kernel_sizes = kernel_sizes
        self.strides = strides
        self.offsets = offsets

        self.conv_proj_weight = nn.Parameter(torch.empty((self.hidden_dim, 3, *self.kernel_sizes[0])).normal_(),
                                             requires_grad=True)

        self.register_parameter('conv_proj_weight', self.conv_proj_weight)

        self.conv_proj_bias = nn.Parameter(torch.zeros(len(self.kernel_sizes), self.hidden_dim), requires_grad=True)
        self.register_parameter('conv_proj_bias', self.conv_proj_bias)

    def forward(self, x: Tensor) -> Tensor:
        n, c, t, h, w = x.shape  # CTHW
        tubes = []
        for i in range(len(self.kernel_sizes)):
            if i == 0:
                weight = self.conv_proj_weight
            else:
                weight = F.interpolate(self.conv_proj_weight, self.kernel_sizes[i], mode='trilinear')

            tube = F.conv3d(
                x[:, :, self.offsets[i][0]:, self.offsets[i][1]:, self.offsets[i][2]:],
                weight,
                bias=self.conv_proj_bias[i],
                stride=self.strides[i],
            )

            tube = tube.reshape((n, self.hidden_dim, -1))

            tubes.append(tube)

        x = torch.cat(tubes, dim=-1)
        x = x.permute(0, 2, 1).contiguous()
        return x


class TubeViT(nn.Module):
    def __init__(
        self,
        num_classes: int,
        video_shape: Union[List[int], np.ndarray],  # CTHW
        num_layers: int,
        num_heads: int,
        hidden_dim: int,
        mlp_dim: int,
        dropout: float = 0.0,
        attention_dropout: float = 0.0,
        representation_size=None,
    ):
        super(TubeViT, self).__init__()
        self.video_shape = np.array(video_shape)  # CTHW
        self.num_classes = num_classes
        self.hidden_dim = hidden_dim
        self.kernel_sizes = (
            (8, 8, 8),
            (16, 4, 4),
            (4, 12, 12),
            (1, 16, 16),
        )

        self.strides = (
            (16, 32, 32),
            (6, 32, 32),
            (16, 32, 32),
            (32, 16, 16),
        )

        self.offsets = (
            (0, 0, 0),
            (4, 8, 8),
            (0, 16, 16),
            (0, 0, 0),
        )
        self.sparse_tubes_tokenizer = SparseTubesTokenizer(self.hidden_dim, self.kernel_sizes, self.strides,
                                                           self.offsets)

        self.pos_embedding = self._generate_position_embedding()
        self.pos_embedding = torch.nn.Parameter(self.pos_embedding, requires_grad=False)
        self.register_parameter('pos_embedding', self.pos_embedding)

        # Add a class token
        self.class_token = nn.Parameter(torch.zeros(1, 1, self.hidden_dim), requires_grad=True)
        self.register_parameter('class_token', self.class_token)

        self.encoder = Encoder(
            num_layers=num_layers,
            num_heads=num_heads,
            hidden_dim=self.hidden_dim,
            mlp_dim=mlp_dim,
            dropout=dropout,
            attention_dropout=attention_dropout,
        )

        heads_layers: OrderedDict[str, nn.Module] = OrderedDict()
        if representation_size is None:
            heads_layers["head"] = nn.Linear(self.hidden_dim, self.num_classes)
        else:
            heads_layers["pre_logits"] = nn.Linear(self.hidden_dim, representation_size)
            heads_layers["act"] = nn.Tanh()
            heads_layers["head"] = nn.Linear(representation_size, self.num_classes)

        self.heads = nn.Sequential(heads_layers)

    def forward(self, x):
        x = self.sparse_tubes_tokenizer(x)
        n = x.shape[0]

        # Expand the class token to the full batch
        batch_class_token = self.class_token.expand(n, -1, -1)
        x = torch.cat([batch_class_token, x], dim=1)

        x = x + self.pos_embedding

        x = self.encoder(x)

        # Classifier "token" as used by standard language architectures
        x = x[:, 0]

        x = self.heads(x)

        return x

    def _calc_conv_shape(self, kernel_size, stride, offset) -> np.ndarray:
        kernel_size = np.array(kernel_size)
        stride = np.array(stride)
        offset = np.array(offset)
        output = np.ceil((self.video_shape[[1, 2, 3]] - offset - kernel_size + 1) / stride).astype(int)
        return output

    def _generate_position_embedding(self) -> torch.nn.Parameter:
        position_embedding = [torch.zeros(1, self.hidden_dim)]

        for i in range(len(self.kernel_sizes)):
            tube_shape = self._calc_conv_shape(self.kernel_sizes[i], self.strides[i], self.offsets[i])
            pos_embed = get_3d_sincos_pos_embed(
                embed_dim=self.hidden_dim,
                tube_shape=tube_shape,
                kernel_size=self.kernel_sizes[i],
                stride=self.strides[i],
                offset=self.offsets[i],
            )
            position_embedding.append(pos_embed)

        position_embedding = torch.cat(position_embedding, dim=0).contiguous()
        return position_embedding

In [19]:
# class HRNet(nn.Module):
#     def __init__(self, num_classes=2):
#         super(HRNet, self).__init__()
#         self.backbone = timm.create_model('hrnet_w64', pretrained=True, num_classes=num_classes)
        
#     def forward(self, x):
#         x = self.backbone(x)
#         return x

In [20]:
# from mmseg.models import HRNet as mmhr
# class HRNet(nn.Module):
#     def __init__(self, num_classes=2):
#         super(HRNet, self).__init__()
#         extra = dict(
#             stage1=dict(
#                 num_modules=1,
#                 num_branches=1,
#                 block='BOTTLENECK',
#                 num_blocks=(4, ),
#                 num_channels=(64, )),
#             stage2=dict(
#                 num_modules=1,
#                 num_branches=2,
#                 block='BASIC',
#                 num_blocks=(4, 4),
#                 num_channels=(32, 64)),
#             stage3=dict(
#                 num_modules=4,
#                 num_branches=3,
#                 block='BASIC',
#                 num_blocks=(4, 4, 4),
#                 num_channels=(32, 64, 128)),
#             stage4=dict(
#                 num_modules=3,
#                 num_branches=4,
#                 block='BASIC',
#                 num_blocks=(4, 4, 4, 4),
#                 num_channels=(32, 64, 128, 256)))
#         self.backbone = mmhr(extra, in_channels=3, pretrained="fcn_hr18s_512x1024_40k_cityscapes_20200601_014216-93db27d0.pth")
#         self.classifier = nn.Linear(in_features=256, out_features=num_classes)
        
#     def forward(self, x):
#         x = self.backbone(x)
#         x = F.adaptive_avg_pool2d(x[-1], 1).flatten(1)
#         x = self.classifier(x)
#         return x

>- ### Focal Loss

In [21]:
class FocalLoss(nn.Module):
    def __init__(self, alpha=0.25, gamma=2, reduction='mean'):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction
    
    def forward(self, inputs, targets):
        ce_loss = nn.CrossEntropyLoss(reduction='none')(inputs, targets)
        pt = torch.exp(-ce_loss)
        focal_loss = self.alpha * (1-pt)**self.gamma * ce_loss
        if self.reduction == 'mean':
            return focal_loss.mean()
        elif self.reduction == 'sum':
            return focal_loss.sum()
        else:
            return focal_loss

>- ### Diverse Expert Loss

In [22]:
class DiverseExpertLoss(nn.Module):
    def __init__(self, method, cls_num_list=None, max_m=0.5, s=30, tau=2):
        super().__init__()
        self.base_loss = F.cross_entropy 
        if method == "crash":
            cls_num_list = [0] * 2
            cls_num_list[0] = 1783
            cls_num_list[1] = 915
            
        elif method == "ego_involve":
            cls_num_list = [0] * 2
            cls_num_list[0] = 491
            cls_num_list[1] = 424
            
        elif method == "weather":
            cls_num_list = [0] * 3
            cls_num_list[0] = 716
            cls_num_list[1] = 129
            cls_num_list[2] = 70
            
        elif method == "timing":
            cls_num_list = [0] * 2
            cls_num_list[0] = 808
            cls_num_list[1] = 107
            
        prior = np.array(cls_num_list) / np.sum(cls_num_list)
        self.prior = torch.tensor(prior).float().cuda()
        self.C_number = len(cls_num_list)  # class number
        self.s = s
        self.tau = tau 

    def inverse_prior(self, prior): 
        value, idx0 = torch.sort(prior)
        _, idx1 = torch.sort(idx0)
        idx2 = prior.shape[0]-1-idx1 # reverse the order
        inverse_prior = value.index_select(0,idx2)
        
        return inverse_prior

    def forward(self, output_logits, target, extra_info=None):
        if extra_info is None:
            return self.base_loss(extra_info['output'], target)  # output_logits indicates the final prediction

        loss = 0
        
        # Obtain logits from each expert  
        expert1_logits = extra_info['logits'][:, 0]
        expert2_logits = extra_info['logits'][:, 1] 
        expert3_logits = extra_info['logits'][:, 2]  
 
        # Softmax loss for expert 1 
        loss += self.base_loss(expert1_logits, target)
        
        # Balanced Softmax loss for expert 2 
        expert2_logits = expert2_logits + torch.log(self.prior + 1e-9) 
        loss += self.base_loss(expert2_logits, target)
        
        # Inverse Softmax loss for expert 3
        inverse_prior = self.inverse_prior(self.prior)
        expert3_logits = expert3_logits + torch.log(self.prior + 1e-9) - self.tau * torch.log(inverse_prior+ 1e-9) 
        loss += self.base_loss(expert3_logits, target)
   
        return loss

## Functions

>- ### collate_fn

In [23]:
def collate_fn(batch):
    data = [item[0] for item in batch]
    labels = [item[1] for item in batch]
    filenames = [item[2] for item in batch]
    print(type(labels[0]))
    print(type(data[0]))
    print(type(filenames[0]))
    return torch.stack(data), torch.tensor(labels), filenames

>- ### Train

In [24]:
def train(model, optimizer, train_loader, val_loader, scheduler, device, method):
    model.to(device)
    # criterion = nn.CrossEntropyLoss(label_smoothing=0.1).to(device)
    # criterion = FocalLoss().to(device)
    criterion = DiverseExpertLoss(method=method).to(device)
    
    best_epoch = 0
    best_val_score = 0
    best_model = None
    early_stop = 0
    
    scaler = torch.cuda.amp.GradScaler()
    for epoch in range(1, config["epochs"]+1):
        model.train()
        train_loss = []
        preds = []
        trues = []
        
        for i, batch in enumerate(tqdm(iter(train_loader))):
            # optimizer.zero_grad()
            
            videos = batch[0].to(device=device, dtype=torch.float32, non_blocking=True)
            labels = batch[1].to(device=device, dtype=torch.long, non_blocking=True)
            with torch.cuda.amp.autocast():
                output = model(videos)
                loss = criterion(output, labels, extra_info=output)
                loss = loss / config["accumulation_steps"]
            
            scaler.scale(loss).backward()
            if (i+1) % config["accumulation_steps"] == 0 or i + 1 == len(train_loader):
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad()
            
            train_loss.append(loss.item())
            preds += output['output'].argmax(1).detach().cpu().numpy().tolist()
            trues += labels.detach().cpu().numpy().tolist()
            
            if scheduler is not None:
                scheduler.step()
                    
        _val_loss, _val_score = validation(model, criterion, val_loader, device)
        _train_loss = np.mean(train_loss)
        train_f1_score = f1_score(trues, preds, average='macro')
        # if scheduler is not None:
            # scheduler.step(_val_score)
            
        if best_val_score < _val_score:
            best_epoch = epoch
            best_val_score = _val_score
            best_model = deepcopy(model)
            early_stop = 0
        
        logger.info(f'Method: [{method}] Epoch [{epoch}], Train Loss : [{_train_loss:.5f}] Train F1: [{train_f1_score:.5f}] Val Loss : [{_val_loss:.5f}] Val F1 : [{_val_score:.5f}] Best F1 : [{best_val_score}] Best Epoch : [{best_epoch}]')
        
        if best_val_score == 1.0: break
        
        early_stop += 1
        if early_stop > config["early_stop"]: break
    
    with open(f"{config['save_path']}/{config['model']}_{method}.pkl","wb") as f:
        pickle.dump(best_model, f)

>- ### Validation

In [25]:
def validation(model, criterion, val_loader, device):
    model.eval()
    val_loss = []
    preds, trues = [], []
    
    with torch.no_grad():
        for batch in tqdm(iter(val_loader)):
            videos = batch[0].to(device=device, dtype=torch.float32, non_blocking=True)
            labels = batch[1].to(device=device, dtype=torch.long, non_blocking=True)
            
            with torch.cuda.amp.autocast():
                logit = model(videos)
                loss = criterion(logit, labels, logit)
            
            val_loss.append(loss.item())
            preds += logit['output'].argmax(1).detach().cpu().numpy().tolist()
            trues += labels.detach().cpu().numpy().tolist()
        
        _val_loss = np.mean(val_loss)
    
    _val_score = f1_score(trues, preds, average='macro')
    return _val_loss, _val_score

>- ### Run

In [26]:
def run(method, backbone):
    num_class = {"crash": 2, "ego_involve": 2, "weather": 3, "timing": 2, "all": 13}
    if method == "crash":
        # model = TubeViT(num_classes=num_class[method], video_shape=[3, 50, 224, 224], num_layers=12, num_heads=12, hidden_dim=768, mlp_dim=3072)
        model = MViT(num_classes=num_class[method])
        # config["model"] == "MViT"
    elif method == "ego_involve":
        # model = TubeViT(num_classes=num_class[method], video_shape=[3, 50, 224, 224], num_layers=12, num_heads=12, hidden_dim=768, mlp_dim=3072)
        model = MViT(num_classes=num_class[method])
        # config["model"] == "MViT"
    elif method == "weather":
        # model = TubeViT(num_classes=num_class[method], video_shape=[3, 50, 224, 224], num_layers=12, num_heads=12, hidden_dim=768, mlp_dim=3072)
        model = MViT(num_classes=num_class[method])
        # model = WeatherNet(backbone="deit_base_distilled_patch16_224", num_classes=num_class[method])
        # config["model"] == "WeatherNet"
    elif method == "timing":
        # model = TubeViT(num_classes=num_class[method], video_shape=[3, 50, 224, 224], num_layers=12, num_heads=12, hidden_dim=768, mlp_dim=3072)
        model = MViT(num_classes=num_class[method])
        # model = WeatherNet(backbone="deit_base_distilled_patch16_224", num_classes=num_class[method])
        # model = MViT(num_classes=num_class[method])
    #     config["model"] == "MViT"
    # if config["model"] == "S3D":
    #     model = S3D(num_classes=num_class[method])
    # elif config["model"] == "ResNet":
    #     model = ResNet(num_classes=num_class[method])
    # elif config["model"] == "MViT":
    #     model = MViT(num_classes=num_class[method])
    # elif config["model"] == "HRNet":
    #     model = HRNet(num_classes=num_class[method])
    # elif config["model"] == "Swin3d":
    #     model = Swin3d(num_classes=num_class[method])
    # else:
    #     model = WeatherNet(num_classes=num_class[method])
    model.eval()
    
    # scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=2,threshold_mode='abs',min_lr=1e-8, verbose=True)
    
    train_df, val_df = get_df(method=method)
    train_dataset = CustomDataset(config["h5_name"], method=method, phase="train")
    train_loader = DataLoader(train_dataset, batch_size=config["batch_size"], shuffle=True, num_workers=config["num_workers"])

    val_dataset = CustomDataset(config["h5_name"], method=method, phase="val")
    val_loader = DataLoader(val_dataset, batch_size = config["batch_size"], shuffle=False, num_workers=config["num_workers"])
    
    optimizer = torch.optim.AdamW(params=model.parameters(), lr=config["learning_rate"], weight_decay=config["weight_decay"])
    scheduler = torch.optim.lr_scheduler.OneCycleLR(
        optimizer=optimizer,
        max_lr=config["max_lr"],
        steps_per_epoch=len(train_loader),
        epochs=config["epochs"],
    )
    
    train(model, optimizer, train_loader, val_loader, scheduler, config["device"], method)

>- ### Evaluate

In [27]:
def evaluate(model_dict):
    test = pd.read_csv(os.path.join(config["data_path"], "train.csv"))
    test_dataset = TestDataset(test['video_path'].values, test['label'].values, test['sample_id'].values)
    test_loader = DataLoader(test_dataset, batch_size=config["batch_size"], shuffle=False, num_workers=0)
    
    for value in model_dict.values():
        value.eval()
        
    preds = []
    targets = []
    filenames = []
    with torch.no_grad():
        for batch in tqdm(iter(test_loader)):
            videos = batch[0].to(device=config["device"], dtype=torch.float32, non_blocking=True)
            targets.extend(batch[1].numpy())
            filenames.extend(batch[2])
            
            with torch.cuda.amp.autocast():
                crash_pred = model_dict["crash"](videos).argmax(1)
                crash_all_pred = model_dict["crash_all"](videos).argmax(1)
                # ego_involve_pred = model_dict["ego_involve"](videos).argmax(1)
                # weather_pred = model_dict["weather"](videos).argmax(1)
                # timing_pred = model_dict["timing"](videos).argmax(1)
            
            for c, a in zip(crash_pred, crash_all_pred):
                if c == 0:
                    preds.append(0)
                else:
                    preds.append(a)
            # for c, e, w, t in zip(crash_pred, ego_involve_pred, weather_pred, timing_pred):
            #     if c == 0:
            #         preds.append(0)
            #     else:
            #         if e == 0 and w == 0 and t == 0:
            #             preds.append(1)
            #         elif e == 0 and w == 0 and t == 1:
            #             preds.append(2)
            #         elif e == 0 and w == 1 and t == 0:
            #             preds.append(3)
            #         elif e == 0 and w == 1 and t == 1:
            #             preds.append(4)
            #         elif e == 0 and w == 2 and t == 0:
            #             preds.append(5)
            #         elif e == 0 and w == 2 and t == 1:
            #             preds.append(6)
            #         elif e == 1 and w == 0 and t == 0:
            #             preds.append(7)
            #         elif e == 1 and w == 0 and t == 1:
            #             preds.append(8)
            #         elif e == 1 and w == 1 and t == 0:
            #             preds.append(9)
            #         elif e == 1 and w == 1 and t == 1:
            #             preds.append(10)
            #         elif e == 1 and w == 2 and t == 0:
            #             preds.append(11)
            #         elif e == 1 and w == 2 and t == 1:
            #             preds.append(12)

    return preds, targets, filenames

>- ### Predict

In [28]:
def predict(model, test_loader, device):
    model.to(device)
    model.eval()
    preds = []
    with torch.no_grad():
        for videos in tqdm(iter(test_loader)):
            videos = videos.to(device)
            videos = videos[:, :, -16:]
            
            logit = model(videos)

            preds += logit.argmax(1).detach().cpu().numpy().tolist()
    return preds

>- ### Run Predict

In [29]:
def run_predict():
    test = pd.read_csv('./test.csv')
    test_dataset = TestDataset(test['video_path'].values, None)
    test_loader = DataLoader(test_dataset, batch_size=config["batch_size"], shuffle=False, num_workers=0)
    preds = inference(model, test_loader, device)

>- ### Submission

In [30]:
def submit(preds):
    submit = pd.read_csv(os.path.join(config["data_path"], "sample_submission.csv"))
    submit['label'] = preds
    submit.head()
    submit.to_csv(f'{config["save_path"]}/baseline_submit.csv', index=False)

## Run

In [None]:
# "swinv2_base_window8_256": 0.6630420066708513
# "regnetz_040": 0.7489193005322038
# "regnety_040": 0.7055432963288931
# "regnetv_040": 0.7563289952368452
# 'resnetv2_50': 0.6496801796077435,
# 'tresnet_m': 0.7588267287207852,
# 'hrnet_w64': 0.6522576577751691,
# 'wide_resnet50_2': 0.6562548562548562
# "tf_efficientnetv2_s_in21ft1k": 0.712865681031771
# mobilenetv2_140': 0.7344136123064748,
#  'deit3_small_patch16_224_in21ft1k': 0.6744444444444445,
#  'jx_nest_small': 0.6763762242485646,
#  'twins_svt_base': 0.7335441231363767,
#  'swsl_resnext101_32x4d': 0.7493798145972059,
#  'swin_small_patch4_window7_224': 0.6703382689298182,
#  'tf_efficientnet_b4_ap': 0.6841474826876287,
#  'xcit_small_12_p8_224': 0.7360193556828488,
# 'deit_base_distilled_patch16_224': 0.774743351886209,
# "deit3_base_patch16_224": 0.7636604774535809
# 'deit_tiny_patch16_224': 0.7077855477855478,
# 'deit_tiny_distilled_patch16_224': 0.6552881502579692,
# 'deit_small_patch16_224': 0.7524221632244941,
# 'deit3_small_patch16_224': 0.6744444444444445,
# 'deit_base_patch16_224': 0.6538752253037967,
# 'deit3_small_patch16_224_in21ft1k': 0.7077171954052939
#  'cait_s24_224': 0.688900438900439,
#  'resmlp_big_24_distilled_224': 0.29243353783231085,
#  'resnet152d': 0.6889550264550265
# "beit_base_patch16_224": 0.4883536768520795
# "convnext_small_in22ft1k": 0.7031188314251876

for method in config["methods"]:
# for method in ["weather"]:
    run(method=method, backbone=None)

  0%|          | 0/1079 [00:00<?, ?it/s]

  0%|          | 0/270 [00:00<?, ?it/s]

2023-03-12 21:57:29,850 - Method: [crash] Epoch [1], Train Loss : [0.18274] Train F1: [0.90065] Val Loss : [0.15329] Val F1 : [0.98333] Best F1 : [0.9833333333333334] Best Epoch : [1]


  0%|          | 0/1079 [00:00<?, ?it/s]

  0%|          | 0/270 [00:00<?, ?it/s]

2023-03-12 22:02:32,212 - Method: [crash] Epoch [2], Train Loss : [0.13261] Train F1: [0.92976] Val Loss : [3.32992] Val F1 : [0.46998] Best F1 : [0.9833333333333334] Best Epoch : [1]


  0%|          | 0/1079 [00:00<?, ?it/s]

  0%|          | 0/270 [00:00<?, ?it/s]

2023-03-12 22:07:35,819 - Method: [crash] Epoch [3], Train Loss : [0.37642] Train F1: [0.72614] Val Loss : [1.69075] Val F1 : [0.65886] Best F1 : [0.9833333333333334] Best Epoch : [1]


  0%|          | 0/1079 [00:00<?, ?it/s]

  0%|          | 0/270 [00:00<?, ?it/s]

2023-03-12 22:12:36,315 - Method: [crash] Epoch [4], Train Loss : [0.29566] Train F1: [0.80627] Val Loss : [0.69544] Val F1 : [0.91817] Best F1 : [0.9833333333333334] Best Epoch : [1]


  0%|          | 0/1079 [00:00<?, ?it/s]

  0%|          | 0/270 [00:00<?, ?it/s]

2023-03-12 22:17:40,033 - Method: [crash] Epoch [5], Train Loss : [0.21713] Train F1: [0.86185] Val Loss : [0.88761] Val F1 : [0.86200] Best F1 : [0.9833333333333334] Best Epoch : [1]


  0%|          | 0/1079 [00:00<?, ?it/s]

  0%|          | 0/270 [00:00<?, ?it/s]

2023-03-12 22:22:39,679 - Method: [crash] Epoch [6], Train Loss : [0.19459] Train F1: [0.88282] Val Loss : [0.56857] Val F1 : [0.91662] Best F1 : [0.9833333333333334] Best Epoch : [1]


  0%|          | 0/1079 [00:00<?, ?it/s]

  0%|          | 0/270 [00:00<?, ?it/s]

2023-03-12 22:27:37,676 - Method: [crash] Epoch [7], Train Loss : [0.07815] Train F1: [0.95714] Val Loss : [0.20964] Val F1 : [0.97324] Best F1 : [0.9833333333333334] Best Epoch : [1]


  0%|          | 0/1079 [00:00<?, ?it/s]

  0%|          | 0/270 [00:00<?, ?it/s]

2023-03-12 22:32:37,569 - Method: [crash] Epoch [8], Train Loss : [0.03937] Train F1: [0.98046] Val Loss : [0.13071] Val F1 : [0.98559] Best F1 : [0.985592260985901] Best Epoch : [8]


  0%|          | 0/1079 [00:00<?, ?it/s]

  0%|          | 0/270 [00:00<?, ?it/s]

2023-03-12 22:37:34,872 - Method: [crash] Epoch [9], Train Loss : [0.02186] Train F1: [0.98919] Val Loss : [0.12305] Val F1 : [0.97753] Best F1 : [0.985592260985901] Best Epoch : [8]


  0%|          | 0/1079 [00:00<?, ?it/s]

  0%|          | 0/270 [00:00<?, ?it/s]

2023-03-12 22:42:32,576 - Method: [crash] Epoch [10], Train Loss : [0.00766] Train F1: [0.99587] Val Loss : [0.09468] Val F1 : [0.98157] Best F1 : [0.985592260985901] Best Epoch : [8]


  0%|          | 0/366 [00:00<?, ?it/s]

  0%|          | 0/92 [00:00<?, ?it/s]

2023-03-12 22:44:15,292 - Method: [ego_involve] Epoch [1], Train Loss : [0.43110] Train F1: [0.70897] Val Loss : [1.47616] Val F1 : [0.75940] Best F1 : [0.7594035594035593] Best Epoch : [1]


  0%|          | 0/366 [00:00<?, ?it/s]

  0%|          | 0/92 [00:00<?, ?it/s]

2023-03-12 22:45:58,526 - Method: [ego_involve] Epoch [2], Train Loss : [0.35256] Train F1: [0.76914] Val Loss : [1.86833] Val F1 : [0.31716] Best F1 : [0.7594035594035593] Best Epoch : [1]


  0%|          | 0/366 [00:00<?, ?it/s]

  0%|          | 0/92 [00:00<?, ?it/s]

2023-03-12 22:47:40,691 - Method: [ego_involve] Epoch [3], Train Loss : [0.46726] Train F1: [0.66329] Val Loss : [1.76332] Val F1 : [0.65404] Best F1 : [0.7594035594035593] Best Epoch : [1]


  0%|          | 0/366 [00:00<?, ?it/s]

  0%|          | 0/92 [00:00<?, ?it/s]

2023-03-12 22:49:22,870 - Method: [ego_involve] Epoch [4], Train Loss : [0.48441] Train F1: [0.66337] Val Loss : [1.87621] Val F1 : [0.70385] Best F1 : [0.7594035594035593] Best Epoch : [1]


  0%|          | 0/366 [00:00<?, ?it/s]

  0%|          | 0/92 [00:00<?, ?it/s]

2023-03-12 22:51:06,935 - Method: [ego_involve] Epoch [5], Train Loss : [0.48096] Train F1: [0.65161] Val Loss : [2.12807] Val F1 : [0.32845] Best F1 : [0.7594035594035593] Best Epoch : [1]


  0%|          | 0/366 [00:00<?, ?it/s]

In [None]:
# z
# ## Save pickle
# with open(f"{config['save_path']}/{config['model']}.pkl","wb") as f:
#     pickle.dump(model_dict, f)

In [None]:
# # Load pickle
# with open("MViT.pkl","rb") as f:
#     model_dict = pickle.load(f)
# print(model_dict)

## Eval

In [None]:
# preds, targets, filenames = evaluate(model_dict)

In [None]:
# cnt = 0
# error_dict = {t: {} for t in targets}
# for p, t, f in zip(preds, targets, filenames):
#     if p != t:
#         print(f"pred: {p}, target: {t}, filename: {f}")
#         if error_dict[t].get(p) is not None:
#             error_dict[t][p] += 1
#         else:
#             error_dict[t][p] = 1
#         cnt += 1
        
# f1 = f1_score(targets, preds, average='macro')
# print("count:", cnt)
# print("error dict:", error_dict)
# print(f"f1 score: {f1}")

# p = Path(config["save_path"].joinpath("error_dict.txt"))
# p.write_text(f"{str(error_dict)}\n f1 score: {str(f1)}")

## Inference

In [None]:
# config["save_path"] = Path("results/2023-03-09 18:32:24.788752")
# test = pd.read_csv(os.path.join(config["data_path"], "test.csv"))
# test_dataset = TestDataset(test['video_path'].values, None)
# test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False, num_workers=0)


# pred_dict = {"crash": [], "ego": [], "weather": [], "timing": []}
# for name, method in zip([f"{config['model']}_crash.pkl", f"{config['model']}_ego_involve.pkl", f"{config['model']}_weather.pkl", f"{config['model']}_timing.pkl"], ["crash", "ego", "weather", "timing"]):
#     with open(config["save_path"].joinpath(name),"rb") as f:
#         model = pickle.load(f)
#     model.eval()
    
#     with torch.no_grad():
#         for i, videos in enumerate(tqdm(iter(test_loader))):
#             videos = videos.to(device=config["device"], dtype=torch.float32, non_blocking=True)

#             with torch.cuda.amp.autocast():
#                 pred = model(videos)
                
#                 print(f"{i}, {method}: {pred.softmax(-1).detach().cpu().numpy()}")
#                 pred = pred.argmax(-1)
#                 pred_dict[method].append(pred)

# pred_list = []
# for c, e, w, t in zip(pred_dict["crash"], pred_dict["ego"], pred_dict["weather"], pred_dict["timing"]):
#     if c == 0:
#         preds_list.append(0)
#     else:
#         if e == 0 and w == 0 and t == 0:
#             preds_list.append(1)
#         elif e == 0 and w == 0 and t == 1:
#             preds_list.append(2)
#         elif e == 0 and w == 1 and t == 0:
#             preds_list.append(3)
#         elif e == 0 and w == 1 and t == 1:
#             preds_list.append(4)
#         elif e == 0 and w == 2 and t == 0:
#             preds_list.append(5)
#         elif e == 0 and w == 2 and t == 1:
#             preds_list.append(6)
#         elif e == 1 and w == 0 and t == 0:
#             preds_list.append(7)
#         elif e == 1 and w == 0 and t == 1:
#             preds_list.append(8)
#         elif e == 1 and w == 1 and t == 0:
#             preds_list.append(9)
#         elif e == 1 and w == 1 and t == 1:
#             preds_list.append(10)
#         elif e == 1 and w == 2 and t == 0:
#             preds_list.append(11)
#         elif e == 1 and w == 2 and t == 1:
#             preds_list.append(12)

# submit(preds_list)

In [None]:
test = pd.read_csv(os.path.join(config["data_path"], "test.csv"))
test_dataset = TestDataset(test['video_path'].values, None)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False, num_workers=0)

# for method in config["methods"]:
#     model_dict[method].eval()
with open(config["save_path"].joinpath("MViT_crash.pkl"),"rb") as f:
    crash_model = pickle.load(f)
with open(config["save_path"].joinpath("MViT_ego_involve.pkl"),"rb") as f:
    ego_model = pickle.load(f)
with open(config["save_path"].joinpath("MViT_weather.pkl"),"rb") as f:
    weather_model = pickle.load(f)
with open(config["save_path"].joinpath("MViT_timing.pkl"),"rb") as f:
    timing_model = pickle.load(f)
    
preds_list = []
crash_list = []
crash_all_list = []

crash_model.eval()
ego_model.eval()
timing_model.eval()
weather_model.eval()

with torch.no_grad():
    for i, videos in enumerate(tqdm(iter(test_loader))):
        videos = videos.to(device=config["device"], dtype=torch.float32, non_blocking=True)
            
        with torch.cuda.amp.autocast():
            # outputs = model_dict["crash"](videos).argmax(1)
            
            # if outputs.item() == 0:
            #     preds_list.append(0)
            # else:
            #     outputs = model_dict["crash_all"](videos).argmax(1)
            #     preds_list.append(outputs.item())
            
            crash_pred = crash_model(videos)['output']
            ego_involve_pred = ego_model(videos)['output']
            weather_pred = weather_model(videos)['output']
            timing_pred = timing_model(videos)['output']
        
        # print(f"{i} crash: {crash_pred.softmax(-1).detach().cpu().numpy()}, ego: {ego_involve_pred.softmax(-1).detach().cpu().numpy()}, weather: {weather_pred.softmax(-1).detach().cpu().numpy()}, timing: {timing_pred.softmax(-1).detach().cpu().numpy()}")
        crash_pred = crash_pred.argmax(-1)
        ego_involve_pred = ego_involve_pred.argmax(-1)
        weather_pred = weather_pred.argmax(-1)
        timing_pred = timing_pred.argmax(-1)
        
        for c, e, w, t in zip(crash_pred, ego_involve_pred, weather_pred, timing_pred):
            if c == 0:
                preds_list.append(0)
            else:
                if e == 0 and w == 0 and t == 0:
                    preds_list.append(1)
                elif e == 0 and w == 0 and t == 1:
                    preds_list.append(2)
                elif e == 0 and w == 1 and t == 0:
                    preds_list.append(3)
                elif e == 0 and w == 1 and t == 1:
                    preds_list.append(4)
                elif e == 0 and w == 2 and t == 0:
                    preds_list.append(5)
                elif e == 0 and w == 2 and t == 1:
                    preds_list.append(6)
                elif e == 1 and w == 0 and t == 0:
                    preds_list.append(7)
                elif e == 1 and w == 0 and t == 1:
                    preds_list.append(8)
                elif e == 1 and w == 1 and t == 0:
                    preds_list.append(9)
                elif e == 1 and w == 1 and t == 1:
                    preds_list.append(10)
                elif e == 1 and w == 2 and t == 0:
                    preds_list.append(11)
                elif e == 1 and w == 2 and t == 1:
                    preds_list.append(12)

submit(preds_list)

In [None]:
z

In [None]:
output = []
for p in preds_list:
    if isinstance(p, torch.Tensor):
        p = p.detach().cpu().numpy()
    output.append(p)

In [None]:
submit(output)

In [None]:
z

In [None]:
def get_df(method):
    train_df = pd.read_csv(os.path.join(config["data_path"], "train.csv"))
    
    if method == "crash":
        # 차량 충돌 여부 f1 90 이상
        train_df.loc[train_df["label"]!=0, "label"] = 1
    
    elif method == "ego_involve":
        # 차량 충돌 연관 여부 f1 68 이상
        train_df.drop(train_df[train_df["label"]==0].index, inplace=True)
        train_df.loc[(train_df["label"]==1)|(train_df["label"]==2)|(train_df["label"]==3)|(train_df["label"]==4)|(train_df["label"]==5)|(train_df["label"]==6), "label"] = 0 # yes
        train_df.loc[(train_df["label"]==7)|(train_df["label"]==8)|(train_df["label"]==9)|(train_df["label"]==10)|(train_df["label"]==11)|(train_df["label"]==12), "label"] = 1 # no
        
    elif method == "weather":
        # 날씨 구분 f1 49 이상
        train_df.drop(train_df[train_df["label"]==0].index, inplace=True)
        train_df.loc[(train_df["label"]==1)|(train_df["label"]==2)|(train_df["label"]==7)|(train_df["label"]==8), "label"] = 0 # normal
        train_df.loc[(train_df["label"]==3)|(train_df["label"]==4)|(train_df["label"]==9)|(train_df["label"]==10), "label"] = 1 # snowy
        train_df.loc[(train_df["label"]==5)|(train_df["label"]==6)|(train_df["label"]==11)|(train_df["label"]==12), "label"] = 2 # rainy
        
    elif method == "timing":
        # 낮/밤 구분 f1 90 이상
        train_df.drop(train_df[train_df["label"]==0].index, inplace=True)
        train_df.loc[(train_df["label"]==1)|(train_df["label"]==3)|(train_df["label"]==5)|(train_df["label"]==7)|(train_df["label"]==9)|(train_df["label"]==11), "label"] = 0 # day
        train_df.loc[(train_df["label"]==2)|(train_df["label"]==4)|(train_df["label"]==6)|(train_df["label"]==8)|(train_df["label"]==10)|(train_df["label"]==12), "label"] = 1 # night
    
    train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=config["seed"], stratify=train_df['label'])
        
    return train_df, val_df

train_df, val_df = get_df(method="weather")
print(train_df)
print(train_df["label"].value_counts())

In [None]:
timm.list_models(pretrained=True)

In [None]:
# !mim download mmsegmentation --config fcn_hr18s_512x1024_40k_cityscapes --dest .

In [None]:
config_file = "fcn_hr18s_512x1024_40k_cityscapes.py"
checkpoint = "fcn_hr18s_512x1024_40k_cityscapes_20200601_014216-93db27d0.pth"
model = init_segmentor(config_file, checkpoint=checkpoint, device='cuda:0')
model(torch.randn(1, 3, 224, 224).to('cuda:0'))
# del model.init_cfg
# model.backbone.forward(torch.randn(1, 3, 224, 224).to('cuda:0'))