In [1]:
# !pip install torchsampler
#https://github.com/ufoym/imbalanced-dataset-sampler


# train_loader = torch.utils.data.DataLoader(
#     train_dataset,
#     sampler=ImbalancedDatasetSampler(train_dataset),
#     batch_size=args.batch_size,
#     **kwargs
# )
#https://www.kaggle.com/competitions/dfl-bundesliga-data-shootout/discussion/360236
#model kinetic version
#label seperate
#video augmentation
#imbalance data
#focal loss
#https://github.com/HHTseng/video-classification
#https://huggingface.co/models?other=video-classification
#앙상블

In [2]:
# !pip install torchsummary

In [3]:
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
import pytorch_lightning as pl
from torchsampler import ImbalancedDatasetSampler
from sklearn.model_selection import KFold,StratifiedKFold
import albumentations as A

# from einops import rearrange
# from decord import VideoReader
from sklearn.metrics import f1_score
from torch.utils.data import Dataset, DataLoader
from segmentation_models_pytorch.losses import FocalLoss
# from transformers import AutoModel, AutoImageProcessor, AutoConfig
# from skmultilearn.model_selection import iterative_train_test_split
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorchvideo.transforms.transforms_factory import create_video_transform
import random
import os
import cv2
from tqdm import tqdm

from transformers import AutoModel, AutoImageProcessor, AutoConfig



In [4]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

device(type='cuda')

In [5]:
import pandas as pd
all_df = pd.read_csv('./data/train.csv')
test_df = pd.read_csv('./data/test.csv')

In [6]:
all_df

Unnamed: 0,sample_id,video_path,label
0,TRAIN_0000,./train/TRAIN_0000.mp4,7
1,TRAIN_0001,./train/TRAIN_0001.mp4,7
2,TRAIN_0002,./train/TRAIN_0002.mp4,0
3,TRAIN_0003,./train/TRAIN_0003.mp4,0
4,TRAIN_0004,./train/TRAIN_0004.mp4,1
...,...,...,...
2693,TRAIN_2693,./train/TRAIN_2693.mp4,3
2694,TRAIN_2694,./train/TRAIN_2694.mp4,5
2695,TRAIN_2695,./train/TRAIN_2695.mp4,0
2696,TRAIN_2696,./train/TRAIN_2696.mp4,0


In [7]:

CFG = {
    'VIDEO_LENGTH':40, # 10프레임 * 5초
    'IMG_SIZE':115,
    'EPOCHS':30,
    'LEARNING_RATE':3e-5,
    'BATCH_SIZE':1,
    'SEED':2023,
    'SPLIT':5,
    'ROOT':'./data',
    'MODEL':"facebook/timesformer-base-finetuned-k400"
    
#     'MCG-NJU/videomae-base-finetuned-ssv2'
    
}

In [8]:
skf = StratifiedKFold(n_splits = CFG['SPLIT'])

In [9]:
all_df['video_path'] = all_df['video_path'].apply(lambda x:CFG['ROOT']+x[1:])
test_df['video_path'] = test_df['video_path'].apply(lambda x:CFG['ROOT']+x[1:])



In [10]:
all_df

Unnamed: 0,sample_id,video_path,label
0,TRAIN_0000,./data/train/TRAIN_0000.mp4,7
1,TRAIN_0001,./data/train/TRAIN_0001.mp4,7
2,TRAIN_0002,./data/train/TRAIN_0002.mp4,0
3,TRAIN_0003,./data/train/TRAIN_0003.mp4,0
4,TRAIN_0004,./data/train/TRAIN_0004.mp4,1
...,...,...,...
2693,TRAIN_2693,./data/train/TRAIN_2693.mp4,3
2694,TRAIN_2694,./data/train/TRAIN_2694.mp4,5
2695,TRAIN_2695,./data/train/TRAIN_2695.mp4,0
2696,TRAIN_2696,./data/train/TRAIN_2696.mp4,0


In [11]:
crash_df = all_df.copy()
crash_df['label'] = crash_df['label'].apply(lambda x: 1 if x != 0 else 0)

In [12]:
crash_df

Unnamed: 0,sample_id,video_path,label
0,TRAIN_0000,./data/train/TRAIN_0000.mp4,1
1,TRAIN_0001,./data/train/TRAIN_0001.mp4,1
2,TRAIN_0002,./data/train/TRAIN_0002.mp4,0
3,TRAIN_0003,./data/train/TRAIN_0003.mp4,0
4,TRAIN_0004,./data/train/TRAIN_0004.mp4,1
...,...,...,...
2693,TRAIN_2693,./data/train/TRAIN_2693.mp4,1
2694,TRAIN_2694,./data/train/TRAIN_2694.mp4,1
2695,TRAIN_2695,./data/train/TRAIN_2695.mp4,0
2696,TRAIN_2696,./data/train/TRAIN_2696.mp4,0


In [13]:
# for train_idx,test_idx in skf.split(crash_df['video_path'],crash_df['label']):
#     print(train_idx)
#     print("==")
#     print(test_idx)

In [14]:
# from sklearn.model_selection import train_test_split
# train_df, val_df = train_test_split(
#     all_df, test_size=0.2, stratify=all_df['label'])

In [15]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(CFG['SEED']) # Seed 고정

In [16]:
from transformers import VideoMAEConfig, VideoMAEModel
from transformers import AutoImageProcessor, VideoMAEForVideoClassification

image_processor_config = AutoImageProcessor.from_pretrained(CFG['MODEL'])
# configuration = VideoMAEConfig()
configuration = AutoConfig.from_pretrained(CFG['MODEL'])

# configuration.image_size=CFG['IMG_SIZE']
# configuration.num_frames = CFG['VIDEO_LENGTH']
# image_processor = AutoImageProcessor.from_pretrained("MCG-NJU/videomae-base")
# AutoModel
# model = VideoMAEModel.from_pretrained(CFG['MODEL'],config=configuration,ignore_mismatched_sizes=True)
# crash_model = VideoMAEForVideoClassification.from_pretrained(CFG['MODEL'],config=configuration,ignore_mismatched_sizes=True)
configuration.num_frames=CFG['VIDEO_LENGTH']
configuration.image_size=CFG['IMG_SIZE']
configuration.id2label = {0:'no crash',1:'crash'}
configuration.label2id = {'no crash':0,'crash':1}
crash_model = AutoModel.from_pretrained(CFG['MODEL'],config=configuration,ignore_mismatched_sizes=True)



Some weights of the model checkpoint at facebook/timesformer-base-finetuned-k400 were not used when initializing TimesformerModel: ['classifier.weight', 'classifier.bias']
- This IS expected if you are initializing TimesformerModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TimesformerModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of TimesformerModel were not initialized from the model checkpoint at facebook/timesformer-base-finetuned-k400 and are newly initialized because the shapes did not match:
- timesformer.embeddings.position_embeddings: found shape torch.Size([1, 197, 768]) in the checkpoint and torch.Size([1, 50, 768]) in the model instantiated
- timesformer.embedd

In [17]:
crash_model.config

TimesformerConfig {
  "_name_or_path": "facebook/timesformer-base-finetuned-k400",
  "architectures": [
    "TimesformerForVideoClassification"
  ],
  "attention_probs_dropout_prob": 0.0,
  "attention_type": "divided_space_time",
  "drop_path_rate": 0,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "id2label": {
    "0": "no crash",
    "1": "crash"
  },
  "image_size": 115,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "crash": 1,
    "no crash": 0
  },
  "layer_norm_eps": 1e-06,
  "model_type": "timesformer",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_frames": 40,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "qkv_bias": true,
  "torch_dtype": "float32",
  "transformers_version": "4.26.1"
}

In [18]:
# image_processor_config

In [19]:
# #https://torchvideo.readthedocs.io/en/latest/transforms.html#examples
# train_transform = create_video_transform(
#     mode='train',
#     num_samples=model.config.num_frames,
#     video_mean = tuple(image_processor_config.image_mean),
#     video_std = tuple(image_processor_config.image_std),
#     crop_size = tuple(image_processor_config.crop_size.values())
# )

# val_transform = create_video_transform(
#     mode='val',
#     num_samples=model.config.num_frames,
#     video_mean = tuple(image_processor_config.image_mean),
#     video_std = tuple(image_processor_config.image_std),
#     crop_size = tuple(image_processor_config.crop_size.values())
# )

In [20]:
Alb = A.Compose([
        A.Resize(width=CFG['IMG_SIZE'], height=CFG['IMG_SIZE']),
        A.HorizontalFlip(p=0.5),
        A.ShiftScaleRotate(rotate_limit=15, p=0.3),
#         A.ChannelDropout(p=0.1),
#         A.RandomRain(p=0.1),
#         A.GridDistortion(p=0.3),
        A.Normalize()
    ], p=1)


def aug_video(vid, tfms):
#     seed = random.randint(0,99999)
    aug_vid = []
    for x in vid:
#         random.seed(seed)
        aug_vid.append((tfms(image = np.asarray(x)))['image'])
    return torch.from_numpy(np.stack(aug_vid))

In [21]:

class VideoDataset(Dataset):
    def __init__(self, video_path_list, label_list,transform=None):
        self.video_path_list = video_path_list
        self.label_list = label_list
#         self.transform=transform
        self.Alb = transform
    
    def get_labels(self):   
        return self.label_list  
    
    def __getitem__(self, index):
        frames = self.get_video(self.video_path_list[index])
        
#         vr = VideoReader(self.video_path_list[index])
#         video = torch.from_numpy(vr.get_batch(range(CFG['VIDEO_LENGTH'])).asnumpy())
#         video = rearrange(video, 't h w c -> c t h w')
        if self.label_list is not None:
#             if self.transform:
#                 frames = transform(frames)
            label = self.label_list[index]
            return frames, label
        else:
#             if self.transform:
#                 frames = transform(frames)
            return frames
        
    def __len__(self):
        return len(self.video_path_list)
    
    def get_video(self, path):
        frames = []
        cap = cv2.VideoCapture(path)
        for _ in range(5,45):
            _, img = cap.read()
            img = cv2.resize(img, (CFG['IMG_SIZE'], CFG['IMG_SIZE']))
#             img = img / 255.
            frames.append(img)
        if self.Alb is not None:
            frames = aug_video(frames, tfms=self.Alb)
#         8, 224, 224, 3
        return torch.FloatTensor(np.array(frames)).permute(0, 3, 1, 2)


In [22]:
# model

In [23]:
# import torch.nn as nn
# import torch.nn.functional as F

class CrashClsModel(nn.Module):
    def __init__(self,pretrained_model):
        super().__init__()
        self.model = pretrained_model
        self.classifier = nn.LazyLinear(2)
    def forward(self, x):
#         print(x.size())
        batch_size = x.size(0)
        x = self.model(x).last_hidden_state.mean(dim=1)
        x_out = self.classifier(x)
#         x = x[0].view(batch_size,-1)
#         x = self.classifier(x)
#         print(x.size())
#         print(x.size())
#         print(x)
        return x_out

In [24]:
def train(model, optimizer, train_loader, val_loader, scheduler, device):
    model.to(device)
#     criterion = nn.CrossEntropyLoss().to(device)
#     criterion = nn.BCELoss().to(device)
    criterion = FocalLoss('multiclass')
    best_val_score = 0
    best_model = None
    
    for epoch in range(1, CFG['EPOCHS']+1):
        model.train()
        train_loss = []
        for videos, labels in tqdm(iter(train_loader)):
            videos = videos.to(device)
            labels = labels.to(device)
#             print(labels)
            optimizer.zero_grad()
            
            output = model(videos)
#             print(output.armax(-1).item())
            
#             output = F.softmax(output,dim=1)[:1]
#             print(output)
#             print("==")
#             print(labels)
#             print(output)
            loss = criterion(output, labels)
           
#             print(output.logits)
#             print(labels)
#             print(loss)
#             loss =FocalLoss(gamma=0)(output,labels)
            loss.backward()
            optimizer.step()
#             print(loss.item())
            train_loss.append(loss.item())
        _val_loss, _val_score = validation(model, criterion, val_loader, device)
        _train_loss = np.mean(train_loss)
        print(f'Epoch [{epoch}], Train Loss : [{_train_loss:.5f}] Val Loss : [{_val_loss:.5f}] Val F1 : [{_val_score:.5f}]')
        
        if scheduler is not None:
            scheduler.step(_val_score)
            
        if best_val_score < _val_score:
            best_val_score = _val_score
            best_model = model
            torch.save(best_model.state_dict(), './'+str(epoch)+'_best_model.pth')
    
    return best_model

In [25]:
def validation(model, criterion, val_loader, device):
    model.eval()
    val_loss = []
    preds, trues = [], []
    
    with torch.no_grad():
        for videos, labels in tqdm(iter(val_loader)):
            videos = videos.to(device)
            labels = labels.to(device)
            
            output = model(videos)
            
            loss = criterion(output, labels)
            
            val_loss.append(loss.item())
            
            preds += output.argmax(1).detach().cpu().numpy().tolist()
            trues += labels.detach().cpu().numpy().tolist()
        
        _val_loss = np.mean(val_loss)
    
    _val_score = f1_score(trues, preds, average='macro')
    return _val_loss, _val_score

In [26]:
# for train_idx,test_idx in skf.split(crash_df['video_path'],crash_df['label']):
#     print(crash_df['video_path'][train_idx].values)

In [27]:
# from sklearn.model_selection import train_test_split
# crash_train_df, val_df = train_test_split(
#     crash_df, test_size=0.2, stratify=crash_df['label'])
crash_cls_model = CrashClsModel(crash_model)
optimizer = torch.optim.Adam(params = crash_cls_model.parameters(), lr = CFG["LEARNING_RATE"])
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=2,threshold_mode='abs',min_lr=1e-10, verbose=True)

#  sampler=ImbalancedDatasetSampler(train_dataset),
for train_idx,val_idx in skf.split(crash_df['video_path'],crash_df['label']):
    train_dataset = VideoDataset(crash_df['video_path'][train_idx].values, crash_df['label'][train_idx].values,transform=Alb)
    val_dataset = VideoDataset(crash_df['video_path'][val_idx].values, crash_df['label'][val_idx].values, transform=Alb)
    train_loader = DataLoader(train_dataset,shuffle=False,batch_size = CFG['BATCH_SIZE'],  num_workers=12)
    val_loader = DataLoader(val_dataset, batch_size = CFG['BATCH_SIZE'], num_workers=12)
    infer_model = train(crash_cls_model, optimizer, train_loader, val_loader, scheduler, device)

100%|██████████████████████████████████████████████████████████████████████████████████████████| 2158/2158 [07:31<00:00,  4.78it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████| 540/540 [00:33<00:00, 16.25it/s]


Epoch [1], Train Loss : [0.11593] Val Loss : [0.42433] Val F1 : [0.26560]


100%|██████████████████████████████████████████████████████████████████████████████████████████| 2158/2158 [07:31<00:00,  4.78it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████| 540/540 [00:33<00:00, 16.21it/s]

Epoch [2], Train Loss : [0.04214] Val Loss : [0.45003] Val F1 : [0.25939]



100%|██████████████████████████████████████████████████████████████████████████████████████████| 2158/2158 [07:31<00:00,  4.78it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████| 540/540 [00:33<00:00, 16.29it/s]


Epoch [3], Train Loss : [0.03420] Val Loss : [0.37307] Val F1 : [0.40447]


100%|██████████████████████████████████████████████████████████████████████████████████████████| 2158/2158 [07:32<00:00,  4.77it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████| 540/540 [00:33<00:00, 16.16it/s]


Epoch [4], Train Loss : [0.02478] Val Loss : [0.35302] Val F1 : [0.48769]


100%|██████████████████████████████████████████████████████████████████████████████████████████| 2158/2158 [07:31<00:00,  4.78it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████| 540/540 [00:33<00:00, 16.20it/s]

Epoch [5], Train Loss : [0.02443] Val Loss : [0.41397] Val F1 : [0.43866]



100%|██████████████████████████████████████████████████████████████████████████████████████████| 2158/2158 [07:32<00:00,  4.77it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████| 540/540 [00:33<00:00, 16.24it/s]

Epoch [6], Train Loss : [0.01966] Val Loss : [0.38489] Val F1 : [0.29585]



100%|██████████████████████████████████████████████████████████████████████████████████████████| 2158/2158 [07:32<00:00,  4.77it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████| 540/540 [00:33<00:00, 16.26it/s]

Epoch [7], Train Loss : [0.01803] Val Loss : [0.54248] Val F1 : [0.25939]
Epoch     7: reducing learning rate of group 0 to 1.5000e-05.



 25%|███████████████████████▏                                                                   | 550/2158 [01:55<05:37,  4.77it/s]


KeyboardInterrupt: 

In [None]:
# video_model = CrashClsModel(model).to(device)

# optimizer = torch.optim.Adam(params = model.parameters(), lr = CFG["LEARNING_RATE"])
# scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=2,threshold_mode='abs',min_lr=1e-10, verbose=True)


In [None]:
# infer_model = train(video_model, optimizer, train_loader, val_loader, scheduler, device)