In [1]:
import os
import random
import time

import albumentations as alb
import cv2
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision.models import efficientnet_v2_s, EfficientNet_V2_S_Weights

In [2]:
# Метод для считывания ролика в массив, (from baseline)

def read_clip(f_name: str, start: int = 0, transposed: bool = False):
    """Прочесть ролик в массив."""

    cpr = cv2.VideoCapture(f_name)
    has_frame = True
    frames = []

    while has_frame:
        has_frame, frame = cpr.read()
        if has_frame:
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            if transposed:
                frame = np.moveaxis(frame, -1, 0).copy()

            frames.append(frame)
    cpr.release()
    return np.array(frames)[start:]

In [None]:
# Жесткое разделение данных для обучения и валидации

base_dir = 'ds/train/'
os.mkdir('ds/val')

split = 0.3
for class_name in os.listdir(base_dir):

    target_dir = f"ds/val/{class_name}/"
    if not os.path.exists(target_dir):
        os.mkdir(target_dir)

    class_clips = [
        clip_file for clip_file in
        os.listdir(base_dir + class_name)
        if clip_file.endswith('.mp4')
    ]
    split_value = int(len(class_clips) * split)
    for i, clip_file in enumerate(class_clips):
        if i < split_value:
            os.rename(
                f"{base_dir}{class_name}/{clip_file}",
                f"{target_dir}{clip_file}"
            )

In [3]:
# Формирование и считывание наборов данных

classes_dict = {
    'no_action': 0,
    'train_in_out': 1,
    'bridge_up': 2,
    'bridge_down': 3,
}

_x_train, _x_val = [], []
_y_train, _y_val = [], []

ds_root = 'ds/'
for ds_part in ('train', 'val'):

    x_set, y_set = (_x_train, _y_train) if ds_part == 'train' else (_x_val, _y_val)

    for class_name in os.listdir(f"{ds_root}{ds_part}"):
        class_clips_files = [
            clip_file for clip_file in
            os.listdir(f"{ds_root}{ds_part}/{class_name}")
            if clip_file.endswith('.mp4')
        ]

        x_set += [
            read_clip(f"{ds_root}{ds_part}/{class_name}/{clip_file}")
            for clip_file in class_clips_files
        ]
        y_set += [classes_dict[class_name]] * len(class_clips_files)

In [4]:
# Аугментации. 

train_transforms = alb.Compose(transforms=[
    alb.HorizontalFlip(p=0.5),
    alb.ShiftScaleRotate(rotate_limit=15, scale_limit=0.1, shift_limit=0, p=0.5),
    alb.RGBShift(r_shift_limit=15, g_shift_limit=15, b_shift_limit=15, p=0.5),
    alb.RandomBrightnessContrast(brightness_limit=0.15, contrast_limit=0.15, p=0.5),
    alb.HueSaturationValue(hue_shift_limit=15, sat_shift_limit=15, val_shift_limit=15, p=0.5),
],
    additional_targets={f'image{i}': 'image' for i in range(1, 300)}
)

In [5]:
# Предобработка Х в батче:
# Случайная подрезка ролика до 300 кадров + вырезание куска в 50 кадров в случайном месте + аугментации
# Затем ролик превращается в изображение со средними значениями от всех кадров

def prepare_batch_x(data, train=True):
    x = []
    for clip in data:

        if len(clip) > 300:
            split_idx = random.randint(0, len(clip) - 300)
            clip = clip[split_idx: split_idx + 300]

        mask_start = random.randint(0, len(clip))
        clip = np.concatenate([clip[:mask_start], clip[mask_start + 50:]], 0)

        if train:
            images = {f'image{frame_number}': frame for frame_number, frame in enumerate(clip[1:], start=1)}
            images['image'] = clip[0]
            clip = train_transforms(**images)
            clip.pop('image', None)
            clip = np.array(list(clip.values()))

        clip_image = np.sum(clip, 0) // len(clip)

        clip_image = np.transpose(clip_image, [-1, 0, 1])

        x.append(clip_image / 255.)

    return np.array(x)


In [6]:
# Метод для итерирования батчей

def iterate_batches(x, y, batch_size, dev, train=True):
    indices = np.random.permutation(np.arange(len(x)))
    for start in range(0, len(indices), batch_size):
        ix = indices[start: start + batch_size]
        x_i = prepare_batch_x([x[ii] for ii in ix], train=train)
        x_i = torch.as_tensor(x_i, dtype=torch.float32).to(dev)
        y_i = torch.as_tensor([y[ii] for ii in ix], dtype=torch.int64).to(dev)
        yield x_i, y_i

In [7]:
# Трейнер

def train_loop(
        model, opt, dev,
        x_train, y_train,
        x_val=None, y_val=None,
        num_epochs=15, batch_size=50
):
    train_loss = []
    val_accuracy = []

    for _epoch in range(num_epochs):
        start_time = time.time()

        model.train(True)

        for x_batch, y_batch in iterate_batches(x_train, y_train, batch_size, dev):
            loss = F.cross_entropy(model(x_batch), y_batch).mean()
            loss.backward()
            opt.step()
            opt.zero_grad()
            train_loss.append(loss.data.cpu())

        if x_val and y_val:
            model.train(False)
            for x_batch, y_batch in iterate_batches(x_val, y_val, batch_size, dev, train=False):
                logits = model(x_batch).max(1)[1].data
                val_accuracy.append((y_batch == logits).cpu().numpy().mean())

        print("Epoch {} of {} took {:.3f}s".format(
            _epoch + 1, num_epochs, time.time() - start_time))
        print("    training loss (in-iteration): \t{:.6f}".format(
            np.mean(train_loss[-len(x_train) // batch_size:])))
        if x_val and y_val:
            print("    validation accuracy: \t\t\t{:.4f} %".format(
                np.mean(val_accuracy[-len(x_val) // batch_size:]) * 100))

    model.train(False)

In [9]:
# Создание и подготовка модели

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model_h1 = efficientnet_v2_s(weights=EfficientNet_V2_S_Weights.DEFAULT)
model_h1.classifier[1] = nn.Linear(in_features=1280, out_features=4)

for i, param in enumerate(model_h1.children()):
    if i < 2:
        param.requires_grad = False

model_h1.to(device)
optimizer = torch.optim.Adam(model_h1.parameters())

In [10]:
# Трансферное обучение модели

train_loop(
    model_h1, optimizer, device,
    _x_train, _y_train, _x_val, _y_val,
    num_epochs=10, batch_size=32
)

Epoch 1 of 10 took 111.247s
    training loss (in-iteration): 	0.579955
    validation accuracy: 			78.8194 %
Epoch 2 of 10 took 102.611s
    training loss (in-iteration): 	0.099521
    validation accuracy: 			95.7639 %
Epoch 3 of 10 took 103.676s
    training loss (in-iteration): 	0.179242
    validation accuracy: 			96.2500 %
Epoch 4 of 10 took 104.443s
    training loss (in-iteration): 	0.156894
    validation accuracy: 			97.0139 %
Epoch 5 of 10 took 103.591s
    training loss (in-iteration): 	0.124425
    validation accuracy: 			89.5139 %
Epoch 6 of 10 took 104.985s
    training loss (in-iteration): 	0.083474
    validation accuracy: 			94.0278 %
Epoch 7 of 10 took 98.795s
    training loss (in-iteration): 	0.040349
    validation accuracy: 			98.7500 %
Epoch 8 of 10 took 104.933s
    training loss (in-iteration): 	0.017985
    validation accuracy: 			98.2639 %
Epoch 9 of 10 took 102.288s
    training loss (in-iteration): 	0.087043
    validation accuracy: 			100.0000 %
Epoch 10 o

In [11]:
model_name = 'h3'
epoch = 10
torch.save(model_h1, f'models/classifier-{model_name}-{epoch}.ckpt')

In [12]:
# Продолжение обучения модели, теперь обучаются все слои, но с уменьшенным lr оптимизатора

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model_h1 = torch.load('models/classifier-h3-10.ckpt')
for i, param in enumerate(model_h1.children()):
    param.requires_grad = True

torch.cuda.empty_cache()
model_h1.to(device)

optimizer = torch.optim.Adam(model_h1.parameters(), lr=0.0004)
train_loop(
    model_h1, optimizer, device,
    _x_train, _y_train, _x_val, _y_val,
    num_epochs=10, batch_size=32
)

Epoch 1 of 10 took 103.513s
    training loss (in-iteration): 	0.091735
    validation accuracy: 			100.0000 %
Epoch 2 of 10 took 101.393s
    training loss (in-iteration): 	0.038647
    validation accuracy: 			100.0000 %
Epoch 3 of 10 took 102.170s
    training loss (in-iteration): 	0.045316
    validation accuracy: 			100.0000 %
Epoch 4 of 10 took 95.304s
    training loss (in-iteration): 	0.015004
    validation accuracy: 			99.3750 %
Epoch 5 of 10 took 108.631s
    training loss (in-iteration): 	0.041417
    validation accuracy: 			99.3750 %
Epoch 6 of 10 took 100.798s
    training loss (in-iteration): 	0.006608
    validation accuracy: 			99.3750 %
Epoch 7 of 10 took 105.109s
    training loss (in-iteration): 	0.036377
    validation accuracy: 			99.3750 %
Epoch 8 of 10 took 101.181s
    training loss (in-iteration): 	0.032112
    validation accuracy: 			98.7500 %
Epoch 9 of 10 took 101.680s
    training loss (in-iteration): 	0.024183
    validation accuracy: 			99.3750 %
Epoch 10

In [14]:
model_name = 'h3'
epoch = 20
torch.save(model_h1, f'models/classifier-{model_name}-{epoch}.ckpt')

In [15]:
# Продолжение обучения модели, теперь на обучение отправляются все данные и усиливается аугментация 

train_transforms = alb.Compose(transforms=[
    alb.HorizontalFlip(p=0.5),
    alb.ShiftScaleRotate(rotate_limit=45, scale_limit=0.2, shift_limit=0, p=0.7),
    alb.RGBShift(r_shift_limit=35, g_shift_limit=35, b_shift_limit=35, p=0.7),
    alb.RandomBrightnessContrast(brightness_limit=0.35, contrast_limit=0.35, p=0.7),
    alb.HueSaturationValue(hue_shift_limit=35, sat_shift_limit=35, val_shift_limit=35, p=0.7),
    alb.RandomFog(fog_coef_lower=0.1, fog_coef_upper=0.7, p=0.3),
],
    additional_targets={f'image{i}': 'image' for i in range(1, 300)}
)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model_h1 = torch.load('models/classifier-h3-20.ckpt')
for i, param in enumerate(model_h1.children()):
    param.requires_grad = True

torch.cuda.empty_cache()
model_h1.to(device)

optimizer = torch.optim.Adam(model_h1.parameters(), lr=0.0004)
train_loop(
    model_h1, optimizer, device,
    _x_train + _x_val, _y_train + _y_val,
    num_epochs=10, batch_size=32
)

Epoch 1 of 10 took 240.409s
    training loss (in-iteration): 	0.349335
Epoch 2 of 10 took 241.111s
    training loss (in-iteration): 	0.239774
Epoch 3 of 10 took 242.701s
    training loss (in-iteration): 	0.126809
Epoch 4 of 10 took 244.129s
    training loss (in-iteration): 	0.121535
Epoch 5 of 10 took 244.349s
    training loss (in-iteration): 	0.110417
Epoch 6 of 10 took 234.874s
    training loss (in-iteration): 	0.085850
Epoch 7 of 10 took 241.122s
    training loss (in-iteration): 	0.094086
Epoch 8 of 10 took 236.140s
    training loss (in-iteration): 	0.031532
Epoch 9 of 10 took 239.824s
    training loss (in-iteration): 	0.055752
Epoch 10 of 10 took 238.216s
    training loss (in-iteration): 	0.050000


In [16]:
model_name = 'h3'
epoch = 30
torch.save(model_h1, f'models/classifier-{model_name}-{epoch}.ckpt')