## Импорт библиотек

In [1]:
# %pip install kornia

In [2]:
import os
import cv2
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision.models.video import r3d_18
from torchvision.transforms import Compose, Resize, CenterCrop, Normalize, ToTensor
from sklearn.model_selection import train_test_split
from PIL import Image
import pandas as pd
from pathlib import Path
from torchvision.models.video import r3d_18, R3D_18_Weights
from tqdm import tqdm
import kornia as K
import kornia.augmentation as KAug
from torch.utils.tensorboard import SummaryWriter
import gc

2023-11-12 07:16:18.326036: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-11-12 07:16:18.373898: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
# %load_ext tensorboard
# %tensorboard --logdir runs

## Создание классов из обучаемых данных

In [4]:
# Загрузка данных из CSV-файла
csv_data = pd.read_csv('classes.csv')

# Создание словаря для сопоставления названия класса и индекса класса
class_name_to_idx = pd.Series(csv_data.class_idx.values, index=csv_data.class_name).to_dict()

# Предположим, что ваши видеофайлы находятся в папке 'videos'
videos_path = Path('./dataset/videos')


In [5]:
# Генерируем список путей к файлам и соответствующих меток классов
file_paths = []
labels = []

for class_name, class_idx in class_name_to_idx.items():
    class_path = videos_path / class_name
    for video_file in class_path.iterdir():
        file_paths.append(str(video_file))
        labels.append(class_idx)

## Нахождение и настройка весов

In [6]:
#функция для извлечения кадров из видео
def extract_frames(video_path):
    cap = cv2.VideoCapture(video_path)
    frames = []
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        frames.append(frame)
    cap.release()
    return frames

In [7]:
# Вычисление средних значений и стандартных отклонений
def compute_mean_std(videos_path):
    mean = np.zeros(3)
    std = np.zeros(3)
    n_frames = 0

    for video_path in tqdm(videos_path):
        frames = extract_frames(video_path)
        for frame in frames:
            mean += frame.mean(axis=(0, 1))
            std += frame.std(axis=(0, 1))
            n_frames += 1

    mean /= n_frames
    std /= n_frames
    return mean / 255, std / 255  # Нормализация значений

In [8]:
# mean, std = compute_mean_std(file_paths)
# print(f'Mean: {mean}, Std: {std}')

In [9]:
# Определите параметры
num_classes = 24  # Замените на количество классов в вашем датасете
num_epochs = 10
batch_size = 16
learning_rate = 1e-4
frame_count = 3  # Количество кадров в каждом видеосемпле

In [10]:
# Трансформации для предобработки видеофреймов
transform = Compose([
    Resize([128, 171], antialias=True),
    CenterCrop([112, 112]),
    ToTensor(),
    Normalize(mean=[0.3545011, 0.4070217, 0.416456], std=[0.20798995, 0.21007156, 0.21544088])
])

In [11]:
augmentation = KAug.AugmentationSequential(
    KAug.RandomHorizontalFlip(p=0.5),
    KAug.RandomRotation(degrees=10),
    data_keys=['input']
)

## Обработка видеофайлов

### Класс создания датасета из видео

In [12]:
class VideoDataset(Dataset):
    def __init__(self, file_paths, labels, transform=None, augmentation=None):
        self.file_paths = file_paths
        self.labels = labels
        self.transform = transform
        self.augmentation = augmentation

    def __len__(self):
        return len(self.file_paths)

    def __getitem__(self, idx):
        frame_count = 3
        video_path = self.file_paths[idx]
        cap = cv2.VideoCapture(video_path)
        frames = []
        try:
            while len(frames) < frame_count:
                ret, frame = cap.read()
                if not ret:
                    break
                frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                frame = Image.fromarray(frame)

                if self.transform:
                    frame = self.transform(frame)

                frames.append(frame)
        finally:
            cap.release()

        # Применяем аугментацию к каждому кадру в последовательности
        if self.augmentation:
            seed = torch.randint(0, 1000000, (1,)).item()  # Генерируем случайное число для сидирования
            torch.manual_seed(seed)  # Устанавливаем сид для аугментации
            frames = [self.augmentation(f) for f in frames]

        frames = frames[:frame_count] + [frames[-1]] * (frame_count - len(frames))
        frames = torch.stack(frames)
        label = torch.tensor(self.labels[idx])
        return frames, label


### Разделение на обучающий и валидационный датасеты

In [13]:
# Разделение на обучающий и валидационный датасеты
train_paths, valid_paths, train_labels, valid_labels = train_test_split(
    file_paths, labels, test_size=0.2, random_state=42, stratify=labels)

# Создание экземпляров датасета
train_dataset = VideoDataset(train_paths, train_labels, transform=transform, augmentation=augmentation)
valid_dataset = VideoDataset(valid_paths, valid_labels, transform=transform, augmentation=augmentation)

In [14]:
# Создание DataLoader'ов
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)

## Создание модели и обучение

### Создание модели

In [15]:
# Создание модели
# model = r3d_18(pretrained=True)
model = r3d_18(weights=R3D_18_Weights.KINETICS400_V1)
model.fc = nn.Linear(model.fc.in_features, num_classes)


# Перенесите модель на GPU, если доступно
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Функция потерь и оптимизатор
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)


Downloading: "https://download.pytorch.org/models/r3d_18-b3b3357e.pth" to /tmp/xdg_cache/torch/hub/checkpoints/r3d_18-b3b3357e.pth
100%|██████████| 127M/127M [00:02<00:00, 54.0MB/s] 


### Обучение модели

In [16]:
# Обучение модели
writer = SummaryWriter('runs/experiment_name')

best_accuracy = 0.0  # Инициализация переменной для лучшей точности

for epoch in range(num_epochs):
    model.train()
    for i, (videos, labels) in enumerate(train_loader):
        videos = videos.to(device)
        labels = labels.to(device)

        # Обнуляем градиенты
        optimizer.zero_grad()
        
        videos = videos.squeeze(2)
        # Прямой проход
        outputs = model(videos)
        
        # Расчет потерь
        loss = criterion(outputs, labels)
        
        # Обратное распространение
        loss.backward()
        
        # Оптимизация
        optimizer.step()
    
    # Валидация модели
    model.eval()
    correct = 0
    total = 0
    for videos, labels in valid_loader:
        videos = videos.to(device)
        labels = labels.to(device)
        videos = videos.squeeze(2)
        outputs = model(videos)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        
        if (100 * correct / total) > best_accuracy:
            best_accuracy = 100 * correct / total
            torch.save(model.state_dict(), f'model_epoch_kornia_{epoch+1}.pth')
            print(f'Model saved at epoch {epoch+1}')
        
        # Для записи потерь обучения
    writer.add_scalar('Training Loss', loss.item(), epoch)

    # Для записи точности валидации
    writer.add_scalar('Validation Accuracy', 100 * correct / total, epoch)

    # Выводим статистику после каждой эпохи
    print(f'Epoch {epoch + 1}/{num_epochs}, Train Loss: {loss.item():.4f}, Validation Accuracy: {100 * correct / total:.2f}%')


writer.close()

In [17]:
# Сохранение модели
torch.save(model.state_dict(), 'action_recognition_kornia_model.pth')


## Проверка модели

In [18]:
model = r3d_18(weights=None)
model.fc = nn.Linear(model.fc.in_features, num_classes)
model.load_state_dict(torch.load('action_recognition_model.pth'))
model.to(device)
model.eval()

VideoResNet(
  (stem): BasicStem(
    (0): Conv3d(3, 64, kernel_size=(3, 7, 7), stride=(1, 2, 2), padding=(1, 3, 3), bias=False)
    (1): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
  )
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Sequential(
        (0): Conv3DSimple(64, 64, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False)
        (1): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU(inplace=True)
      )
      (conv2): Sequential(
        (0): Conv3DSimple(64, 64, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False)
        (1): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (relu): ReLU(inplace=True)
    )
    (1): BasicBlock(
      (conv1): Sequential(
        (0): Conv3DSimple(64, 64, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False)
        (1):

### Функции для распознавания действий в видео

In [19]:
import matplotlib.pyplot as plt
import cv2

def display_image(image):
    plt.imshow(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
    plt.axis('off')  # Отключить оси координат для чистого отображения
    plt.show()

In [20]:
def process_frame(frame, model, transform, device, class_name_to_idx, frame_number, fps):
    # Обработка одного кадра в секунду
    if frame_number % int(fps) == 0:
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        frame = Image.fromarray(frame)
        frame = transform(frame).unsqueeze(0).to(device)

        with torch.no_grad():
            outputs = model(frame)
            _, predicted = torch.max(outputs, 1)
            class_name = [name for name, idx in class_name_to_idx.items() if idx == predicted.item()][0]
            timestamp = frame_number / fps
            
            return class_name, timestamp
    else:
        return None


In [21]:
import gc

def process_frame_group(frame_group, model, transform, device, class_name_to_idx, timestamp, fps):
    processed_frames = []
    for frame in frame_group:
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        frame = Image.fromarray(frame)
        frame = transform(frame)
        processed_frames.append(frame)

    # Стекирование кадров и добавление батч-размерности
    frames_tensor = torch.stack(processed_frames).unsqueeze(0).to(device)

    with torch.no_grad():
        outputs = model(frames_tensor)
        _, predicted = torch.max(outputs, 1)
        class_name = [name for name, idx in class_name_to_idx.items() if idx == predicted.item()][0]
    
    del frames_tensor
    gc.collect()
    
    return class_name, timestamp


In [22]:
import concurrent.futures

def process_video_parallel(video_path, model, transform, device, class_name_to_idx):
    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS)
    action_timestamps = {class_name: [] for class_name in class_name_to_idx.keys()}
    
    max_workers = 4  # Ограничение количества одновременных задач
    
    frames = []
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        frames.append(frame)
    cap.release()

    # Выборка трех кадров каждую секунду
    frames_to_process = [frames[i:i+3] for i in range(0, len(frames), int(fps)) if i+3 <= len(frames)]
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_frame = {executor.submit(process_frame_group, frame_group, model, transform, device, class_name_to_idx, idx / fps, fps): idx for idx, frame_group in enumerate(frames_to_process)}
        for future in concurrent.futures.as_completed(future_to_frame):
            result = future.result()
            if result is not None:
                class_name, timestamp = result
                print(result)
                display_image(frames_to_process[future_to_frame[future]][0])
                action_timestamps[class_name].append(timestamp)

    return action_timestamps

In [24]:
video_path = 'zx,c.mp4'
# Убедитесь, что модель находится в режиме .eval() и загружена на устройство перед вызовом этой функции
timestamps = process_video_parallel(video_path, model, transform, device, class_name_to_idx)

for action, times in timestamps.items():
    print(f"Действие '{action}' обнаружено в следующих временных метках: {times}")


ValueError: range() arg 3 must not be zero