# Introduction
This repository contains the codebase for an advanced video analysis pipeline that combines the power of multiple state-of-the-art models and methodologies including YOLO V8, ByteTrack, Movenet, and Transformer encoders for high-accuracy activity recognition. The pipeline is trained on the Human Activity Recognition (HAR - Video Dataset)

Reference:
Datasets: https://www.kaggle.com/datasets/sharjeelmazhar/human-activity-recognition-video-dataset?resource=download-directory

In [None]:
!nvidia-smi



# Install Dependencies

In [None]:
import os
import subprocess
import sys

def install_package(package_name, pip_command=None):
    try:
        # Check if the package is installed by trying to import it
        exec(f"import {package_name}")
    except ImportError:
        # If the package is not installed, use pip to install it
        if not pip_command:
            pip_command = package_name
        subprocess.check_call([sys.executable, "-m", "pip", "install", pip_command])

!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

# Usage
packages_to_install = {
    'cv2': 'opencv-python-headless',
    'tensorflow': None,
    'tensorflow': 'tensorflow-gpu==2.4.1',
    'cv2': 'opencv-python',
    'matplotlib': None,
    'imageio': None,
    'tfdocs': 'git+https://github.com/tensorflow/docs',
    'transformers': 'git+https://github.com/huggingface/transformers',
    'sklearn': 'scikit-learn',
    'scipy': None,
    'matplotlib': None,
    'tensorflow_hub': None
}

for package, command in packages_to_install.items():
    install_package(package, command)

!pip install timm

%cd "C:/Users/Duc Anh/Desktop/#ActivityRecognition/packages/ultralytics"
!pip install -r requirements.txt

#  Import Dependencies

In [None]:
# Standard Library Imports
import torch
import torch.nn as nn
import torchvision.models as models
from torch.nn import TransformerEncoder, TransformerEncoderLayer
import tensorflow as tf
import numpy as np
import random
import gc
gc.enable()

# Third-party Libraries
import ultralytics
from ultralytics import YOLO
from sklearn.model_selection import train_test_split
from tensorflow import keras

# Custom Module Imports
from train import initialize_dense_model, initialize_lstm_model, initialize_cnn_model, train_model
from datasets import create_datasets, create_cnn_datasets

# Constants or Configuration
from torchvision.models.resnet import ResNet152_Weights

In [None]:
seed_constant = 27
np.random.seed(seed_constant)
random.seed(seed_constant)
tf.random.set_seed(seed_constant)

SEQUENCE_LENGTH = 50

# Datasets
DATASET_DIR = 'C:/Users/Duc Anh/Desktop/#ActivityRecognition/datasets/Human Activity Recognition - Video Dataset'
CLASSES_LIST = sorted([entry.name for entry in os.scandir(DATASET_DIR) if entry.is_dir()])
print(f'{CLASSES_LIST}')
print(f'LENGTH: {len(CLASSES_LIST)}')

# Setting Up Saving Folder
final = "C:/Users/Duc Anh/Desktop/#ActivityRecognition/final"
if not os.path.exists(final):
    os.makedirs(final)
%cd "C:/Users/Duc Anh/Desktop/#ActivityRecognition/final"

# Initialize Model


In [None]:
# Initialize YOLO model
yolo_model = YOLO('yolov8x.pt')

# For spatial features
spatial_resnet = models.resnet152(weights=ResNet152_Weights.IMAGENET1K_V1)
spatial_resnet = nn.Sequential(*list(spatial_resnet.children())[:-1])
spatial_resnet.eval()

# For temporal features
temporal_resnet = models.resnet152(weights=ResNet152_Weights.IMAGENET1K_V1)
temporal_resnet = nn.Sequential(*list(temporal_resnet.children())[:-1])
temporal_resnet.eval()


# Define Transformer Encoders
# fewer attention heads for spatial model
spatial_encoder_layers = TransformerEncoderLayer(d_model=2048, nhead=4)  # fewer attention heads for spatial model
spatial_transformer = TransformerEncoder(spatial_encoder_layers, num_layers=2)

# more attention heads for temporal model
temporal_encoder_layers = TransformerEncoderLayer(d_model=2048, nhead=16)  # more attention heads for temporal model
temporal_transformer = TransformerEncoder(temporal_encoder_layers, num_layers=2)

# Data Pre-processing

In [None]:
# Datasets for Dense + LSTM
%cd "C:/Users/Duc Anh/Desktop/#ActivityRecognition/final/create_datasets"
# Preparing Datasets for Training
dl_features, dl_labels, dl_video_files_paths = create_datasets(yolo_model,
                                                      spatial_resnet,
                                                      spatial_transformer,
                                                      temporal_resnet,
                                                      temporal_transformer,
                                                      DATASET_DIR,
                                                      CLASSES_LIST,
                                                      SEQUENCE_LENGTH,
                                                      NUM_VIDEOS_TO_PROCESS=25)

# Drop any NaN
if np.isnan(dl_features).any():
  # Identify sequences that contain any nan values
  sequences_with_nan = np.any(np.isnan(dl_features), axis=(1, 2))

  # Filter out those sequences
  dl_features = dl_features[~sequences_with_nan]
  dl_labels = dl_labels[~sequences_with_nan]

  print(f"Original number of sequences: {dl_features.shape[0]}")
  print(f"Number of sequences after removal: {dl_labels.shape[0]}")
  print(f'{dl_features.shape}')
  print(f'{dl_labels.shape}')

# Data Augmented, this might be a good testing to scale the datasets for training
from datasets import augment_features

# # Generate the augmented data
all_augmented_features = augment_features(dl_features)

# # Combine original and all augmented versions
all_features = np.concatenate((dl_features, all_augmented_features), axis=0)

# # Create labels for the augmented data
all_labels = np.tile(dl_labels, (all_features.shape[0] // dl_features.shape[0],))

from tensorflow.keras.utils import to_categorical
dl_one_hot_encoded_labels = to_categorical(all_labels)
dl_features_train, dl_features_test, dl_labels_train, dl_labels_test = train_test_split(all_features, dl_one_hot_encoded_labels,
                                                                            test_size = 0.2, shuffle = True,
                                                                            random_state = seed_constant)

In [None]:
# Datasets for CNN Model
%cd "C:/Users/Duc Anh/Desktop/#ActivityRecognition/final/create_cnn_datasets"
cnn_features, cnn_labels, cnn_video_files_paths = create_cnn_datasets(DATASET_DIR,
                                                         CLASSES_LIST,
                                                         SEQUENCE_LENGTH,
                                                         NUM_VIDEOS_TO_PROCESS=25)

# Drop any NaN
if np.isnan(cnn_features).any():
  # Identify sequences that contain any nan values
  sequences_with_nan = np.any(np.isnan(cnn_features), axis=(1, 2))

  # Filter out those sequences
  cnn_features = features[~sequences_with_nan]
  cnn_labels = cnn_labels[~sequences_with_nan]

  print(f"Original number of sequences: {cnn_features.shape[0]}")
  print(f"Number of sequences after removal: {cnn_labels.shape[0]}")
  print(f'{cnn_features.shape}')
  print(f'{cnn_labels.shape}')

from tensorflow.keras.utils import to_categorical
cnn_one_hot_encoded_labels = to_categorical(cnn_labels)
# Please run this if you want to split the data without Augment!
cnn_features_train, cnn_features_test, cnn_labels_train, cnn_labels_test = train_test_split(cnn_features, cnn_one_hot_encoded_labels,
                                                                            test_size = 0.2, shuffle = True,
                                                                            random_state = seed_constant)

In [None]:
print('Dense + LSTM')
print(f'Features Train: {dl_features_train.shape}')
print(f'Labels Train: {dl_labels_train.shape}')
print(f'Features Test: {dl_features_test.shape}')
print(f'Labels Test: {dl_labels_test.shape}\n')
print('CNN')
print(f'Features Train: {cnn_features_train.shape}')
print(f'Labels Train: {cnn_labels_train.shape}')
print(f'Features Test: {cnn_features_test.shape}')
print(f'Labels Test: {cnn_labels_test.shape}')


# Model Training

In [None]:
%cd "C:/Users/Duc Anh/Desktop/#ActivityRecognition/final/"

In [None]:
input_shape = (dl_features.shape[1], dl_features.shape[2])
num_classes = len(CLASSES_LIST)

# Dense Model
dense_model = initialize_dense_model(input_shape, num_classes)
dense_model.summary()

# Train model
dense_model_name = 'dense_model'
dense_model = train_model(dense_model_name, 
                          dense_model, 
                          dl_features_train, 
                          dl_labels_train, 
                          batch_size=1, 
                          epochs=1000000000000, 
                          early_stopping_patience=20)

In [None]:
input_shape = (dl_features.shape[1], dl_features.shape[2])
num_classes = len(CLASSES_LIST)

# LSTM MODEL
lstm_model = initialize_lstm_model(input_shape, num_classes)
lstm_model.summary()

# Train model
lstm_model_name = 'lstm_model'
lstm_model = train_model(lstm_model_name, 
                         lstm_model, 
                         dl_features_train, 
                         dl_labels_train, 
                         batch_size=1, 
                         epochs=1000000000000, 
                         early_stopping_patience=20)

In [None]:
input_shape = (cnn_features.shape[1], cnn_features.shape[2], cnn_features.shape[3], cnn_features.shape[4])
num_classes = len(CLASSES_LIST)

# CNN Model
cnn_model = initialize_cnn_model(input_shape, num_classes)
cnn_model.summary()


# Train model
cnn_model_name = 'cnn_model'
cnn_model = train_model(cnn_model_name, 
                         cnn_model, 
                         cnn_features_train, 
                         cnn_labels_train, 
                         batch_size=4, 
                         epochs=1000000000000, 
                         early_stopping_patience=20)

# Predict on video

In [None]:
%cd "C:/Users/Duc Anh/Desktop/#ActivityRecognition/final/"

# Load the state dict previously saved 
dense_model_name = 'dense_model.h5'
dense_model = keras.models.load_model(dense_model_name)

lstm_model_name = 'lstm_model.h5'
lstm_model = keras.models.load_model(lstm_model_name)

cnn_model_name = 'cnn_model.h5'
cnn_model = keras.models.load_model(cnn_model_name)

In [None]:
from predict import predict_on_video
video_path = 'C:/Users/Duc Anh/Desktop/ActivityRecognition/test/videos/#TEST2.mp4'
output_video = '#TEST2.mp4'
predict_on_video(video_path,
                 output_video,
                 CLASSES_LIST,
                 dense_model,
                 lstm_model,
                 cnn_model,
                 yolo_model,
                 spatial_resnet,
                 spatial_transformer,
                 temporal_resnet, 
                 temporal_transformer,
                 SEQUENCE_LENGTH=50,
                 debug=False)