# Action Recognition Model for TRACKO - VideoMAE

Notebook ini merupakan notebook development untuk mencoba dan memvalidasi model secara lokal.

Pipeline:
1. Input video → YOLO detection → create bounding boxes for each person
2. Extract person clips → predict actions using trained model  
3. Post-processing: Filter out short duration events (<0.4s) to remove unstable predictions
4. Export results to CSV and annotated video

Requirements:
- Input video: sample_video/video.mp4
- Output: annotated video + action summary CSV

## Setup & Instalasi

In [None]:
import torch
from transformers import AutoImageProcessor, AutoModelForVideoClassification
from ultralytics import YOLO
from decord import VideoReader, cpu
import numpy as np
import cv2
import os
from huggingface_hub import snapshot_download
from IPython.display import HTML, display
from base64 import b64encode
from collections import defaultdict
import csv

# Load utilities
from action_utils import *

Creating new Ultralytics Settings v0.0.6 file ✅ 
View Ultralytics Settings with 'yolo settings' or at '/root/.config/Ultralytics/settings.json'
Update Settings with 'yolo settings key=value', i.e. 'yolo settings runs_dir=path/to/dir'. For help see https://docs.ultralytics.com/quickstart/#ultralytics-settings.


In [3]:
# Generate path for saving results
current_run = get_next_run_number()
run_folder = f"hasil/{current_run:03d}"
os.makedirs(run_folder, exist_ok=True)

input_video_path = "../sample_video/sample.mp4"
output_video_path = f"{run_folder}/videomae_prediction.mp4"
output_csv_path = f"{run_folder}/videomae_rekap.csv"

print(f"Run #{current_run:03d}")
print(f"Output folder: {run_folder}")
print(f"Video output: {output_video_path}")
print(f"CSV output: {output_csv_path}")

Run #001
Output folder: hasil/001
Video output: hasil/001/videomae_prediction.mp4
CSV output: hasil/001/videomae_rekap.csv


In [None]:
# VideoMAE Action Recognition Model for TRACKO
MODEL_PATH = "haipradana/tracko-videomae-action-detection"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

yolo_model = YOLO("yolo11n.pt")
yolo_model.to(device)

Downloading https://github.com/ultralytics/assets/releases/download/v8.3.0/yolo11n.pt to 'yolo11n.pt'...


100%|██████████| 5.35M/5.35M [00:00<00:00, 103MB/s]


YOLO(
  (model): DetectionModel(
    (model): Sequential(
      (0): Conv(
        (conv): Conv2d(3, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
        (bn): BatchNorm2d(16, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
        (act): SiLU(inplace=True)
      )
      (1): Conv(
        (conv): Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
        (bn): BatchNorm2d(32, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
        (act): SiLU(inplace=True)
      )
      (2): C3k2(
        (cv1): Conv(
          (conv): Conv2d(32, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn): BatchNorm2d(32, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
          (act): SiLU(inplace=True)
        )
        (cv2): Conv(
          (conv): Conv2d(48, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn): BatchNorm2d(64, eps=0.001, momentum=0.03, affine=True, track_running_

In [5]:
action_model = AutoModelForVideoClassification.from_pretrained(MODEL_PATH)
image_processor = AutoImageProcessor.from_pretrained(MODEL_PATH)
action_model.to(device)
action_model.eval()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/936 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/485M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/412 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


TimesformerForVideoClassification(
  (timesformer): TimesformerModel(
    (embeddings): TimesformerEmbeddings(
      (patch_embeddings): TimesformerPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (pos_drop): Dropout(p=0.0, inplace=False)
      (time_drop): Dropout(p=0.0, inplace=False)
    )
    (encoder): TimesformerEncoder(
      (layer): ModuleList(
        (0-11): 12 x TimesformerLayer(
          (drop_path): Identity()
          (attention): TimeSformerAttention(
            (attention): TimesformerSelfAttention(
              (qkv): Linear(in_features=768, out_features=2304, bias=True)
              (attn_drop): Dropout(p=0.0, inplace=False)
            )
            (output): TimesformerSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): TimesformerIntermediate(
            (dense

## Main Execution Block

In [None]:
if 'action_model' in locals() and os.path.exists(input_video_path):
    predict_multiperson_video(
        video_path=input_video_path,
        output_path=output_video_path,
        yolo=yolo_model,
        action_classifier=action_model,
        processor=image_processor,
        device=device,
        output_csv_path=output_csv_path
    )

    print("\n--- Hasil Akhir ---")
    print(f"Video output disimpan di: {output_video_path}")
    print(f"Rekap aksi disimpan di: {output_csv_path}")

else:
    print("Eksekusi dibatalkan. Pastikan model telah dimuat dan path video input benar.")


Memulai pelacakan orang...
[31m[1mrequirements:[0m Ultralytics requirement ['lap>=0.5.12'] not found, attempting AutoUpdate...

[31m[1mrequirements:[0m AutoUpdate success ✅ 0.8s


video 1/1 (frame 1/261) /content/drive/MyDrive/datathon_2025/videos/multiperson.mp4: 384x640 10 persons, 240.0ms
video 1/1 (frame 2/261) /content/drive/MyDrive/datathon_2025/videos/multiperson.mp4: 384x640 10 persons, 34.7ms
video 1/1 (frame 3/261) /content/drive/MyDrive/datathon_2025/videos/multiperson.mp4: 384x640 10 persons, 29.7ms
video 1/1 (frame 4/261) /content/drive/MyDrive/datathon_2025/videos/multiperson.mp4: 384x640 11 persons, 53.0ms
video 1/1 (frame 5/261) /content/drive/MyDrive/datathon_2025/videos/multiperson.mp4: 384x640 11 persons, 33.5ms
video 1/1 (frame 6/261) /content/drive/MyDrive/datathon_2025/videos/multiperson.mp4: 384x640 11 persons, 21.9ms
video 1/1 (frame 7/261) /content/drive/MyDrive/datathon_2025/videos/multiperson.mp4: 384x640 11 persons, 19.9ms
video 1/1 (frame 8/261) /cont

In [None]:
# JIKA PAKAI COLAB: Tampilkan video hasil
print("\nMenampilkan video hasil:")
mp4 = open(output_video_path,'rb').read()
data_url = "data:video/mp4;base64," + b64encode(mp4).decode()

# Tampilkan isi file CSV
if os.path.exists(output_csv_path):
    print(f"\nIsi file rekap aksi ({output_csv_path}):")
    with open(output_csv_path, 'r') as f:
        print(f.read())