In [None]:
import os
import urllib.request

# Example for Actor 01 (adjust URL for other actors)
url = "https://zenodo.org/record/1188976/files/Video_Speech_Actor_01.zip?download=1"
os.makedirs("ravdess_video", exist_ok=True)
urllib.request.urlretrieve(url, "ravdess_video/Actor01.zip")

!unzip -q ravdess_video/Actor01.zip -d ravdess_video/Actor01


In [None]:
!pip install timm einops opencv-python decord
!pip install -q \
    peft==0.10.0 \
    transformers==4.37.2 \
    tokenizers==0.15.2 \
    huggingface_hub==0.33.4 \
    datasets==2.19.1 \
    sentence-transformers==2.6.1 \
    accelerate==0.28.0 \
    torchaudio



In [None]:
import os
import numpy as np
from PIL import Image
from datasets import Dataset, DatasetDict
from decord import VideoReader, cpu
from transformers import (
    VideoMAEImageProcessor,
    TimesformerForVideoClassification,
    TrainingArguments,
    Trainer,
    DefaultDataCollator
)
import torch


The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

In [None]:
def extract_label_from_path(path):
    # Example filename: 01-01-01-01-01-01-01.mp4
    # Emotion is at position 3 (index 2), mapping below:
    emotion_map = {
        "01": "neutral", "02": "calm", "03": "happy", "04": "sad",
        "05": "angry", "06": "fearful", "07": "disgust", "08": "surprised"
    }
    emotion_id = path.split("/")[-1].split("-")[2]
    return emotion_map.get(emotion_id, "unknown")


In [None]:
from glob import glob

video_paths = glob("/content/ravdess_video/**/*.mp4", recursive=True)

data = {"video_path": [], "label": []}
for path in video_paths:
    label = extract_label_from_path(path)
    if label != "unknown":
        data["video_path"].append(path)
        data["label"].append(label)

label2id = {label: idx for idx, label in enumerate(sorted(set(data["label"])))}
data["label"] = [label2id[label] for label in data["label"]]

dataset = Dataset.from_dict(data).train_test_split(test_size=0.2)


In [None]:
image_processor = VideoMAEImageProcessor.from_pretrained("facebook/timesformer-base-finetuned-k400")

def load_video_frames(video_path, num_frames=8):
    vr = VideoReader(video_path, ctx=cpu(0))
    total_frames = len(vr)
    indices = np.linspace(0, total_frames - 1, num=num_frames, dtype=int)
    frames = vr.get_batch(indices).asnumpy()  # shape: (num_frames, H, W, 3)
    pil_frames = [Image.fromarray(frame) for frame in frames]
    return pil_frames


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
def transform(example):
    frames = load_video_frames(example["video_path"])
    inputs = image_processor(frames, return_tensors="pt")
    example["pixel_values"] = inputs["pixel_values"].squeeze(0)  # (T, 3, H, W)
    return example

dataset = dataset.map(transform, remove_columns=["video_path"])


Map:   0%|          | 0/96 [00:00<?, ? examples/s]

  return torch.tensor(value)


Map:   0%|          | 0/24 [00:00<?, ? examples/s]

In [None]:
from transformers import TimesformerModel, TimesformerConfig, TimesformerForVideoClassification, TrainingArguments, Trainer
import torch.nn as nn
import numpy as np

# Load base model configuration
config = TimesformerConfig.from_pretrained("facebook/timesformer-base-finetuned-k400")
config.num_labels = len(label2id)
config.label2id = label2id
config.id2label = {v: k for k, v in label2id.items()}

# Load base model (without classification head)
base_model = TimesformerModel.from_pretrained("facebook/timesformer-base-finetuned-k400", config=config)

# Wrap it into a new classifier model manually
class CustomTimesformerForClassification(TimesformerForVideoClassification):
    def __init__(self, config):
        super().__init__(config)
        self.timesformer = base_model
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

# Initialize the model
model = CustomTimesformerForClassification(config)

# Training args
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    learning_rate=5e-5,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    logging_steps=10,
)

# Accuracy metric
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    labels = p.label_ids
    acc = np.mean(preds == labels)
    return {"accuracy": acc}

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    compute_metrics=compute_metrics,
)

# Train
trainer.train()


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


Epoch,Training Loss,Validation Loss,Accuracy
1,2.0642,2.115082,0.125
2,0.9957,0.62927,0.791667
3,0.363,0.389059,0.916667


TrainOutput(global_step=144, training_loss=1.3957309739457235, metrics={'train_runtime': 7273.0569, 'train_samples_per_second': 0.04, 'train_steps_per_second': 0.02, 'total_flos': 2.523400261997691e+17, 'train_loss': 1.3957309739457235, 'epoch': 3.0})

In [None]:
metrics = trainer.evaluate()
print("Test Accuracy:", metrics["eval_accuracy"])


Test Accuracy: 0.9166666666666666
