In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


In [None]:

import torch
from transformers import  TimesformerConfig, TimesformerForVideoClassification,AutoImageProcessor
import imageio
import cv2
import numpy as np
import os
import time

In [None]:
image_processor = AutoImageProcessor.from_pretrained("MCG-NJU/videomae-base")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/271 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [None]:
import  torchvision.transforms.v2
class ToPILImage:
    def __init__(self):
        self.to_pil = torchvision.transforms.ToPILImage()
    def __call__(self, frames):
        #print(len(frames))
        frames = [self.to_pil(frame) for frame in frames]
        return frames
class Resize:
    def __init__(self, h,w):
        self.h = h
        self.w = w
        self.resize = torchvision.transforms.v2.Resize((h,w))
    def __call__(self, frames):
        frames = [self.resize(frame) for frame in frames]
        return frames

class ToTensor:
    def __init__(self):
        self.to_tensor = torchvision.transforms.v2.ToTensor()
    def __call__(self, frames):
        frames = [self.to_tensor(frame) for frame in frames]
        return frames

class SampleFrames:
    def __init__(self, num_frames=8):
        self.num_frames = num_frames

    def __call__(self, frames):
        frames = torch.stack(frames)
        if frames.shape[0] < self.num_frames:
            # Repeat frames until the desired number is reached
            repeated_frames = frames.repeat((self.num_frames // frames.shape[0], 1, 1, 1))
            remainder = self.num_frames % frames.shape[0]
            if remainder > 0:
                repeated_frames = torch.cat((repeated_frames, frames[:remainder]), dim=0)
            return repeated_frames
        else:
            # Use UniformTemporalSubsample to subsample frames
            #print("Subsampling frames")
            subsample = torchvision.transforms.v2.UniformTemporalSubsample(self.num_frames)
            return subsample(frames.unsqueeze(0)).squeeze(0)
class Normalize:
    def __init__(self, mean, std):
                self.normalize = torchvision.transforms.v2.Normalize(mean=mean, std=std)
    def __call__(self, frames):
                frames = [self.normalize(frame) for frame in frames]
                return frames
class RandomCrop:
                def __init__(self, size):
                    self.size = size
                    self.random_crop = torchvision.transforms.v2.RandomCrop(size)

                def __call__(self, frames):
                    frames = [self.random_crop(frame) for frame in frames]
                    return frames
class RandomHorizontalFlip:
    def __init__(self, p=0.5):
        self.p = p
        self.random_horizontal_flip = torchvision.transforms.v2.RandomHorizontalFlip(p=p)

    def __call__(self, frames):
        frames = [self.random_horizontal_flip(frame) for frame in frames]
        return frames
class RandomVerticalFlip:
    def __init__(self, p=0.5):
        self.p = p
        self.random_vertical_flip = torchvision.transforms.v2.RandomVerticalFlip(p=p)

    def __call__(self, frames):
        frames = [self.random_vertical_flip(frame) for frame in frames]
        return frames
class RandomRotation:
    def __init__(self, degrees):
        self.degrees = degrees
        self.random_rotation = torchvision.transforms.v2.RandomRotation(degrees)

    def __call__(self, frames):
        frames = [self.random_rotation(frame) for frame in frames]
        return frames
class RandomResizedCrop:
    def __init__(self, size, scale=(0.08, 1.0), ratio=(3. / 4., 4. / 3.)):
        self.size = size
        self.scale = scale
        self.ratio = ratio
        self.random_resized_crop = torchvision.transforms.v2.RandomResizedCrop(size, scale=scale, ratio=ratio)

    def __call__(self, frames):
        frames = [self.random_resized_crop(frame) for frame in frames]
        return frames
class CenterCrop:
    def __init__(self, size):
        self.size = size
        self.center_crop = torchvision.transforms.v2.CenterCrop(size)

    def __call__(self, frames):
        frames = [self.center_crop(frame) for frame in frames]
        return frames
class ColorJitter:
    def __init__(self, brightness=0, contrast=0, saturation=0, hue=0):
        self.color_jitter = torchvision.transforms.v2.ColorJitter(
            brightness=brightness, contrast=contrast, saturation=saturation, hue=hue)

    def __call__(self, frames):
        frames = [self.color_jitter(frame) for frame in frames]
        return frames

In [None]:
from torchvision.datasets import DatasetFolder
import random

class CustomDatasetFolder(DatasetFolder):
    def make_dataset(self, directory, class_to_idx, extensions=None, is_valid_file=None,allow_empty=False):
        instances = []
        for target_class in sorted(class_to_idx.keys()):
            class_idx = class_to_idx[target_class]
            target_dir = os.path.join(directory, target_class)
            if not os.path.isdir(target_dir):
                continue
            gif_files = [f for f in os.listdir(target_dir) if f.endswith('.gif')]
            selected_gif_files = random.sample(gif_files, min(175, len(gif_files)))
            for gif_file in selected_gif_files:
                path = os.path.join(target_dir, gif_file)
                item = (path, class_idx)
                instances.append(item)
        return instances

In [None]:
from torchvision.datasets import DatasetFolder
from torchvision.transforms.v2 import Compose

def gif_loader(path):
    frames = []
    gif_frames = imageio.mimread(path, memtest=False)

    for frame in gif_frames:
        # Ensure frame is a NumPy array for consistent handling (imageio.mimread usually does this)
        # Convert to appropriate dtype if necessary (e.g., uint8)
        frame_np = np.array(frame, dtype=np.uint8)

        # Handle 4-channel (RGBA) -> RGB
        if frame_np.shape[-1] == 4:
            frame_np = cv2.cvtColor(frame_np, cv2.COLOR_RGBA2RGB)
        # Handle 1-channel (Grayscale with explicit channel dimension) -> RGB
        elif frame_np.shape[-1] == 1:
            frame_np = cv2.cvtColor(frame_np, cv2.COLOR_GRAY2RGB)
        # Handle 2-dimensional (Grayscale without explicit channel dimension) -> RGB
        elif frame_np.ndim == 2:
            frame_np = cv2.cvtColor(frame_np, cv2.COLOR_GRAY2RGB)
        # Else, assume it's already 3-channel RGB (or imageio converted from indexed to 3-channel RGB)

        frames.append(frame_np)

    return frames # Returns a list of NumPy arrays, each (H, W, 3)
if "shortest_edge" in image_processor.size:
    height = width = image_processor.size["shortest_edge"]
else:
    height = image_processor.size["height"]
    width = image_processor.size["width"]
resize_to = (height, width)
# Define a transform to apply to each frame of the GIF
transform = Compose([

    #
    ToPILImage(),  # Convert frames to PIL images
    #RandomCrop(resize_to),
    RandomHorizontalFlip(),
    Resize(224, 224),  # Resize frames to 224x224
    RandomResizedCrop(resize_to),
    ColorJitter(brightness=0.1, contrast=0.1, saturation=0.1, hue=0.05),
    ToTensor(),
    Normalize(image_processor.image_mean,image_processor.image_std),
    SampleFrames(),
    #ToTensor(),          # Convert frames to PyTorch tensors

])

test_transform = Compose([

    #
    ToPILImage(),  # Convert frames to PIL images
    Resize(224, 224),  # Resize frames to 224x224
    ToTensor(),
    Normalize(image_processor.image_mean,image_processor.image_std),
    SampleFrames(),
    #ToTensor(),          # Convert frames to PyTorch tensors

])



# Create a custom dataset using DatasetFolder
train_dataset = CustomDatasetFolder(
    root="/content/kinetics2/train",
    loader=gif_loader,
    extensions=("gif",),
    transform=transform  # Use the existing transform variable
)


test_dataset = CustomDatasetFolder(
    root="/content/kinetics2/test",
    loader=gif_loader,
    extensions=("gif",),
    transform=test_transform  # Use the existing transform variable
)

print(f"Custom dataset created with {len(train_dataset)} samples.")
print(f"Test dataset created with {len(test_dataset)} samples.")

Custom dataset created with 3426 samples.
Test dataset created with 1159 samples.




In [None]:
# Load the base config
config = TimesformerConfig.from_pretrained("facebook/timesformer-base-finetuned-k400")

# Adjust dropout rates in the config
# Common parameters for ViViT and other Transformers:
config.hidden_dropout_prob = 0.2  # Dropout after intermediate (hidden) layers
config.attention_probs_dropout_prob = 0.2 # Dropout in attention mechanism
config.classifier_dropout_prob = 0.2 # Dropout in the classification head (if present)

model = TimesformerForVideoClassification.from_pretrained("facebook/timesformer-base-finetuned-k400",config=config)


config.json:   0%|          | 0.00/22.7k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/486M [00:00<?, ?B/s]

In [None]:
fsdp_config = {
    "compute_environment": "LOCAL_MACHINE",
    "debug": False,
    "distributed_type": "FSDP",
    "downcast_bf16": "no",
    "fsdp_config": {
        "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
        "fsdp_backward_prefetch_policy": "BACKWARD_PRE",
        "fsdp_forward_prefetch": False,
        "fsdp_cpu_ram_efficient_loading": True,
        "fsdp_offload_params": False,
        "fsdp_sharding_strategy": "FULL_SHARD",
        "fsdp_state_dict_type": "SHARDED_STATE_DICT",
        "fsdp_sync_module_states": True,
        "fsdp_transformer_layer_cls_to_wrap": "BertLayer",
        "fsdp_use_orig_params": True
    },
    "machine_rank": 0,
    "main_training_function": "main",
    "mixed_precision": "bf16",
    "num_machines": 1,
    "num_processes": 2,
    "rdzv_backend": "static",
    "same_network": True,
    "tpu_env": [],
    "tpu_use_cluster": False,
    "tpu_use_sudo": False,
    "use_cpu": False,
}

In [None]:
from transformers import TrainingArguments, Trainer,EarlyStoppingCallback
model_name ='timesformer'
new_model_name = f"{model_name}-finetuned-workouts"
num_epochs = 10

args = TrainingArguments(
    model_name,
    remove_unused_columns=False,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8, gradient_accumulation_steps=4,torch_empty_cache_steps=4,
    per_device_eval_batch_size=5,
    warmup_ratio=0.01,
    lr_scheduler_type="cosine", # Use cosine annealing scheduler
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    push_to_hub=False,
    max_steps=(len(train_dataset)// 5) * num_epochs,
        report_to="none",
    fsdp_config = fsdp_config,
    #fsdp_strategy="full_shard",
    save_total_limit = 5,
    fp16=True,
    greater_is_better=True,      # For accuracy, higher is better


)

model.safetensors:   0%|          | 0.00/486M [00:00<?, ?B/s]

In [None]:
import evaluate

metric = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [None]:
def compute_metrics(eval_pred):
    """Computes accuracy on a batch of predictions."""
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return metric.compute(predictions=predictions, references=eval_pred.label_ids)

In [None]:
def collate_fn(examples):
    pixel_values = torch.stack([example[0] for example in examples])
    labels = torch.tensor([example[1] for example in examples])
    print(labels)
    return {"pixel_values": pixel_values, "labels": labels}

In [None]:
from transformers import default_data_collator

trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
     tokenizer=image_processor,
    compute_metrics=compute_metrics,
    data_collator=collate_fn,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=7)], # <--- Early stopping patience


 )

  trainer = Trainer(


In [None]:
train_results = trainer.train()

tensor([ 2, 20, 16, 12, 18, 14,  6, 13])
tensor([18, 10,  6,  3, 18,  6, 16, 19])
tensor([ 2, 12, 17, 15,  1, 20,  6,  7])
tensor([15,  6,  4,  1,  3,  0,  3,  6])
tensor([ 0,  0,  4,  0, 13, 13,  3,  4])
