In [None]:
import numpy as np
import torch
from transformers import VideoMAEForPreTraining, VideoMAEImageProcessor

# 1. 加载预训练的 VideoMAE 模型和图像处理器
model_name = "MCG-NJU/videomae-base"
model = VideoMAEForPreTraining.from_pretrained(model_name)
processor = VideoMAEImageProcessor.from_pretrained(model_name)

# 2. 准备数据和掩码
num_frames = 16
channels = 3
height = 224
width = 224

# 创建示例数据和掩码
data = np.random.randn(num_frames, channels, height, width).astype(np.float32) #* 16,3,224,224

# 强制数据在 [0, 1] 范围内
data = np.clip(data, 0, 1)

# 确保数据通道顺序为 [frames, height, width, channels]
data = data.transpose(0, 2, 3, 1)#* 16,224,224,3

# 将数据和掩码转换为 PyTorch 的 Tensor
pixel_values = processor(images=list(data), return_tensors="pt", do_rescale=False).pixel_values
#* 16,[3,224,224]
# 计算掩码的形状和序列长度
num_patches_per_frame = (model.config.image_size // model.config.patch_size) ** 2 #* (224//16)**2=196
tubelet_seq_length = num_frames//model.config.tubelet_size#! tubelet_size:The number of tubelets

# 随机生成掩码（0表示缺失区域）
bool_masked_pos = torch.randint(0, 2, (1,num_patches_per_frame*tubelet_seq_length)).bool()# generate a array which shape is (1,seq_lem) and its range between (0,1)

# 3. 使用模型进行推理，得到编码后的特征表示
model.eval()
with torch.no_grad():
    outputs = model(pixel_values, bool_masked_pos=bool_masked_pos) # Patches are flattened and embedded into vectors.
    #! model pre-processing pixel_values [1, 16, 3, 224, 224] to [1,1568,768] ==[1,1568,16*16*3]==[batch_size,tubelet_size,patch_size]
    encoded_features = outputs.logits #* batch_size,embedded_seq_length,hidden_size -> 1,variant,1536
    #! the embedded_seq_length would change depending on how many patches were masked.
#! Spatiotemporal Consistency: The model does not shuffle or lose the order of patches during processing, 
#! so any reconstruction attempt (e.g., decoding or projecting back to the pixel space) 
#! should preserve the original spatial and temporal relationships between patches.
# 4. 调整输入形状，确保其形状为 [batch_size, seq_length, feature_dim]
# encoded_features = encoded_features.view(1, -1, model.config.hidden_size) # model.config.hidden_size=768
projection_layer = torch.nn.Linear(outputs.logits.shape[2], 384)# hidden_size decoder typically epected is 384
#! Linear Layer allows for learning and adjusting weights to best fit the data.
# 5. 使用 decoder 进行解码
#! the model's decoding process handles the reconstruction.
decoded_frames = model.decoder(projection_layer(outputs.logits), return_token_num=tubelet_seq_length*num_patches_per_frame)
#* 1, 712, 1536
linear_layer = torch.nn.Linear(1536, 3 * 14 * 14)  # Project to patch size
projected_patches = linear_layer(decoded_frames.logits)  # Shape: (1, 748, 768)
projected_patches = projected_patches.view(1,-1, 3, 14, 14)  # Shape: (1, 773, 3, 16, 16)
num_frames_reconstructed = projected_patches.shape[1] // tubelet_seq_length
reconstructed_video = torch.zeros(1, num_frames_reconstructed, 3, 224, 224)


In [None]:
outputs.logits.shape,model.config.hidden_size,decoded_frames.logits.shape,projected_patches.shape,tubelet_seq_length,bool_masked_pos.shape

In [1]:
import torch
from transformers import VideoMAEConfig, VideoMAEForPreTraining, VideoMAEImageProcessor

# Create a dummy input tensor
input_tensor = torch.randn(1, 24, 3, 224, 224)

# Initialize the model
config = VideoMAEConfig(
    image_size=224,
    patch_size=16,
    num_channels=3,
    num_frames=24,
    tubelet_size=3,
    hidden_size=768,
    num_hidden_layers=12,
    num_attention_heads=12,
    intermediate_size=3072,
    hidden_act="gelu",
    hidden_dropout_prob=0.0,
    attention_probs_dropout_prob=0.0,
    initializer_range=0.02,
    layer_norm_eps=1e-12,
    qkv_bias=True,
    use_mean_pooling=True
)
model = VideoMAEForPreTraining(config)
configuration = model.config
# Create a mask for missing values
num_patches_per_frame = (config.image_size // config.patch_size) ** 2
seq_length = (config.num_frames // config.tubelet_size) * num_patches_per_frame#!patch number of a tubelet
bool_masked_pos = torch.randint(0, 2, (1, seq_length)).bool()

# Prepare the input for the image processor
input_np = input_tensor.squeeze(0).permute(0, 2, 3, 1).numpy()
input_np = (input_np - input_np.min()) / (input_np.max() - input_np.min())

# Initialize the image processor with custom size
image_processor = VideoMAEImageProcessor(
    do_resize=False, 
    size={"height": 224, "width": 224},
    do_normalize=True,
    image_mean=[0.5, 0.5, 0.5],
    image_std=[0.5, 0.5, 0.5]
)

# Process the input
processed_input = image_processor(
    images=list(input_np),
    return_tensors="pt"
)

# Forward pass through the model
outputs = model(
    pixel_values=processed_input.pixel_values,
    bool_masked_pos=bool_masked_pos
)

# The output contains the loss and reconstructed patches
loss = outputs.loss
reconstructed_patches = outputs.logits

print("Loss:", loss.item())
print("Reconstructed patches shape:", reconstructed_patches.shape)


It looks like you are trying to rescale already rescaled images. If the input images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again.
  return torch.tensor(value)


Loss: 1.1391116380691528
Reconstructed patches shape: torch.Size([1, 753, 2304])


In [2]:
import torch
import math

# Unflatten the logits to shape (batch_size, masked_seq, tubelet_size, patch_size, patch_size, num_channels)
batch_size, num_masked_patches, flattened_patch_dim = reconstructed_patches.shape
patch_size = config.patch_size
num_channels = config.num_channels
tubelet_size = config.tubelet_size

# Compute the number of patches per frame and number of frames
num_patches_per_frame = (config.image_size // patch_size) ** 2
num_frames = config.num_frames

#TODO OR Reshape the logits with different position of tubelet_size 
reconstructed_patches = reconstructed_patches.view(
    batch_size, num_masked_patches, tubelet_size, patch_size, patch_size, num_channels
)

# Create an empty tensor for the output of shape (batch_size, num_frames, num_channels, image_size, image_size)
reconstructed_video = torch.zeros(
    batch_size, num_frames,  config.image_size, config.image_size,num_channels
)

# Get the indices of the masked patches
masked_indices = torch.nonzero(bool_masked_pos[0]).flatten()

# Loop through each masked patch and place it into the correct position in the original video grid
for i, idx in enumerate(masked_indices):
    frame_idx = idx // num_patches_per_frame  # Find the corresponding frame
    patch_idx = idx % num_patches_per_frame  # Find the corresponding patch within the frame
    
    # Compute the top-left corner of the patch in the frame
    y = (patch_idx // (config.image_size // patch_size)) * patch_size
    x = (patch_idx % (config.image_size // patch_size)) * patch_size
    
    for j in range(tubelet_size):
    # Place the reconstructed patch back into the original video grid
        reconstructed_video[:, frame_idx+j*num_frames//tubelet_size,  y:y+patch_size, x:x+patch_size,:] = reconstructed_patches[:, i,j]

# The reconstructed_video now has the same spatial and temporal dimensions as the original input
print("Reconstructed video shape:", reconstructed_video.shape)


Reconstructed video shape: torch.Size([1, 24, 224, 224, 3])


In [None]:
from transformers import AutoImageProcessor, VideoMAEForPreTraining
import numpy as np
import torch
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
num_frames = 16
video = list(np.random.randint(0, 256, (num_frames, 3, 224, 224)))

image_processor = AutoImageProcessor.from_pretrained("MCG-NJU/videomae-base")
model = VideoMAEForPreTraining.from_pretrained("MCG-NJU/videomae-base")
# Move the model to GPU
# model.to(device)
pixel_values = image_processor(video, return_tensors="pt").pixel_values
# Move the inputs to GPU
# pixel_values = pixel_values.to(device)
num_patches_per_frame = (model.config.image_size // model.config.patch_size) ** 2
seq_length = (num_frames // model.config.tubelet_size) * num_patches_per_frame#!patch number of a tubelet
bool_masked_pos = torch.randint(0, 2, (1, seq_length)).bool()

outputs = model(pixel_values, bool_masked_pos=bool_masked_pos)
loss = outputs.loss
print("Reconstructed patches shape:", outputs.logits.shape)

In [None]:
import torch.nn.functional as F

# Assume `input_tensor` is the original input video tensor with shape (batch_size, num_frames, num_channels, height, width)
# Squeeze to remove the batch dimension and permute to match the shape of the reconstructed video
original_video = input_tensor.squeeze(0).permute(0, 2, 3, 1)  # Shape: (num_frames, height, width, num_channels)

# Permute reconstructed_video to match the original video shape
reconstructed_video = reconstructed_video.permute(0, 1, 4, 2, 3)  # Shape: (batch_size, num_frames, num_channels, height, width)

# Compute the MSE loss
loss = F.mse_loss(reconstructed_video, original_video)

print("Manual Reconstruction Loss:", loss.item())


In [1]:
import av
import numpy as np

from transformers import AutoImageProcessor, VideoMAEModel
from huggingface_hub import hf_hub_download

np.random.seed(0)


def read_video_pyav(container, indices):
    '''
    Decode the video with PyAV decoder.
    Args:
        container (`av.container.input.InputContainer`): PyAV container.
        indices (`List[int]`): List of frame indices to decode.
    Returns:
        result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
    '''
    frames = []
    container.seek(0)
    start_index = indices[0]
    end_index = indices[-1]
    for i, frame in enumerate(container.decode(video=0)):
        if i > end_index:
            break
        if i >= start_index and i in indices:
            frames.append(frame)
    return np.stack([x.to_ndarray(format="rgb24") for x in frames])


def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
    '''
    Sample a given number of frame indices from the video.
    Args:
        clip_len (`int`): Total number of frames to sample.
        frame_sample_rate (`int`): Sample every n-th frame.
        seg_len (`int`): Maximum allowed index of sample's last frame.
    Returns:
        indices (`List[int]`): List of sampled frame indices
    '''
    converted_len = int(clip_len * frame_sample_rate)
    end_idx = np.random.randint(converted_len, seg_len)
    start_idx = end_idx - converted_len
    indices = np.linspace(start_idx, end_idx, num=clip_len)
    indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
    return indices


# video clip consists of 300 frames (10 seconds at 30 FPS)
file_path = hf_hub_download(
    repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
)
container = av.open(file_path)

# sample 16 frames
indices = sample_frame_indices(clip_len=16, frame_sample_rate=1, seg_len=container.streams.video[0].frames)
video = read_video_pyav(container, indices)

image_processor = AutoImageProcessor.from_pretrained("MCG-NJU/videomae-base")
model = VideoMAEModel.from_pretrained("MCG-NJU/videomae-base")

# prepare video for the model
inputs = image_processor(list(video), return_tensors="pt")

# forward pass
outputs = model(**inputs)
last_hidden_states = outputs.last_hidden_state
list(last_hidden_states.shape)

eating_spaghetti.mp4:   0%|          | 0.00/1.01M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
  return torch.tensor(value)


[1, 1568, 768]