In [1]:
import os
import cv2
import pandas as pd
import torch
from transformers import ViTFeatureExtractor
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load CSV file
df = pd.read_csv('movie_description.csv')

In [3]:
# Function to extract frames from video
def extract_frames(video_path, num_frames=16):
    frames = []
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print(f"Error opening video file {video_path}")
        return frames

    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    interval = max(total_frames // num_frames, 1)
    for i in range(0, total_frames, interval):
        cap.set(cv2.CAP_PROP_POS_FRAMES, i)
        ret, frame = cap.read()
        if ret:
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frames.append(cv2.resize(frame, (224, 224)))
        if len(frames) == num_frames:
            break
    cap.release()
    return frames

In [4]:
# Load feature extractor
feature_extractor = ViTFeatureExtractor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")



In [5]:
# Extract and save features
feature_dir = 'features'
os.makedirs(feature_dir, exist_ok=True)

In [6]:
for idx, row in tqdm(df.iterrows(), total=len(df)):
    video_path = row['video_path']
    frames = extract_frames(video_path)
    if not frames:
        print(f"No frames extracted for video {video_path}")
        continue

    features = feature_extractor(images=frames, return_tensors="pt").pixel_values
    feature_path = os.path.join(feature_dir, f"{row['video_id']}.pt")
    torch.save(features, feature_path)

 80%|███████▉  | 799/1000 [03:35<00:41,  4.88it/s]

Error opening video file videos_1000\3365243.mp4
No frames extracted for video videos_1000\3365243.mp4


100%|██████████| 1000/1000 [04:28<00:00,  3.73it/s]


## Data Preparation

In [7]:
from torch.utils.data import Dataset, DataLoader

In [8]:
class VideoDataset(Dataset):
    def __init__(self, csv_file, feature_dir, tokenizer):
        self.df = pd.read_csv(csv_file)
        self.feature_dir = feature_dir
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        video_id = row['video_id']
        feature_path = os.path.join(self.feature_dir, f"{video_id}.pt")
        features = torch.load(feature_path)
        description = row['description']
        inputs = self.tokenizer(description, return_tensors="pt", padding='max_length', truncation=True, max_length=512)
        return features, inputs['input_ids'], inputs['attention_mask']

In [9]:
import torch
import torch.nn as nn
from transformers import VisionEncoderDecoderModel, ViTForImageClassification, GPT2Model, GPT2Config
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer
import os
import pandas as pd
from tqdm import tqdm

In [10]:
# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

In [11]:
# Create dataset and dataloader
dataset = VideoDataset('movie_description.csv', 'features', tokenizer)
dataloader = DataLoader(dataset, batch_size=4, shuffle=True)

## Fine-Tune the ViT Model

In [12]:
# Load the ViT model
model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

In [21]:
batch_size = 2048
seq_length_encoder = 197
seq_length_decoder = 512
hidden_size = 768
num_epochs = 1

In [22]:
# Modify the configuration of the ViT model
config = model.config
config.num_labels = seq_length_decoder
config.max_length = seq_length_decoder

In [23]:
# Define optimizer and scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.1)

In [24]:
# Define loss function
loss_fn = nn.CrossEntropyLoss()

In [25]:
# Training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

VisionEncoderDecoderModel(
  (encoder): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTAttention(
            (attention): ViTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_featur

In [26]:
# features = features.view(-1, features.size(2), features.size(3), features.size(4))

In [27]:
# for epoch in range(num_epochs):
#     model.train()
#     total_loss = 0.0
    
#     for features, input_ids, attention_mask in tqdm(dataloader):
#         # Move inputs to device
#         features = features.reshape(-1, features.size(2), features.size(3), features.size(4))
#         features = features.to(device)
#         input_ids = input_ids.to(device)
#         attention_mask = attention_mask.to(device)
        
#         # Forward pass
#         outputs = model(pixel_values=features, decoder_input_ids=input_ids, decoder_attention_mask=attention_mask)
#         logits = outputs.logits
        
#         # Calculate loss
#         loss = loss_fn(logits, input_ids[:, 1:])
        
#         # Backward pass
#         optimizer.zero_grad()
#         loss.backward()
#         optimizer.step()
        
#         total_loss += loss.item()
    
#     # Adjust learning rate
#     scheduler.step()
    
#     # Print loss after each epoch
#     print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss / len(dataloader)}")

In [28]:
import torch
from tqdm import tqdm

# Assuming you have defined your model, dataloader, loss function, optimizer, scheduler, device, and num_epochs

for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0
    
    for features, input_ids, attention_mask in tqdm(dataloader):
        # Move inputs to device
        features = features.squeeze(1)  # Squeeze the batch dimension
        features = features.to(device)
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        
        # Given pixel values shape: torch.Size([4, 16, 3, 224, 224])
        # Reshape it to have 3 channels (RGB)
        batch_size, num_channels, _, height, width = features.shape
        features = features.view(batch_size, -1, height, width)[:,:3,:,:]
        
        # Now pixel_values has the correct shape: torch.Size([4, 3, 224, 224])

        # Forward pass
        outputs = model(
            pixel_values=features,  # Add pixel_values as input
            decoder_input_ids=input_ids,  # Pass input_ids directly
            decoder_attention_mask=attention_mask,  # Use attention_mask from dataloader
            labels=input_ids,  # Pass labels for calculating loss
        )
        logits = outputs.logits
        
        # Calculate loss
        input_ids_target = input_ids[:, 1:]
        loss = loss_fn(logits.view(-1, logits.shape[-1]), input_ids_target.view(-1))
        
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    # Adjust learning rate
    scheduler.step()
    
    # Print loss after each epoch
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss / len(dataloader)}")

  0%|          | 0/250 [00:13<?, ?it/s]


ValueError: Expected input batch_size (2048) to match target batch_size (0).

In [None]:
print("Pixel values shape:", features.shape)