##**Download the VideoCLIP Encoder and install other Dependencies**

In [None]:
%%capture
!git lfs install
!git clone https://huggingface.co/AskYoutube/AskVideos-VideoCLIP-v0.2
!git clone https://github.com/AskYoutubeAI/AskVideos-VideoCLIP
%cd AskVideos-VideoCLIP/
%mkdir models
%cp ../AskVideos-VideoCLIP-v0.2/askvideos_clip_v0.2.pth ./models

!pip install omegaconf
!pip install iopath
!pip install timm
!pip install decord
!pip install webdataset
!pip install einops
!pip install yt_dlp
!pip install peft
!pip install huggingface-hub
!pip install transformers torch bitsandbytes accelerate
%%capture
!pip install bert-score

##**Import Necessary Libraries**

In [None]:
# Imports
from argparse import Namespace
from IPython.display import HTML, display
import matplotlib.pyplot as plt
import numpy as np
import os
import torch
import video_clip
import bert_score
import pandas as pd

from typing import List, Optional
from tqdm import tqdm
from transformers import AutoModel , LlamaForCausalLM , AutoTokenizer , DataCollatorWithPadding , DistilBertTokenizer
from transformers import get_scheduler
import torch
import torch.nn as nn
from torch.utils.data import DataLoader , Dataset , default_collate
import pandas as pd
from peft import (
    get_peft_model,
    LoraConfig,
    TaskType,
)
from torch.nn.utils.rnn import pad_sequence

# New Section

In [None]:
# Mount drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
filtered_df = pd.read_csv('/content/drive/MyDrive/DataDownload/filtered_dataframe.csv')
path = '/content/drive/MyDrive/DataDownload/Videos'

In [None]:
len(filtered_df)

18433

In [None]:
tokenizer = AutoTokenizer.from_pretrained('meta-llama/Llama-3.2-3B-Instruct' , use_auth_token = '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')



tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

In [None]:
tokenizer.eos_token

'<|eot_id|>'

In [None]:
tokenizer.bos_token

'<|begin_of_text|>'

In [None]:
tokenizer.vocab_size

128000

In [None]:
tokenizer.eos_token_id

128009

In [None]:
tokenizer.add_special_tokens({'pad_token': '<|pad_token|>'})
tokenizer.pad_token_id = 128010

In [None]:

tokenizer.pad_token

'<|python_tag|>'

In [None]:

tokenizer.pad_token_id

128010

In [None]:
tokenizer.vocab_size

128000

In [None]:
data_collator_for_padding = DataCollatorWithPadding(tokenizer=tokenizer)

##**VQA-Dataset**

In [None]:
class VQA_DATASET(Dataset):
    def __init__(self,df,path,tokenizer,testing = False):
        super().__init__()
        self.df = df
        self.path = path
        self.tokenizer = tokenizer
        self.testing = testing

    def __len__(self):
        return len(self.df)

    def __getitem__(self,idx):
        gif_id = self.df.iloc[idx, 0] + ".mp4"
        question = self.df.iloc[idx, 1]
        answer = self.df.iloc[idx, 2]
        gif_path = os.path.join(self.path , gif_id)

        if self.testing == True:
          input_ids , mask = self.create_sequence(question)
          sample = {'gif_path': gif_path , 'input_ids' : input_ids , 'mask' : mask  , 'question' : question , 'answer' : answer}
          return sample

        else:
          input_ids , mask = self.create_sequence(question , answer)
          sample = {'gif_path': gif_path , 'input_ids' : input_ids , 'mask' : mask }
          return sample

    def create_sequence(self,question = None ,answer=None ):
      if self.testing == True:
        prompt = f"<|begin_of_text|> question : {question} answer : "
        sequence = self.tokenizer(str(prompt)  ,max_length = 32 ,padding = 'max_length' , truncation = True ,return_tensors='pt' )
        input_ids = sequence['input_ids']
        mask = sequence['attention_mask']
        return input_ids , mask

      else:
        prompt = f"<|begin_of_text|> question : {question} answer : {answer} <|eot_id|>"
        sequence = self.tokenizer(str(prompt)  ,max_length = 32 ,padding = 'max_length' , truncation = True ,return_tensors='pt' )
        input_ids = sequence['input_ids']
        mask = sequence['attention_mask']
        return input_ids , mask

def custom_collator(batch):
    gif_paths = [item['gif_path'] for item in batch]
    input_ids = [item['input_ids'] for item in batch]
    masks = [item['mask'] for item in batch]

    features_to_pad = {
        'input_ids': input_ids,
        'attention_mask': masks,
    }

    padded_tensors = data_collator_for_padding(features_to_pad)

    return padded_tensors , gif_paths

def test_collator(batch):
    gif_paths = [item['gif_path'] for item in batch]
    input_ids = [item['input_ids'] for item in batch]
    masks = [item['mask'] for item in batch]
    questions = [item['question'] for item in batch]
    answers = [item['answer'] for item in batch]

    features_to_pad = {
        'input_ids': input_ids,
        'attention_mask': masks,
    }

    padded_tensors = data_collator_for_padding(features_to_pad)

    return padded_tensors , gif_paths , questions , answers

##**Loading the Model and initializing weights as trainable or frozen**

In [None]:
eval_config = 'eval_configs/video_clip_v0.2.yaml'
video_model2, vis_processor = video_clip.load_model(eval_config)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



Loading VIT


100%|██████████| 1.89G/1.89G [00:11<00:00, 184MB/s]
  state_dict = torch.load(cached_file, map_location="cpu")


Loading VIT Done
Loading Q-Former


100%|██████████| 413M/413M [00:02<00:00, 203MB/s]
  checkpoint = torch.load(cached_file, map_location="cpu")


Load first Checkpoint: ./models/askvideos_clip_v0.2.pth


  ckpt = torch.load(ckpt_path, map_location="cpu")


In [None]:
for param in video_model2.parameters():
    param.requires_grad = False

In [None]:
def video_model(video_paths_list):
    video_embs = video_clip.get_all_video_embeddings(video_paths_list, video_model2, vis_processor) # List with each element of size (1 , 32 , 1024)

    video_embs = [video_emb.squeeze(0) for video_emb in video_embs]
    video_embs = torch.stack(video_embs , dim = 0)
    return video_embs

In [None]:
class VQAModel(nn.Module):
    def __init__(self  , video_model , llama_model_path, r ,lora_alpha ,lora_dropout = 0.1 ,tgt = ["q_proj",'k_proj']):
        super(VQAModel , self).__init__()
        self.video_encoder = video_model
        self.llama_model  = LlamaForCausalLM.from_pretrained(llama_model_path , use_auth_token = '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' ,load_in_8bit=True  , device_map = 'auto')
        self.peft_config = LoraConfig(
          task_type=TaskType.CAUSAL_LM, inference_mode=False,
          r=r,
          lora_alpha=lora_alpha, lora_dropout=lora_dropout,
          target_modules = tgt
        )
        self.peft_model = get_peft_model(self.llama_model, self.peft_config).to(device)

        self.MLP = nn.Sequential( nn.Linear(1024, 2048),
                                 nn.Dropout(p=0.1),
                                  nn.GELU(),
                                  nn.Linear(2048, 3072)
                              ).to(device)

    def forward(self , input_ids ,
                attention_mask = None,
                token_type_ids = None,
                video_paths_list = None
                ):

        encoded_videos = (video_model(video_paths_list)).to(device)
        embeddings = self.peft_model.base_model.model.model.embed_tokens(input_ids)
        encoded_videos = self.MLP(encoded_videos)
        # b x seq
        new_mask = (torch.ones((attention_mask.shape[0] , 32))).to(device)
        attention_mask = torch.cat([
            new_mask , attention_mask
        ] , dim = 1)
        fused_embeddings = torch.cat([
           encoded_videos , embeddings] , dim = 1) # b x 64 x 3072

        outputs = self.peft_model(inputs_embeds = fused_embeddings.half() , attention_mask = attention_mask.half() )
        final_output = {
            "logits" : outputs.logits
            }

        return final_output

    def generate(self,gif_path,input_ids,mask):   ##for testing and generation
        with torch.no_grad():
          encoded_videos = video_model(gif_path).to(device)
          embeddings = self.peft_model.base_model.model.model.embed_tokens(input_ids)
          encoded_videos = self.MLP(encoded_videos)
          fused_embeddings = torch.cat([
              encoded_videos , embeddings.squeeze(dim=1)] , dim = 1)  # b x 64 x 4096
          mask = mask.to(device)
          new_mask = (torch.ones((mask.shape[0] , 32))).to(device)
          mask = torch.cat([
              new_mask , mask.squeeze(dim=1)
          ] , dim = 1).to(device)

        return fused_embeddings , mask


In [None]:
batch_size = 16

train_dataset = VQA_DATASET(filtered_df,path , tokenizer)
train_dataloader = DataLoader(train_dataset ,batch_size=batch_size, shuffle=True , collate_fn = custom_collator , num_workers = 8 )

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
llama_model_path = "meta-llama/Llama-3.2-3B-Instruct"
lora_rank = 16
lora_alpha = 16

In [None]:
model = VQAModel(video_model, llama_model_path, lora_rank, lora_alpha).to(device)



config.json:   0%|          | 0.00/878 [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

##**COUNT NUMBER OF TRAINABLE AND NON-TRAINABLE PARAMS**

In [None]:

# Assuming 'model' is your PyTorch model
def count_parameters(model):
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

    return total_params, trainable_params

total_params, trainable_params = count_parameters(model)

print(f'Total Parameters: {total_params}')
print(f'Trainable Parameters: {trainable_params}')


Total Parameters: 3225731072
Trainable Parameters: 12981248


##**DEFINE THE OPTIMIZER AND LR_SCHEDULER**

In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)

## lr_scheduler
num_train_epochs = 3
num_update_steps_per_epoch = len(train_dataloader)
# print(num_update_steps_per_epoch)
num_training_steps = num_train_epochs * num_update_steps_per_epoch
# print(num_training_steps)

lr_scheduler = get_scheduler(
    name="linear",  ##Cosine Annealing or any other
    optimizer=optimizer,
    num_warmup_steps=0.1*num_training_steps,
    num_training_steps=num_training_steps,
)

In [None]:
scaler = torch.amp.GradScaler()

In [None]:
import time
for batch in train_dataloader:
  start_time = time.time()
  data,gif_paths = batch
  end_time = time.time()
  print(end_time-start_time)
  break

7.867813110351562e-06


##Training Loop

In [None]:
train_dataset = VQA_DATASET(filtered_df,path , tokenizer)
train_dataloader = DataLoader(train_dataset ,batch_size=batch_size, shuffle=True , collate_fn = custom_collator , num_workers = 8 )

**LOAD ANY PRETRAINED MODEL IF IT EXISTS**

In [None]:
# Path to your checkpoint file
checkpoint_path = '/content/drive/MyDrive/DataDownload/checkpoint_videoclip_llama.pth'

# Load the checkpoint
checkpoint = torch.load(checkpoint_path )

# Assume 'model' is your VQAModel instance
# Load the state dict into the model
model_state_dict = checkpoint.get('model_state_dict', checkpoint)
model.load_state_dict(model_state_dict, strict=False)

# If you're using a GPU, move the model to GPU

model.to(device)

  checkpoint = torch.load(checkpoint_path )


VQAModel(
  (llama_model): LlamaForCausalLM(
    (model): LlamaModel(
      (embed_tokens): Embedding(128256, 3072)
      (layers): ModuleList(
        (0-27): 28 x LlamaDecoderLayer(
          (self_attn): LlamaSdpaAttention(
            (q_proj): lora.Linear8bitLt(
              (base_layer): Linear8bitLt(in_features=3072, out_features=3072, bias=False)
              (lora_dropout): ModuleDict(
                (default): Dropout(p=0.1, inplace=False)
              )
              (lora_A): ModuleDict(
                (default): Linear(in_features=3072, out_features=16, bias=False)
              )
              (lora_B): ModuleDict(
                (default): Linear(in_features=16, out_features=3072, bias=False)
              )
              (lora_embedding_A): ParameterDict()
              (lora_embedding_B): ParameterDict()
              (lora_magnitude_vector): ModuleDict()
            )
            (k_proj): lora.Linear8bitLt(
              (base_layer): Linear8bitLt(in_features=3

##**TRAINING LOOP**

In [None]:
import torch.nn.functional as F
from tqdm import tqdm

def train(model, optimizer, device, epochs):
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        with tqdm(total=len(train_dataloader), desc='Training Epoch {}'.format(epoch + 1)) as pbar:
            for batch in train_dataloader:
                try:
                    data, gif_paths = batch
                    gif_paths = gif_paths
                    data = {k: v.to(device) for k, v in data.items()}
                    optimizer.zero_grad()

                    with torch.amp.autocast(device_type='cuda'):
                        final_output = model(
                            input_ids=data['input_ids'].squeeze(dim=1),
                            attention_mask=data['attention_mask'].squeeze(dim=1),
                            video_paths_list=gif_paths
                        )
                        logits = final_output['logits'].to(device)
                        trimmed_logits = logits[:, 32:63, :].contiguous()
                        trimmed_labels = data['input_ids'].squeeze(dim=1)[:, 1:].contiguous()

                        loss = F.cross_entropy(trimmed_logits.view(-1, trimmed_logits.size(-1)),
                                               trimmed_labels.view(-1))

                    scaler.scale(loss).backward()
                    scaler.step(optimizer)
                    scaler.update()
                    lr_scheduler.step()

                    total_loss += loss.item()

                except Exception as e:
                    print(f"Error in batch: {e}")
                    continue  # Skip the problematic batch

                pbar.update(1)

        avg_loss = total_loss / len(train_dataloader)
        print(f'Epoch {epoch + 1} || Train Loss: {avg_loss:.4f}')
        torch.save({
                      'model_state_dict': model.state_dict()
                  }, '/content/drive/MyDrive/DataDownload/checkpoint_videoclip_llama.pth')


## Train Iteration 1

In [None]:
train(model , optimizer , device , num_train_epochs )

  return torch.cuda.amp.autocast(dtype=dtype)
Training Epoch 1:   4%|▎         | 43/1153 [11:57<4:39:11, 15.09s/it]

Error in batch: Error reading /content/drive/MyDrive/DataDownload/Videos/tumblr_nqlaipLluX1unyhx2o1_400.mp4...


Training Epoch 1:  26%|██▌       | 302/1153 [1:12:31<2:45:53, 11.70s/it]

Error in batch: Error reading /content/drive/MyDrive/DataDownload/Videos/tumblr_no3g4x6AxO1utn2b7o1_400.mp4...


Training Epoch 1:  65%|██████▍   | 744/1153 [2:20:06<45:32,  6.68s/it]

Error in batch: Error reading /content/drive/MyDrive/DataDownload/Videos/tumblr_nqlaipLluX1unyhx2o1_400.mp4...


Training Epoch 1:  96%|█████████▌| 1108/1153 [2:55:38<03:26,  4.58s/it]

Error in batch: Error reading /content/drive/MyDrive/DataDownload/Videos/tumblr_n9l64fs33g1rga6mqo1_250.mp4...


Training Epoch 1: 100%|█████████▉| 1149/1153 [2:58:43<00:37,  9.33s/it]


Epoch 1 || Train Loss: 1.5270


Training Epoch 2:  25%|██▍       | 286/1153 [13:12<39:23,  2.73s/it]

Error in batch: Error reading /content/drive/MyDrive/DataDownload/Videos/tumblr_nqlaipLluX1unyhx2o1_400.mp4...


Training Epoch 2:  35%|███▍      | 403/1153 [18:36<34:25,  2.75s/it]

Error in batch: Error reading /content/drive/MyDrive/DataDownload/Videos/tumblr_no3g4x6AxO1utn2b7o1_400.mp4...


Training Epoch 2:  42%|████▏     | 479/1153 [22:06<31:05,  2.77s/it]

Error in batch: Error reading /content/drive/MyDrive/DataDownload/Videos/tumblr_nqlaipLluX1unyhx2o1_400.mp4...


Training Epoch 2:  50%|█████     | 581/1153 [26:50<26:39,  2.80s/it]

Error in batch: Error reading /content/drive/MyDrive/DataDownload/Videos/tumblr_n9l64fs33g1rga6mqo1_250.mp4...


Training Epoch 2:  98%|█████████▊| 1133/1153 [52:17<00:54,  2.72s/it]

## Train Iteration 2

In [None]:
train(model , optimizer , device , num_train_epochs )

  return torch.cuda.amp.autocast(dtype=dtype)
Training Epoch 1:   7%|▋         | 85/1153 [12:29<2:20:17,  7.88s/it]

Error in batch: Error reading /content/drive/MyDrive/DataDownload/Videos/tumblr_nqlaipLluX1unyhx2o1_400.mp4...


Training Epoch 1:  34%|███▎      | 387/1153 [47:34<1:17:57,  6.11s/it]

Error in batch: Error reading /content/drive/MyDrive/DataDownload/Videos/tumblr_no3g4x6AxO1utn2b7o1_400.mp4...


Training Epoch 1:  81%|████████▏ | 938/1153 [1:32:58<14:02,  3.92s/it]

Error in batch: Error reading /content/drive/MyDrive/DataDownload/Videos/tumblr_nqlaipLluX1unyhx2o1_400.mp4...


Training Epoch 1:  93%|█████████▎| 1077/1153 [1:41:52<04:47,  3.78s/it]

Error in batch: Error reading /content/drive/MyDrive/DataDownload/Videos/tumblr_n9l64fs33g1rga6mqo1_250.mp4...


Training Epoch 1: 100%|█████████▉| 1149/1153 [1:46:12<00:22,  5.55s/it]


Epoch 1 || Train Loss: 0.7013


Training Epoch 2:  12%|█▏        | 137/1153 [06:22<48:57,  2.89s/it]

Error in batch: Error reading /content/drive/MyDrive/DataDownload/Videos/tumblr_nqlaipLluX1unyhx2o1_400.mp4...


Training Epoch 2:  51%|█████     | 586/1153 [27:03<25:52,  2.74s/it]

Error in batch: Error reading /content/drive/MyDrive/DataDownload/Videos/tumblr_no3g4x6AxO1utn2b7o1_400.mp4...


Training Epoch 2:  73%|███████▎  | 847/1153 [39:05<14:23,  2.82s/it]

Error in batch: Error reading /content/drive/MyDrive/DataDownload/Videos/tumblr_nqlaipLluX1unyhx2o1_400.mp4...


Training Epoch 2:  90%|████████▉ | 1033/1153 [47:40<05:38,  2.82s/it]

Error in batch: Error reading /content/drive/MyDrive/DataDownload/Videos/tumblr_n9l64fs33g1rga6mqo1_250.mp4...


Training Epoch 2: 100%|█████████▉| 1149/1153 [52:58<00:11,  2.77s/it]


Epoch 2 || Train Loss: 0.6373


Training Epoch 3:  38%|███▊      | 440/1153 [20:21<32:48,  2.76s/it]

Error in batch: Error reading /content/drive/MyDrive/DataDownload/Videos/tumblr_nqlaipLluX1unyhx2o1_400.mp4...


Training Epoch 3:  43%|████▎     | 499/1153 [23:05<30:16,  2.78s/it]

Error in batch: Error reading /content/drive/MyDrive/DataDownload/Videos/tumblr_n9l64fs33g1rga6mqo1_250.mp4...


Training Epoch 3:  71%|███████   | 816/1153 [37:41<15:26,  2.75s/it]

Error in batch: Error reading /content/drive/MyDrive/DataDownload/Videos/tumblr_nqlaipLluX1unyhx2o1_400.mp4...


Training Epoch 3:  82%|████████▏ | 947/1153 [43:41<09:30,  2.77s/it]

Error in batch: Error reading /content/drive/MyDrive/DataDownload/Videos/tumblr_no3g4x6AxO1utn2b7o1_400.mp4...


Training Epoch 3: 100%|█████████▉| 1149/1153 [52:58<00:11,  2.77s/it]


Epoch 3 || Train Loss: 0.5957


In [None]:
import torch

# Assuming `model` is your trained model
torch.save(model.state_dict() , 'vqa_state_dict.pth')  # Saves the model's state dictionary to 'model.pth'


In [None]:
# Save the model's parameters and optimizer's state
torch.save({
    'epoch': 1,
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'loss': 0.04753347113728523,
}, '/content/checkpoint.pth')


##Evaluation Loop

In [None]:
eval_dataset = VQA_DATASET(eval_df,path , tokenizer )
eval_dataloader = DataLoader(eval_dataset ,batch_size=batch_size, shuffle=True , collate_fn = custom_collator)

In [None]:
def eval(model,optimizer,device,epochs , dataloader):
        for epoch in range(epochs):
            model.eval()
            val_loss = 0
            with torch.no_grad():
              with tqdm(total=len(dataloader), desc='Validation Epoch {}'.format(epoch + 1)) as pbar:
                for batch in dataloader:
                    data,gif_paths = batch
                    data = {k:v.to(device) for k,v in data.items()}
                    optimizer.zero_grad()
                    final_output = model(input_ids = data['input_ids'].squeeze(dim=1), attention_mask = data['attention_mask'].squeeze(dim=1) , video_paths_list = gif_paths )
                    logits = final_output['logits'].to(device)
                    trimmed_logits = logits[:, 33:64, :].contiguous()
                    trimmed_labels = data['input_ids'].squeeze(dim=1)[:, 1:].contiguous()
                    loss = F.cross_entropy(trimmed_logits.view(-1, trimmed_logits.size(-1)), trimmed_labels.view(-1))
                    val_loss+= loss.item()
                    pbar.update(1)

            avg_val_loss = val_loss/len(dataloader)
            print(f'epoch no: {epoch + 1} ||eval_loss : {avg_val_loss}')   #batch avg loss in every epoch

In [None]:
eval(model,optimizer,device , 3 , eval_dataloader)

Validation Epoch 1: 100%|██████████| 13/13 [00:18<00:00,  1.40s/it]


epoch no: 1 ||eval_loss : 0.0019153660037913001


Validation Epoch 2: 100%|██████████| 13/13 [00:18<00:00,  1.39s/it]


epoch no: 2 ||eval_loss : 0.0018976551677601843


Validation Epoch 3: 100%|██████████| 13/13 [00:18<00:00,  1.39s/it]

epoch no: 3 ||eval_loss : 0.0019375680533882517





##Generation loop and Bert Score evaluation

In [None]:
test_dataset = VQA_DATASET(filtered_df , path , tokenizer , testing=True)
test_dataloader = DataLoader(test_dataset ,batch_size=3, shuffle=True , collate_fn = test_collator)

Generation

In [None]:
for batch in test_dataloader:
  data,gif_path,question,answer = batch
  break

In [None]:
input_ids = data['input_ids']
mask = data['attention_mask']

In [None]:
# Get input embeddings by passing token IDs to the model's embedding layer
llama_embeddings , mask = model.generate(gif_path,input_ids,mask)

  return torch.cuda.amp.autocast(dtype=dtype)


In [None]:
output = model.peft_model.generate(
    inputs_embeds=llama_embeddings.half(),
    attention_mask=mask.half(),
    max_new_tokens=8,
    do_sample=True,
    top_p=0.9,
    num_return_sequences=3,
    no_repeat_ngram_size=2,
    pad_token_id = 128010
)

In [None]:
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

In [None]:
question[1]

'What does the man in red shirt do before shout ?'

In [None]:
answer[1]

'fall down'

In [None]:
generated_text

'assistant <|eot_id|><|python_tag|><|python_tag|><|python_tag|><|python_tag|><|python_tag|>'

##**Bert Score Evaluation**

In [None]:
def bert_score_evaluation(model,tokenizer,dataloader):
  model.eval()
  candidate = []
  reference = []

  for batch in tqdm(dataloader, desc="Testing"):
      data,gif_paths,questions,answers = batch
      data = {k:v.to(device) for k,v in data.items()}
      input_ids = data['input_ids']
      mask = data['mask']
      with torch.no_grad():
        llama_embeddings , mask = model.generate(gif_path,input_ids,mask)
        output = model.peft_model.generate(
          inputs_embeds=llama_embeddings.half(),  # Pass the embeddings instead of input_ids
          attention_mask=mask.half(),
          max_new_tokens=50,                   # Maximum length of the generated sequence
          num_beams=5,                     # Beam search with 5 beams
          early_stopping=True,             # Stop when all beams reach the end
          num_return_sequences=1           # Return the best sequence
          )
      output_text = []
      for i in range(output.shape[0]):
        output_text.append(tokenizer.decode(output[i], skip_special_tokens=True))
      candidate.extend(output_text)
      reference.extend(answer)
  P , R , F1 = bert_score.score(candidate, reference, lang="en", verbose=True)
  avg_P_score = P.mean().item()
  avg_R_score = R.mean().item()
  avg_F1_score = F1.mean().item()
  return avg_P_score , avg_R_score , avg_F1_score