In [1]:
# Mount drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


**Import Necessary Libraries**

In [64]:
%%capture
!pip install peft
!pip install bitsandbytes
!pip install bert-score

In [65]:
import os
import pandas as pd
import torch
from bert_score import score
from PIL import Image
import numpy as np
import bitsandbytes as bnb
import torch.nn as nn
from tqdm import tqdm
from torch.utils.data import Dataset , DataLoader
from transformers import AutoProcessor , XCLIPVisionModel , LlamaForCausalLM , LlamaTokenizer , AutoTokenizer , DataCollatorWithPadding ,  default_data_collator
from peft import get_peft_model, LoraConfig, TaskType
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [6]:
x_clip_model_path = "microsoft/xclip-base-patch16"
llama_model_path = 'meta-llama/Llama-3.2-3B-Instruct'
lora_rank = 16
lora_alpha = 16

In [7]:
df = pd.read_csv('/content/test_df_with_link.csv')

In [8]:
import asyncio
import aiohttp
import os
import pathlib
from urllib.parse import urlparse

GIF_PATH = pathlib.Path('gifs')
if not GIF_PATH.exists():
    GIF_PATH.mkdir()

def is_valid_url(url):
    """Check if the URL is a valid string and properly formatted."""
    if not isinstance(url, str):
        return False
    parsed = urlparse(url)
    return all([parsed.scheme, parsed.netloc])

async def download_file(url):
    filename = url.split("/")[-1]
    filepath = GIF_PATH / filename

    # Check if the GIF file already exists
    if filepath.exists():
#         print(f"File {filename} already exists, skipping download.")
        return

    max_retries = 3  # Set the maximum number of retries
    retries = 0
    while retries < max_retries:
        try:
            async with aiohttp.ClientSession(trust_env=True) as session:
                async with session.get(url) as response:
                    if response.status == 200:
                        with open(filepath, mode="wb") as file:
                            while True:
                                chunk = await response.content.read(1024)
                                if not chunk:
                                    break
                                file.write(chunk)
#                         print(f"Downloaded file {filename}")
                        return  # Exit the loop if successful
                    else:
#                         print(f"Failed to download {url}: HTTP Status {response.status}")
                        return
        except (aiohttp.client_exceptions.ClientConnectorError, ConnectionResetError) as e:
#             print(f"Error downloading {url}: {e}")
            retries += 1
            await asyncio.sleep(1)  # Wait for a second before retrying

async def safe_request(semaphore, url):
    async with semaphore:
        return await download_file(url)

async def main(url_col, parallel_processes):
    # Filter out invalid URLs
    valid_urls = [url for url in url_col if is_valid_url(url)]

    semaphore = asyncio.Semaphore(parallel_processes)
    tasks = [asyncio.ensure_future(safe_request(semaphore, url)) for url in valid_urls]
    await asyncio.gather(*tasks)

**Downloading the Data**

In [9]:
await main(df['full_link'][70000:70100], 50)

In [None]:
df_new = df[:20000]

In [10]:
df_test = df[70000:70100]

**DEFINE THE PRETRAINED TEXT TOKENIZER : SINCE THIS TOKENIZER DID NOT HAVE PRE-BUILT PAD TOKEN WE INCLUDED OUR CUSTOM PAD TOKEN**

In [11]:
llama_tokenizer = AutoTokenizer.from_pretrained(llama_model_path , use_auth_token = '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')



tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

**SETTING THE PAD TOKEN**

In [12]:
llama_tokenizer.add_special_tokens({'pad_token': '<|pad_token|>'})
llama_tokenizer.pad_token_id = 128010

In [14]:
llama_tokenizer.pad_token

'<|python_tag|>'

In [15]:
data_collator_for_padding = DataCollatorWithPadding(tokenizer=llama_tokenizer)

In [16]:
# # XCLIPProcesser
image_processor = AutoProcessor.from_pretrained(x_clip_model_path)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/310 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/927 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.06M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/472 [00:00<?, ?B/s]



**MAKE THE DATASET**

Frame Sampler

In [17]:
def sample_gif_frames( gif_path, num_frames):

    gif = Image.open(gif_path)

    frames = []
    try:
        while True:
            frame = gif.copy().convert("RGB")
            frames.append(np.array(frame))
            gif.seek(gif.tell() + 1)
    except EOFError:
        pass

    total_frames = len(frames)

    # If there are fewer than num_frames, pad with the last frame
    if total_frames < num_frames:
        last_frame = frames[-1]
        for _ in range(num_frames - total_frames):
            frames.append(last_frame)

    # If there are more than num_frames, sample the frames evenly
    indices = np.linspace(0, len(frames) - 1, num=num_frames, dtype=int)
    sampled_frames = [frames[i] for i in indices]

    result = np.stack(sampled_frames)

    return result

In [18]:

class VQA_DATASET(Dataset):
    def __init__(self,df,gif_dir,tokenizer,testing = False , num_frames = 16):
        super().__init__()
        self.df = df
        self.gif_dir = gif_dir
        self.num_frames = num_frames
        self.tokenizer = tokenizer
        self.testing = testing

    def __len__(self):
        return len(self.df)


    def __getitem__(self,idx):
      try:
        gif_path = str(self.df['full_link'].iloc[idx]).split('/')[-1]
        gif_path = os.path.join(self.gif_dir, gif_path)

        if os.path.exists(gif_path):
          frames = sample_gif_frames(gif_path, self.num_frames)
          inputs = image_processor(videos=list(frames), return_tensors="pt")
          inputs = inputs.pixel_values
          question = self.df['question'].iloc[idx]
          answer = self.df['answer'].iloc[idx]

          if self.testing:
            input_ids, mask = self.create_sequence(question)
            sample = {
                'gif_embed': inputs,
                'input_ids': input_ids,
                'mask': mask,
                'question': question,
                'answer': answer
            }
            return sample
          else:
              input_ids, mask = self.create_sequence(question, answer)
              sample = sample = {
                  'gif_embed': inputs,
                  'input_ids': input_ids,
                  'mask': mask
              }
              return sample
        else:

          return None
      except Exception as e:

        return None

    def create_sequence(self,question = None ,answer=None ):
      if self.testing == True:
        prompt = f"<|begin_of_text|> The question is  {question} The answer is "
        sequence = self.tokenizer(str(prompt)  ,max_length = 32 ,padding = 'max_length' , truncation = True ,return_tensors='pt' )
        input_ids = sequence['input_ids']
        mask = sequence['attention_mask']
        return input_ids , mask

      else:
        prompt = f"<|begin_of_text|> The question is {question} The answer is  {answer} <|eot_id|>"
        sequence = self.tokenizer(str(prompt)  ,max_length = 32 ,padding = 'max_length' , truncation = True ,return_tensors='pt' )
        input_ids = sequence['input_ids']
        mask = sequence['attention_mask']
        return input_ids , mask

from torch.utils.data import default_collate

def custom_collator(batch):
    batch = [item for item in batch if item is not None]

    if len(batch) == 0:
        return None

    gif_embed = [item['gif_embed'] for item in batch]
    input_ids = [item['input_ids'] for item in batch]
    masks = [item['mask'] for item in batch]

    features_to_pad = {
        'input_ids': input_ids,
        'attention_mask': masks,
    }

    padded_tensors = data_collator_for_padding(features_to_pad)

    gif_embed_collated = default_collate(gif_embed)

    return padded_tensors, gif_embed_collated


def test_collator(batch):
    gif_embed = [item['gif_embed'] for item in batch]
    input_ids = [item['input_ids'] for item in batch]
    masks = [item['mask'] for item in batch]
    questions = [item['question'] for item in batch]
    answers = [item['answer'] for item in batch]

    features_to_pad = {
        'input_ids': input_ids,
        'attention_mask': masks,
    }

    padded_tensors = data_collator_for_padding(features_to_pad)

    gif_embed_collated = default_collate(gif_embed)

    return padded_tensors , gif_embed_collated , questions , answers

**MAKE THE MODEL**

In [19]:
class VQAModel(nn.Module):
    def __init__(self  , x_clip_model_path , llama_model_path, r ,lora_alpha ,lora_dropout = 0.1 ,tgt = ["q_proj",'k_proj']):
        super(VQAModel , self).__init__()
        self.video_encoder = XCLIPVisionModel.from_pretrained(x_clip_model_path)
        for params in self.video_encoder.parameters():
            params.requires_grad = False

        self.llama_model  = LlamaForCausalLM.from_pretrained(llama_model_path , use_auth_token = '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'   , device_map = 'auto', load_in_4bit=True , bnb_4bit_compute_dtype=torch.bfloat16 , bnb_4bit_quant_type="nf4" )
        self.peft_config = LoraConfig(
          task_type=TaskType.CAUSAL_LM, inference_mode=False,
          r=r,
          lora_alpha=lora_alpha, lora_dropout=lora_dropout,
          target_modules = tgt
        )
        self.peft_model = get_peft_model(self.llama_model, self.peft_config).to(device)

        self.MLP = nn.Sequential( nn.Linear(768, 2048),
                                  nn.Dropout(p=0.1),
                                  nn.GELU(),
                                  nn.Linear(2048, 3072),
                              ).to(device)

    def forward(self , input_ids ,
                attention_mask = None,
                video_pixel_values = None
                ):
        input_ids = input_ids.squeeze(dim = 1)
        attention_mask = attention_mask.squeeze(dim = 1)
        video_pixel_values = video_pixel_values.squeeze(dim = 1)
        batch_size, frames, channels ,height, width = video_pixel_values.shape
        video_pixel_values = video_pixel_values.view(batch_size * frames, channels, height, width)
        encoded_videos = self.video_encoder(video_pixel_values) # B*Num_Frames , CLS , 768
        encoded_videos = encoded_videos.pooler_output # B*Num_Frames , 768
        encoded_videos = encoded_videos.view(-1,16,768).to(device) # B , 16 , 768

        embeddings = self.peft_model.base_model.model.model.embed_tokens(input_ids)
        encoded_videos = self.MLP(encoded_videos) # B x 16 x 3072
        # b x seq
        new_mask = (torch.ones((attention_mask.shape[0] , 16))).to(device)
        attention_mask = torch.cat([
            new_mask , attention_mask
        ] , dim = 1)

        fused_embeddings = torch.cat([
           encoded_videos , embeddings] , dim = 1) # b x 48 x 3072

        outputs = self.peft_model(inputs_embeds = fused_embeddings.half() , attention_mask = attention_mask.half() )
        final_output = {
            "logits" : outputs.logits
            }

        return final_output

    def generate(self,gif_embeds,input_ids,mask):   ##for testing and generation
        with torch.no_grad():
          input_ids = input_ids.squeeze(dim = 1)
          mask = mask.squeeze(dim = 1)
          gif_embeds = gif_embeds.squeeze(dim = 1)
          batch_size, frames, channels ,height, width = gif_embeds.shape
          gif_embeds = gif_embeds.view(batch_size * frames, channels, height, width)
          encoded_videos = (self.video_encoder(gif_embeds))
          encoded_videos = encoded_videos.pooler_output
          encoded_videos = encoded_videos.view(-1,16,768).to(device)
          embeddings = self.peft_model.base_model.model.model.embed_tokens(input_ids)
          encoded_videos = self.MLP(encoded_videos)
          fused_embeddings = torch.cat([
              encoded_videos , embeddings] , dim = 1)  # b x 48 x 3072
          mask = mask.to(device)
          new_mask = (torch.ones((mask.shape[0] , 16))).to(device)
          mask = torch.cat([
              new_mask , mask
          ] , dim = 1).to(device)

        return fused_embeddings , mask


**INSTANTIATE THE MODEL**

In [20]:
model = VQAModel(x_clip_model_path, llama_model_path, lora_rank, lora_alpha).to(device)

config.json:   0%|          | 0.00/8.90k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/780M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/878 [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

**LOADING THE MODEL STATE DICT**

In [21]:
import torch

checkpoint_path = '/content/drive/MyDrive/DataDownload/checkpoint_x_clip_llama_6.pth'

checkpoint = torch.load(checkpoint_path)

model_state_dict = checkpoint.get('model_state_dict', checkpoint)
model.load_state_dict(model_state_dict, strict=False)


model.to(device)


  checkpoint = torch.load(checkpoint_path)


VQAModel(
  (video_encoder): XCLIPVisionModel(
    (vision_model): XCLIPVisionTransformer(
      (embeddings): XCLIPVisionEmbeddings(
        (patch_embedding): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16), bias=False)
        (position_embedding): Embedding(197, 768)
      )
      (pre_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (encoder): XCLIPVisionEncoder(
        (layers): ModuleList(
          (0-11): 12 x XCLIPVisionEncoderLayer(
            (message_fc): Linear(in_features=768, out_features=768, bias=True)
            (message_ln): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (message_attn): XCLIPAttention(
              (k_proj): Linear(in_features=768, out_features=768, bias=True)
              (v_proj): Linear(in_features=768, out_features=768, bias=True)
              (q_proj): Linear(in_features=768, out_features=768, bias=True)
              (out_proj): Linear(in_features=768, out_features=768, bias=True)
          

**INSTANTIATE A TRAIN DATASET AND A TRAIN DATALOADER**

In [None]:
train_dataset = VQA_DATASET(df_new , gif_dir = '/content/gifs' , tokenizer = tokenizer)
train_dataloader = DataLoader(train_dataset , batch_size = 16 , shuffle = True , collate_fn = custom_collator)

**DEFINE THE OPTIMIZERS AND LR_SCHEDULER**

In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)
from transformers import get_scheduler
## lr_scheduler
num_train_epochs = 10
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=0.1*num_training_steps,
    num_training_steps=num_training_steps,
)

**MAKE A TRAINING LOOP AND SAVE THE MODEL**

In [None]:
import torch.nn.functional as F
def train(model,optimizer,device,epochs):
        train_loss = []
        progress_bar = tqdm(train_dataloader, desc="Training", leave=False)
        for epoch in range(epochs):
            model.train()
            loss = 0
            for batch in progress_bar:
                data,gif_embeds = batch
                gif_embeds = gif_embeds.to(device)
                data = {k:v.to(device) for k,v in data.items()}
                optimizer.zero_grad()
                final_output = model(input_ids = data['input_ids'], attention_mask = data['attention_mask'] , video_pixel_values = gif_embeds )
                logits = final_output['logits'].to(device)
                trimmed_logits = logits[:, 16:47, :].contiguous()   ##Loss computation only for the logits computed from the question and answer
                trimmed_labels = data['input_ids'].squeeze(dim=1)[:, 1:].contiguous()
                loss = F.cross_entropy(trimmed_logits.view(-1, trimmed_logits.size(-1)), trimmed_labels.view(-1))
                loss.backward()
                optimizer.step()
                lr_scheduler.step()
                loss+= loss.item()
                progress_bar.set_postfix(loss=loss.item())
            avg_loss = loss/len(train_dataloader)
            train_loss.append(avg_loss)
            print(f'epoch no: {epoch + 1} ||Train_loss : {avg_loss}')   #batch avg loss in every epoch
            torch.save({
                      'model_state_dict': model.state_dict()
                  }, '/content/drive/MyDrive/checkpoint/checkpoint_x_clip_llama.pth') ##Saving the model parameters

In [None]:
train(model,optimizer,device,num_train_epochs)

**EVALUATION LOOP**

In [None]:
def eval(model,optimizer,device,epochs):
  model.eval()
  val_loss = 0
  with torch.no_grad():
    with tqdm(total=len(eval_dataloader), desc='Validation Epoch {}'.format(epoch + 1)) as pbar:
      for batch in eval_dataloader:
          data,gif_embeds = batch
          data = {k:v.to(device) for k,v in data.items()}
          optimizer.zero_grad()
          final_output = model(input_ids = data['input_ids'], attention_mask = data['attention_mask'] , video_pixel_values = gif_embeds )
          logits = final_output['logits'].to(device)
          trimmed_logits = logits[:, 16:47, :].contiguous()
          trimmed_labels = data['input_ids'].squeeze(dim=1)[:, 1:].contiguous()
          loss = F.cross_entropy(trimmed_logits.view(-1, trimmed_logits.size(-1)), trimmed_labels.view(-1))
          val_loss+= loss.item()
          pbar.update(1)

  avg_val_loss = val_loss/len(eval_dataloader)
  print(f'epoch no: {epoch + 1} ||eval_loss : {avg_val_loss}')


In [None]:
eval(model,optimizer,device,num_train_epochs)

**GENERATION LOOP**

In [24]:
test_dataset = VQA_DATASET(df_test , gif_dir = '/content/gifs' , tokenizer = llama_tokenizer , testing = True)
test_dataloader = DataLoader(test_dataset , batch_size = 4 , shuffle = True , collate_fn = test_collator)

In [26]:
with torch.no_grad():
  for batch in test_dataloader:
    data,gif_embeds,question,answer = batch
    input_ids = data['input_ids'].to(device)
    mask = data['attention_mask'].to(device)
    gif_embeds = gif_embeds.to(device)
    llama_embeddings , mask = model.generate(gif_embeds,input_ids,mask)
    output = model.peft_model.generate(
                            inputs_embeds=llama_embeddings.half(),
                            attention_mask=mask.half(),
                            max_new_tokens=16,
                            num_beams=5,
                            early_stopping=True,
                            num_return_sequences=1,
                            no_repeat_ngram_size=2
                        )
    for i in range(output.shape[0]):
      generated_text = llama_tokenizer.decode(output[i], skip_special_tokens=True)
      print('-------------------------------------------------------------------------------------------------------------------------------')
      print(f"question : {question[i]}")
      print(f"predicted_answer : {generated_text}")
      print(f"actual_answer : {answer[i]}")
      print('-------------------------------------------------------------------------------------------------------------------------------')

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


-------------------------------------------------------------------------------------------------------------------------------
question : What does the dog do before lift leg up ?
predicted_answer :  The dog licks face 
actual_answer : stand up
-------------------------------------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------------------------------------
question : How many times does the man spin finger ?
predicted_answer :  The number  2 
actual_answer : 3
-------------------------------------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------------------------------------
question : What does the man wearing sweater do before extend arm ?
predicted_answer :  The man wear a sweater 
actual_answer : put hand

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


-------------------------------------------------------------------------------------------------------------------------------
question : What does the man do before get touched by another man ?
predicted_answer :  The man smile 
actual_answer : open arms
-------------------------------------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------------------------------------
question : What does the man do after hold a woman's head ?
predicted_answer :  The woman touches the man's face 
actual_answer : kiss woman
-------------------------------------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------------------------------------
question : What does the reunion do before put down a child ?
predicted_answer :  The man 

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


-------------------------------------------------------------------------------------------------------------------------------
question : How many times does the man wipe nose ?
predicted_answer :  The woman 
actual_answer : 2
-------------------------------------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------------------------------------
question : what is combing the another man 's hair ?
predicted_answer :  The  woman 
actual_answer : suit
-------------------------------------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------------------------------------
question : What does the cat do before fall asleep ?
predicted_answer :  The cat sit up 
actual_answer : lay
---------------------------------------------

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


-------------------------------------------------------------------------------------------------------------------------------
question : Give a description about the clip?
predicted_answer :  The dog runs up the stairs and jumps on the bed. 
actual_answer : a dog is jumping up to switch of the light .
-------------------------------------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------------------------------------
question : How many times does the man chew food ?
predicted_answer :  The man is chewing food 
actual_answer : 6
-------------------------------------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------------------------------------
question : What does the boy in red shirt do 3 times ?
predicted_answ

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


-------------------------------------------------------------------------------------------------------------------------------
question : What does the bird do 2 times ?
predicted_answer :  The bird flaps wings 
actual_answer : throw treat
-------------------------------------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------------------------------------
question : what is the color of the hair ?
predicted_answer :  The 
actual_answer : brown
-------------------------------------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------------------------------------
question : What does the man on right do after walk down the street ?
predicted_answer :  The man on left push him 
actual_answer : take their shirt off
----

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


-------------------------------------------------------------------------------------------------------------------------------
question : How many times does the child raise her eyebrows ?
predicted_answer :  The 3 
actual_answer : 3
-------------------------------------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------------------------------------
question : What does the men do before jump ?
predicted_answer :  The man in white jumps 
actual_answer : stand
-------------------------------------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------------------------------------
question : What does the man do after spin ?
predicted_answer :  The woman 
actual_answer : hit ball
----------------------------------------

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


-------------------------------------------------------------------------------------------------------------------------------
question : What does the man do after pull out a match ?
predicted_answer :  The man put the match in his mouth 
actual_answer : strike the match
-------------------------------------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------------------------------------
question : What does the woman do after look ahead ?
predicted_answer :  The woman blow smoke 
actual_answer : blow out smoke
-------------------------------------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------------------------------------
question : What does the woman do before wave at camera ?
predicted_answer :  The woman 

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


-------------------------------------------------------------------------------------------------------------------------------
question : What does the woman do before stand on hand ?
predicted_answer :  The woman stand on knee 
actual_answer : squat down
-------------------------------------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------------------------------------
question : How many times does the man swing hips ?
predicted_answer :  (2) 
actual_answer : 6
-------------------------------------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------------------------------------
question : What does the man do 3 times ?
predicted_answer :  The man wave hand 
actual_answer : wave hands
----------------------------

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


-------------------------------------------------------------------------------------------------------------------------------
question : What does the man do after walk up ?
predicted_answer :  The man walk down 
actual_answer : clap face
-------------------------------------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------------------------------------
question : What does the man do 10 or more than 10 times ?
predicted_answer :  The man strum guitar 
actual_answer : play guitar
-------------------------------------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------------------------------------
question : How many times does the man with Velcro suit step ?
predicted_answer :  The 4 
actual_answer : 5
----------

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


-------------------------------------------------------------------------------------------------------------------------------
question : What does the man do after neutral ?
predicted_answer :  The smile 
actual_answer : look unamused
-------------------------------------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------------------------------------
question : Give a description about the clip?
predicted_answer :  The ballerina is dancing in a studio. 
actual_answer : a woman is dancing on stage in a black tank top and shorts .
-------------------------------------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------------------------------------
question : Give a description about the clip?
predicted_answer :  The

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


-------------------------------------------------------------------------------------------------------------------------------
question : where is the skateboarder performing tricks . ?
predicted_answer :  The street 
actual_answer : pipe
-------------------------------------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------------------------------------
question : How many times does the man sprint ?
predicted_answer :  3 
actual_answer : 4
-------------------------------------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------------------------------------
question : What does the man on right do 2 times ?
predicted_answer :  The man on right hug another man 
actual_answer : rub another man's back
---------------

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


-------------------------------------------------------------------------------------------------------------------------------
question : What does the boy on the left do before take off headset ?
predicted_answer :  The boy on the right smile 
actual_answer : bow down
-------------------------------------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------------------------------------
question : What does the man do after brush hair ?
predicted_answer :  The man smile 
actual_answer : look up
-------------------------------------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------------------------------------
question : what is the color of the t-shirt ?
predicted_answer :  The color is  white 
actual_answer : whit

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


-------------------------------------------------------------------------------------------------------------------------------
question : What does the man do before jump kick ?
predicted_answer :  The man run 
actual_answer : look forward
-------------------------------------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------------------------------------
question : Give a description about the clip?
predicted_answer :  The cat is wearing a red hat and sitting next to a pumpkin. 
actual_answer : six cats are wearing hats and one walks off .
-------------------------------------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------------------------------------
question : What does the man do 3 times ?
predicted_answer

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


-------------------------------------------------------------------------------------------------------------------------------
question : What does the man do after walk toward another man ?
predicted_answer :  The other man walk away 
actual_answer : hug man
-------------------------------------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------------------------------------
question : what are the man driving wearing a hat and a hood ?
predicted_answer :  The car 
actual_answer : car
-------------------------------------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------------------------------------
question : What does the woman on left do 3 times ?
predicted_answer :  The woman on right dance 
actual_answer : s

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


-------------------------------------------------------------------------------------------------------------------------------
question : What does the man with guitar do before look forward ?
predicted_answer :  The man with guitar look down 
actual_answer : envelop woman
-------------------------------------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------------------------------------
question : what is the hand holding ?
predicted_answer :  The puppy 
actual_answer : puppy
-------------------------------------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------------------------------------
question : Give a description about the clip?
predicted_answer :  The woman is wearing sunglasses and her hair is blowing 

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


-------------------------------------------------------------------------------------------------------------------------------
question : what is the woman wearing ?
predicted_answer :  The dress 
actual_answer : gown
-------------------------------------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------------------------------------
question : What does the woman do before smile ?
predicted_answer :  The woman looks down 
actual_answer : wink
-------------------------------------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------------------------------------
question : Give a description about the clip?
predicted_answer :  The man in a black jacket is hugging another man. 
actual_answer : two young man in a cape 

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


-------------------------------------------------------------------------------------------------------------------------------
question : How many times does the dog chew carrot ?
predicted_answer :  The dog chews the carrot 5 
actual_answer : 5
-------------------------------------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------------------------------------
question : what is the color of the jacket ?
predicted_answer :  The color of the jacket 
actual_answer : black
-------------------------------------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------------------------------------
question : What does the man do 2 times ?
predicted_answer :  The woman 
actual_answer : lift arm
-------------------------------

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


-------------------------------------------------------------------------------------------------------------------------------
question : where is the man dancing . ?
predicted_answer :  The room 
actual_answer : room
-------------------------------------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------------------------------------
question : Give a description about the clip?
predicted_answer :  The man in a black jacket is talking to another man. 
actual_answer : a guy with black coat is showing a gun to a man in suit .
-------------------------------------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------------------------------------
question : What does the man do before jump on platform ?
predicted_answer 

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


-------------------------------------------------------------------------------------------------------------------------------
question : What does the girl do 2 times ?
predicted_answer :  The girl shake head 
actual_answer : lick lips
-------------------------------------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------------------------------------
question : What does the man in yellow uniform do before jump ?
predicted_answer :  The man in white uniform dribble ball 
actual_answer : run
-------------------------------------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------------------------------------
question : What does the woman do 3 times ?
predicted_answer :  The woman turn her head 
actual_answer : pu

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


-------------------------------------------------------------------------------------------------------------------------------
question : What does the man do 2 times ?
predicted_answer :  The man take a sip 
actual_answer : raise eyebrow
-------------------------------------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------------------------------------
question : Give a description about the clip?
predicted_answer :  The race cars are driving on a race track. 
actual_answer : some red cars are running in a race .
-------------------------------------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------------------------------------
question : how many wrestler lifts another wrestler and slams him down on to a mat ?

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


-------------------------------------------------------------------------------------------------------------------------------
question : what is trying to get in through the window ?
predicted_answer :  The cat 
actual_answer : cat
-------------------------------------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------------------------------------
question : what is the color of the stiletto ?
predicted_answer :  The color 
actual_answer : red
-------------------------------------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------------------------------------
question : How many times does the man take bites of noodles ?
predicted_answer :  The answer is  0 
actual_answer : 0
-------------------------------------

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


-------------------------------------------------------------------------------------------------------------------------------
question : How many times does the woman in ring step ?
predicted_answer :  The woman in ring steps 3 
actual_answer : 3
-------------------------------------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------------------------------------
question : How many times does the man rub head ?
predicted_answer :  The 3 
actual_answer : 3
-------------------------------------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------------------------------------
question : What do the person wearing high heels do 3 times ?
predicted_answer :  The person wearing high heels walk 
actual_answer : step
-----

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


-------------------------------------------------------------------------------------------------------------------------------
question : What does the woman do after move hand ?
predicted_answer :  The woman move hand 
actual_answer : cover mouth with both hand
-------------------------------------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------------------------------------
question : What does the woman do 2 times ?
predicted_answer :  The woman run 
actual_answer : step
-------------------------------------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------------------------------------
question : What does the man on left do before toss it behind him ?
predicted_answer :  The man on right kick the ball 
actu

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


-------------------------------------------------------------------------------------------------------------------------------
question : what does the person move ?
predicted_answer :  The hand 
actual_answer : bear
-------------------------------------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------------------------------------
question : What does the woman do before land on mat ?
predicted_answer :  The woman flip 
actual_answer : jump off beam
-------------------------------------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------------------------------------
question : How many times does the dog paw ground ?
predicted_answer :  The dog paws the ground 5 
actual_answer : 10
-------------------------------

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


-------------------------------------------------------------------------------------------------------------------------------
question : what falls into the bath and tries to jump off ?
predicted_answer :  The cat 
actual_answer : cat
-------------------------------------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------------------------------------
question : Give a description about the clip?
predicted_answer :  The man in a black shirt is making hand gestures. 
actual_answer : two hands are reaching for each other .
-------------------------------------------------------------------------------------------------------------------------------
-------------------------------------------------------------------------------------------------------------------------------
question : Give a description about the clip?
predicted_answer :  The dog is s

In [50]:
def generator(question ,gif_dir , gif_link):
  gif_path = os.path.join(gif_dir , str(gif_link).split('/')[-1])
  if os.path.exists(gif_path):
    frames = sample_gif_frames(gif_path, 16)
    inputs = image_processor(videos=list(frames), return_tensors="pt")
    inputs = inputs.pixel_values
    prompt = f"<|begin_of_text|> The question is {question} The answer is "
    sequence = llama_tokenizer(str(prompt)  ,max_length = 32 ,padding = 'max_length' , truncation = True ,return_tensors='pt' )
    input_ids = sequence['input_ids'].to(device)
    mask = sequence['attention_mask'].to(device)
    inputs = inputs.to(device)
    print(inputs.size)
    print(mask.size)
    print(input_ids.size)
    llama_embeddings , mask = model.generate(inputs,input_ids,mask)
    output = model.peft_model.generate(
                            inputs_embeds=llama_embeddings.half(),
                            attention_mask=mask.half(),
                            max_new_tokens=16,
                            num_beams=5,
                            early_stopping=True,
                            num_return_sequences=1,
                            no_repeat_ngram_size=2
                        )
    generated_text = llama_tokenizer.decode(output[0], skip_special_tokens=True)
    print(f"question : {question}")
    print(f"predicted_answer : {generated_text}")


  else:
    print("gif doesn't exsist")



In [54]:
link1 = df['full_link'][70050]

In [59]:
link1

'https://31.media.tumblr.com/5534ad5bec2134d390b6570c27712359/tumblr_nafdsueQbr1tjebzno1_400.gif'

In [55]:
question1 = "What's the description of this clip ?"

In [56]:
generator(question = question1 ,gif_dir = '/content/gifs', gif_link =  link1 )

<built-in method size of Tensor object at 0x7ddcfc705fd0>
<built-in method size of Tensor object at 0x7ddcfcdf83b0>
<built-in method size of Tensor object at 0x7ddcfcdf8900>


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


question : What's the description of this clip ?
predicted_answer :  The man is riding a skateboard on the stairs. 


In [57]:
link2 = df['full_link'][70051]

In [58]:
link2

'https://31.media.tumblr.com/3a49fe4885acb4089e5aa484b4ad7be1/tumblr_noxovlt9FT1uuvxsao1_400.gif'

In [60]:
question2 = 'how many people are there in the clip ?'

In [61]:
generator(question = question2 ,gif_dir = '/content/gifs', gif_link =  link2 )

<built-in method size of Tensor object at 0x7ddcfcdf8040>
<built-in method size of Tensor object at 0x7ddcfd445d00>
<built-in method size of Tensor object at 0x7ddcfcdebc40>


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


question : how many people are there in the clip ?
predicted_answer :  The 2 


In [62]:
generator(question = 'what is the ethnicity of people in the clip ?' ,gif_dir = '/content/gifs', gif_link =  link2 )

<built-in method size of Tensor object at 0x7ddcfcdeade0>
<built-in method size of Tensor object at 0x7ddcfcdf83b0>
<built-in method size of Tensor object at 0x7ddcfcdf8590>


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


question : what is the ethnicity of people in the clip ?
predicted_answer :  The  Asian 


In [68]:
def bert_score_evaluation(model,tokenizer,dataloader):
  model.eval()
  candidate = []
  reference = []

  for batch in tqdm(dataloader, desc="Testing"):
      data,gif_embeds,questions,answers = batch
      data = {k:v.to(device) for k,v in data.items()}
      input_ids = data['input_ids']
      mask = data['attention_mask']
      gif_embeds = gif_embeds.to(device)

      with torch.no_grad():
        llama_embeddings , mask = model.generate(gif_embeds,input_ids,mask)
        output = model.peft_model.generate(
          inputs_embeds=llama_embeddings.half(),  # Pass the embeddings instead of input_ids
          attention_mask=mask.half(),
          max_new_tokens=50,                   # Maximum length of the generated sequence
          num_beams=5,                     # Beam search with 5 beams
          early_stopping=True,             # Stop when all beams reach the end
          num_return_sequences=1           # Return the best sequence
          )
      output_text = []
      for i in range(output.shape[0]):
        output_text.append(tokenizer.decode(output[i], skip_special_tokens=True))
      candidate.extend(output_text)
      reference.extend(answer)
  P , R , F1 = score(candidate, reference, lang="en", verbose=True)
  avg_P_score = P.mean().item()
  avg_R_score = R.mean().item()
  avg_F1_score = F1.mean().item()
  return avg_P_score , avg_R_score , avg_F1_score

In [69]:
bert_score_evaluation(model ,llama_tokenizer , test_dataloader)

Testing:   0%|          | 0/25 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Testing:   4%|▍         | 1/25 [00:11<04:32, 11.35s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Testing:   8%|▊         | 2/25 [00:22<04:16, 11.16s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Testing:  12%|█▏        | 3/25 [00:34<04:15, 11.62s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Testing:  16%|█▌        | 4/25 [00:42<03:33, 10.16s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Testing:  20%|██        | 5/25 [00:55<03:44, 11.22s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Testing:  24%|██▍       | 6/25 [01:03<03:09, 10.00s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Testing:  28%|██▊       | 7/25 [01:11<02:50,  9.47s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/2 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/2 [00:00<?, ?it/s]

done in 0.45 seconds, 224.58 sentences/sec


(0.838026225566864, 0.8239507079124451, 0.8304803371429443)