In [None]:
# Mount drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


**Import Necessary Libraries**

In [None]:
%%capture
!pip install peft
%%capture
!pip install bitsandbytes

In [None]:
import os
import torch
import torch.nn as nn
import bitsandbytes as bnb
from tqdm import tqdm
from torch.utils.data import Dataset , DataLoader
from transformers import AutoProcessor , XCLIPVisionModel , LlamaForCausalLM , LlamaTokenizer , AutoTokenizer , DataCollatorWithPadding
from peft import get_peft_model, LoraConfig, TaskType
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [None]:
x_clip_model_path = "microsoft/xclip-base-patch16"
llama_model_path = 'meta-llama/Llama-3.2-3B-Instruct'
lora_rank = 16
lora_alpha = 16

In [None]:
import pandas as pd
df = pd.read_csv('/content/test_df_with_link.csv')

In [None]:
import asyncio
import aiohttp
import os
import pathlib
from urllib.parse import urlparse

GIF_PATH = pathlib.Path('gifs')
if not GIF_PATH.exists():
    GIF_PATH.mkdir()

def is_valid_url(url):
    """Check if the URL is a valid string and properly formatted."""
    if not isinstance(url, str):
        return False
    parsed = urlparse(url)
    return all([parsed.scheme, parsed.netloc])

async def download_file(url):
    filename = url.split("/")[-1]
    filepath = GIF_PATH / filename

    # Check if the GIF file already exists
    if filepath.exists():
#         print(f"File {filename} already exists, skipping download.")
        return

    max_retries = 3  # Set the maximum number of retries
    retries = 0
    while retries < max_retries:
        try:
            async with aiohttp.ClientSession(trust_env=True) as session:
                async with session.get(url) as response:
                    if response.status == 200:
                        with open(filepath, mode="wb") as file:
                            while True:
                                chunk = await response.content.read(1024)
                                if not chunk:
                                    break
                                file.write(chunk)
#                         print(f"Downloaded file {filename}")
                        return  # Exit the loop if successful
                    else:
#                         print(f"Failed to download {url}: HTTP Status {response.status}")
                        return
        except (aiohttp.client_exceptions.ClientConnectorError, ConnectionResetError) as e:
#             print(f"Error downloading {url}: {e}")
            retries += 1
            await asyncio.sleep(1)  # Wait for a second before retrying

async def safe_request(semaphore, url):
    async with semaphore:
        return await download_file(url)

async def main(url_col, parallel_processes):
    # Filter out invalid URLs
    valid_urls = [url for url in url_col if is_valid_url(url)]

    semaphore = asyncio.Semaphore(parallel_processes)
    tasks = [asyncio.ensure_future(safe_request(semaphore, url)) for url in valid_urls]
    await asyncio.gather(*tasks)

**Downloading the Data**

In [None]:
await main(df['full_link'][:20000], 50)

In [None]:
df_new = df[:20000]

**DEFINE THE PRETRAINED TEXT TOKENIZER : SINCE THIS TOKENIZER DID NOT HAVE PRE-BUILT PAD TOKEN WE INCLUDED OUR CUSTOM PAD TOKEN**

In [None]:
tokenizer = AutoTokenizer.from_pretrained(llama_model_path , use_auth_token = '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')



**SETTING THE PAD TOKEN**

In [None]:
tokenizer.add_special_tokens({'pad_token': '<|pad_token|>'})
tokenizer.pad_token_id = 128010

In [None]:
tokenizer.pad_token

'<|python_tag|>'

In [None]:
data_collator_for_padding = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
# # XCLIPProcesser
image_processor = AutoProcessor.from_pretrained(x_clip_model_path)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


**MAKE THE DATASET**

In [None]:
from PIL import Image
import numpy as np
class VQA_DATASET(Dataset):
    def __init__(self,df,gif_dir,tokenizer,testing = False , num_frames = 16):
        super().__init__()
        self.df = df
        self.gif_dir = gif_dir
        self.num_frames = num_frames
        self.tokenizer = tokenizer
        self.testing = testing

    def __len__(self):
        return len(self.df)

    def sample_gif_frames(self, gif_path, num_frames=32):

        # Open the GIF file using PIL
        gif = Image.open(gif_path)

        # Extract all frames from the GIF
        frames = []
        try:
            while True:
                frame = gif.copy().convert("RGB")  # Convert each frame to RGB
                frames.append(np.array(frame))  # Convert to numpy array
                gif.seek(gif.tell() + 1)  # Move to the next frame
        except EOFError:
            pass  # End of GIF reached

        total_frames = len(frames)

        # If there are fewer than num_frames, pad with the last frame
        if total_frames < num_frames:
            last_frame = frames[-1]  # Get the last frame
            for _ in range(num_frames - total_frames):
                frames.append(last_frame)

        # If there are more than num_frames, sample the frames evenly
        indices = np.linspace(0, len(frames) - 1, num=num_frames, dtype=int)
        sampled_frames = [frames[i] for i in indices]

        # Convert the list of frames into a single NumPy array
        result = np.stack(sampled_frames)

        return result

    def __getitem__(self,idx):
      try:
          # Attempt to construct the gif_path
          gif_path = self.df['full_link'][idx].split('/')[-1]
          gif_path = os.path.join(self.gif_dir, gif_path)

          # Check if the gif_path exists
          if os.path.exists(gif_path):
              frames = self.sample_gif_frames(gif_path, self.num_frames)
              inputs = inputs.pixel_values
              question = self.df['question'][idx]
              answer = self.df['answer'][idx]

              if self.testing:
                  input_ids, mask = self.create_sequence(question)
                  sample = {
                      'gif_embed': inputs,
                      'input_ids': input_ids,
                      'mask': mask,
                      'question': question,
                      'answer': answer
                  }
                  return sample
              else:
                  input_ids, mask = self.create_sequence(question, answer)
                  sample = sample = {
                      'gif_embed': inputs,
                      'input_ids': input_ids,
                      'mask': mask
                  }
                  return sample
          else:
              # If the file does not exist, return None
              return None
      except Exception as e:
        # Handle any exception and return None
        return None

    def create_sequence(self,question = None ,answer=None ):
      if self.testing == True:
        prompt = f"<|begin_of_text|> question : {question} answer : "
        sequence = self.tokenizer(str(prompt)  ,max_length = 32 ,padding = 'max_length' , truncation = True ,return_tensors='pt' )
        input_ids = sequence['input_ids']
        mask = sequence['attention_mask']
        return input_ids , mask

      else:
        prompt = f"<|begin_of_text|> question : {question} answer : {answer} <|eot_id|>"
        sequence = self.tokenizer(str(prompt)  ,max_length = 32 ,padding = 'max_length' , truncation = True ,return_tensors='pt' )
        input_ids = sequence['input_ids']
        mask = sequence['attention_mask']
        return input_ids , mask

from torch.utils.data import default_collate

def custom_collator(batch):
    # Filter out None items from the batch
    batch = [item for item in batch if item is not None]

    if len(batch) == 0:
        return None  # If all items are None, return None or handle it accordingly

    gif_embed = [item['gif_embed'] for item in batch]
    input_ids = [item['input_ids'] for item in batch]
    masks = [item['mask'] for item in batch]

    # Prepare features for padding
    features_to_pad = {
        'input_ids': input_ids,
        'attention_mask': masks,
    }

    # Use the collator to pad input_ids and masks
    padded_tensors = data_collator_for_padding(features_to_pad)

    # Use default_collate for gif_embed to stack tensors along the batch dimension
    gif_embed_collated = default_collate(gif_embed)

    return padded_tensors, gif_embed_collated


def test_collator(batch):
    gif_embed = [item['gif_embed'] for item in batch]
    input_ids = [item['input_ids'] for item in batch]
    masks = [item['mask'] for item in batch]
    questions = [item['question'] for item in batch]
    answers = [item['answer'] for item in batch]

    features_to_pad = {
        'input_ids': input_ids,
        'attention_mask': masks,
    }

    padded_tensors = data_collator_for_padding(features_to_pad)

    return padded_tensors , gif_embed , questions , answers

**MAKE THE MODEL**

In [None]:
class VQAModel(nn.Module):
    def __init__(self  , x_clip_model_path , llama_model_path, r ,lora_alpha ,lora_dropout = 0.1 ,tgt = ["q_proj",'k_proj']):
        super(VQAModel , self).__init__()
        self.video_encoder = XCLIPVisionModel.from_pretrained(x_clip_model_path)
        for params in self.video_encoder.parameters():
            params.requires_grad = False

        self.llama_model  = LlamaForCausalLM.from_pretrained(llama_model_path , use_auth_token = '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'   , device_map = 'auto', load_in_4bit=True , bnb_4bit_compute_dtype=torch.bfloat16 , bnb_4bit_quant_type="nf4" )
        self.peft_config = LoraConfig(
          task_type=TaskType.CAUSAL_LM, inference_mode=False,
          r=r,
          lora_alpha=lora_alpha, lora_dropout=lora_dropout,
          target_modules = tgt
        )
        self.peft_model = get_peft_model(self.llama_model, self.peft_config).to(device)

        self.MLP = nn.Sequential( nn.Linear(768, 2048),
                                  nn.Dropout(p=0.1),
                                  nn.GELU(),
                                  nn.Linear(2048, 3072),
                              ).to(device)

    def forward(self , input_ids ,
                attention_mask = None,
                video_pixel_values = None
                ):

        encoded_videos = self.video_encoder(video_pixel_values) # B*Num_Frames , CLS , 768
        encoded_videos = encoded_videos.pooler_output # B*Num_Frames , 768
        encoded_videos = encoded_videos.view(-1,16,768).to(device) # B , 16 , 768

        embeddings = self.peft_model.base_model.model.model.embed_tokens(input_ids)
        encoded_videos = self.MLP(encoded_videos) # B x 16 x 3072
        # b x seq
        new_mask = (torch.ones((attention_mask.shape[0] , 16))).to(device)
        attention_mask = torch.cat([
            new_mask , attention_mask
        ] , dim = 1)

        fused_embeddings = torch.cat([
           encoded_videos , embeddings] , dim = 1) # b x 48 x 3072

        outputs = self.peft_model(inputs_embeds = fused_embeddings.half() , attention_mask = attention_mask.half() )
        final_output = {
            "logits" : outputs.logits
            }

        return final_output

    def generate(self,gif_embeds,input_ids,mask):   ##for testing and generation
        with torch.no_grad():
          encoded_videos = (self.video_encoder(gif_embeds)).to(device)
          encoded_videos = encoded_videos.pooler_output
          encoded_videos = encoded_videos.view(-1,16,768)
          embeddings = self.peft_model.base_model.model.model.embed_tokens(input_ids)
          encoded_videos = self.MLP(encoded_videos)
          fused_embeddings = torch.cat([
              encoded_videos , embeddings.squeeze(dim=1)] , dim = 1)  # b x 48 x 3072
          mask = mask.to(device)
          new_mask = (torch.ones((mask.shape[0] , 16))).to(device)
          mask = torch.cat([
              new_mask , mask.squeeze(dim=1)
          ] , dim = 1).to(device)

        return fused_embeddings , mask


**INSTANTIATE THE MODEL**

In [None]:
model = VQAModel(x_clip_model_path, llama_model_path, lora_rank, lora_alpha).to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
import torch

checkpoint_path = 'your_checkpoint_path'

checkpoint = torch.load(checkpoint_path)

model_state_dict = checkpoint.get('model_state_dict', checkpoint)
model.load_state_dict(model_state_dict, strict=False)


model.to(device)


In [None]:
# Assuming 'model' is your PyTorch model
def count_parameters(model):
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

    return total_params, trainable_params

total_params, trainable_params = count_parameters(model)

print(f'Total Parameters: {total_params}')
print(f'Trainable Parameters: {trainable_params}')

Total Parameters: 1937174016
Trainable Parameters: 12456960


**INSTANTIATE A TRAIN DATASET AND A TRAIN DATALOADER**

In [None]:
train_dataset = VQA_DATASET(df_new , gif_dir = '/content/gifs' , tokenizer = tokenizer)
train_dataloader = DataLoader(train_dataset , batch_size = 8 , shuffle = True , collate_fn = custom_collator)

**DEFINE THE OPTIMIZERS AND LR_SCHEDULER**

In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)
from transformers import get_scheduler
## lr_scheduler
num_train_epochs = 10
num_update_steps_per_epoch = len(train_dataloader)
# print(num_update_steps_per_epoch)
num_training_steps = num_train_epochs * num_update_steps_per_epoch
# print(num_training_steps)

lr_scheduler = get_scheduler(
    name="linear",  ##Cosine Annealing or any other
    optimizer=optimizer,
    num_warmup_steps=0.1*num_training_steps,
    num_training_steps=num_training_steps,
)

**MAKE A TRAINING LOOP AND SAVE THE MODEL**

In [None]:
import torch.nn.functional as F
def train(model,optimizer,device,epochs):
        for epoch in range(epochs):
            model.train()
            loss = 0
            with tqdm(total=len(train_dataloader), desc='Training Epoch {}'.format(epoch + 1)) as pbar:
              for batch in train_dataloader:
                  data,gif_embeds = batch
                  gif_embeds = gif_embeds.squeeze(dim = 1).to(device)
                  batch_size, frames, channels ,height, width = gif_embeds.shape
                  gif_embeds = gif_embeds.view(batch_size * frames, channels, height, width)
                  data = {k:v.to(device) for k,v in data.items()}
                  optimizer.zero_grad()
                  final_output = model(input_ids = data['input_ids'].squeeze(dim=1), attention_mask = data['attention_mask'].squeeze(dim=1) , video_pixel_values = gif_embeds )
                  logits = final_output['logits'].to(device)
                  trimmed_logits = logits[:, 16:47, :].contiguous()
                  trimmed_labels = data['input_ids'].squeeze(dim=1)[:, 1:].contiguous()
                  loss = F.cross_entropy(trimmed_logits.view(-1, trimmed_logits.size(-1)), trimmed_labels.view(-1))
                  loss.backward()
                  optimizer.step()
                  lr_scheduler.step()
                  loss+= loss.item()
                  pbar.update(1)

            avg_loss = loss/len(train_dataloader)
            print(f'epoch no: {epoch + 1} ||Train_loss : {avg_loss}')   #batch avg loss in every epoch

            torch.save({
                      'model_state_dict': model.state_dict()
                  }, '/content/drive/MyDrive/DataDownload/checkpoint_x_clip_llama.pth')

In [None]:
train(model,optimizer,device,num_train_epochs)

Training Epoch 1: 100%|██████████| 2500/2500 [46:01<00:00,  1.10s/it]


epoch no: 1 ||Train_loss : 0.0007003115606494248


Training Epoch 2:  95%|█████████▌| 2383/2500 [43:50<02:11,  1.13s/it]

**EVALUATION LOOP**

In [None]:
def train(model,optimizer,device,epochs):
  model.eval()
  val_loss = 0
  with torch.no_grad():
    with tqdm(total=len(eval_dataloader), desc='Validation Epoch {}'.format(epoch + 1)) as pbar:
      for batch in eval_dataloader:
          data,gif_embeds = batch
          data = {k:v.to(device) for k,v in data.items()}
          optimizer.zero_grad()
          final_output = model(input_ids = data['input_ids'].squeeze(dim=1), attention_mask = data['attention_mask'].squeeze(dim=1) , video_pixel_values = gif_embeds )
          logits = final_output['logits'].to(device)
          trimmed_logits = logits[:, 16:47, :].contiguous()
          trimmed_labels = data['input_ids'].squeeze(dim=1)[:, 1:].contiguous()
          loss = F.cross_entropy(trimmed_logits.view(-1, trimmed_logits.size(-1)), trimmed_labels.view(-1))
          val_loss+= loss.item()
          pbar.update(1)

  avg_val_loss = val_loss/len(eval_dataloader)
  print(f'epoch no: {epoch + 1} ||eval_loss : {avg_val_loss}')   #batch avg loss in every epoch


**GENERATION LOOP**

In [None]:
with torch.no_grad():
  for batch in test_dataloader:
    data,gif_embeds,question,answer = batch
    input_ids = data['input_ids'].to(device)
    mask = data['attention_mask'].to(device)
    llama_embeddings , mask = model.generate(gif_embeds,input_ids,mask)
    output = model.peft_model.generate(
                            inputs_embeds=llama_embeddings.half(),
                            attention_mask=mask.half(),
                            max_new_tokens=16,
                            num_beams=5,
                            early_stopping=True,
                            num_return_sequences=1,
                            no_repeat_ngram_size=2
                        )
    for i in range(output.shape[0]):
      generated_text = tokenizer.decode(output[i], skip_special_tokens=True)
      print(f"question : {question[i]}")
      print(f"predicted_answer : {generated_text}")
      print(f"actual_answer : {answer[i]}")