<a href="https://colab.research.google.com/github/jyanivaddi/ERA_V1/blob/master/Capstone/MultiModal_phi_with_audio.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Mount Google Drive**

In [1]:
from google.colab import drive
drive.mount('/content/gdrive/', force_remount=True)

Mounted at /content/gdrive/


**Install dependencies**

In [2]:
!pip install -U --quiet numpy transformers datasets tqdm matplotlib wandb torchmetrics torchinfo pytorch-lightning peft bitsandbytes einops pillow

**Imports**

In [4]:
!git clone "https://github.com/jyanivaddi/ERA_V1.git"
!git -C ERA_V1 pull
!git pull

fatal: destination path 'ERA_V1' already exists and is not an empty directory.
Already up to date.
fatal: not a git repository (or any of the parent directories): .git


In [5]:
import torchmetrics
import wandb
import io
import requests
import os
import sys
import gc
import torch
import pickle
import json
import torchinfo
import torch.nn as nn
import numpy as np
import pytorch_lightning as pl
import torch.multiprocessing as mp
from PIL import Image
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import BitsAndBytesConfig
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from pathlib import Path
from typing import Union, List
from torch.cuda.amp import autocast
from matplotlib import pyplot as plt
from pytorch_lightning import LightningModule, Trainer
from pytorch_lightning.callbacks import LearningRateMonitor, ModelCheckpoint
from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning.callbacks.progress import TQDMProgressBar
from transformers import AutoProcessor, CLIPVisionModel
from pytorch_lightning.callbacks import Callback
from peft import LoraConfig


In [6]:
sys.path.append("ERA_V1/Capstone")
from llava_instruct_dataset import LlavaFinetuneDataset, LlavaCollator, split_data_to_train_and_val, get_image_embeddings
from model_finetune import LitMultiModalPhiFineTune, SimpleLinearBlock, model_summary

**set parameters**

In [7]:
#json_path = './data/llava_instruct_150k.json'
#batch_size = 20
device = 'cuda' if torch.cuda.is_available() else 'cpu'
projection_layer_in_channels = 768
projection_layer_out_channels = 2560
#seq_len = 72
#num_image_tokens = 49
#max_ques_length = seq_len - (1+num_image_tokens+1) # 1 for image start, 1 for comment
stage1_projection_checkpoints = 'phi2_projection_checkpoints/ckpt_60001.pt'
projection_layer_finetuning_checkpoint_path = '/content/gdrive/MyDrive/ERA_Capstone/phi2_finetune_checkpoints_run2_low_lr/projection_layer_finetuning/projection_layer_ckpt_finetuning_global_step_4001.pt'
finetuned_phi_checkpoint_path = '/content/gdrive/MyDrive/ERA_Capstone/phi2_finetune_checkpoints_run2_low_lr/phi_model_finetuning/adapter_layer_ckpt_finetuning_global_step_4001'

# Define configurations for QLORA finetuning
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True, # Load the model in 4 bits
    bnb_4bit_quant_type="nf4", # 4 bit quant type
    bnb_4bit_use_double_quant=True, # double quant saves more bits
    bnb_4bit_compute_dtype=torch.float16, # use bfloat16
    )

# Define the models and tokenizers
clip_model = CLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32")
clip_preprocessor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
phi_tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
  return self.fget.__get__(instance, owner)()
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [8]:
# Define multimodal model
multimodal_phi_model = LitMultiModalPhiFineTune(projection_layer_in_channels,
                                                projection_layer_out_channels,
                                                quantization_config)


multimodal_phi_model.projection_layer.load_state_dict(torch.load(projection_layer_finetuning_checkpoint_path))
multimodal_phi_model.llm_model.from_pretrained(multimodal_phi_model.llm_model, finetuned_phi_checkpoint_path)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


trainable params: 11,796,480 || all params: 2,791,480,320 || trainable%: 0.4225886858482312
Number of Training Parameters
********************************
Projection Layer:1970176
Phi Model:11796480
********************************


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): PeftModelForCausalLM(
      (base_model): LoraModel(
        (model): PhiForCausalLM(
          (model): PhiModel(
            (embed_tokens): Embedding(51200, 2560)
            (embed_dropout): Dropout(p=0.0, inplace=False)
            (layers): ModuleList(
              (0-31): 32 x PhiDecoderLayer(
                (self_attn): PhiAttention(
                  (q_proj): lora.Linear4bit(
                    (base_layer): Linear4bit(in_features=2560, out_features=2560, bias=True)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.05, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=2560, out_features=8, bias=False)
                    )
                    (lora_B): ModuleDict(
                      (default): Linear(in_features=8, out_features=2560, bias=False)
                    )
                    

**Define all the helper methods**

In [37]:
def generate_phi_responses(multimodal_phi_model, batch, batch_size=1):
    image_embeddings = batch['image_embeddings']
    proj_outs = multimodal_phi_model.projection_layer(image_embeddings).to(device)
    print(f"proj_outs: {proj_outs.shape}, device: {proj_outs.device}")
    #device = proj_outs.device
    print("device")

    # define comment and im start tokens
    comment_token = torch.tensor(multimodal_phi_model.COMMENT_TOKEN_ID).repeat(batch_size, 1).to(device)
    comment = multimodal_phi_model.llm_model.model.model.embed_tokens(comment_token).to(device) #
    print(f"comment: {comment.shape}, device: {comment.device}")


    im_start_token = torch.tensor(multimodal_phi_model.IMAGE_TOKEN_ID).repeat(batch_size, 1).to(device)
    im_start = multimodal_phi_model.llm_model.model.model.embed_tokens(im_start_token).to(device) #
    print(f"im_start: {im_start.shape}, device: {im_start.device}")


    question_tokens = batch['ques_tokenized']
    question_embeddings = multimodal_phi_model.llm_model.model.model.embed_tokens(question_tokens).to(device)
    print(f"ques_embed: {question_embeddings.shape}, device: {question_embeddings.device}")


    # prepare input embeddings
    inputs_embeds = torch.cat([im_start, # <IM> [B x 1 x 2560]
                                proj_outs, # [B x 49 x 2560]
                                comment, # [B x 1 x 2560]
                                question_embeddings, # [B x 1 x 2560]
                                ], dim=1) # total dim: (B, 64, 2560)
    with torch.no_grad():
        with autocast(True):
            pred_logits = multimodal_phi_model.llm_model.generate(inputs_embeds = inputs_embeds, max_new_tokens=64)
            generated_text = multimodal_phi_model.tokenizer.batch_decode(pred_logits, skip_special_tokens=True, clean_up_tokenization_spaces=True, verbose=False)[0]
    return generated_text


In [29]:
def get_image_embeddings(image, model, preprocessor, device=None):
    """
    This method computes the clip embeddings for a given image, after preprocessing it according to the model
    """
    processed_image = preprocessor(images=image, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**processed_image)
    return outputs.last_hidden_state.squeeze()[1:,:].unsqueeze(0)

def tokenize_sentence(sentence, tokenizer):
    tokenizer_output = tokenizer(sentence, return_tensors="pt", return_attention_mask=False)
    tokenized_sentence = tokenizer_output['input_ids']
    return tokenized_sentence

def generate_embeddings_from_inputs(image, text, clip_model, clip_preprocessor, tokenizer):
    image_embeddings = get_image_embeddings(image, clip_model, clip_preprocessor)
    tokenized_sentence = tokenize_sentence(text, tokenizer)
    return {'image_embeddings': image_embeddings, 'ques_tokenized': tokenized_sentence}


**Let's try out the code!**

In [43]:
image_url = r'/content/gdrive/MyDrive/temp.jpg'
image = Image.open(image_url)

#image_url = r'http://images.cocodataset.org/train2017/000000010005.jpg'
#image = Image.open(requests.get(image_url, stream=True).raw)
question = "Describe this image"
batch = generate_embeddings_from_inputs(image, question, clip_model, clip_preprocessor, phi_tokenizer)
print(batch['image_embeddings'].shape)
print(batch['ques_tokenized'].shape)


torch.Size([1, 49, 768])
torch.Size([1, 4])


In [44]:
generate_phi_responses(multimodal_phi_model, batch, batch_size=1)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


proj_outs: torch.Size([1, 49, 2560]), device: cuda:0
device
comment: torch.Size([1, 1, 2560]), device: cuda:0
im_start: torch.Size([1, 1, 2560]), device: cuda:0
ques_embed: torch.Size([1, 4, 2560]), device: cuda:0


' two cats on on on a on a bed bed. on a bed. on a bed. on a bed. on a bed. on a bed. on a bed. on a bed. on a bed. on a bed. on a bed. on a bed. on a bed. on a bed. on'

In [None]:
multimodal_phi_model