<a href="https://colab.research.google.com/github/jyanivaddi/ERA_V1/blob/master/Capstone/MultiModal_phi_with_audio.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Mount Google Drive**

In [1]:
from google.colab import drive
drive.mount('/content/gdrive/', force_remount=True)

Mounted at /content/gdrive/


**Install dependencies**

In [2]:
!pip install -U --quiet transformers datasets tqdm matplotlib wandb torchmetrics torchinfo pytorch-lightning peft bitsandbytes einops pillow gradio
!pip install 'git+https://github.com/m-bain/whisperx.git' --quiet

  Preparing metadata (setup.py) ... [?25l[?25hdone


**Imports**

In [3]:
!git clone "https://github.com/jyanivaddi/ERA_V1.git"
!git -C ERA_V1 pull
!git pull

fatal: destination path 'ERA_V1' already exists and is not an empty directory.
Already up to date.
fatal: not a git repository (or any of the parent directories): .git


In [4]:
import torchmetrics
import wandb
import io
import requests
import os
import sys
import gc
import torch
import pickle
import whisperx
import json
import torchinfo
import torch.nn as nn
import numpy as np
import pytorch_lightning as pl
import torch.multiprocessing as mp
from PIL import Image
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import BitsAndBytesConfig
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from pathlib import Path
from typing import Union, List
from torch.cuda.amp import autocast
from matplotlib import pyplot as plt
from pytorch_lightning import LightningModule, Trainer
from pytorch_lightning.callbacks import LearningRateMonitor, ModelCheckpoint
from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning.callbacks.progress import TQDMProgressBar
from transformers import AutoProcessor, CLIPVisionModel
from pytorch_lightning.callbacks import Callback
from peft import LoraConfig


  torchaudio.set_audio_backend("soundfile")


In [5]:
sys.path.append("ERA_V1/Capstone")
from llava_instruct_dataset import LlavaFinetuneDataset, LlavaCollator, split_data_to_train_and_val, get_image_embeddings
from model_finetune import LitMultiModalPhiFineTune, SimpleLinearBlock, model_summary

**set parameters**

In [None]:
#json_path = './data/llava_instruct_150k.json'
#batch_size = 20
device = 'cuda' if torch.cuda.is_available() else 'cpu'
projection_layer_in_channels = 768
projection_layer_out_channels = 2560
#seq_len = 72
#num_image_tokens = 49
#max_ques_length = seq_len - (1+num_image_tokens+1) # 1 for image start, 1 for comment
stage1_projection_checkpoints = 'phi2_projection_checkpoints/ckpt_60001.pt'
projection_layer_finetuning_checkpoint_path = '/content/gdrive/MyDrive/ERA_Capstone/phi2_finetune_checkpoints_run2_low_lr/projection_layer_finetuning/projection_layer_ckpt_finetuning_global_step_4001.pt'
#projection_layer_finetuning_checkpoint_path = '/content/gdrive/MyDrive/ERA_Capstone/phi2_projection_checkpoints/ckpt_60001.pt'
finetuned_phi_checkpoint_path = '/content/gdrive/MyDrive/ERA_Capstone/phi2_finetune_checkpoints_run2_low_lr/phi_model_finetuning/adapter_layer_ckpt_finetuning_global_step_4001'

# Define configurations for QLORA finetuning
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True, # Load the model in 4 bits
    bnb_4bit_quant_type="nf4", # 4 bit quant type
    bnb_4bit_use_double_quant=True, # double quant saves more bits
    bnb_4bit_compute_dtype=torch.float16, # use bfloat16
    )

# Define the models and tokenizers
clip_model = CLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32")
clip_preprocessor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
phi_tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
# Define multimodal model
multimodal_phi_model = LitMultiModalPhiFineTune(projection_layer_in_channels,
                                                projection_layer_out_channels,
                                                quantization_config)


multimodal_phi_model.projection_layer.load_state_dict(torch.load(projection_layer_finetuning_checkpoint_path))
multimodal_phi_model.llm_model.from_pretrained(multimodal_phi_model.llm_model, finetuned_phi_checkpoint_path)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


trainable params: 11,796,480 || all params: 2,791,480,320 || trainable%: 0.4225886858482312
Number of Training Parameters
********************************
Projection Layer:1970176
Phi Model:11796480
********************************


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): PeftModelForCausalLM(
      (base_model): LoraModel(
        (model): PhiForCausalLM(
          (model): PhiModel(
            (embed_tokens): Embedding(51200, 2560)
            (embed_dropout): Dropout(p=0.0, inplace=False)
            (layers): ModuleList(
              (0-31): 32 x PhiDecoderLayer(
                (self_attn): PhiAttention(
                  (q_proj): lora.Linear4bit(
                    (base_layer): Linear4bit(in_features=2560, out_features=2560, bias=True)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.05, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=2560, out_features=8, bias=False)
                    )
                    (lora_B): ModuleDict(
                      (default): Linear(in_features=8, out_features=2560, bias=False)
                    )
                    

**Define all the helper methods**

In [None]:
def prepare_inputs(multimodal_phi_model, proj_output=None, question_embeddings=None, batch_size=1):

    # define comment and im start tokens
    comment_token = torch.tensor(multimodal_phi_model.COMMENT_TOKEN_ID).repeat(batch_size, 1).to(device)
    comment = multimodal_phi_model.llm_model.model.model.embed_tokens(comment_token).to(device) #

    im_start_token = torch.tensor(multimodal_phi_model.IMAGE_TOKEN_ID).repeat(batch_size, 1).to(device)
    im_start = multimodal_phi_model.llm_model.model.model.embed_tokens(im_start_token).to(device) #

    if proj_output is None and question_embeddings is None:
        raise Exception("you need to provide an image, text, or audio input")
    if question_embeddings is None:
        # prepare input embeddings
        print(im_start.shape)
        print(proj_output.shape)
        print(comment.shape)
        inputs_embeds = torch.cat([im_start, # <IM> [B x 1 x 2560]
                                   proj_output, # [B x 49 x 2560]
                                   comment, # [B x 1 x 2560]
                                   ], dim=1) # total dim: (B, 64, 2560)
    else:
        # prepare input embeddings
        inputs_embeds = torch.cat([im_start, # <IM> [B x 1 x 2560]
                                   proj_output, # [B x 49 x 2560]
                                   comment, # [B x 1 x 2560]
                                   question_embeddings,
                                   ], dim=1) # total dim: (B, 64, 2560)
    return inputs_embeds


def generate_phi_responses(multimodal_phi_model, batch, batch_size=1):
    question_embeddings = None
    proj_output = None
    if 'ques_tokenized' in batch:
        question_tokens = batch['ques_tokenized']
        question_embeddings = multimodal_phi_model.llm_model.model.model.embed_tokens(question_tokens).to(device)

    if 'image_embeddings' in batch:
        image_embeddings = batch['image_embeddings']
        proj_output = multimodal_phi_model.projection_layer(image_embeddings).to(device)

    inputs_embeds = prepare_inputs(multimodal_phi_model, proj_output, question_embeddings)

    with torch.no_grad():
        with autocast(True):
            pred_logits = multimodal_phi_model.llm_model.generate(inputs_embeds = inputs_embeds, max_new_tokens=20)
            generated_text = multimodal_phi_model.tokenizer.batch_decode(pred_logits, skip_special_tokens=True, clean_up_tokenization_spaces=True, verbose=False)[0]
    return generated_text


In [None]:
def get_image_embeddings(image, model, preprocessor, device=None):
    """
    This method computes the clip embeddings for a given image, after preprocessing it according to the model
    """
    processed_image = preprocessor(images=image, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**processed_image)
    return outputs.last_hidden_state.squeeze()[1:,:].unsqueeze(0)

def tokenize_sentence(sentence, tokenizer):
    tokenizer_output = tokenizer(sentence, return_tensors="pt", return_attention_mask=False)
    tokenized_sentence = tokenizer_output['input_ids']
    return tokenized_sentence

def generate_embeddings_from_inputs(image, text, clip_model, clip_preprocessor, tokenizer):
    image_embeddings = get_image_embeddings(image, clip_model, clip_preprocessor)
    tokenized_sentence = tokenize_sentence(text, tokenizer)
    return {'image_embeddings': image_embeddings, 'ques_tokenized': tokenized_sentence}


**Let's try out the code!**

In [None]:
image_url = r'/content/gdrive/MyDrive/temp.jpg'
image = Image.open(image_url)

#image_url = r'http://images.cocodataset.org/train2017/000000010005.jpg'
#image = Image.open(requests.get(image_url, stream=True).raw)
question = "what is the color of the cat in this image?"
batch = generate_embeddings_from_inputs(image, question, clip_model, clip_preprocessor, phi_tokenizer)
#question = tokenize_sentence("who is the greatest cricketer of all time?", phi_tokenizer)
#batch = {'ques_tokenized': question}
#print(batch['image_embeddings'].shape)
#print(batch['ques_tokenized'].shape)
#batch = {'image_embeddings': batch['image_embeddings']}

In [None]:
generate_phi_responses(multimodal_phi_model, batch, batch_size=1)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


' white. white. white. white. white. white. white. white. white. white.'

In [6]:
import whisperx
import gc

device = "cuda"
audio_file = "/content/gdrive/MyDrive/test_audio.mp4"
batch_size = 16 # reduce if low on GPU mem
compute_type = "float16" # change to "int8" if low on GPU mem (may reduce accuracy)

# 1. Transcribe with original whisper (batched)
model = whisperx.load_model("large-v2", device, compute_type=compute_type)

# save model to local path (optional)
# model_dir = "/path/"
# model = whisperx.load_model("large-v2", device, compute_type=compute_type, download_root=model_dir)

audio = whisperx.load_audio(audio_file)
print(audio)
result = model.transcribe(audio, batch_size=batch_size)
print(result["segments"]) # before alignment


No language specified, language will be first be detected for each audio file (increases inference time).


INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.2.0.post0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../root/.cache/torch/whisperx-vad-segmentation.bin`


Model was trained with pyannote.audio 0.0.1, yours is 3.1.1. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.1.0+cu121. Bad things might happen unless you revert torch to 1.x.
[-0.00012207 -0.00015259 -0.00015259 ...  0.          0.
  0.        ]
Detected language: en (0.85) in first 30s of audio...
[{'text': " Hello. How are you? I'm okay. I will be. I said she could stay with us till she feels better. Of course she can. No, this won't be for long. Well, you can stay as long as you want, my love. I really missed you. Great to see you, love.", 'start': 0.145, 'end': 20.776}]


In [7]:
def run_inference_on_model(audio_file):
    print(audio_file)
    audio = whisperx.load_audio(audio_file)

    print(audio)
    if True:
        # whisper
        audio_model = whisperx.load_model("large-v2", device, compute_type='float16')
        compute_type = "float16" # change to "int8" if low on GPU mem (may reduce accuracy)

        # 1. Transcribe with original whisper (batched)
        #audio = whisperx.load_audio(audio_file)
        #result = audio_model.transcribe(audio[1], batch_size=1)
        #print(result["segments"]['text'])
        audio_result = audio_model.transcribe(audio)
        audio_text = ''
        for seg in audio_result['segments']:
            audio_text += seg['text']
        audio_text = audio_text.strip()
        return audio_text # before alignment
    return None

**Integrate Gradio**

In [None]:
import gradio as gr
import whisperx
device = 'cuda' if torch.cuda.is_available() else 'cpu'


demo = gr.Interface(run_inference_on_model,
                    inputs = [gr.Audio(sources=["microphone"], type='filepath')],
                    outputs = [gr.Textbox(label='AI response', scale=2)],
                    title = "none",
                    description = "mpme"
                   )
demo.launch(debug=True)

Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
Running on public URL: https://d3fdacc8d5f7917327.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


/tmp/gradio/35d476c4447f1744452ce8004ed5fee2129101b9/audio.wav
[0.         0.         0.         ... 0.00125122 0.00192261 0.00213623]
No language specified, language will be first be detected for each audio file (increases inference time).


INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.2.0.post0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../root/.cache/torch/whisperx-vad-segmentation.bin`


Model was trained with pyannote.audio 0.0.1, yours is 3.1.1. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.1.0+cu121. Bad things might happen unless you revert torch to 1.x.
Detected language: en (0.89) in first 30s of audio...
