In [1]:
from transformers import WhisperProcessor, WhisperForConditionalGeneration, WhisperTokenizer, AutoTokenizer, AutoModelForCausalLM
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from pathlib import Path
import os
import torch
from IPython.display import Audio

DSDIR = Path(os.environ["DSDIR"])
WHISPER_PATH = "openai/whisper-small.en"
LLM_PATH = DSDIR / "HuggingFace_Models/microsoft/phi-2"
DATA_PATH = DSDIR / "meld_fidle"

def freeze_model(model):
    for param in model.parameters():
        param.requires_grad = False

    return model

whisper_model = WhisperForConditionalGeneration.from_pretrained(WHISPER_PATH)
whisper_model.eval()
whisper_model = freeze_model(whisper_model)
whisper_processor = WhisperProcessor.from_pretrained(WHISPER_PATH)

# Initialize the model and its tokenizer
llm_model = AutoModelForCausalLM.from_pretrained(
    LLM_PATH,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,  # Allow using code that was not written by HuggingFace
    attn_implementation="flash_attention_2"  # Optimize the model with Flash Attention
).to("cuda")
llm_model.eval()
llm_model = freeze_model(llm_model)
llm_tokenizer = AutoTokenizer.from_pretrained(LLM_PATH)

You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [2]:
# Observe the data and check the whisper model

In [3]:
mode = "dev"
WAV_DATA = DATA_PATH / f"{mode}_wav"
PT_DATA = DATA_PATH / f"{mode}_pt"
CSV_DF = DATA_PATH / f"{mode}_sent_emo.csv"

In [4]:
whisper_model

WhisperForConditionalGeneration(
  (model): WhisperModel(
    (encoder): WhisperEncoder(
      (conv1): Conv1d(80, 768, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv2): Conv1d(768, 768, kernel_size=(3,), stride=(2,), padding=(1,))
      (embed_positions): Embedding(1500, 768)
      (layers): ModuleList(
        (0-11): 12 x WhisperEncoderLayer(
          (self_attn): WhisperSdpaAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=False)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
        

In [5]:
df = pd.read_csv(CSV_DF)
df

Unnamed: 0,Sr No.,Utterance,Speaker,Emotion,Sentiment,Dialogue_ID,Utterance_ID,Season,Episode,StartTime,EndTime
0,1,"Oh my God, hes lost it. Hes totally lost it.",Phoebe,sadness,negative,0,0,4,7,"00:20:57,256","00:21:00,049"
1,2,What?,Monica,surprise,negative,0,1,4,7,"00:21:01,927","00:21:03,261"
2,3,"Or! Or, we could go to the bank, close our acc...",Ross,neutral,neutral,1,0,4,4,"00:12:24,660","00:12:30,915"
3,4,Youre a genius!,Chandler,joy,positive,1,1,4,4,"00:12:32,334","00:12:33,960"
4,5,"Aww, man, now we wont be bank buddies!",Joey,sadness,negative,1,2,4,4,"00:12:34,211","00:12:37,505"
...,...,...,...,...,...,...,...,...,...,...,...
1104,1174,No.,Monica,sadness,negative,113,9,6,2,"00:19:28,792","00:19:29,876"
1105,1175,What? Oh my God! Im gonna miss you so much!,Rachel,sadness,negative,113,10,6,2,"00:19:33,213","00:19:35,965"
1106,1176,Im gonna miss you!,Monica,sadness,negative,113,11,6,2,"00:19:36,175","00:19:37,967"
1107,1177,I mean its the end of an era!,Rachel,sadness,negative,113,12,6,2,"00:19:39,094","00:19:40,928"


In [6]:
idx = 53
df.loc[idx]

Sr No.                   54
Utterance             Good.
Speaker                Ross
Emotion               anger
Sentiment          negative
Dialogue_ID               5
Utterance_ID             11
Season                    2
Episode                   7
StartTime       0:19:31,378
EndTime         0:19:32,440
Name: 53, dtype: object

In [7]:
inp = torch.load(PT_DATA / f"dia{df.loc[idx]['Dialogue_ID']}_utt{df.loc[idx]['Utterance_ID']}.pt")
Audio(os.path.join(WAV_DATA, f"dia{df.loc[idx]['Dialogue_ID']}_utt{df.loc[idx]['Utterance_ID']}.wav"), embed=True)

In [8]:
input_features = whisper_processor(inp, return_tensors="pt", sampling_rate=16000).input_features
generated_ids = whisper_model.generate(input_features)
transcription = whisper_processor.batch_decode(generated_ids, skip_special_tokens=True)
transcription

[' Good!']

In [9]:
# Design the model

In [10]:
whisper_encoder = whisper_model.model.encoder.to("cuda")
out_enc = whisper_encoder(input_features.to("cuda"))
out_enc["last_hidden_state"]

tensor([[[-0.6875,  0.6428,  0.0718,  ...,  0.0831,  0.7077, -0.6348],
         [ 0.6532,  0.6566, -0.0738,  ..., -0.3608,  0.2177, -1.0051],
         [ 2.6466,  0.7558,  0.3444,  ..., -0.1396, -0.4181,  0.2586],
         ...,
         [-0.0480, -0.0395, -0.0336,  ..., -0.0049,  0.0069, -0.0107],
         [-1.5615, -0.1322,  0.2551,  ...,  0.3530, -1.0932,  0.8876],
         [-0.5215, -0.1084,  0.3092,  ...,  0.4454, -0.8959,  0.4412]]],
       device='cuda:0')

In [22]:
class Projector(torch.nn.Module):
    def __init__(
        self, encoder_hidden_dim=768, llm_hidden_dim=2560, nhead=8, nb_feat_tokens=5
    ):
        super(Projector, self).__init__()
        
        self.encoder_layer = torch.nn.TransformerEncoderLayer(
            d_model=encoder_hidden_dim, nhead=nhead, activation="gelu", batch_first=True
        )
        self.projection_layer = torch.nn.Linear(encoder_hidden_dim, llm_hidden_dim)
        
        self.nb_feat_tokens = nb_feat_tokens
        self.encoder_hidden_dim = encoder_hidden_dim
        self.llm_hidden_dim = llm_hidden_dim

    def forward(self, x):
        x = self.encoder_layer(x)
        x = x[:, :self.nb_feat_tokens, :]  # We select a fix number of vectors that will represent the audio features
        x = self.projection_layer(x.view(-1, self.encoder_hidden_dim))
        x = x.view(-1, self.nb_feat_tokens, self.llm_hidden_dim)
        return x

In [23]:
projector = Projector().to("cuda").to(torch.bfloat16)

In [25]:
feat_extracted = out_enc["last_hidden_state"].to(torch.bfloat16)
feat_project = projector(feat_extracted)
feat_project.shape

torch.Size([1, 5, 2560])

In [26]:
model_inp = llm_tokenizer("Hey there!", return_tensors="pt").to("cuda")
tokens_embed = llm_model.model.embed_tokens(model_inp["input_ids"])
tokens_embed.type()

'torch.cuda.BFloat16Tensor'

In [27]:
llm_inp = torch.concat([feat_project, tokens_embed], axis=1)
llm_inp.type()

'torch.cuda.BFloat16Tensor'

In [34]:
model_inp['attention_mask']

tensor([[1, 1, 1]], device='cuda:0')

In [None]:
class LlavaFriends(torch.nn.Module):
    def __init__(
        self, whisper_encoder, llm_model, 
    ):
        super(Projector, self).__init__()

In [35]:
out = llm_model(inputs_embeds=llm_inp, attention_mask=model_inp['attention_mask'])

In [36]:
out.logits.shape

torch.Size([1, 8, 51200])