In [1]:
from transformers import WhisperProcessor, WhisperForConditionalGeneration, WhisperTokenizer, AutoTokenizer, AutoModelForCausalLM
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from pathlib import Path
import os
import torch
from IPython.display import Audio
from tqdm.notebook import tqdm
import random

DSDIR = Path(os.environ["DSDIR"])
WHISPER_PATH = "openai/whisper-small.en"
LLM_PATH = DSDIR / "HuggingFace_Models/microsoft/phi-2"
DATA_PATH = DSDIR / "meld_fidle"

os.environ['TOKENIZERS_PARALLELISM'] = 'false'

def freeze_model(model):
    for param in model.parameters():
        param.requires_grad = False

    return model

whisper_model = WhisperForConditionalGeneration.from_pretrained(WHISPER_PATH)
whisper_model.eval()
whisper_model = freeze_model(whisper_model)
whisper_processor = WhisperProcessor.from_pretrained(WHISPER_PATH)

# Initialize the model and its tokenizer
llm_model = AutoModelForCausalLM.from_pretrained(
    LLM_PATH,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,  # Allow using code that was not written by HuggingFace
    attn_implementation="flash_attention_2"  # Optimize the model with Flash Attention
).to("cuda")
llm_model.eval()
llm_model = freeze_model(llm_model)
llm_tokenizer = AutoTokenizer.from_pretrained(LLM_PATH)
llm_tokenizer.pad_token_id = 50257  # Special token of phi

You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [2]:
# Observe the data and check the whisper model

In [3]:
mode = "train"
WAV_DATA = DATA_PATH / f"{mode}_wav"
PT_DATA = DATA_PATH / f"{mode}_pt"
CSV_DF = f"{mode}_sent_emo.csv"

In [4]:
mode = "test"
WAV_TEST_DATA = DATA_PATH / f"{mode}_wav"
PT_TEST_DATA = DATA_PATH / f"{mode}_pt"
CSV_TEST_DF = f"{mode}_sent_emo.csv"

In [5]:
whisper_model

WhisperForConditionalGeneration(
  (model): WhisperModel(
    (encoder): WhisperEncoder(
      (conv1): Conv1d(80, 768, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv2): Conv1d(768, 768, kernel_size=(3,), stride=(2,), padding=(1,))
      (embed_positions): Embedding(1500, 768)
      (layers): ModuleList(
        (0-11): 12 x WhisperEncoderLayer(
          (self_attn): WhisperSdpaAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=False)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
        

In [6]:
df = pd.read_csv(CSV_DF)
df_test = pd.read_csv(CSV_TEST_DF)
df

Unnamed: 0,Sr No.,Utterance,Speaker,Emotion,Sentiment,Dialogue_ID,Utterance_ID,Season,Episode,StartTime,EndTime,question,answer
0,1,also I was the point person on my companys tr...,Chandler,neutral,neutral,0,0,8,21,"00:16:16,059","00:16:21,731",What kind of person is Chandler in relation to...,"Neutral, as Chandler is simply stating a fact ..."
1,2,You mustve had your hands full.,The Interviewer,neutral,neutral,0,1,8,21,"00:16:21,940","00:16:23,442",What is the speaker's emotion?,The speaker's emotion is empathy.
2,3,That I did. That I did.,Chandler,neutral,neutral,0,2,8,21,"00:16:23,442","00:16:26,389",What is the speaker's emotion in this dialogue?,Neutral.
3,4,So lets talk a little bit about your duties.,The Interviewer,neutral,neutral,0,3,8,21,"00:16:26,820","00:16:29,572",Who is the speaker in this dialogue?,The Interviewer.
4,5,My duties? All right.,Chandler,surprise,positive,0,4,8,21,"00:16:34,452","00:16:40,917",What is the speaker's emotion in this dialogue?,"Unexpectedly, the speaker, Chandler, seems sur..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9983,10474,You or me?,Chandler,neutral,neutral,1038,13,2,3,"00:00:48,173","00:00:50,799",What is the emotional tone of the speaker?,Neutral.
9984,10475,"I got it. Uh, Joey, women don't have Adam's ap...",Ross,neutral,neutral,1038,14,2,3,"00:00:51,009","00:00:53,594",Who is the speaker?,The speaker is Ross.
9985,10476,"You guys are messing with me, right?",Joey,surprise,positive,1038,15,2,3,"00:01:00,518","00:01:03,520",Who is the speaker in this dialogue?,"Joey, who is surprised and said ""You guys are ..."
9986,10477,Yeah.,All,neutral,neutral,1038,16,2,3,"00:01:05,398","00:01:07,274",Who is the speaker in this dialogue?,"All, who said ""Yeah."" and is being neutral."


In [7]:
idx = 53
df.loc[idx]

Sr No.                                 57
Utterance                         Me too!
Speaker                            Rachel
Emotion                               joy
Sentiment                        positive
Dialogue_ID                             4
Utterance_ID                           10
Season                                  3
Episode                                11
StartTime                     0:20:41,991
EndTime                       0:20:43,543
question        What is Rachel's emotion?
answer           Rachel's emotion is Joy.
Name: 53, dtype: object

In [8]:
inp = torch.load(PT_DATA / f"dia{df.loc[idx]['Dialogue_ID']}_utt{df.loc[idx]['Utterance_ID']}.pt")
Audio(os.path.join(WAV_DATA, f"dia{df.loc[idx]['Dialogue_ID']}_utt{df.loc[idx]['Utterance_ID']}.wav"), embed=True)

In [9]:
input_features = whisper_processor(inp, return_tensors="pt", sampling_rate=16000).input_features
generated_ids = whisper_model.generate(input_features)
transcription = whisper_processor.batch_decode(generated_ids, skip_special_tokens=True)
transcription

[' Ciao!']

In [10]:
# Design the model

In [10]:
whisper_encoder = whisper_model.model.encoder.to("cuda")
out_enc = whisper_encoder(input_features.to("cuda"))
out_enc["last_hidden_state"].shape

torch.Size([1, 1500, 768])

In [11]:
class Projector(torch.nn.Module):
    def __init__(
        self, encoder_hidden_dim=768, llm_hidden_dim=2560, nhead=8, nb_feat_tokens=5
    ):
        super(Projector, self).__init__()
        
        self.encoder_layer = torch.nn.TransformerEncoderLayer(
            d_model=encoder_hidden_dim, nhead=nhead, activation="gelu", batch_first=True
        )
        self.projection_layer = torch.nn.Linear(encoder_hidden_dim, llm_hidden_dim)
        
        self.nb_feat_tokens = nb_feat_tokens
        self.encoder_hidden_dim = encoder_hidden_dim
        self.llm_hidden_dim = llm_hidden_dim

    def forward(self, x):
        x = self.encoder_layer(x)
        x = x[:, :self.nb_feat_tokens, :]  # We select a fix number of vectors that will represent the audio features
        x = self.projection_layer(x.reshape(-1, self.encoder_hidden_dim))
        x = x.reshape(-1, self.nb_feat_tokens, self.llm_hidden_dim)
        return x

# class Projector(torch.nn.Module):
#     def __init__(
#         self, encoder_hidden_dim=768, llm_hidden_dim=2560, nhead=8, nb_feat_tokens=5
#     ):
#         super(Projector, self).__init__()
#         self.proj_layer = torch.nn.Linear(encoder_hidden_dim, llm_hidden_dim)
#         self.gelu = torch.nn.GELU()

#         self.query = torch.nn.Parameter(torch.randn(nb_feat_tokens, llm_hidden_dim))
#         self.multihead_attn = torch.nn.MultiheadAttention(llm_hidden_dim, nhead, batch_first=True)
        
#         self.linear_layer = torch.nn.Linear(llm_hidden_dim, llm_hidden_dim)
        
#         self.nb_feat_tokens = nb_feat_tokens
#         self.encoder_hidden_dim = encoder_hidden_dim
#         self.llm_hidden_dim = llm_hidden_dim

#     def forward(self, x):
#         bs = x.shape[0]
        
#         x = self.proj_layer(x.reshape(-1, self.encoder_hidden_dim))
#         x = self.gelu(x)
#         x = x.reshape(bs, -1, self.llm_hidden_dim)

#         x = self.multihead_attn(torch.stack([self.query]*bs), x, x, need_weights=False)[0]

#         x = self.linear_layer(x.reshape(-1, self.llm_hidden_dim))
#         x = x.reshape(-1, self.nb_feat_tokens, self.llm_hidden_dim)
#         return x

In [12]:
projector = Projector(nb_feat_tokens=50).to("cuda").to(torch.bfloat16)

In [13]:
feat_extracted = out_enc["last_hidden_state"].to(torch.bfloat16)
feat_project = projector(feat_extracted)
feat_project.shape

torch.Size([1, 50, 2560])

In [14]:
def add_padding(list_ids: list[torch.Tensor]) -> torch.Tensor:
    """Add padding to a list of tensors and return a padded tensor (batch)"""
    padded_tensor = torch.nn.utils.rnn.pad_sequence(
        [sample.flip(dims=(0,)) for sample in list_ids],
        batch_first=True,
        padding_value=llm_tokenizer.pad_token_id,
    ).flip(dims=(1,))
    return padded_tensor


def create_mask(padded_tensor: torch.Tensor) -> torch.Tensor:
    """Create a mask for HuggingFace models"""
    decoder_mask = torch.logical_not(
        (padded_tensor == torch.full_like(padded_tensor, llm_tokenizer.pad_token_id))
    ).to(dtype=torch.int)
    return decoder_mask

In [15]:
class FriendsDataset(Dataset):
    def __init__(self, df, tokenizer, processor):
        self.df = df
        self.tokenizer = tokenizer
        self.processor = processor

    def __len__(self) -> int:
        """Return the number of element of the dataset"""
        return len(self.df)

    def __getitem__(self, idx) -> (str, torch.Tensor, torch.Tensor):
        """Return the input for the model and the label for the loss"""
        df_elem = self.df.loc[idx]

        task = random.choice(["speach", "emotion", "speaker"])
        if task == "speach":
            text_inp = f"Transcription: {df_elem['Utterance']}"
        elif task == "emotion":
            text_inp = f"Emotion: {df_elem['Emotion']}."
        elif task == "speaker":
            text_inp = f"Speaker: {df_elem['Speaker']}."
            
        tokens_ids_inp = self.tokenizer(text_inp, add_special_tokens=False)['input_ids']
        tokens_ids_target = tokens_ids_inp[1:] + [self.tokenizer.eos_token_id]

        audio_torch = torch.load(PT_DATA / f"dia{df_elem['Dialogue_ID']}_utt{df_elem['Utterance_ID']}.pt")
        audio_torch = self.processor(audio_torch, return_tensors="pt", sampling_rate=16000).input_features

        return torch.tensor(tokens_ids_inp, dtype=torch.int64), torch.tensor(tokens_ids_target, dtype=torch.int64), audio_torch

In [16]:
def collate_fn(batch):
    tokens_ids_inp_list = [element[0] for element in batch]
    tokens_ids_target_list = [element[1] for element in batch]
    batch_audio_torch = torch.cat([element[2] for element in batch])

    batch_inp = add_padding(tokens_ids_inp_list)
    batch_target = add_padding(tokens_ids_target_list)
    
    decoder_mask = create_mask(batch_inp)
    
    return batch_inp, batch_target, decoder_mask, batch_audio_torch


In [17]:
dataset = FriendsDataset(df=df, tokenizer=llm_tokenizer, processor=whisper_processor)
dataloader = DataLoader(
    dataset,
    batch_size=8,
    num_workers=1,
    prefetch_factor=1,
    shuffle=True,
    collate_fn=collate_fn
)

In [18]:
class LlavaFriends(torch.nn.Module):
    def __init__(
        self, whisper_encoder, llm_model, projector
    ):
        super(LlavaFriends, self).__init__()
        self.whisper_encoder = whisper_encoder
        self.llm_model = llm_model
        self.projector = projector

    def add_audio_feat(self, batch_embed_inp, batch_audio_torch, decoder_mask):
        nb_audio_tokens = batch_audio_torch.shape[1]
        idx_max = torch.argmax(decoder_mask, dim=1)
        
        decoder_mask = torch.stack([
            torch.cat([vec[:idx_m], torch.tensor([1]*nb_audio_tokens).to("cuda"), vec[idx_m:]])
            for vec, idx_m in zip(decoder_mask, idx_max)
        ])
        batch_embed_inp = torch.stack([
            torch.cat([vec[:idx_m], audio_vec, vec[idx_m:]])
            for vec, idx_m, audio_vec in zip(batch_embed_inp, idx_max, batch_audio_torch)
        ])
    
        return decoder_mask, batch_embed_inp

    def forward(
        self, batch_inp, decoder_mask, batch_audio_torch
    ):
        batch_embed_inp = self.llm_model.model.embed_tokens(batch_inp)
        batch_audio_torch = self.whisper_encoder(batch_audio_torch)["last_hidden_state"]

        batch_audio_torch = self.projector(batch_audio_torch.to(torch.bfloat16))
    
        decoder_mask, batch_embed_inp = self.add_audio_feat(batch_embed_inp, batch_audio_torch, decoder_mask)
    
        out = self.llm_model(inputs_embeds=batch_embed_inp, attention_mask=decoder_mask)
        nb_audio_tokens = batch_audio_torch.shape[1]
        return out.logits[:, nb_audio_tokens:, :]

    def generate(
        self, batch_inp, decoder_mask, batch_audio_torch
    ):
        batch_embed_inp = self.llm_model.model.embed_tokens(batch_inp)
        batch_audio_torch = self.whisper_encoder(batch_audio_torch)["last_hidden_state"]

        batch_audio_torch = self.projector(batch_audio_torch.to(torch.bfloat16))
    
        decoder_mask, batch_embed_inp = self.add_audio_feat(batch_embed_inp, batch_audio_torch, decoder_mask)
    
        out = llm_model.generate(inputs_embeds=batch_embed_inp, attention_mask=decoder_mask, max_new_tokens=100, do_sample=False)
        return out

llava_model = LlavaFriends(whisper_encoder, llm_model, projector)

In [19]:
def prepare_for_loss(logits, labels):
    """Unfold the Tensors to compute the CrossEntropyLoss correctly"""
    batch_size, seq_length, vocab_size = logits.shape
    logits = logits.reshape(batch_size * seq_length, vocab_size)
    labels = labels.reshape(batch_size * seq_length)
    return logits, labels

In [20]:
# Initialize Optimizer and Criterion
# We choose the CrossEntropyLoss and Adam because they're the most used
criterion = torch.nn.CrossEntropyLoss(ignore_index=llm_tokenizer.pad_token_id)
optimizer = torch.optim.Adam(llava_model.parameters(), lr=1e-4)

In [21]:
loop = tqdm(dataloader)
for batch_inp, batch_target, decoder_mask, batch_audio_torch in loop:
    batch_inp = batch_inp.to("cuda")
    batch_target = batch_target.to("cuda")
    decoder_mask = decoder_mask.to("cuda")
    batch_audio_torch = batch_audio_torch.to("cuda")
    # print(batch_inp.shape)
    # print(batch_target.shape)
    # print(decoder_mask.shape)
    # print(batch_audio_torch.shape)

    logits = llava_model(batch_inp, decoder_mask, batch_audio_torch)
    logits, labels = prepare_for_loss(logits, batch_target)
    loss = criterion(logits, labels)

    loss.backward()
    optimizer.step()

    # print next to progress bar
    loop.set_postfix(loss=loss.item())

  0%|          | 0/1249 [00:00<?, ?it/s]

In [48]:
torch.cuda.max_memory_allocated(device="cuda")/(1024**3)

13.062968254089355

In [22]:
class FriendsTestDataset(Dataset):
    def __init__(self, df, tokenizer, processor):
        self.df = df
        self.tokenizer = tokenizer
        self.processor = processor

    def __len__(self) -> int:
        """Return the number of element of the dataset"""
        return len(self.df)

    def __getitem__(self, idx) -> (str, torch.Tensor, torch.Tensor):
        """Return the input for the model and the label for the loss"""
        df_elem = self.df.loc[idx]
        question = df_elem['question']
        speaker = df_elem['Speaker']
        emotion = df_elem['Emotion']
        speach = df_elem['Utterance']

        task = random.choice(["speach", "emotion", "speaker"])
        if task == "speach":
            text_inp = f"Transcription:"
        elif task == "emotion":
            text_inp = f"Emotion:"
        elif task == "speaker":
            text_inp = f"Speaker:"
        tokens_ids_inp = self.tokenizer(text_inp, add_special_tokens=False)['input_ids']
        tokens_ids_target = tokens_ids_inp[1:] + [self.tokenizer.eos_token_id]

        audio_torch = torch.load(PT_TEST_DATA / f"dia{df_elem['Dialogue_ID']}_utt{df_elem['Utterance_ID']}.pt")
        audio_torch = self.processor(audio_torch, return_tensors="pt", sampling_rate=16000).input_features

        return torch.tensor(tokens_ids_inp, dtype=torch.int64), audio_torch, text_inp, speaker, emotion, speach

In [23]:
def collate_fn_test(batch):
    tokens_ids_inp_list = [element[0] for element in batch]
    batch_audio_torch = torch.cat([element[1] for element in batch])
    questions = [element[2] for element in batch]
    speakers = [element[3] for element in batch]
    emotions = [element[4] for element in batch]
    speachs = [element[5] for element in batch]

    batch_inp = add_padding(tokens_ids_inp_list)
    
    decoder_mask = create_mask(batch_inp)
    
    return batch_inp, decoder_mask, batch_audio_torch, questions, speakers, emotions, speachs

In [24]:
test_dataset = FriendsTestDataset(df=df_test, tokenizer=llm_tokenizer, processor=whisper_processor)
test_dataloader = DataLoader(
    test_dataset,
    batch_size=4,
    num_workers=4,
    prefetch_factor=2,
    shuffle=True,
    collate_fn=collate_fn_test
)

In [25]:
iter_test_data = iter(test_dataloader)

In [26]:
batch_inp, decoder_mask, batch_audio_torch, questions, speakers, emotions, speachs = next(iter_test_data)
batch_inp = batch_inp.to("cuda")
decoder_mask = decoder_mask.to("cuda")
batch_audio_torch = batch_audio_torch.to("cuda")

out = llava_model.generate(batch_inp, decoder_mask, batch_audio_torch)
out_str = llm_tokenizer.batch_decode(out, skip_special_tokens=True)

for gen_answ, question, speaker, emotion, speach in zip(out_str, questions, speakers, emotions, speachs):
    print(f"Speaker: {speaker}")
    print(f"Speach: {speach}")
    print(f"Emotion: {emotion}")
    print(f"Question: {question}")
    print(f"Generated answer: {gen_answ}")
    print("#" * 50)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Speaker: Rachel
Speach: You don't know anything.
Emotion: anger
Question: Emotion:
Generated answer:  neutral.
##################################################
Speaker: Paul
Speach: So Lizzie, are-are-are you planning on staying the night?
Emotion: neutral
Question: Emotion:
Generated answer:  neutral.
##################################################
Speaker: Monica
Speach: Relieved?
Emotion: surprise
Question: Speaker:
Generated answer:  Ross.
##################################################
Speaker: Chandler
Speach: But after I unpacked the boxes I wanted to do something nice for you, so, I-I-I cleaned the apartment.
Emotion: neutral
Question: Transcription:
Generated answer:  Oh, I'm so sorry.
##################################################
