# Setup

Run the following commands from terminal

```bash
# create a Jupyter kernel and install required packages
module purge
mamba create -n ml python=3.10
mamba activate ml
pip install numpy pandas torch transformers
mamba install -c conda-forge ipykernel
python -m ipykernel install --user --name ml --display-name "ml"
```

```bash
# clone ImageBind and install requirements
git clone https://github.com/facebookresearch/ImageBind.git
cd ImageBind
pip install .
```

# Download MELD Dataset

In [None]:
!wget -P /scratch1/$USER https://huggingface.co/datasets/declare-lab/MELD/resolve/main/MELD.Raw.tar.gz
!tar -xzf /scratch1/$USER/MELD.Raw.tar.gz -C /scratch1/$USER
!rm /scratch1/$USER/MELD.Raw.tar.gz

!find /scratch1/$USER/MELD.Raw/ -type f -name '*.tar.gz' -exec tar -xzf {} -C /scratch1/$USER/MELD.Raw/ \;
!find /scratch1/$USER/MELD.Raw/ -type f -name '*.tar.gz' -exec rm {} \;

--2025-03-05 17:39:48--  https://huggingface.co/datasets/declare-lab/MELD/resolve/main/MELD.Raw.tar.gz
Resolving huggingface.co (huggingface.co)... 18.164.174.55, 18.164.174.17, 18.164.174.23, ...
Connecting to huggingface.co (huggingface.co)|18.164.174.55|:443... connected.
302 Foundest sent, awaiting response... 
Location: https://cdn-lfs.hf.co/repos/e5/f9/e5f9b3280b3cb63549fb6376cd84286e8bf7cade60967f984e192dda5701e74b/a56b4407d574195cbce470d86f9c9d72fcfea59b0e34502ecd4babee4a5c613e?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27MELD.Raw.tar.gz%3B+filename%3D%22MELD.Raw.tar.gz%22%3B&response-content-type=application%2Fgzip&Expires=1741228788&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTc0MTIyODc4OH19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5oZi5jby9yZXBvcy9lNS9mOS9lNWY5YjMyODBiM2NiNjM1NDlmYjYzNzZjZDg0Mjg2ZThiZjdjYWRlNjA5NjdmOTg0ZTE5MmRkYTU3MDFlNzRiL2E1NmI0NDA3ZDU3NDE5NWNiY2U0NzBkODZmOWM5ZDcyZmNmZWE1OWIwZTM0NTAyZWNkNGJhYmVlNGE1Y

# Prepare the Dataset

- Read each .csv file (train, dev, test) and for each utterance (uniquely identified by `Dialogue_ID d_id` and `Utterance_ID u_id`), collect the following information:
    - `Conversational_History`: Concenated `Utterance` for all `Utterance_ID` $\leq$ `u_id` and `Dialogue_ID` = `d_id`
    - `Video_Path`: MP4 filepath associated  for utterance `(d_id, u_id)`
    - `Label`: The response to the utterance `(d_id, u_id)`, which would be the utterance for `(d_id, u_id+1)`


In [1]:
import os
import pandas as pd

splits = ['train', 'dev', 'test']

def prepare_split(split):
    df = pd.read_csv(f'/scratch1/{os.environ.get("USER")}/MELD.Raw/{split}_sent_emo.csv')

    video_folders = {
        'train': 'train_splits',
        'dev': 'dev_splits_complete',
        'test': 'output_repeated_splits_test'
    }

    data = []
    dialogues_df = df.groupby('Dialogue_ID')
    
    for d_id, dialogue in dialogues_df:
        conv_hist = ""

        for _, row in dialogue.iterrows():
            u_id = row['Utterance_ID']

            # Only process up to second-to-last utterance in each dialogue (because last utterance does not have target/response)
            if u_id == dialogue['Utterance_ID'].max():
                break

            conv_hist += row['Utterance'] + " "

            try:
                next_utterance = dialogue[dialogue['Utterance_ID'] == u_id + 1].iloc[0]
            except IndexError:
                continue
                
            label = next_utterance['Utterance']
            
            video_path = f'/scratch1/{os.environ.get("USER")}/MELD.Raw/{video_folders[split]}/dia{d_id}_utt{u_id}.mp4'
            
            data.append({
                'Dialogue_ID': d_id,
                'Utterance_ID': u_id,
                'Conversational_History': conv_hist.strip(),
                'Video_Path': video_path,
                'Label': label
            })
    
    return data

train_data = prepare_split('train')
dev_data = prepare_split('dev')
test_data = prepare_split('test')

## Extract audio .wav from .mp4 files for each utterance

Run the following commands from the terminal:
```
./extract_audio.sh /scratch1/$USER/MELD.Raw/train_splits/ /scratch1/$USER/MELD.Raw/train_audio
./extract_audio.sh /scratch1/$USER/MELD.Raw/dev_splits_complete/ /scratch1/$USER/MELD.Raw/dev_audio
./extract_audio.sh /scratch1/$USER/MELD.Raw/output_repeated_splits_test/ /scratch1/$USER/MELD.Raw/test_audio
```

In [2]:
def add_audio_to_split(split, data):
    # filter out utterances that failed audio extraction
    data[:] = [
        u for u in data
        if os.path.isfile(f'/scratch1/{os.environ.get("USER")}/MELD.Raw/{split}_audio/dia{u["Dialogue_ID"]}_utt{u["Utterance_ID"]}.wav')
    ]

    # assign valid audio file paths to each utterance
    for u in data:
        d_id = u['Dialogue_ID']
        u_id = u['Utterance_ID']

        audio_path = f'/scratch1/{os.environ.get("USER")}/MELD.Raw/{split}_audio/dia{d_id}_utt{u_id}.wav'
        u['Audio_Path'] = audio_path

add_audio_to_split('train', train_data)
add_audio_to_split('dev', dev_data)
add_audio_to_split('test', test_data)

In [3]:
from torch.utils.data import Dataset, DataLoader

class MultimodalMELD(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        u = self.data[idx]

        return {
            'text': u['Conversational_History'],
            'audio': u['Audio_Path'],
            'video': u['Video_Path'],
            'label': u['Label']
        }

train_dataset = MultimodalMELD(train_data)
dev_dataset = MultimodalMELD(dev_data)
test_dataset = MultimodalMELD(test_data)

train_dataloader = DataLoader(train_dataset, batch_size=2, shuffle=True)
dev_dataloader = DataLoader(dev_dataset, batch_size=2, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=2, shuffle=False)

  from .autonotebook import tqdm as notebook_tqdm


# Define the Model

**Forward Pass:**
- Use frozen ImageBind model to generate joint (multimodal) embedding
    - Add the embeddings for each modality to get unified embedding
- Use projector to project joint embedding into token embedding space
- Use decoder-only LLM model to generate predicted response to the utterance

**Fine Tuning:**
- Fine-tune projector and LLM with LoRA

In [5]:
from imagebind import data
import torch
from imagebind.models import imagebind_model
from imagebind.models.imagebind_model import ModalityType

#### RUN THIS CELL #####
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = imagebind_model.imagebind_huge(pretrained=True)
model.eval()
model.to(device)

inputs = {
    ModalityType.TEXT: data.load_and_transform_text([train_dataset[0]['text']], device),
    ModalityType.AUDIO: data.load_and_transform_audio_data([train_dataset[0]['audio']], device),
    ModalityType.VISION: data.load_and_transform_video_data([train_dataset[0]['video']], device)
}

with torch.no_grad():
    embeddings = model(inputs)

embeddings[ModalityType.TEXT]
embeddings[ModalityType.AUDIO]
embeddings[ModalityType.VISION]



tensor([[-0.0223,  0.0326,  0.0214,  ..., -0.0364, -0.0179,  0.0184]])

In [None]:
import torch
import torch.nn as nn
from transformers import GPT2LMHeadModel, AutoTokenizer
from imagebind import data
from imagebind.models import imagebind_model
from imagebind.models.imagebind_model import ModalityType

class EmpatheticMLLM(nn.Module):
    def __init__(self):
        super(EmpatheticMLLM, self).__init__()
        
        self.imagebind = imagebind_model.imagebind_huge(pretrained=True)
        self.imagebind.eval() # freeze imagebind model

        self.llm = GPT2LMHeadModel.from_pretrained('gpt2')
        self.tokenizer = AutoTokenizer.from_pretrained('gpt2')

        self.projector = nn.Linear(1024, self.llm.config.n_embd)

    def forward(self, x, trg=None):
        device = next(self.parameters()).device
        
        inputs = {
            ModalityType.TEXT: data.load_and_transform_text(x['text'], device),
            ModalityType.AUDIO: data.load_and_transform_audio_data(x['audio'], device),
            ModalityType.VISION: data.load_and_transform_video_data(x['video'], device)
        }
            
        with torch.no_grad():
            modality_embeddings = self.imagebind(inputs)

            # (bs, 1024) -- 1024 is d for ImageBind
            joint_embedding = modality_embeddings[ModalityType.TEXT] \
                + modality_embeddings[ModalityType.AUDIO] \
                + modality_embeddings[ModalityType.VISION]

        # (bs, 1, 768) -- 768 is d for GPT2
        inputs_embeds = self.projector(joint_embedding).unsqueeze(1)

        if trg is not None:
            # (bs, T, V)
            output_logits = []

            for t in range(trg.size(1)):
                t_embed = self.llm.transformer.wte.weight[t]

                inputs_embeds = torch.cat([inputs_embeds, trg_embeds], dim=1)

                output = self.llm(inputs_embeds=inputs_embeds)
                output_logits.append(output.logits)

        output_logits = torch.stack(output_logits, dim=1)
        return output_logits

In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = EmpatheticMLLM()

In [7]:
batch = next(iter(train_dataloader))
x = {
    'text': batch['text'],
    'audio': batch['audio'],
    'video': batch['video']
}

outputs = model(x)

torch.Size([2, 1024])


In [8]:
projected_embedding.shape

NameError: name 'projected_embedding' is not defined

# Training Loop
        

In [None]:
optimizer = Adam(model.parameters(), lr=1e-5)
loss_fn = nn.CrossEntropyLoss()

epochs = 1
for epoch in range(epochs):
    model.train()
    total_loss = 0
    
    for batch in train_dataloader:
        optimizer.zero_grad()

        x = {
            'text': batch['text'],
            'audio': batch['audio'],
            'video': batch['video']
        }

        logits = model(x)

        loss = loss_fn()

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}")