In [7]:
import os
import pandas as pd

def prepare_split(split):
    dataset_dir = '/project/msoleyma_1026/EmpatheticResponseGeneration'
    dialogues_df = pd.read_csv(f'{dataset_dir}/MELD.Raw/{split}_sent_emo.csv').groupby('Dialogue_ID')
    targets_df = pd.read_csv(f'{dataset_dir}/Targets/{split}_structured.csv')

    data = []
    for d_id, dialogue in dialogues_df:
        for _, row in dialogue.iterrows():
            u_id = row['Utterance_ID']

            # Only process up to second-to-last utterance in each dialogue (because last utterance does not have next speaker)
            if u_id == dialogue['Utterance_ID'].max():
                break
        
            video_path = f'{dataset_dir}/FaceCrop/{split}/dia{d_id}_utt{u_id}/pyavi/highest_faces_encoded.mp4'
            audio_path = f'{dataset_dir}/Audio/{split}/dia{d_id}_utt{u_id}.wav'

            # only include utterances with successful face cropping and audio extraction
            if os.path.isfile(video_path) and os.path.isfile(audio_path):
                data.append({
                    'Dialogue_ID': d_id,
                    'Utterance_ID': u_id,
                    'Video_Path': video_path,
                    'Audio_Path': audio_path
                })
    
    return data

train_data = prepare_split('train')
test_data = prepare_split('test')

In [None]:
import torch
import os

from imagebind import data
from imagebind.models import imagebind_model
from imagebind.models.imagebind_model import ModalityType
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = imagebind_model.imagebind_huge(pretrained=True)
model.eval()
model.to(device)

for split in ['train', 'test']:
    save_dir = f"/project/msoleyma_1026/EmpatheticResponseGeneration/ImagebindEmbeds/{split}"
    os.makedirs(save_dir, exist_ok=True)

    if split == 'train':
        dataset = train_data
    else:
        dataset = test_data
    
    for x in tqdm(dataset):
        multimodal_inputs = {
            ModalityType.AUDIO: data.load_and_transform_audio_data([x['Audio_Path']], device),
            ModalityType.VISION: data.load_and_transform_video_data([x['Video_Path']], device)
        }
    
        # generate joint embedding by adding modality-specific embeddings
        with torch.no_grad():
            modality_embeddings = model(multimodal_inputs)
        
            joint_embedding = modality_embeddings[ModalityType.AUDIO] + modality_embeddings[ModalityType.VISION]
    
        d_id = x['Dialogue_ID']
        u_id = x['Utterance_ID']
        save_path = os.path.join(save_dir, f"dia{d_id}_utt{u_id}.pt")
        torch.save(joint_embedding.cpu(), save_path)

  1%|          | 103/8572 [01:32<1:56:18,  1.21it/s]