In [None]:
import pandas as pd
import re 
import os

# 1. Access the corresponding .txt, .wav, and .avi files for each EDA label
# Extract the conversation filename and speaker informationfrom the dataset
eda_df = pd.read_csv("eda_iemocap_no_utts_dataset.csv")
eda_df = eda_df[["speaker", "utt_id", "EDA"]]
filename_ids = []
speaker_M_F = []
session_numbers = []
for i, row in eda_df.iterrows():
    match = re.search(r"b'(Ses(\d+)[MF]_.+\d+.*)_([MF])", row["speaker"])
    filename_ids.append(match.group(1))
    session_numbers.append(int(match.group(2))) 
    speaker_M_F.append(match.group(3))
eda_df = eda_df.drop(columns=["speaker"])
eda_df["filename"] = filename_ids
eda_df["filename"] = eda_df["filename"].astype(str)
eda_df["session_number"] = session_numbers
eda_df["session_number"] = eda_df["session_number"].astype(int)
eda_df["speaker"] = speaker_M_F
eda_df["speaker"] = eda_df["speaker"].astype(str)
eda_df["utt_id"] = eda_df["utt_id"].astype(int)
# Access transcipt files based on filename
utt_df = []
root_dir = "IEMOCAP_full_release/"
for i in range(1, 6):
    directory = os.path.join(root_dir, f"Session{i}/dialog/transcriptions/")
    for entry in os.scandir(directory):  
        if entry.is_file() and entry.path.endswith(".txt"):  # check if it's a file
            with open(entry.path, "r") as file:
                filename = entry.path.split("/")[-1][:-4]
                lines = file.readlines()
                for order, line in enumerate(lines):
                    speaker_info, utterance = line.split(":")[0], line.split(":")[1]
                    pattern = r"(F|M)(\d+)\s\[(\d+\.\d+)-(\d+\.\d+)\]"
                    match = re.search(pattern, speaker_info)
                    if match is None:
                        continue
                    speaker_f_m = match.group(1)
                    utt_id = match.group(2)
                    start = match.group(3)
                    end = match.group(4)
                    utt_df.append({"utt_id": int(utt_id), "filename": str(filename), "start": float(start), "end": float(end), "speaker": str(speaker_f_m.strip()), "utterance": utterance.strip(), "session_number": int(i), "original_order": order})
utt_df = pd.DataFrame(utt_df)
# Combine the EDA and utterances together
final_df = pd.merge(eda_df, utt_df, on=["utt_id", "session_number", "filename", "speaker"])
final_df
# TODO: Double check if there is really a perfect merge match

Unnamed: 0,utt_id,EDA,filename,session_number,speaker,start,end,utterance,original_order
0,0,sd,Ses01M_impro07,1,M,2.6812,7.9800,Check this out. You know how I've told you I'...,0
1,0,b,Ses01M_impro07,1,F,7.6300,8.5700,Yeah.,1
2,1,sd,Ses01M_impro07,1,M,8.2200,14.7500,"Well, this is totally random, I got this full ...",2
3,1,qy,Ses01M_impro07,1,F,13.9500,21.1200,[LAUGHTER]. For softball? That's unbelievable....,3
4,2,qy,Ses01M_impro07,1,M,15.5400,20.6700,For softball. They're going to pay me to go t...,4
...,...,...,...,...,...,...,...,...,...
10034,24,qy,Ses05F_script01_3,5,F,404.1923,406.6600,Do you still feel like that?,63
10035,39,sd,Ses05F_script01_3,5,M,407.9600,410.7823,"I I want you now, Annie.",64
10036,25,qy,Ses05F_script01_3,5,F,410.4317,427.7079,Because you can't feel like that anymore Chris...,65
10037,40,sd,Ses05F_script01_3,5,M,426.7965,431.8242,"Oh Annie. Annie, I am going to make you a for...",66


In [76]:
from collections import Counter
import matplotlib.pyplot as plt
set(final_df["EDA"])
final_df[final_df["EDA"] == "%"]
labels = Counter(final_df["EDA"])
labels
# plt.hist(labels.values(), bins=len(labels))
# plt.xticks(range(len(labels)))
# plt.show()

Counter({'sd': 4513,
         'sv': 2023,
         'qy': 1047,
         'qw': 732,
         'xx': 490,
         'b': 298,
         'fc': 149,
         'ad': 142,
         'ba': 125,
         'aa': 99,
         'nn': 78,
         'ny': 75,
         'br': 51,
         '^q': 37,
         'qh': 26,
         'na': 25,
         'bh': 16,
         'qy^d': 14,
         'h': 14,
         'qo': 14,
         'ft': 13,
         'fp': 12,
         'bf': 12,
         'fa': 7,
         '%': 6,
         '^2': 5,
         'bk': 5,
         '^h': 3,
         'fo_o_fw_"_by_bc': 2,
         'x': 2,
         'ng': 1,
         'ar': 1,
         'no': 1,
         'b^m': 1})

In [None]:
# 2. Create a dataset following agreement estimation project for each modality or each utterance
from transformers import AutoTokenizer
import torch 
from collections import Counter

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")


labels = set(Counter(final_df["EDA"]).keys())
labels_to_num_mapping = {'CONCESSION': 0, 'FACTS': 1, 'INTEREST': 2, 'POSITIVE EXPECTATIONS': 3, 'POWER': 4, 'PROCEDURAL': 5, 'PROPOSAL': 6, 'RESIDUAL': 7, 'RIGHTS': 8}

class DADataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(labels_to_num_mapping[self.labels[idx]])
        return item     

    def __len__(self):
        return len(self.labels)

# 3. Run some training on 80% of the scripted data on the lab machine
# 4. Determine the training/test splits (think of emotion distribution, scenarios, speaker)