In [13]:
import torch

In [2]:
EMOTION_IDS = {
    'anger/disgust': 0,
    'fear/surprise': 1,
    'happiness': 2,
    'neutral': 3,
    'sadness': 4
}

IEMOCAP_EMOTION_MAP = {
    'hap': 'happiness',
    'sad': 'sadness',
    'ang': 'anger/disgust',
    'fru': 'anger/disgust',
    'exc': 'happiness',
    'fea': 'fear/surprise',
    'sur': 'fear/surprise',
    'dis': 'anger/disgust',
    'neu': 'neutral'
}

In [4]:
import pandas as pd

# Load the dataset into a pandas dataframe.
df_session_1 = pd.read_csv("data/Session1.csv", index_col=0)
df_session_2 = pd.read_csv("data/Session2.csv", index_col=0)
df_session_3 = pd.read_csv("data/Session3.csv", index_col=0)
df_session_4 = pd.read_csv("data/Session4.csv", index_col=0)
df_session_5 = pd.read_csv("data/Session5.csv", index_col=0)

df_session_1['Label'] = df_session_1['Emotion'].apply(lambda emotion: EMOTION_IDS[IEMOCAP_EMOTION_MAP[emotion]])
df_session_2['Label'] = df_session_2['Emotion'].apply(lambda emotion: EMOTION_IDS[IEMOCAP_EMOTION_MAP[emotion]])
df_session_3['Label'] = df_session_3['Emotion'].apply(lambda emotion: EMOTION_IDS[IEMOCAP_EMOTION_MAP[emotion]])
df_session_4['Label'] = df_session_4['Emotion'].apply(lambda emotion: EMOTION_IDS[IEMOCAP_EMOTION_MAP[emotion]])
df_session_5['Label'] = df_session_5['Emotion'].apply(lambda emotion: EMOTION_IDS[IEMOCAP_EMOTION_MAP[emotion]])

# Report the number of sentences.
print('Number of training sentences: {:,}\n'.format(df_session_1.shape[0] + df_session_2.shape[0] + df_session_3.shape[0] + df_session_4.shape[0]))

# Display 10 random rows from the data.
df_session_1.sample(10)

Number of training sentences: 5,879



Unnamed: 0_level_0,Session_Number,Mocap_Source,Dialogue_Type,Dialogue_Number,Utterance_Number,StartTime,EndTime,Utterance,Speaker,Emotion,Emotion_Label,Label
Sr.No,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
844,1,F,improvisation,05,35,205.63,211.94,Or you can make a phone call. But I think a l...,M,neu,3,3
257,1,M,improvisation,01,32,151.85,157.36,"'Cause you're obviously not talking to me, you...",M,ang,0,0
1097,1,F,improvisation,02,8,110.5653,113.2,I know.,M,fru,0,0
20,1,M,improvisation,07,11,60.81,66.77,"Yeah, that you probably. You should see one of...",F,exc,2,2
967,1,F,script,01_3,9,106.5092,114.9575,"I don't really have any imagination, that's al...",M,hap,2,2
634,1,F,script,02_1,30,209.63,211.115,For real?,M,sur,1,1
542,1,M,script,01_2,13,138.57,142.3975,"Okay, calm yourself. Alright.",M,fru,0,0
1076,1,M,script,02_2,50,497.8775,500.04,It's not champagne.,M,neu,3,3
766,1,F,improvisation,04,25,157.1625,159.7,Why? What choice?,F,fru,0,0
837,1,F,improvisation,05,31,178.59,180.87,Oh. My name is Sean. Hi.,M,neu,3,3


In [6]:
from transformers import BertTokenizer

# Load the BERT tokenizer.
print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Loading BERT tokenizer...


In [7]:
df_sessions = [df_session_1, df_session_2, df_session_3, df_session_4, df_session_1]

In [14]:
for test_session_idx in range(5):
    df_test = df_sessions[test_session_idx]
    df_val = df_sessions[test_session_idx]
    df_train = pd.concat([sess for idx, sess in enumerate(df_sessions) if idx != test_session_idx])
    
    train_sentences = df_train.Utterance.values
    train_labels = df_train.Label.values
    test_sentences = df_test.Utterance.values
    test_labels = df_test.Label.values
    val_sentences = df_val.Utterance.values
    val_labels = df_val.Label.values
    
    # Print the original sentence.
    print(' Original: ', train_sentences[0])

    # Print the sentence split into tokens.
    print('Tokenized: ', tokenizer.tokenize(train_sentences[0]))

    # Print the sentence mapped to token ids.
    print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(train_sentences[0])))
    
    train_max_len = 0
    for sent in train_sentences:
        input_ids = tokenizer.encode(sent, add_special_tokens=True)
        train_max_len = max(train_max_len, len(input_ids))
    print('Max train sentence length: ', train_max_len)

    test_max_len = 0
    for sent in test_sentences:
        input_ids = tokenizer.encode(sent, add_special_tokens=True)
        test_max_len = max(test_max_len, len(input_ids))
    print('Max test sentence length: ', test_max_len)

    val_max_len = 0
    for sent in val_sentences:
        input_ids = tokenizer.encode(sent, add_special_tokens=True)
        val_max_len = max(val_max_len, len(input_ids))
    print('Max val sentence length: ', val_max_len)
    
    train_input_ids = []
    train_attention_masks = []
    for sent in train_sentences:
        encoded_dict = tokenizer.encode_plus(
                            sent,                      # Sentence to encode.
                            add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                            max_length = 64,           # Pad & truncate all sentences.
                            pad_to_max_length = True,
                            return_attention_mask = True,   # Construct attn. masks.
                            return_tensors = 'pt',     # Return pytorch tensors.
                            truncation=True
                       )
        train_input_ids.append(encoded_dict['input_ids'])
        train_attention_masks.append(encoded_dict['attention_mask'])

    train_input_ids = torch.cat(train_input_ids, dim=0)
    train_attention_masks = torch.cat(train_attention_masks, dim=0)
    train_labels = torch.tensor(train_labels)

    test_input_ids = []
    test_attention_masks = []
    for sent in test_sentences:
        encoded_dict = tokenizer.encode_plus(
                            sent,                      # Sentence to encode.
                            add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                            max_length = 64,           # Pad & truncate all sentences.
                            pad_to_max_length = True,
                            return_attention_mask = True,   # Construct attn. masks.
                            return_tensors = 'pt',     # Return pytorch tensors.
                            truncation=True
                       )
        test_input_ids.append(encoded_dict['input_ids'])
        test_attention_masks.append(encoded_dict['attention_mask'])

    test_input_ids = torch.cat(test_input_ids, dim=0)
    test_attention_masks = torch.cat(test_attention_masks, dim=0)
    test_labels = torch.tensor(test_labels)

    val_input_ids = []
    val_attention_masks = []
    for sent in val_sentences:
        encoded_dict = tokenizer.encode_plus(
                            sent,                      # Sentence to encode.
                            add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                            max_length = 64,           # Pad & truncate all sentences.
                            pad_to_max_length = True,
                            return_attention_mask = True,   # Construct attn. masks.
                            return_tensors = 'pt',     # Return pytorch tensors.
                            truncation=True
                       )
        val_input_ids.append(encoded_dict['input_ids'])
        val_attention_masks.append(encoded_dict['attention_mask'])

    val_input_ids = torch.cat(val_input_ids, dim=0)
    val_attention_masks = torch.cat(val_attention_masks, dim=0)
    val_labels = torch.tensor(val_labels)
    
    from torch.utils.data import TensorDataset, random_split

    train_dataset = TensorDataset(train_input_ids, train_attention_masks, train_labels)
    test_dataset = TensorDataset(test_input_ids, test_attention_masks, test_labels)
    val_dataset = TensorDataset(val_input_ids, val_attention_masks, val_labels)
    
    torch.save(train_dataset, 'train_dataset_' + str(test_session_idx) + '.pt')
    torch.save(test_dataset, 'test_dataset_' + str(test_session_idx) + '.pt')
    torch.save(val_dataset, 'val_dataset_' + str(test_session_idx) + '.pt')

 Original:  Why did he invite her here?
Tokenized:  ['why', 'did', 'he', 'invite', 'her', 'here', '?']
Token IDs:  [2339, 2106, 2002, 13260, 2014, 2182, 1029]
Max train sentence length:  109
Max test sentence length:  108
Max val sentence length:  108
 Original:  Check this out.  You know how I've told you I've been really into like softball recently?
Tokenized:  ['check', 'this', 'out', '.', 'you', 'know', 'how', 'i', "'", 've', 'told', 'you', 'i', "'", 've', 'been', 'really', 'into', 'like', 'softball', 'recently', '?']
Token IDs:  [4638, 2023, 2041, 1012, 2017, 2113, 2129, 1045, 1005, 2310, 2409, 2017, 1045, 1005, 2310, 2042, 2428, 2046, 2066, 12585, 3728, 1029]
Max train sentence length:  109
Max test sentence length:  94
Max val sentence length:  94
 Original:  Check this out.  You know how I've told you I've been really into like softball recently?
Tokenized:  ['check', 'this', 'out', '.', 'you', 'know', 'how', 'i', "'", 've', 'told', 'you', 'i', "'", 've', 'been', 'really', 'int

In [16]:
df_train

Unnamed: 0_level_0,Session_Number,Mocap_Source,Dialogue_Type,Dialogue_Number,Utterance_Number,StartTime,EndTime,Utterance,Speaker,Emotion,Emotion_Label,Label
Sr.No,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,1,M,improvisation,07,0,2.6812,7.9800,Check this out. You know how I've told you I'...,M,exc,2,2
2,1,M,improvisation,07,0,7.6300,8.5700,Yeah.,F,neu,3,3
3,1,M,improvisation,07,1,8.2200,14.7500,"Well, this is totally random, I got this full ...",M,exc,2,2
4,1,M,improvisation,07,1,13.9500,21.1200,[LAUGHTER]. For softball? That's unbelievable....,F,exc,2,2
5,1,M,improvisation,07,2,15.5400,20.6700,For softball. They're going to pay me to go t...,M,exc,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...
1534,4,M,improvisation,05,41,303.5200,307.1400,"I'm sorry, really. But--",F,fru,0,0
1535,4,M,improvisation,05,43,306.6100,309.2200,I don't -- I'm not- I'm not convinced.,M,fru,0,0
1536,4,M,improvisation,05,42,308.6700,310.4800,That I'm sorry?,F,fru,0,0
1537,4,M,improvisation,05,44,309.7800,313.2200,"Yeah. If you were really sorry, you'd give me...",M,ang,0,0


In [11]:
[a_ for i, a_ in enumerate(a) if i != 2]

[1, 2, 4, 5]