In [21]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

No GPU available, using the CPU instead.


In [22]:
EMOTION_IDS = {
    'anger/disgust': 0,
    'fear/surprise': 1,
    'happiness': 2,
    'neutral': 3,
    'sadness': 4
}

MELD_EMOTION_MAP = {
    'joy': 'happiness',
    'sadness': 'sadness',
    'surprise': 'fear/surprise',
    'fear': 'fear/surprise',
    'anger': 'anger/disgust',
    'disgust': 'anger/disgust',
    'neutral': 'neutral'
}

In [24]:
import pandas as pd

# Load the dataset into a pandas dataframe.
df_train = pd.read_csv("data/train_sent_emo.csv", index_col=0)
df_test = pd.read_csv("data/test_sent_emo.csv", index_col=0)
df_val = pd.read_csv("data/dev_sent_emo.csv", index_col=0)

df_train['Label'] = df_train['Emotion'].apply(lambda emotion: EMOTION_IDS[MELD_EMOTION_MAP[emotion]])
df_test['Label'] = df_test['Emotion'].apply(lambda emotion: EMOTION_IDS[MELD_EMOTION_MAP[emotion]])
df_val['Label'] = df_val['Emotion'].apply(lambda emotion: EMOTION_IDS[MELD_EMOTION_MAP[emotion]])

# Report the number of sentences.
print('Number of training sentences: {:,}\n'.format(df_train.shape[0]))

# Display 10 random rows from the data.
df_train.sample(10)

Number of training sentences: 9,989



Unnamed: 0_level_0,Utterance,Speaker,Emotion,Sentiment,Dialogue_ID,Utterance_ID,Season,Episode,StartTime,EndTime,Label
Sr No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
344,"Y'know, last night was embarrassing for you too.",Ross,neutral,neutral,34,5,5,23,"00:14:43,132","00:14:46,259",3
3708,Im an alien. Im an alien.,Chandler,neutral,neutral,370,2,3,10,"00:22:29,598","00:22:31,766",3
4015,"No, no, no... why, because it might get weird ...",Ross,neutral,neutral,410,11,1,24,"00:19:41,430","00:19:43,347",3
3566,Went down to the docks. Bet ya didn't know you...,Monica,neutral,neutral,356,2,3,3,"00:01:19,496","00:01:24,917",3
9819,I mean doesn't she have any y'know other strip...,Rachel,anger,negative,977,4,3,12,"00:21:10,269","00:21:13,813",0
3301,Then keep running.,Phoebe,neutral,neutral,329,10,6,6,"00:13:38,567","00:13:39,692",3
3729,Oh. They don't.,Roger,surprise,positive,372,7,1,13,"00:19:29,376","00:19:31,752",1
996,"Th-th-that's all it is, a third nipple.",Ross,neutral,neutral,100,0,3,23,"00:05:30,246","00:05:32,122",3
199,"Man, I didn't think we were gonna make it!",Joey,surprise,positive,18,9,5,21,"00:05:39,047","00:05:41,548",1
8147,Thats it?! I gave up my,Chandler,joy,positive,819,17,7,4,"00:07:02,672","00:07:05,757",2


In [25]:
train_sentences = df_train.Utterance.values
train_labels = df_train.Label.values
test_sentences = df_test.Utterance.values
test_labels = df_test.Label.values
val_sentences = df_val.Utterance.values
val_labels = df_val.Label.values

In [26]:
from transformers import BertTokenizer

# Load the BERT tokenizer.
print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Loading BERT tokenizer...


In [27]:
# Print the original sentence.
print(' Original: ', train_sentences[0])

# Print the sentence split into tokens.
print('Tokenized: ', tokenizer.tokenize(train_sentences[0]))

# Print the sentence mapped to token ids.
print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(train_sentences[0])))

 Original:  also I was the point person on my companys transition from the KL-5 to GR-6 system.
Tokenized:  ['also', 'i', 'was', 'the', 'point', 'person', 'on', 'my', '[UNK]', 'transition', 'from', 'the', 'k', '##l', '-', '5', 'to', 'gr', '-', '6', 'system', '.']
Token IDs:  [2036, 1045, 2001, 1996, 2391, 2711, 2006, 2026, 100, 6653, 2013, 1996, 1047, 2140, 1011, 1019, 2000, 24665, 1011, 1020, 2291, 1012]


In [28]:
train_max_len = 0
for sent in train_sentences:
    input_ids = tokenizer.encode(sent, add_special_tokens=True)
    train_max_len = max(train_max_len, len(input_ids))
print('Max train sentence length: ', train_max_len)

test_max_len = 0
for sent in test_sentences:
    input_ids = tokenizer.encode(sent, add_special_tokens=True)
    test_max_len = max(test_max_len, len(input_ids))
print('Max test sentence length: ', test_max_len)

val_max_len = 0
for sent in val_sentences:
    input_ids = tokenizer.encode(sent, add_special_tokens=True)
    val_max_len = max(val_max_len, len(input_ids))
print('Max val sentence length: ', val_max_len)

Max train sentence length:  95
Max test sentence length:  61
Max val sentence length:  49


In [29]:
train_input_ids = []
train_attention_masks = []
for sent in train_sentences:
    encoded_dict = tokenizer.encode_plus(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 64,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                        truncation=True
                   )
    train_input_ids.append(encoded_dict['input_ids'])
    train_attention_masks.append(encoded_dict['attention_mask'])

train_input_ids = torch.cat(train_input_ids, dim=0)
train_attention_masks = torch.cat(train_attention_masks, dim=0)
train_labels = torch.tensor(train_labels)

test_input_ids = []
test_attention_masks = []
for sent in test_sentences:
    encoded_dict = tokenizer.encode_plus(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 64,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                        truncation=True
                   )
    test_input_ids.append(encoded_dict['input_ids'])
    test_attention_masks.append(encoded_dict['attention_mask'])

test_input_ids = torch.cat(test_input_ids, dim=0)
test_attention_masks = torch.cat(test_attention_masks, dim=0)
test_labels = torch.tensor(test_labels)

val_input_ids = []
val_attention_masks = []
for sent in val_sentences:
    encoded_dict = tokenizer.encode_plus(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 64,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                        truncation=True
                   )
    val_input_ids.append(encoded_dict['input_ids'])
    val_attention_masks.append(encoded_dict['attention_mask'])

val_input_ids = torch.cat(val_input_ids, dim=0)
val_attention_masks = torch.cat(val_attention_masks, dim=0)
val_labels = torch.tensor(val_labels)

In [30]:
from torch.utils.data import TensorDataset, random_split

train_dataset = TensorDataset(train_input_ids, train_attention_masks, train_labels)
test_dataset = TensorDataset(test_input_ids, test_attention_masks, test_labels)
val_dataset = TensorDataset(val_input_ids, val_attention_masks, val_labels)

In [35]:
torch.save(train_dataset, 'train_dataset.pt')
torch.save(test_dataset, 'test_dataset.pt')
torch.save(val_dataset, 'val_dataset.pt')

In [33]:
len(val_dataset)

1109