In [10]:
import torch

In [4]:
import pandas as pd

# Load the dataset into a pandas dataframe.
df_train = pd.read_csv("train.csv", index_col=0)
df_test = pd.read_csv("test.csv", index_col=0)
df_val = pd.read_csv("val.csv", index_col=0)

# Report the number of sentences.
print('Number of training sentences: {:,}\n'.format(df_train.shape[0]))

# Display 10 random rows from the data.
df_train.sample(10)

Number of training sentences: 87,170



Unnamed: 0,emotion,act,text,topic,label
86419,no_emotion,question,What do you think of the acting of the two mai...,relationship,3
45619,no_emotion,question,What was that ?,ordinary_life,3
72424,no_emotion,question,Isn't my baggage enough of a deposit ?,tourism,3
35739,no_emotion,inform,"After that , we'll let you decide if you still...",ordinary_life,3
66114,sadness,inform,"Sorry , sir , we are having a sale now .",ordinary_life,4
48712,no_emotion,inform,"Good afternoon , Mr . Dome ' s office .",work,3
78570,no_emotion,question,"Good afternoon , ABC Incorporated . How many I...",work,3
21959,no_emotion,inform,"In that case , formal suit with a nice tie wil...",relationship,3
82333,no_emotion,question,Have you received any scholarships ?,work,3
12071,anger,directive,Don't brother me !,school_life,0


In [5]:
train_sentences = df_train.text.values
train_labels = df_train.label.values
test_sentences = df_test.text.values
test_labels = df_test.label.values
val_sentences = df_val.text.values
val_labels = df_val.label.values

In [6]:
from transformers import BertTokenizer

# Load the BERT tokenizer.
print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Loading BERT tokenizer...


In [7]:
# Print the original sentence.
print(' Original: ', train_sentences[0])

# Print the sentence split into tokens.
print('Tokenized: ', tokenizer.tokenize(train_sentences[0]))

# Print the sentence mapped to token ids.
print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(train_sentences[0])))

 Original:  Say , Jim , how about going for a few beers after dinner ?
Tokenized:  ['say', ',', 'jim', ',', 'how', 'about', 'going', 'for', 'a', 'few', 'beers', 'after', 'dinner', '?']
Token IDs:  [2360, 1010, 3958, 1010, 2129, 2055, 2183, 2005, 1037, 2261, 18007, 2044, 4596, 1029]


In [8]:
train_max_len = 0
for sent in train_sentences:
    input_ids = tokenizer.encode(sent, add_special_tokens=True)
    train_max_len = max(train_max_len, len(input_ids))
print('Max train sentence length: ', train_max_len)

test_max_len = 0
for sent in test_sentences:
    input_ids = tokenizer.encode(sent, add_special_tokens=True)
    test_max_len = max(test_max_len, len(input_ids))
print('Max test sentence length: ', test_max_len)

val_max_len = 0
for sent in val_sentences:
    input_ids = tokenizer.encode(sent, add_special_tokens=True)
    val_max_len = max(val_max_len, len(input_ids))
print('Max val sentence length: ', val_max_len)

Max train sentence length:  296
Max test sentence length:  220
Max val sentence length:  178


In [11]:
train_input_ids = []
train_attention_masks = []
for sent in train_sentences:
    encoded_dict = tokenizer.encode_plus(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 64,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                        truncation=True
                   )
    train_input_ids.append(encoded_dict['input_ids'])
    train_attention_masks.append(encoded_dict['attention_mask'])

train_input_ids = torch.cat(train_input_ids, dim=0)
train_attention_masks = torch.cat(train_attention_masks, dim=0)
train_labels = torch.tensor(train_labels)

test_input_ids = []
test_attention_masks = []
for sent in test_sentences:
    encoded_dict = tokenizer.encode_plus(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 64,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                        truncation=True
                   )
    test_input_ids.append(encoded_dict['input_ids'])
    test_attention_masks.append(encoded_dict['attention_mask'])

test_input_ids = torch.cat(test_input_ids, dim=0)
test_attention_masks = torch.cat(test_attention_masks, dim=0)
test_labels = torch.tensor(test_labels)

val_input_ids = []
val_attention_masks = []
for sent in val_sentences:
    encoded_dict = tokenizer.encode_plus(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 64,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                        truncation=True
                   )
    val_input_ids.append(encoded_dict['input_ids'])
    val_attention_masks.append(encoded_dict['attention_mask'])

val_input_ids = torch.cat(val_input_ids, dim=0)
val_attention_masks = torch.cat(val_attention_masks, dim=0)
val_labels = torch.tensor(val_labels)

In [12]:
from torch.utils.data import TensorDataset, random_split

train_dataset = TensorDataset(train_input_ids, train_attention_masks, train_labels)
test_dataset = TensorDataset(test_input_ids, test_attention_masks, test_labels)
val_dataset = TensorDataset(val_input_ids, val_attention_masks, val_labels)

In [13]:
torch.save(train_dataset, 'train_dataset.pt')
torch.save(test_dataset, 'test_dataset.pt')
torch.save(val_dataset, 'val_dataset.pt')