In [1]:
from util import get_num_lines, get_vocab, embed_sequence, get_word2idx_idx2word, get_embedding_matrix
from util import TextDatasetWithGloveElmoSuffix as TextDataset
from util import evaluate
from model import RNNSequenceClassifier

import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
from torch.utils.data import DataLoader

import csv
import h5py
# import matplotlib
# matplotlib.use('Agg')  # to avoid the error: _tkinter.TclError: no display name and no $DISPLAY environment variable
# matplotlib.use('tkagg') # to display the graph on remote server
import matplotlib.pyplot as plt

print("PyTorch version:")
print(torch.__version__)
print("GPU Detected:")
print(torch.cuda.is_available())
using_GPU = True

PyTorch version:
0.4.1.post2
GPU Detected:
True


In [27]:


"""
1. Data pre-processing
"""
'''
1.1 
get rev_id --> label as a dictionary:
    rev_id: string to indicate the id of the comment
    label: int 1 or 0
'''
id2label = {}
with open('../aggression_dataset/aggression_annotations.tsv') as f:
    lines = csv.reader(f, delimiter='\t')
    next(lines, None)  # skip the headers
    for line in lines:
        rev_id = line[0]
        # worker_id = line[1]
        aggression_label = int(float(line[2]))
        # aggression_score = float(line[3])
        id2label[rev_id] = aggression_label                 
'''
1.2 
get raw dataset as three list with given train/dev/test split:
  Each element is a triple:
    comment: the text needed to be classified (removed NEWLINE_TOKEN)
    rev_id: string
    label: int 1 or 0
'''


raw_train = []
raw_dev = []
raw_test = []
with open('../aggression_dataset/aggression_annotated_comments.tsv') as f:
    lines = csv.reader(f, delimiter='\t')
    next(lines, None)  # skip the headers
    for line in lines:
        rev_id = line[0]
        # lowercse 
        comment = line[1].replace('NEWLINE_TOKEN', '').lower()
        split = line[6]
        sen_len = len(comment.split())
        if  sen_len != 0 and sen_len < 10000:  # filter for length
            if split == 'train':
                raw_train.append([comment, rev_id, id2label[rev_id]])
            elif split == 'dev':
                raw_dev.append([comment,rev_id,  id2label[rev_id]])
            else:
                raw_test.append([comment, rev_id, id2label[rev_id]])     
            
# datset split without limit on sen_len: train, dev, test:  69526=12875+56648       23160       23178
# with limit on sen_len <= 50: 44227=8710+35517    14663=939+11724      14493=2828+11665
# with limit on sen_len <= 100: 57664=10868+46796          19186          19154

In [3]:
"""
2. Data preparation
"""
'''
2. 1
get vocabulary and glove embeddings in raw dataset 
'''
# vocab is a set of words
vocab = get_vocab(raw_train + raw_dev + raw_test)
# two dictionaries. <PAD>: 0, <UNK>: 1
word2idx, idx2word = get_word2idx_idx2word(vocab)
# glove_embeddings a nn.Embeddings
glove_embeddings = get_embedding_matrix(word2idx, idx2word, normalization=False)
# elmo_embeddings
# elmos_train_vua = h5py.File('../elmo/VUA_train.hdf5', 'r')
# elmos_val_vua = h5py.File('../elmo/VUA_val.hdf5', 'r')

vocab size:  152445


100%|██████████| 2196017/2196017 [00:36<00:00, 60412.21it/s]


Number of pre-trained word vectors loaded:  43858
Embeddings mean:  -0.007001449353992939
Embeddings stdev:  0.3965314030647278


In [28]:
# dataset study

# raw_train_pos = []  # 8710
# raw_train_neg = []  # 35517
# for example in raw_train:
#     if example[2] == 1:
#         raw_train_pos.append(example)
#     else:
#         raw_train_neg.append(example)
# raw_train_balanced = raw_train_pos + raw_train_neg[:len(raw_train_pos)]
# print(len(raw_train), len(raw_dev), len(raw_test))
# print(len(raw_train_pos), len(raw_train_neg))

In [4]:
'''
2. 2
embed the datasets
'''


elmos_train = None
elmos_dev = None
embedded_train = [[embed_sequence(example[0], word2idx, glove_embeddings, elmos_train), example[2]]
                      for example in raw_train]
embedded_dev = [[embed_sequence(example[0], word2idx, glove_embeddings, elmos_dev), example[2]]
                    for example in raw_dev]

In [5]:
'''
2. 3
set up Dataloader for batching
'''
# Separate the input (embedded_sequence) and labels in the indexed train sets.
train_dataset = TextDataset([example[0] for example in embedded_train],
                                [example[1] for example in embedded_train])
dev_dataset = TextDataset([example[0] for example in embedded_dev],
                              [example[1] for example in embedded_dev])

# Data-related hyperparameters
batch_size = 256
# Set up a DataLoader for the training, validation, and test dataset
train_dataloader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True,
                                  collate_fn=TextDataset.collate_fn)
dev_dataloader = DataLoader(dataset=dev_dataset, batch_size=batch_size,
                                collate_fn=TextDataset.collate_fn)

In [11]:
# embedded_train = [[embed_sequence(example[0], word2idx, glove_embeddings, elmos_train), example[2]]
#                       for example in raw_train[:10000]]
# train_dataset = TextDataset([example[0] for example in embedded_train],
#                                 [example[1] for example in embedded_train])
# train_dataloader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True,
#                                   collate_fn=TextDataset.collate_fn)

# embedded_train_small = [[embed_sequence(example[0], word2idx, glove_embeddings, elmos_train), example[2]]
#                       for example in raw_train[:1000]]
# train_dataset_small = TextDataset([example[0] for example in embedded_train_small],
#                                 [example[1] for example in embedded_train_small])
# train_dataloader_small = DataLoader(dataset=train_dataset_small, batch_size=batch_size, shuffle=True,
#                                   collate_fn=TextDataset.collate_fn)

# embedded_dev_small = [[embed_sequence(example[0], word2idx, glove_embeddings, elmos_dev), example[2]]
#                       for example in raw_dev[:500]]
# dev_dataset_small = TextDataset([example[0] for example in embedded_dev],
#                                 [example[1] for example in embedded_dev])
# dev_dataloader_small = DataLoader(dataset=dev_dataset, batch_size=batch_size, shuffle=True,
#                                   collate_fn=TextDataset.collate_fn)

using_GPU = False
"""
3. Model training
"""
'''
3. 1 
set up model, loss criterion, optimizer
'''
# Instantiate the model
# embedding_dim = glove + elmo + suffix indicator
# dropout1: dropout on input to RNN
# dropout2: dropout in RNN; would not be used if num_layers=1
# dropout3: dropout on hidden state of RNN to linear layer
# Separate the input (embedded_sequence) and labels in the indexed train sets.

# change embedding_dim=300+1024+50 if with elmo + suffix, 512 or 300 for hidden_size
rnn_clf = RNNSequenceClassifier(num_classes=2, embedding_dim=300, hidden_size=100, num_layers=1, bidir=True,
                                dropout1=0, dropout2=0.2, dropout3=0)
# Move the model to the GPU if available
if using_GPU:
    rnn_clf = rnn_clf.cuda()
# Set up criterion for calculating loss
nll_criterion = nn.NLLLoss()
# Set up an optimizer for updating the parameters of the rnn_clf
# rnn_clf_optimizer = optim.SGD(rnn_clf.parameters(), lr=0.01, momentum=0.9)
rnn_clf_optimizer = optim.Adam(rnn_clf.parameters(), lr=0.0001)
# Number of epochs (passes through the dataset) to train the model for.
num_epochs = 100

'''
3. 2
train model
'''
# without checkpoint, print during training
all_train_loss = [] 
# with checkpoints, print and plot at the end
training_loss = []
val_loss = []
training_f1 = []
val_f1 = []
# A counter for the number of gradient updates
num_iter = 0
for epoch in range(num_epochs):
    print("Starting epoch {}".format(epoch + 1))
    for (example_text, example_lengths, labels) in train_dataloader:
        example_text = Variable(example_text)
        example_lengths = Variable(example_lengths)
        labels = Variable(labels)
        if using_GPU:
            example_text = example_text.cuda()
            example_lengths = example_lengths.cuda()
            labels = labels.cuda()
        # predicted shape: (batch_size, 2)
        predicted = rnn_clf(example_text, example_lengths)
        batch_loss = nll_criterion(predicted, labels)
        rnn_clf_optimizer.zero_grad()
        batch_loss.backward()
        rnn_clf_optimizer.step()
        # keep record
        num_iter += 1
        all_train_loss.append(batch_loss.item())
        print('total loss: ', batch_loss.item())
        # Calculate validation and training set loss and accuracy every 200 gradient updates
        if num_iter % 200 == 0:
            avg_eval_loss, eval_accuracy, precision, recall, f1, fus_f1 = evaluate(dev_dataloader_small, rnn_clf,
                                                                                   nll_criterion, using_GPU)
            val_loss.append(avg_eval_loss)
            val_f1.append(f1)
            print(
                "Iteration {}. Validation Loss {}. Validation Accuracy {}. Validation Precision {}. Validation Recall {}. Validation F1 {}. Validation class-wise F1 {}.".format(
                    num_iter, avg_eval_loss, eval_accuracy, precision, recall, f1, fus_f1))
            # filename = '../models/LSTMSuffixElmoAtt_???_all_iter_' + str(num_iter) + '.pt'
            # torch.save(rnn_clf, filename)
#             avg_eval_loss, eval_accuracy, precision, recall, f1, fus_f1 = evaluate(train_dataloader_small, rnn_clf,
#                                                                                    nll_criterion, using_GPU)
#             training_loss.append(avg_eval_loss)
#             training_f1.append(f1)
#             print(
#                 "Iteration {}. Training Loss {}. Training Accuracy {}. Training Precision {}. Training Recall {}. Training F1 {}. Training class-wise F1 {}.".format(
#                     num_iter, avg_eval_loss, eval_accuracy, precision, recall, f1, fus_f1))
print("Training done!")

# cannot display the graph in terminal on remote server
"""
3.3
plot the training process: MET F1 and losses for validation and training dataset
"""
plt.figure(0)
plt.title('F1 for VUA dataset')
plt.xlabel('iteration (unit:200)')
plt.ylabel('F1')
plt.plot(val_f1,'g')
plt.plot(training_f1, 'b')
plt.legend(['Validation F1', 'Training F1'], loc='upper right')
plt.show()


plt.figure(1)
plt.title('Loss for VUA dataset')
plt.xlabel('iteration (unit:200)')
plt.ylabel('Loss')
plt.plot(val_loss,'g')
plt.plot(training_loss, 'b')
plt.legend(['Validation loss', 'Training loss'], loc='upper right')
plt.show()


Starting epoch 1
total loss:  0.7033807635307312
total loss:  0.7006012201309204
total loss:  0.698373556137085
total loss:  0.6956207752227783
total loss:  0.6940983533859253
total loss:  0.6907171607017517
total loss:  0.6878277063369751
total loss:  0.6867819428443909
total loss:  0.6844305396080017
total loss:  0.6806299090385437
total loss:  0.6832157373428345
total loss:  0.6766575574874878
total loss:  0.6726333498954773
total loss:  0.6747531890869141
total loss:  0.6716988682746887
total loss:  0.6648218631744385
total loss:  0.6641291379928589
total loss:  0.6611498594284058
total loss:  0.6553524136543274
total loss:  0.6594277024269104
total loss:  0.6529259085655212
total loss:  0.6552569270133972
total loss:  0.6448972225189209
total loss:  0.6467337012290955
total loss:  0.6431085467338562
total loss:  0.6455671787261963
total loss:  0.6418192386627197
total loss:  0.6384009718894958
total loss:  0.6326025724411011
total loss:  0.635673999786377
total loss:  0.6362977623

total loss:  0.43095171451568604
total loss:  0.42974433302879333
total loss:  0.4526609480381012
total loss:  0.4641832113265991
total loss:  0.505105197429657
total loss:  0.5218487977981567
total loss:  0.525973379611969
total loss:  0.4770985543727875
total loss:  0.469107449054718
total loss:  0.45030879974365234
total loss:  0.4349500834941864
total loss:  0.5540388822555542
total loss:  0.4777229428291321
total loss:  0.3999686539173126
total loss:  0.43161818385124207
total loss:  0.487554669380188
total loss:  0.4215661585330963
total loss:  0.40403446555137634
total loss:  0.48537275195121765
total loss:  0.4491118788719177
total loss:  0.4936070740222931
total loss:  0.4464767575263977
total loss:  0.42852583527565
total loss:  0.48870646953582764
total loss:  0.4428735673427582
total loss:  0.4640365540981293
total loss:  0.4114541709423065
total loss:  0.47916361689567566
total loss:  0.39641085267066956
total loss:  0.4368857741355896
total loss:  0.48714709281921387
tota

total loss:  0.36421188712120056
total loss:  0.3467249274253845
total loss:  0.34895768761634827
total loss:  0.31050053238868713
Starting epoch 13
total loss:  0.47825345396995544
total loss:  0.3853483498096466
total loss:  0.3416697084903717
total loss:  0.36173513531684875
total loss:  0.4382287263870239
total loss:  0.3412337303161621
total loss:  0.42932453751564026
total loss:  0.3676883280277252
total loss:  0.3714163899421692
total loss:  0.36065933108329773
total loss:  0.33759644627571106
total loss:  0.35438454151153564
total loss:  0.37266990542411804
total loss:  0.38675832748413086
total loss:  0.39574408531188965
total loss:  0.48632025718688965
total loss:  0.3944985270500183
total loss:  0.43802931904792786
total loss:  0.3245357275009155
total loss:  0.33483439683914185
total loss:  0.36064690351486206
total loss:  0.38524603843688965
total loss:  0.36329057812690735
total loss:  0.3918398916721344
total loss:  0.341764360666275
total loss:  0.42477256059646606
tota

total loss:  0.3364924490451813
total loss:  0.34360817074775696
total loss:  0.2919561564922333
total loss:  0.33342689275741577
total loss:  0.3926818072795868
total loss:  0.3645480275154114
total loss:  0.3275393545627594
total loss:  0.3383967876434326
total loss:  0.39076852798461914
Starting epoch 19
total loss:  0.3273644745349884
total loss:  0.34806230664253235
total loss:  0.3353683352470398
total loss:  0.3497403562068939
total loss:  0.38346871733665466
total loss:  0.4113236963748932
total loss:  0.4232126772403717
total loss:  0.3207990825176239
total loss:  0.3746936023235321
total loss:  0.34528598189353943
total loss:  0.298734188079834
total loss:  0.3482123911380768
total loss:  0.402088463306427
total loss:  0.3942941427230835
total loss:  0.32977211475372314
total loss:  0.3484899401664734
total loss:  0.34188854694366455
total loss:  0.3226921558380127
total loss:  0.2907339334487915
total loss:  0.33833256363868713
total loss:  0.3646685779094696
total loss:  0.

total loss:  0.34678947925567627
total loss:  0.352632075548172
total loss:  0.312369704246521
total loss:  0.3254830539226532
total loss:  0.29758307337760925
total loss:  0.337289959192276
total loss:  0.28563976287841797
total loss:  0.3784399628639221
total loss:  0.28337982296943665
total loss:  0.30815795063972473
total loss:  0.3138113021850586
total loss:  0.24873630702495575
total loss:  0.36038917303085327
total loss:  0.24870306253433228
Starting epoch 25
total loss:  0.3264745771884918
total loss:  0.30586329102516174
total loss:  0.30149713158607483
total loss:  0.3607213497161865
total loss:  0.3510645031929016
total loss:  0.37391453981399536
total loss:  0.28370392322540283
total loss:  0.33049294352531433
total loss:  0.33412429690361023
total loss:  0.36165496706962585
total loss:  0.25702473521232605
total loss:  0.3361806273460388
total loss:  0.3173214793205261
total loss:  0.3657014071941376
total loss:  0.26762670278549194
total loss:  0.29500266909599304
total l

total loss:  0.27039840817451477
total loss:  0.3783615827560425
total loss:  0.27377039194107056
total loss:  0.31124576926231384
total loss:  0.3325096368789673
total loss:  0.27841004729270935
total loss:  0.34164518117904663
total loss:  0.2667475938796997
total loss:  0.297016441822052
total loss:  0.2506733536720276
total loss:  0.2858319878578186
total loss:  0.29213061928749084
total loss:  0.34355905652046204
total loss:  0.29343724250793457
total loss:  0.33977144956588745
total loss:  0.27718207240104675
total loss:  0.35578614473342896
total loss:  0.39988765120506287
total loss:  0.360710084438324
[[408.  47.]
 [ 20.  25.]]
Iteration 1200. Validation Loss 0.3522036671638489. Validation Accuracy 86. Validation Precision 55.55555555555556. Validation Recall 34.72222222222222. Validation F1 42.73504273504274. Validation class-wise F1 67.57363688281015.
[[832.  76.]
 [ 18.  74.]]
Iteration 1200. Training Loss 0.24644902348518372. Training Accuracy 90. Training Precision 80.434

KeyboardInterrupt: 

In [12]:
embedded_dev = [[embed_sequence(example[0], word2idx, glove_embeddings, elmos_dev), example[2]]
                      for example in raw_dev]
dev_dataset = TextDataset([example[0] for example in embedded_dev],
                                [example[1] for example in embedded_dev])
dev_dataloader = DataLoader(dataset=dev_dataset, batch_size=batch_size, shuffle=True,
                                  collate_fn=TextDataset.collate_fn)
avg_eval_loss, eval_accuracy, precision, recall, f1, fus_f1 = evaluate(dev_dataloader, rnn_clf,
                                                                       nll_criterion, using_GPU)
val_loss.append(avg_eval_loss)
val_f1.append(f1)
print(
    "Iteration {}. Validation Loss {}. Validation Accuracy {}. Validation Precision {}. Validation Recall {}. Validation F1 {}. Validation class-wise F1 {}.".format(
        num_iter, avg_eval_loss, eval_accuracy, precision, recall, f1, fus_f1))

[[11052.  1599.]
 [  672.  1340.]]
Iteration 1215. Validation Loss 0.39598435163497925. Validation Accuracy 84. Validation Precision 66.60039761431412. Validation Recall 45.593739367131676. Validation F1 54.1304786911735. Validation class-wise F1 72.40677780712521.
