 # This is the first project 'sentiment analysis of IMDB' of the GeekTime's NLP Camp. I will record some possible modifications and improvement ideas.

In [1]:
# import and check the data set

In [2]:
# here we use the punkt to do word tokenize
import nltk
nltk.download('punkt') 
from nltk.tokenize import word_tokenize
tokenizer = word_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dwang\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


When we want to use the combination of word embedding/ sentence embedding and character-level embedding. The torchtext is not a good choice.

In [6]:
# the data are from the IMDB dataset, which includes movie reviews
# here we use the torchtext to build the data set and do some preprocessing. 
# we set the seed and cuda(if used) to repeat the results

#!pip install torchtext
import torch
from torchtext import data, datasets
import random

SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
TEXT = data.Field(tokenize = tokenizer, include_lengths = True) # TEXT defines the format of the text data
LABEL = data.LabelField(dtype = torch.float) # LABEL defines the format of the label data
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)

# if we later try other local dataset we can use this
# train, val, test = data.TabularDataset.splits(
#         path='./data/', train='train.tsv',
#         validation='val.tsv', test='test.tsv', format='tsv',
#         fields=[('Text', TEXT), ('Label', LABEL)])

train_data, valid_data = train_data.split(split_ratio=0.8)

In [7]:
# Let's check the size of each data sets
print("train data size", len(train_data))
print("valid data size", len(valid_data))
print("test data size", len(test_data))

train data size 20000
valid data size 5000
test data size 25000


As we can see here, the train and test data are splite equally by IMDB.

I use a large word embedding 6B/300, which is a good choice. But since the code runs on my laptop without GPU, I add a hyperparameter MAX_VOCAB_SIZE, which influences the performance greatly.

In [8]:
# after that, let's build the vocabulary
MAX_VOCAB_SIZE = 25000
TEXT.build_vocab(train_data, max_size = MAX_VOCAB_SIZE, vectors = "glove.6B.300d",
                unk_init = torch.Tensor.normal_)
LABEL.build_vocab(train_data)

We prefer a small batch_size at first and increase the large batch_size later to stabilize the perforamnce.
NOTES:
When our memory is not enoughr for batch_size = 1 /2, we can do the following:
1, we can use half precision, like 16 bits flaot.
2, we can use distributed computation.(Consider DeepSpeed)
3, we can do gradient accumulation. Because of the error, please do not use more than steps' accumulation.

In [9]:
# since we learn with batch, we need to build the iterator
BATCH_SIZE = 32
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
(train_data, valid_data, test_data), batch_size = BATCH_SIZE, sort_within_batch = True, device = device
)

Normally, we need to fix the parameters of the embedding at first with the pretrained paramters. After training sveral epoches, we can train its parameters and the lstm parameters together.

The seqlength is an important paramter. Please do not use the maximum word length as the seqLength. If it is too large, 
the convergence speed will be too slow. We need more padding which is harmful for the final precision. 

Parameters initialization, random initialization with normal distribution is a good choice.

In [10]:
# the next step is to define the neural network structure
import torch.nn as nn
class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers,
                bidirectional, dropout, pad_idx):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional,
                          dropout=dropout)
        self.fc = nn.Linear(hidden_dim*2, output_dim)
        self.dropout = nn.Dropout(dropout)
    def forward(self, text, text_lengths):

        embedded = self.embedding(text)

        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths)

        packed_output, (hidden, cell) = self.rnn(packed_embedded)

        output, out_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)

        hidden = self.dropout(torch.cat((hidden[-2,:,:],hidden[-1,:,:]), dim=1))

        return self.fc(hidden)
        

In [11]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 300
HIDDEN_DIM = 64
OUTPUT_DIM = 1
N_LAYERS = 1
BIDIRECTIONAL = True
DROPOUT = 0.2
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS,
           BIDIRECTIONAL, DROPOUT, PAD_IDX)



In [12]:
# we need to pay attention to the padding, unknown items, since they do not need to take part in 
# the training process. Also we torchtext provide us the pretrained text vector
pretrained_embeddings = TEXT.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embeddings)
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]
model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

WarmUp:
We can use a small learning rate at the first to ensure a good initialization and then increase it for the main training process. When approaching convergence, we need to decrease the learning rate to approach the local optimals. 

It is recommended to use multiple loss iterms.

In [13]:
# before start the training process, we need to define the related parts
import torch.optim as optim
optimizer = optim.Adam(model.parameters(), lr = 1e-3)
criterion = nn.BCEWithLogitsLoss()
model = model.to(device)
criterion = criterion.to(device)

In [14]:
def binary_accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()
    acc = correct.sum() / len(correct)
    return acc

In [21]:
def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.train()
    for batch in iterator:
        optimizer.zero_grad()
        text, text_lengths = batch.text
        predictions = model(text, text_lengths).squeeze(1)
        loss = criterion(predictions, batch.label)
        acc = binary_accuracy(predictions, batch.label)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        print("epoch_loss", loss.item(), "epoch_acc",acc.item() )
    return epoch_loss / len(iteratore), epoch_acc / len(iterator)

In [22]:
def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.eval()
    with torch.no_grad():
        for batch in iterator:
            text, text_lengths = batch.text
            predictions = model(text, text_lengths).squeeze(1)
            loss = criterionion(predictions, batch.label)
            acc = binary_accuracy(predictions, batch.label)
            epoch_loss += loss.item()
            epoch_acc += acc.item()
    return epoch_loss / len(iteratore), epoch_acc / len(iterator)           

It is not recommended to only save the best validation model. Because
1, the near best model is useful for later hand tuning tasks
2, we need to save learning rate/ optimizer and other info

In [None]:
N_EPOCHS = 2
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion) # To validate model at the end of each epoch is also serious offence. 
    
    if valid_loss < best_valid_loss: # This is serious offence here. It is a VERY bad idea to only save the best model. When you want to babysit the training process, it is usually a bad idea to start from the best model. 
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'model.pt') # The estimator state should also be saved (as well as scheduler if available)
    
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

epoch_loss 0.6766699552536011 epoch_acc 0.4375
epoch_loss 0.654873251914978 epoch_acc 0.53125
epoch_loss 0.6881387233734131 epoch_acc 0.53125
epoch_loss 0.6945408582687378 epoch_acc 0.5625


In [None]:
model.load_state_dict(torch.load('model.pt'))

test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')