Most of sources are from https://github.com/bentrevett/pytorch-sentiment-analysis

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
!pip install torch==1.8.0+cu111 torchvision==0.9.0+cu111 torchaudio==0.8.0 -f https://download.pytorch.org/whl/torch_stable.html
!pip install torchtext==0.9.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in links: https://download.pytorch.org/whl/torch_stable.html
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [6]:
import torch
import torch.nn as nn
import torch.optim as optim

import os
from torchtext.legacy import data, datasets
import random
import json

In [7]:
# Tokenizing whole dataset once and save it!
TEXT = data.Field(tokenize='spacy')
LABEL = data.LabelField()

print("Downloading and tokenizing")
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)

print("Make dataset")
train_examples = [vars(t) for t in train_data]
test_examples = [vars(t) for t in test_data]

print("Storing")
if not os.path.exists('./drive/My Drive/public/data/imdb'):
    os.mkdir('./drive/My Drive/public/data/imdb')

with open('./drive/My Drive/public/data/imdb/sentiment_train.json', 'w+') as f:
    for example in train_examples:
        json.dump(example, f)
        f.write('\n')
        
with open('./drive/My Drive/public/data/imdb/sentiment_test.json', 'w+') as f:
    for example in test_examples:
        json.dump(example, f)
        f.write('\n')

print("Done")



Downloading and tokenizing
downloading aclImdb_v1.tar.gz


aclImdb_v1.tar.gz: 100%|██████████| 84.1M/84.1M [00:08<00:00, 9.92MB/s]


Make dataset
Storing
Done


In [5]:
TEXT = data.Field()
LABEL = data.LabelField()

fields = {'text': ('text', TEXT), 'label': ('label', LABEL)}

train_data, test_data = data.TabularDataset.splits(
    path = './drive/My Drive/public/data/imdb',
    train = 'sentiment_train.json',
    test = 'sentiment_test.json',
    format = 'json',
    fields = fields
)

print("Splitting for train and validation")
train_data, valid_data = train_data.split(random_state = random.seed(1234))

MAX_VOCAB_SIZE = 25_000

print("Generating vocabulary sets")
TEXT.build_vocab(train_data, 
                 max_size = MAX_VOCAB_SIZE, 
                 vectors = "glove.6B.100d", 
                 unk_init = torch.Tensor.normal_)
LABEL.build_vocab(train_data)

print("Done")

Splitting for train and validation
Generating vocabulary sets


.vector_cache/glove.6B.zip: 862MB [02:41, 5.34MB/s]                           
100%|█████████▉| 399999/400000 [00:13<00:00, 29304.48it/s]


Done


In [6]:
print("Number of training data : ", len(train_data))
print("Number of validation data : ", len(valid_data))
print("Number of test data : ", len(test_data))

print("Number of unique tokens of data vocab. : ", len(TEXT.vocab))
print("Number of unique tokens of label vocab. : ", len(LABEL.vocab))

print("One of the training example")
print(vars(train_data.examples[0]))

print("Vocabulary set")
print(TEXT.vocab.itos[:10])
print(LABEL.vocab.stoi)

Number of training data :  17500
Number of validation data :  7500
Number of test data :  25000
Number of unique tokens of data vocab. :  25002
Number of unique tokens of label vocab. :  2
One of the training example
{'text': ['We', 'expected', 'something', 'great', 'when', 'we', 'went', 'to', 'see', 'this', 'bomb', '.', 'It', 'is', 'basically', 'a', 'Broadway', 'play', 'put', 'on', 'film', '.', 'The', 'music', 'is', 'plain', 'terrible', '.', 'There', 'is', "n't", 'one', 'memorable', 'song', 'in', 'the', 'movie', '--', 'heard', 'any', 'hits', 'from', 'this', 'movie', '?', 'You', 'wo', "n't", 'because', 'there', 'are', "n't", 'any', '.', 'Some', 'of', 'the', 'musical', 'numbers', 'go', 'on', 'so', 'long', 'that', 'I', 'got', 'up', 'to', 'go', 'to', 'the', 'restroom', 'and', 'get', 'some', 'pop', 'corn', 'and', 'it', 'was', 'still', 'going', 'when', 'I', 'got', 'back', '!', 'If', 'they', 'were', 'good', 'songs', 'well', '--', 'but', 'they', 'suck', '.', 'The', 'pace', 'is', 'slow', ',', 

In [15]:
BATCH_SIZE = 64

INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 2
DROPOUT = 0.5

LR = 0.001
OPTIMIZER = 'Adam'
MAX_EPOCH = 5

RNN_TYPE = 'lstm'
PRETRAIN_EMBEDDING = 0
BIDIRECTIONAL = True
N_LAYERS = 1

In [16]:
class RNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim,\
                dropout, rnn_type, bidirectional, n_layers):
        super().__init__()
        self.rnn_type = rnn_type
        self.bidir = bidirectional
        if self.bidir == True:
            bi_coeff = 2
        else:
            bi_coeff = 1

        self.embedding = nn.Embedding(input_dim, embedding_dim)
        if rnn_type == 'rnn':
            self.rnn = nn.RNN(embedding_dim, hidden_dim, num_layers=n_layers, \
                              bidirectional=bidirectional)
        elif rnn_type == 'lstm':
            self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers,\
                               bidirectional=bidirectional)
        
        self.fc = nn.Linear(bi_coeff*hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

        self.criterion = nn.CrossEntropyLoss()
        self.softmax = nn.Softmax(dim=1)
        
    def forward(self, text, label):
        # text (Timeseq, Batch)
        Tx, Bn = text.size()
        embedded = self.dropout(self.embedding(text))
        # embedded (Timeseq, Batch, emb_dim)

        if self.rnn_type == 'rnn':
            output, hidden = self.rnn(embedded)
        elif self.rnn_type == 'lstm':
            output, (hidden, cell) = self.rnn(embedded)
        
        if self.bidir == True:
            hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
        else:
            hidden = self.dropout(hidden[-1,:,:])
        # hidden (Batch, hidden_dim*direction)
        
        logit = self.fc(hidden)
        # logit (Batch, output_dim)

        if label is not None:
            loss = self.criterion(logit, label)
        else:
            loss = 0
        probs = self.softmax(logit)
        # probs (Batch, output_dim)

        return loss, probs

In [17]:
def train(model, device, train_loader, optimizer, epoch):
    model.train()
    
    correct = 0
    den = 0
    train_loss = 0
    for batch_idx, (batch) in enumerate(train_loader):
        den += 1
        text = batch.text.to(device)
        label = batch.label.to(device)
              
        optimizer.zero_grad()
        loss, probs = model(text, label)
        loss.backward()
        optimizer.step()

        pred = torch.argmax(probs, dim=1)
        correct += (pred == label).float().sum().item()
        train_loss += loss.item()
  
        Tx, Bn = text.size()
        
    acc = correct / len(train_loader.dataset)
    train_loss /= den
    print('Epoch {} Train: Loss: {:.6f} \tAcc.: {:.6f}'.format(
                epoch, train_loss, acc))   
    
            
def test(model, device, test_loader, epoch):
    model.eval()
    test_loss = 0
    correct = 0
    den = 0
    with torch.no_grad():
        for batch in test_loader:
            den += 1
            text = batch.text.to(device)
            label = batch.label.to(device)
            
            loss, probs = model(text, label)
            pred = torch.argmax(probs, dim=1)

            test_loss += loss.item()
            
            correct += (pred == label).float().sum()

    test_loss /= den
    acc = correct / len(test_loader.dataset)

    print('Epoch {} Test : Loss: {:.6f} \tAcc.: {:.6f}\n'.format(
        epoch, test_loss, acc))
    return acc

In [18]:
### Training
# Check the device
cuda = torch.cuda.is_available()
device = torch.device("cuda" if cuda else "cpu")

# Datasets
train_loader = data.BucketIterator(train_data, batch_size = BATCH_SIZE)
valid_loader = data.BucketIterator(valid_data, batch_size = BATCH_SIZE)
test_loader = data.BucketIterator(test_data, batch_size = BATCH_SIZE)

# build my model
model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, DROPOUT, RNN_TYPE, BIDIRECTIONAL, N_LAYERS)
if PRETRAIN_EMBEDDING == 1:
    pretrained_embeddings = TEXT.vocab.vectors
    model.embedding.weight.data.copy_(pretrained_embeddings)
model.to(device)

# build the optimizer
if OPTIMIZER == 'RMSprop':
    opt = optim.RMSprop(model.parameters(), lr=LR)
elif OPTIMIZER == 'Adam':
    opt = optim.Adam(model.parameters(), lr=LR)
elif OPTIMIZER == 'Adadelta':
    opt = optim.Adadelta(model.parameters(), lr=LR)
else:
    opt = optim.SGD(model.parameters(), lr=LR)

# Training..
print("Training Start!")
best_acc = 0
for epoch in range(MAX_EPOCH):
    train(model, device, train_loader, opt, epoch)
    valid_acc = test(model, device, valid_loader, epoch)

    if best_acc < valid_acc:
        print("We found the best model!\n")
        best_acc = valid_acc
        save_dir = 'drive/My Drive/public/results/sentiment_analysis_model_best.pth'
        if os.path.exists(save_dir):
            os.remove(save_dir)
        torch.save(model, save_dir)
    
print("Training is done!!")

Training Start!
Epoch 0 Train: Loss: 0.684666 	Acc.: 0.548457
Epoch 0 Test : Loss: 0.660621 	Acc.: 0.606133

We found the best model!

Epoch 1 Train: Loss: 0.651532 	Acc.: 0.618057
Epoch 1 Test : Loss: 0.603748 	Acc.: 0.682000

We found the best model!

Epoch 2 Train: Loss: 0.639997 	Acc.: 0.634743
Epoch 2 Test : Loss: 0.660230 	Acc.: 0.680133

Epoch 3 Train: Loss: 0.610668 	Acc.: 0.674571
Epoch 3 Test : Loss: 0.571602 	Acc.: 0.728533

We found the best model!

Epoch 4 Train: Loss: 0.550464 	Acc.: 0.726971
Epoch 4 Test : Loss: 0.491066 	Acc.: 0.780000

We found the best model!

Training is done!!


In [22]:
import spacy
!python -m spacy download en_core_web_sm
nlp = spacy.load('en_core_web_sm')

def predict_sentiment(model, sentence):
    model.eval()
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    _, probs = model(tensor, None)

    return probs

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-sm==3.3.0
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.3.0/en_core_web_sm-3.3.0-py3-none-any.whl (12.8 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [23]:
probs = predict_sentiment(model, "This film is Wonderful")
print(probs) #[neg, pos]

tensor([[0.3132, 0.6868]], device='cuda:0', grad_fn=<SoftmaxBackward>)
