In [1]:
%load_ext autoreload
%autoreload 2

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/sensioai/dl/blob/master/sentiment_analysis_bidirectional/sentiment_analysis_bidirectional.ipynb)

## Transformers

In [None]:
import spacy
nlp = spacy.load('en')

Attention mechanisms allows a model to focus only on the appropiate words at each time step. They revolutionized NLP allowing significant improvements in the state of the art. This technique is applied in the [*Transformer*](https://arxiv.org/abs/1706.03762) architecture, where recurrent layers are replaced by attention layers achieving better performance. Transformers are growing in popularity, and new versions appear constantly. One recent interesting model is [BERT](https://arxiv.org/abs/1810.04805). A popular library to work with this models is [transformers](https://github.com/huggingface/transformers).

In [1]:
!pip install transformers

Collecting transformers
  Downloading transformers-2.11.0-py3-none-any.whl (674 kB)
Collecting filelock
  Using cached filelock-3.0.12-py3-none-any.whl (7.6 kB)
Collecting sentencepiece
  Downloading sentencepiece-0.1.91-cp37-cp37m-win_amd64.whl (1.2 MB)
Collecting tokenizers==0.7.0
  Downloading tokenizers-0.7.0-cp37-cp37m-win_amd64.whl (1.1 MB)
Collecting sacremoses
  Downloading sacremoses-0.0.43.tar.gz (883 kB)
Collecting packaging
  Downloading packaging-20.4-py2.py3-none-any.whl (37 kB)
Collecting regex!=2019.12.17
  Downloading regex-2020.6.8-cp37-cp37m-win_amd64.whl (268 kB)
Collecting click
  Using cached click-7.1.2-py2.py3-none-any.whl (82 kB)
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py): started
  Building wheel for sacremoses (setup.py): finished with status 'done'
  Created wheel for sacremoses: filename=sacremoses-0.0.43-py3-none-any.whl size=893262 sha256=51b03cf3e71313416cb23c35388a2c41a2f8c0bd369308ef3bd2e34ef7d6fa34
  

In [2]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [3]:
tokens = tokenizer.tokenize('Hello WORLD how ARE yoU?')

print(tokens)

['hello', 'world', 'how', 'are', 'you', '?']


In [4]:
indexes = tokenizer.convert_tokens_to_ids(tokens)

print(indexes)

[7592, 2088, 2129, 2024, 2017, 1029]


We need our custom tokenizer, that also cuts sentences to the maximum number of tokens required by BERT.

In [6]:
max_input_length = tokenizer.max_model_input_sizes['bert-base-uncased']

def tokenize_and_cut(sentence):
    tokens = tokenizer.tokenize(sentence) 
    tokens = tokens[:max_input_length-2]
    return tokens

In [7]:
import torch
import torchtext

TEXT = torchtext.data.Field(batch_first = True,
                  use_vocab = False,
                  tokenize = tokenize_and_cut,
                  preprocessing = tokenizer.convert_tokens_to_ids,
                  init_token = tokenizer.cls_token_id,
                  eos_token = tokenizer.sep_token_id,
                  pad_token = tokenizer.pad_token_id,
                  unk_token = tokenizer.unk_token_id)

LABEL = torchtext.data.LabelField(dtype = torch.float)

In [None]:
train_data, test_data = torchtext.datasets.IMDB.splits(TEXT, LABEL)

train_data, valid_data = train_data.split()

In [None]:
LABEL.build_vocab(train_data)

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

dataloader = {
    'train': torchtext.data.BucketIterator(train_data, batch_size=64, sort_within_batch=True, device=device),
    'val': torchtext.data.BucketIterator(valid_data, batch_size=64, device=device),
    'test': torchtext.data.BucketIterator(test_data, batch_size=64, device=device)
}

In [None]:
from transformers import BertModel

bert = BertModel.from_pretrained('bert-base-uncased')

Our BERT-based model for sentiment analysis will use BERT as the embedding layer. Then, the outputs will be passed to a bidirectional GRU as we did in the previous examples. Also, we will NOT train the weights from BERT. This is called freezing the network, and will speed up calculations.

In [None]:
import torch.nn as nn

class BERTGRUSentiment(nn.Module):
    def __init__(self, bert, hidden_dim=256, output_dim=1, n_layers=2, bidirectional=True, dropout=0.2):
        super().__init__()        
        self.bert = bert        
        embedding_dim = bert.config.to_dict()['hidden_size']
        self.rnn = nn.GRU(embedding_dim,
                          hidden_dim,
                          num_layers = n_layers,
                          bidirectional = bidirectional,
                          batch_first = True,
                          dropout = 0 if n_layers < 2 else dropout)
        
        self.out = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):                       
        with torch.no_grad():
            embedded = self.bert(text)[0]
        _, hidden = self.rnn(embedded)        
        if self.rnn.bidirectional:
            hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
        else:
            hidden = self.dropout(hidden[-1,:,:])        
        output = self.out(hidden)        
        return output.squeeze(1)

In [None]:
net = BERTGRUSentiment(bert)

# freeze BERT
for name, param in net.named_parameters():                
    if name.startswith('bert'):
        param.requires_grad = False

In [None]:
from src import WordModel

model = WordModel(net)

model.compile(optimizer = torch.optim.Adam(model.net.parameters()),
              loss = torch.nn.BCEWithLogitsLoss(),
              metrics=[Metric()])

hist = model.fit(train_data, validation_data=valid_data, epochs=1)

hist = model.fit(dataloader['train'], dataloader['val'], epochs=5)

In [None]:
model.evaluate(test_data)

Now we can use the model to get predictions

In [None]:
sentence = "the film was good"
tokenized = [tok[:max_input_length-2] for tok in tokenizer.tokenize(sentence)]
indexed = [tokenizer.cls_token_id] + tokenizer.convert_tokens_to_ids(tokenized) + [tokenizer.sep_token_id]
tensor = torch.tensor([indexed]).to(device)
model.net.eval()
prediction = torch.sigmoid(model.net(tensor))
prediction