In [1]:
import torch
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM
import preprocessor as p
import pickle
import re

# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
import logging
#logging.basicConfig(level=logging.INFO)

import matplotlib.pyplot as plt

# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [2]:
text = "Here is the sentence I want embeddings for."
#text = "After stealing money from the bank vault, the bank robber was seen fishing on the Mississippi river bank."
marked_text = "[CLS] " + text + " [SEP]"

print (marked_text)

tokenized_text = tokenizer.tokenize(marked_text)
print (tokenized_text)



[CLS] Here is the sentence I want embeddings for. [SEP]
['[CLS]', 'here', 'is', 'the', 'sentence', 'i', 'want', 'em', '##bed', '##ding', '##s', 'for', '.', '[SEP]']


Next, we need to call the tokenizer to match the tokens agains their indices in the tokenizer vocabulary

In [3]:
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

for tup in zip(tokenized_text, indexed_tokens):
    print(tup)

('[CLS]', 101)
('here', 2182)
('is', 2003)
('the', 1996)
('sentence', 6251)
('i', 1045)
('want', 2215)
('em', 7861)
('##bed', 8270)
('##ding', 4667)
('##s', 2015)
('for', 2005)
('.', 1012)
('[SEP]', 102)


In [4]:
segments_ids = [1] * len(tokenized_text)
print (segments_ids)

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


Next we need to convert our data to torch tensors and call the BERT model. The BERT PyTorch interface requires that the data be in torch tensors rather than Python lists, so we convert the lists here - this does not change the shape or the data.

model.eval() puts our model in evaluation mode as opposed to training mode. In this case, evaluation mode turns off dropout regularization which is used in training.

Calling from_pretrained will fetch the model from the internet. When we load the bert-base-uncased, we see the definition of the model printed in the logging. The model is a deep neural network with 12 layers! Explaining the layers and their functions is outside the scope of this post, and you can skip over this output for now.

In [5]:
# Convert inputs to PyTorch tensors
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])

# Load pre-trained model (weights)
model = BertModel.from_pretrained('bert-base-uncased')

# Put the model in "evaluation" mode, meaning feed-forward operation.
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): BertLayerNorm()
    (dropout): Dropout(p=0.1)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): BertLayerNorm()
            (dropout): Dropout(p=0.1)
          )
        )
        (intermediate): BertIntermediate(
          (dense): Linear(in_features=768, out_features=3072, bias=

In [6]:
# Predict hidden states features for each layer
with torch.no_grad():
    encoded_layers, _ = model(tokens_tensor, segments_tensors)
    
print ("Number of layers:", len(encoded_layers))
layer_i = 0

print ("Number of batches:", len(encoded_layers[layer_i]))
batch_i = 0

print ("Number of tokens:", len(encoded_layers[layer_i][batch_i]))
token_i = 0

print ("Number of hidden units:", len(encoded_layers[layer_i][batch_i][token_i]))

Number of layers: 12
Number of batches: 1
Number of tokens: 14
Number of hidden units: 768


# Output
The full set of hidden states for this model, stored in the object encoded_layers, is a little dizzying. This object has four dimensions, in the following order:

The layer number (12 layers)
The batch number (1 sentence)
The word / token number (22 tokens in our sentence)
The hidden unit / feature number (768 features)
That’s 202,752 unique values just to represent our one sentence!

The second dimension, the batch size, is used when submitting multiple sentences to the model at once; here, though, we just have one example sentence.

In [12]:
# Convert the hidden state embeddings into single token vectors

# Holds the list of 12 layer embeddings for each token
# Will have the shape: [# tokens, # layers, # features]
token_embeddings = [] 

# For each token in the sentence...
for token_i in range(len(tokenized_text)):
  
    # Holds 12 layers of hidden states for each token 
    hidden_layers = [] 

    # For each of the 12 layers...
    for layer_i in range(len(encoded_layers)):

        # Lookup the vector for `token_i` in `layer_i`
        vec = encoded_layers[layer_i][batch_i][token_i]

        hidden_layers.append(vec)

    token_embeddings.append(hidden_layers)

# Sanity check the dimensions:
print ("Number of tokens in sequence:", len(token_embeddings))
print ("Number of layers per token:", len(token_embeddings[0]))

Number of tokens in sequence: 14
Number of layers per token: 12


# Word Vectors
To give you some examples, let’s create word vectors using a concatenation and summation of the last four layers:

In [13]:
concatenated_last_4_layers = [torch.cat((layer[-1], layer[-2], layer[-3], layer[-4]), 0) for layer in token_embeddings] # [number_of_tokens, 3072]

summed_last_4_layers = [torch.sum(torch.stack(layer)[-4:], 0) for layer in token_embeddings] # [number_of_tokens, 768]

In [17]:
len(summed_last_4_layers[0])

768

# Sentence Vectors
To get a single vector for our entire sentence we have multiple application-dependent strategieis, but a simple approach is to average the second to last hiden layer of each token producing a single 768 length vector.

In [18]:
sentence_embedding = torch.mean(encoded_layers[11], 1)

## loading the model

In [45]:
# Load pre-trained model (weights)
model = BertModel.from_pretrained('bert-base-uncased')

# Put the model in "evaluation" mode, meaning feed-forward operation.
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): BertLayerNorm()
    (dropout): Dropout(p=0.1)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): BertLayerNorm()
            (dropout): Dropout(p=0.1)
          )
        )
        (intermediate): BertIntermediate(
          (dense): Linear(in_features=768, out_features=3072, bias=

In [11]:
import numpy as np
p.set_options(p.OPT.URL, p.OPT.NUMBER, p.OPT.RESERVED, p.OPT.MENTION)

In [29]:
def create_BERT_train(path, tweet_field = 2, label_field = 1):
    with open(path, 'r') as f:
        
        tweet_bertx, tweet_berty = [], []
        
        label_d = {
            'negative' : [1., 0., 0.],
            'neutral' : [0., 1., 0.],
            'positive' : [0., 0., 1.]
        }
        i=0
        f.readline()
        for l in f:
            record = l.split('\t')
            tweet = record[tweet_field]
            label = record[label_field]
            label = label_d.get(label)
            
            clean_text = tweet.lower()
            clean_text = p.clean(clean_text)
            clean_text = re.sub(r'(\\u[0-9A-Fa-f]+)',r'', re.sub(r'[^\x00-\x7f]',r'', clean_text))

            marked_text = "[CLS] " + clean_text + " [SEP]"
            tokenized_text = tokenizer.tokenize(marked_text)
            
            indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
            segments_ids = [1] * len(tokenized_text)
            # Convert inputs to PyTorch tensors
            tokens_tensor = torch.tensor([indexed_tokens])
            segments_tensors = torch.tensor([segments_ids])
            
            # Predict hidden states features for each layer
            with torch.no_grad():
                encoded_layers, _ = model(tokens_tensor, segments_tensors)
                
            layer_i = 0
            batch_i = 0
            token_i = 0
            
            # Convert the hidden state embeddings into single token vectors

            # Holds the list of 12 layer embeddings for each token
            # Will have the shape: [# tokens, # layers, # features]
            token_embeddings = [] 

            # For each token in the sentence...
            for token_i in range(len(tokenized_text)):

                # Holds 12 layers of hidden states for each token 
                hidden_layers = [] 

                # For each of the 12 layers...
                for layer_i in range(len(encoded_layers)):

                    # Lookup the vector for `token_i` in `layer_i`
                    vec = encoded_layers[layer_i][batch_i][token_i]

                    hidden_layers.append(vec)

                token_embeddings.append(hidden_layers)
            
            summed_last_4_layers = token_embeddings[0][-4:]
            x_ = summed_last_4_layers#.cpu().detach().numpy()
            tweet_bertx.append(x_)
            tweet_berty.append(label)
            
            i=i+1
            
            if i%2000 == 0:
                print(i)
        
        return tweet_bertx, np.array(tweet_berty)
    

In [None]:
x,y  = create_BERT_train('./tweet-train-no_dup.tsv')

In [None]:
pickle.dump(obj=x, file=open('train_bertx1', 'wb'))
pickle.dump(obj=y, file=open('train_berty1', 'wb'))

In [28]:
len(x[1][3])

768