In [2]:
import os.path as p
import pandas as pd
import torch
import pickle

from transformers import BertTokenizer, BertModel
from definitions import *
from dataset_helper_functions import *

In [3]:
try:
    IS_MASTER
except: 
    IS_MASTER = False

if not IS_MASTER:
    data = {}
    
    dev_path = p.join(PROC_DATA_DIR_PATH, 'dev')

    data_paths = {
        'dev': [p.join(dev_path, 'dev.tsv'), p.join(dev_path, 'dev_spacy.pkl')],
        'test': [
            p.join(POLIT_DATA_DIR_PATH, 'test', 'test_combined.tsv'),
            p.join(PROC_DATA_DIR_PATH, 'test', 'test_spacy.pkl')
        ],
        'train': [
            p.join(POLIT_DATA_DIR_PATH, 'train', 'train_combined.tsv'),
            p.join(PROC_DATA_DIR_PATH, 'train', 'train_spacy.pkl')
        ],
        # 'val': [
        #     p.join(POLIT_DATA_DIR_PATH, 'val', 'val_combined.tsv'),
        #     p.join(PROC_DATA_DIR_PATH, 'val', 'val_spacy.pkl')
        # ],
    }

    for dtype, dpaths in data_paths.items():
        if p.exists(dpaths[1]):
            data[dtype] = pd.read_pickle(dpaths[1])
        else:
            if dtype == 'dev' and not p.exists(dpaths[0]):
                sample_development_set()

            data[dtype] = pd.read_csv(dpaths[0], sep='\t', index_col=False)

Load tokenizer and model. For now `bert-base-uncased`. **check other options**

In [4]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# Load pre-trained model (weights)
model = BertModel.from_pretrained(
    'bert-base-uncased',
    output_hidden_states = True, # Whether the model returns all hidden-states.
)
# Put the model in "evaluation" mode, meaning feed-forward operation.
model.eval()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [5]:
embeddings_path = p.join(PROC_DATA_DIR_PATH, 'embeddings')
if not p.exists(embeddings_path):
    os.mkdir(embeddings_path)

Prepare data.

In [6]:
tensors = {}
for dtype, df in data.items():
       # text = "After stealing money from the bank vault, the bank robber was seen " \
       #        "fishing on the Mississippi river bank."
       # Add the special tokens.
       sentences = [f'[CLS] {s} [SEP]' for s in df['content'].values]

       # Split the sentence into tokens.
       tokenized_sentences = [tokenizer.tokenize(s) for s in sentences]

       # Map the token strings to their vocabulary indeces.
       indexed_tokens = [tokenizer.convert_tokens_to_ids(s) for s in tokenized_sentences]

       # Mark each of the len(s) tokens as belonging to sentence "1".
       # this is here only to conform to format
       segments_ids = [[1] * len(s) for s in tokenized_sentences]

       # Convert inputs to PyTorch tensors
       tokens_tensors = [torch.tensor([it]) for it in indexed_tokens]
       segments_tensors = [torch.tensor([si]) for si in segments_ids]
       
       tensors[dtype] = (tokens_tensors, segments_tensors)

In [1]:
# Run the text through BERT, and collect all of the hidden states produced
# from all 12 layers. 
with torch.no_grad():
    for dtype, (tt, st) in tensors.items():
        last_four_embeddings = []
        second_last_embeddings = []
        for sent_tensor, seg_tensor in list(zip(tt, st)):
            # Evaluating the model will return a different number of objects based on 
            # how it's  configured in the `from_pretrained` call earlier. In this case, 
            # becase we set `output_hidden_states = True`, the third item will be the 
            # hidden states from all layers. See the documentation for more details:
            # https://huggingface.co/transformers/model_doc/bert.html#bertmodel
            hidden_states = model(sent_tensor, seg_tensor)[2]
            # Concatenate the tensors for all layers. We use `stack` here to
            # create a new dimension in the tensor.
            token_embeddings = torch.stack(hidden_states, dim=0)
            # Remove dimension 1, the "batches".
            token_embeddings = torch.squeeze(token_embeddings, dim=1)
            # Swap dimensions 0 and 1.
            token_embeddings = token_embeddings.permute(1,0,2)

            last_four = []
            second_last = []
            for token in token_embeddings:
                # Concatenate the vectors (that is, append them together) from the last 
                # four layers.
                # Each layer vector is 768 values, so `cat_vec` is length 3,072.
                xx = torch.cat((token[-1], token[-2], token[-3], token[-4]), dim=0)
                print(type(xx))
                exit()
                # last_four.append()
                second_last.append(token[-2])

            last_four_embeddings.append(last_four)
            second_last_embeddings.append(second_last)

        with open(p.join(embeddings_path, f'{dtype}_word_emb_last_four.pkl'), 'wb') as fh:
            pickle.dump(last_four_embeddings, fh)

        with open(p.join(embeddings_path, f'{dtype}_word_emb_second_last.pkl'), 'wb') as fh:
            pickle.dump(second_last_embeddings, fh)

NameError: name 'torch' is not defined

In [27]:
# print(tensors['train'])

15
<class 'tuple'>


Format model output.

Model output:
```c++
    Number of layers: 13   (initial embeddings + 12 BERT layers)
    Number of batches: 1
    Number of tokens: 22
    Number of hidden units: 768
```

Concat last 4 layers to create word embeddings.