# Get Data

In [1]:
from tqdm import tqdm

# PyTorch
import torch
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(DEVICE)

cpu


In [2]:
from datasets import load_dataset, logging
logging.set_verbosity_error()

ds = load_dataset('bookcorpus', split='train[:5000]')
ds

Dataset({
    features: ['text'],
    num_rows: 5000
})

## Indexing the Model State Array with PyTorch

hidden_states is a tuple of tensors/arrays. 

There is one tensor/array for each embedding in the network:
hidden_states[i] == hidden states at i<sup>th</sup> layer of network.

Each of these tensors/arrays has the following shape:
hidden_states[i].shape == [num_examples, sequence_length, embedding_size]  

To get the 13 different embeddings for a single token, we loop over the layers of hidden_states. We collect an example sentence isent, a token within that sentence itok, and all 768 scalar values in the embedding matrix using the colon indexer.

For this example, we have:
 - 1 sentence
 - 7 is the sequence length
 - 13 layers, which is one input embedding *x*_i_ + 12 encoder blocks
 - 768 is the embedding size for BERT embeddings
 
The result is a matrix of shpae [1 x 7 x 13 x 768]

In [7]:
from transformers import logging, AutoModel, AutoTokenizer
logging.set_verbosity_error()
MODEL_NAME = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
model = AutoModel.from_pretrained(MODEL_NAME)

In [4]:
# What is the max sequence length?
max([len(tokenizer(s)['input_ids']) for s in ds['text']])

66

## BERT lookup embeddings

In [227]:
bert_embeds = torch.Tensor(model.embeddings.word_embeddings.weight).detach()
torch.save(bert_embeds, '../data/bert_lookup_embeddings.pt')

## Process data

In [18]:
inputs = tokenizer(ds['text'][0], return_offsets_mapping=True, return_tensors='pt')
[tokenizer.decode(x) for x in inputs.input_ids]

['[CLS] usually, he would be tearing around the living room, playing with his toys. [SEP]']

In [8]:
embeds = []

for i, sample in enumerate(tqdm(ds['text'])):
    batch_idx = 0 # one sample at a time, no batching.
    
    inputs = tokenizer(sample, return_offsets_mapping=True, return_tensors='pt')
    
    tok_char_start_inds = [st for st, _ in inputs.pop('offset_mapping')[batch_idx]]
    
    with torch.no_grad():
        hidden_states = model(**inputs,
                              output_hidden_states=True)['hidden_states']
        
        
    embeds.append(list(zip(
        list(inputs.input_ids[batch_idx]),
        tok_char_start_inds,
        list(torch.stack(hidden_states, dim=2)[batch_idx]),
    )))

100%|██████████| 5000/5000 [01:22<00:00, 60.65it/s]


In [9]:
len(embeds)

5000

In [10]:
torch.save(embeds, '../data/bookcorpus_embeddings_0_5000.pt')

## Linguistic Annotations

In [3]:
import spacy
nlp = spacy.load('en_core_web_trf')
embeds = torch.load('../data/bookcorpus_embeddings_0_5000.pt')

In [4]:
from spacyfuncs import get_docs

docs = get_docs(ds['text'],
                '../data/bookcorpus_0_5000.spacy',
                id_text_tuples=False,
               )

Creating new Docbin file


  0%|          | 0/5000 [00:00<?, ?it/s]



  0%|          | 0/5000 [00:00<?, ?it/s]