In [1]:
!pip install transformers

You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [89]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoModel
from transformers import AutoTokenizer

import json
import os
import copy

from typing import Dict, List, Tuple, Set, Optional
from functools import partial

from pytorch_transformers import *
import numpy as np

In [101]:
sentence_example = ['12.14AM']

In [91]:
embedding_type = "bert-base-cased"
do_lower_case = "uncased" in embedding_type
tokenizer = AutoTokenizer.from_pretrained(embedding_type, do_lower_case=do_lower_case, add_special_tokens=False)
bert_embeddings = AutoModel.from_pretrained(embedding_type, output_hidden_states=True)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [103]:
tokens = tokenizer.tokenize(sentence_example, is_split_into_words=True)
bert_tokens = tokenizer(sentence_example, return_tensors="pt", is_split_into_words=True,
                                                 padding="max_length", truncation=True, max_length=8)
print(tokens)
print(bert_tokens)
print(tokenizer.decode(bert_tokens['input_ids'][0]))

['12', '.', '14', '##AM']
{'input_ids': tensor([[  101,  1367,   119,  1489, 10964,   102,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 0, 0]])}
[CLS] 12. 14AM [SEP] [PAD] [PAD]


In [88]:
sub_tokens_map = {}
total_subtokens = 0
for text in sentence_example:
    subtokens_length = len(tokenizer.tokenize(text))
    sub_tokens_map[text] = subtokens_length
    total_subtokens += subtokens_length

bert_tokens = tokenizer(sentence_example, return_tensors="pt", is_split_into_words=True,
                                                 padding="max_length", truncation=True, max_length=10)
padding_length = len(bert_tokens['input_ids'][0]) - total_subtokens - 2 # plus [CLS] and [SEP]

with torch.no_grad():
    bert_output = bert_embeddings(**bert_tokens) # 1, max_length, 768

sum_all_layers = sum(bert_output.hidden_states[0:12])[0] # changed this
if padding_length > 0:
    sum_all_layers = sum_all_layers[:-padding_length]
sum_all_layers = sum_all_layers[1:len(sum_all_layers) - 1] # exclude [CLS] and [SEP]

index = 0
embeddings = []
for text in sentence_example:
    bert_embedding = sum_all_layers[index]
    for i in range(1, sub_tokens_map[text]):
        bert_embedding += sum_all_layers[index + i]
    bert_embedding /= sub_tokens_map[text]
    embeddings.append(bert_embedding)
    index += sub_tokens_map[text]
while len(embeddings) < 10:
    zeros = torch.zeros(768)
    zeros = zeros
    embeddings.append(zeros)
print(torch.stack(embeddings).shape, len(torch.stack(embeddings)))

torch.Size([10, 768]) 10
