In [2]:
from transformers import BertTokenizer, BertModel, BertForMaskedLM
import logging
import torch
import matplotlib.pyplot as plt
%matplotlib inline
#logging.basicConfig(level=logging.INFO)

In [3]:
# Load pre-trained model (weights)
model = BertModel.from_pretrained('bert-base-german-cased', output_hidden_states = True)
tokenizer = BertTokenizer.from_pretrained('bert-base-german-cased')

model.eval()
print()




In [4]:
def createToken(text):
    # Add the special tokens.
    marked_text = "[CLS] " + text + " [SEP]"

    # Split the sentence into tokens.
    tokenized_text = tokenizer.tokenize(marked_text)

    # Map the token strings to their vocabulary indeces.
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

    # Display the words with their indeces.
    #i = 0
    #for tup in zip(tokenized_text, indexed_tokens):
    #    print('{}: {:<12} {:>6,}'.format(i, tup[0], tup[1]))
    #    i += 1
    
    # Mark each of the 22 tokens as belonging to sentence "1".
    segments_ids = [1] * len(tokenized_text)
    # Convert inputs to PyTorch tensors
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])

    return tokens_tensor, segments_tensors

In [88]:
def createSentenceEmbedding(text):
    tokens_tensor, segments_tensors = createToken(text)
    token_embeddings, hidden_states = runBert(tokens_tensor, segments_tensors)

    token_vecs_cat = []
    for token in token_embeddings:
        cat_vec = torch.cat((token[-1], token[-2], token[-3], token[-4]), dim=0)
        token_vecs_cat.append(cat_vec)

    token_vecs = hidden_states[-2][0]
    sentence_embedding = torch.mean(token_vecs, dim=0)
    return sentence_embedding


In [89]:
def createTokens(texts):
    tokens = []
    segments = []
    for text in texts:
        # Add the special tokens.
        marked_text = "[CLS] " + text + " [SEP]"

        # Split the sentence into tokens.
        tokenized_text = tokenizer.tokenize(marked_text)

        # Map the token strings to their vocabulary indeces.
        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
        
        # Mark each of the 22 tokens as belonging to sentence "1".
        segments_ids = [1] * len(tokenized_text)

        tokens.append(indexed_tokens)
        segments.append(segments_ids)

    # Convert inputs to PyTorch tensors
    lenght = len(max(tokens, key=len))
    
    for i in range (len(tokens)):
        if len(tokens[i]) != lenght:
            print(tokens[i])
            while len(tokens[i]) != lenght:
                tokens[i].append(0)
                segments[i].append(1)

    tokens_tensor = torch.LongTensor([tok for tok in tokens])
    segments_tensors = torch.LongTensor([seg for seg in segments])

    return tokens_tensor, segments_tensors

In [270]:
def createMulipbleSentenceEmbedding(text):
    tokens_tensor, segments_tensors = createTokens(text)
    token_embeddings, hidden_states = runBerts(tokens_tensor, segments_tensors)
    return token_embeddings
    #token_vecs_cat = []
    #for token in token_embeddings:
    #   cat_vec = torch.cat((token[-1], token[-2], token[-3], token[-4]), dim=0)
    #   token_vecs_cat.append(cat_vec)

    #token_vecs = hidden_states[-2][0]
    #sentence_embedding = torch.mean(token_vecs, dim=0)
    #return sentence_embedding

In [295]:
def runBert(tokens_tensor, segments_tensors):
    with torch.no_grad():
        outputs = model(tokens_tensor, segments_tensors)
        hidden_states = outputs[2]
        print('Hidden:  ', len(hidden_states))
        token_embeddings = torch.stack(hidden_states, dim=0)
        print('         Num_hidden, batch, num_in, features')
        print('Stack:   ', token_embeddings.shape)
        token_embeddings = torch.squeeze(token_embeddings, dim=1)
        print('Squeeze: ', token_embeddings.shape)
        token_embeddings = token_embeddings.permute(1,0,2)
        print('Permute: ', token_embeddings.shape)
        print(token_embeddings.shape)
        return token_embeddings, hidden_states

def runBerts(tokens_tensor, segments_tensors):
    with torch.no_grad():
        outputs = model(tokens_tensor, segments_tensors)
        hidden_states = outputs[2]
        #print('Hidden:  ', len(hidden_states))

        token_embeddings = torch.stack(hidden_states, dim=0)
        #print('          Num_hidden, batch, num_in, features')
        #print('Stack:   ', token_embeddings.shape)

        # remove axis of size one
        token_embeddings = torch.squeeze(token_embeddings, dim=1)
        #print('Squeeze: ', token_embeddings.shape)

        #token_embeddings = token_embeddings.permute(1,2,0,3)
        #print('Permute: ', token_embeddings.shape)
        return token_embeddings, hidden_states

In [296]:
x = createSentenceEmbedding('Hi')
y = createSentenceEmbedding('Tschüss')
xy = createMulipbleSentenceEmbedding(['Hi', 'Tschüss'])


stack = []
for i in range (len(xy)):
    stack.append(xy[i][1])  
a = torch.stack(stack)
a = torch.transpose(a, 0, 1)
a.shape

Hidden:   13
         Num_hidden, batch, num_in, features
Stack:    torch.Size([13, 1, 3, 768])
Squeeze:  torch.Size([13, 3, 768])
Permute:  torch.Size([3, 13, 768])
torch.Size([3, 13, 768])
Hidden:   13
         Num_hidden, batch, num_in, features
Stack:    torch.Size([13, 1, 4, 768])
Squeeze:  torch.Size([13, 4, 768])
Permute:  torch.Size([4, 13, 768])
torch.Size([4, 13, 768])
[3, 26524, 4]


torch.Size([4, 13, 768])

In [219]:
# 4 = hidden, 1 = batch, 3 = num_in, 10 = features
a = torch.arange(120).reshape(4,1,3,10)
a = torch.squeeze(a, dim=1)
a = torch.transpose(a, 0, 1)
print(a.shape)
print(a)

torch.Size([3, 4, 10])
tensor([[[  0,   1,   2,   3,   4,   5,   6,   7,   8,   9],
         [ 30,  31,  32,  33,  34,  35,  36,  37,  38,  39],
         [ 60,  61,  62,  63,  64,  65,  66,  67,  68,  69],
         [ 90,  91,  92,  93,  94,  95,  96,  97,  98,  99]],

        [[ 10,  11,  12,  13,  14,  15,  16,  17,  18,  19],
         [ 40,  41,  42,  43,  44,  45,  46,  47,  48,  49],
         [ 70,  71,  72,  73,  74,  75,  76,  77,  78,  79],
         [100, 101, 102, 103, 104, 105, 106, 107, 108, 109]],

        [[ 20,  21,  22,  23,  24,  25,  26,  27,  28,  29],
         [ 50,  51,  52,  53,  54,  55,  56,  57,  58,  59],
         [ 80,  81,  82,  83,  84,  85,  86,  87,  88,  89],
         [110, 111, 112, 113, 114, 115, 116, 117, 118, 119]]])


In [269]:
a = torch.arange(240).reshape(4,2,3,10)
print(a)
print('+-------------------------------------+')

a = torch.stack([a[0][0], a[1][0], a[2][0], a[3][0]])
a = torch.transpose(a, 0, 1)
print(a)
a.shape

tensor([[[[  0,   1,   2,   3,   4,   5,   6,   7,   8,   9],
          [ 10,  11,  12,  13,  14,  15,  16,  17,  18,  19],
          [ 20,  21,  22,  23,  24,  25,  26,  27,  28,  29]],

         [[ 30,  31,  32,  33,  34,  35,  36,  37,  38,  39],
          [ 40,  41,  42,  43,  44,  45,  46,  47,  48,  49],
          [ 50,  51,  52,  53,  54,  55,  56,  57,  58,  59]]],


        [[[ 60,  61,  62,  63,  64,  65,  66,  67,  68,  69],
          [ 70,  71,  72,  73,  74,  75,  76,  77,  78,  79],
          [ 80,  81,  82,  83,  84,  85,  86,  87,  88,  89]],

         [[ 90,  91,  92,  93,  94,  95,  96,  97,  98,  99],
          [100, 101, 102, 103, 104, 105, 106, 107, 108, 109],
          [110, 111, 112, 113, 114, 115, 116, 117, 118, 119]]],


        [[[120, 121, 122, 123, 124, 125, 126, 127, 128, 129],
          [130, 131, 132, 133, 134, 135, 136, 137, 138, 139],
          [140, 141, 142, 143, 144, 145, 146, 147, 148, 149]],

         [[150, 151, 152, 153, 154, 155, 156, 157, 158, 

torch.Size([3, 4, 10])