In [1]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

In [2]:
CACHE_DIR = '/scratch/gpfs/hgazula/.cache/'
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-xl',
                                          add_prefix_space=True,
                                          cache_dir=CACHE_DIR)
tokenizer.pad_token = tokenizer.eos_token

lm_model = GPT2LMHeadModel.from_pretrained("gpt2-xl",
                                           output_hidden_states=True,
                                           cache_dir=CACHE_DIR)

In [3]:
lm_model.eval()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 1600)
    (wpe): Embedding(1024, 1600)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): Block(
        (ln_1): LayerNorm((1600,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1600,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): Block(
        (ln_1): LayerNorm((1600,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2):

### All sentences Batched

In [184]:
sentences = ['Hello',
 'Hello world',
 'Hello world there',
 'Hello world there you',
 'Hello world there you are',
 'world there you are high'
 ]
# input_ids = tokenizer(sentences, padding=True, return_tensors='pt')
input_ids = tokenizer.batch_encode_plus(sentences, padding=True, return_tensors='pt')
print(input_ids)
lm_outputs = lm_model(**input_ids)
transformer_hidden_states = lm_outputs[-1]
# print(transformer_hidden_states[-1].shape)
# print(transformer_hidden_states[-1])

{'input_ids': tensor([[18435, 50256, 50256, 50256, 50256],
        [18435,   995, 50256, 50256, 50256],
        [18435,   995,   612, 50256, 50256],
        [18435,   995,   612,   345, 50256],
        [18435,   995,   612,   345,   389],
        [  995,   612,   345,   389,  1029]]), 'attention_mask': tensor([[1, 0, 0, 0, 0],
        [1, 1, 0, 0, 0],
        [1, 1, 1, 0, 0],
        [1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1]])}


In [187]:
for a, b in zip(input_ids['input_ids'], input_ids['attention_mask']):
    print(a, b)

tensor([18435, 50256, 50256, 50256, 50256]) tensor([1, 0, 0, 0, 0])
tensor([18435,   995, 50256, 50256, 50256]) tensor([1, 1, 0, 0, 0])
tensor([18435,   995,   612, 50256, 50256]) tensor([1, 1, 1, 0, 0])
tensor([18435,   995,   612,   345, 50256]) tensor([1, 1, 1, 1, 0])
tensor([18435,   995,   612,   345,   389]) tensor([1, 1, 1, 1, 1])
tensor([ 995,  612,  345,  389, 1029]) tensor([1, 1, 1, 1, 1])


In [112]:
embeddings = transformer_hidden_states[-1]
attn_mask = input_ids['attention_mask']
attn_mask = attn_mask.unsqueeze(-1).expand(embeddings.shape)
attn_mask

tensor([[[1, 1, 1,  ..., 1, 1, 1],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]],

        [[1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]],

        [[1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]],

        [[1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         [0, 0, 0,  ..., 0, 0, 0]],

        [[1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1]],

        [[1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         [

In [113]:
embeddings.shape, input_ids['attention_mask'].shape

(torch.Size([6, 5, 1600]), torch.Size([6, 5]))

In [132]:
embeddings * attn_mask

tensor([[[ 1.0588,  0.1301,  0.2188,  ..., -4.4375,  0.7642,  0.2327],
         [ 0.0000,  0.0000,  0.0000,  ..., -0.0000,  0.0000, -0.0000],
         [ 0.0000,  0.0000,  0.0000,  ..., -0.0000,  0.0000, -0.0000],
         [ 0.0000,  0.0000,  0.0000,  ..., -0.0000,  0.0000, -0.0000],
         [ 0.0000,  0.0000,  0.0000,  ..., -0.0000,  0.0000, -0.0000]],

        [[ 1.0588,  0.1301,  0.2188,  ..., -4.4375,  0.7642,  0.2327],
         [-0.2217,  0.3650,  0.6309,  ..., -1.3968,  0.1588,  0.1691],
         [ 0.0000,  0.0000,  0.0000,  ..., -0.0000,  0.0000, -0.0000],
         [ 0.0000,  0.0000,  0.0000,  ..., -0.0000,  0.0000, -0.0000],
         [ 0.0000,  0.0000,  0.0000,  ..., -0.0000,  0.0000, -0.0000]],

        [[ 1.0588,  0.1301,  0.2188,  ..., -4.4375,  0.7642,  0.2327],
         [-0.2217,  0.3650,  0.6309,  ..., -1.3968,  0.1588,  0.1691],
         [ 0.4922,  0.5799,  0.3748,  ..., -1.2212,  0.4659, -0.0181],
         [ 0.0000,  0.0000,  0.0000,  ..., -0.0000,  0.0000, -0.0000],
  

In [129]:
torch.sum(embeddings * attn_mask, axis=1)

tensor([[  1.0588,   0.1301,   0.2188,  ...,  -4.4375,   0.7642,   0.2327],
        [  0.8371,   0.4952,   0.8497,  ...,  -5.8343,   0.9230,   0.4018],
        [  1.3293,   1.0751,   1.2245,  ...,  -7.0555,   1.3889,   0.3837],
        [  2.4972,   0.7955,   0.9381,  ...,  -8.5987,   2.1432,  -0.4440],
        [  2.7568,   1.1913,   1.6058,  ..., -10.3531,   3.0460,   0.1953],
        [ -3.2951,  -0.4990,   1.1670,  ...,  -9.8724,   2.1661,  -0.0779]],
       grad_fn=<SumBackward1>)

In [123]:
selected = embeddings.gather(0, attn_mask)

In [122]:
selected

tensor([[[ 1.0588,  0.1301,  0.2188,  ..., -4.4375,  0.7642,  0.2327],
         [ 0.2053,  0.3465,  0.5865,  ..., -2.4376,  0.0458, -0.3412],
         [ 0.1965,  0.2960,  0.6076,  ..., -2.5010,  0.0800, -0.3510],
         [ 0.1956,  0.2837,  0.6160,  ..., -2.5184,  0.0915, -0.3556],
         [ 0.1941,  0.2676,  0.6321,  ..., -2.5408,  0.1097, -0.3617]],

        [[ 1.0588,  0.1301,  0.2188,  ..., -4.4375,  0.7642,  0.2327],
         [-0.2217,  0.3650,  0.6309,  ..., -1.3968,  0.1588,  0.1691],
         [ 0.1965,  0.2960,  0.6076,  ..., -2.5010,  0.0800, -0.3510],
         [ 0.1956,  0.2837,  0.6160,  ..., -2.5184,  0.0915, -0.3556],
         [ 0.1941,  0.2676,  0.6321,  ..., -2.5408,  0.1097, -0.3617]],

        [[ 1.0588,  0.1301,  0.2188,  ..., -4.4375,  0.7642,  0.2327],
         [-0.2217,  0.3650,  0.6309,  ..., -1.3968,  0.1588,  0.1691],
         [ 0.6614,  0.3744,  0.5635,  ..., -2.5970,  0.0987, -0.0964],
         [ 0.1956,  0.2837,  0.6160,  ..., -2.5184,  0.0915, -0.3556],
  

In [52]:
sentences = [('Hello'),
             ('Hello', 'world'),
             ('Hello', 'world', 'there'),
             ('Hello', 'world', 'there', 'you'),
             ('Hello', 'world', 'there', 'you', 'are'),
            ('world', 'there', 'you', 'are', 'flying')]

In [66]:
tokenizer(sentences, padding=False)

ValueError: too many values to unpack (expected 2)

In [183]:
input_dict = tokenizer.batch_encode_plus(sentences, is_split_into_words=True, padding=True, return_tensors='pt')
input_dict

{'input_ids': tensor([[18435, 50256, 50256, 50256, 50256],
        [18435,   995, 50256, 50256, 50256],
        [18435,   995,   612, 50256, 50256],
        [18435,   995,   612,   345, 50256],
        [18435,   995,   612,   345,   389],
        [  995,   612,   345,   389,  1029]]), 'attention_mask': tensor([[1, 0, 0, 0, 0],
        [1, 1, 0, 0, 0],
        [1, 1, 1, 0, 0],
        [1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1]])}

In [65]:
import torch
torch.gather(input_dict['input_ids'], 1, input_dict['attention_mask'])

tensor([[50256, 18435, 18435, 18435, 18435],
        [  995,   995, 18435, 18435, 18435],
        [  995,   995,   995, 18435, 18435],
        [  995,   995,   995,   995, 18435],
        [  995,   995,   995,   995,   995],
        [  612,   612,   612,   612,   612]])

In [61]:
import numpy as np
import torch

myvec =  transformer_hidden_states[-1]
myvec2 = myvec/torch.norm(myvec, p=2)

In [62]:
myvec2

tensor([[[-0.0266,  0.0140,  0.0345,  ..., -0.1557,  0.0254, -0.0005]]],
       grad_fn=<DivBackward0>)

In [53]:
# tok_to_str = tokenizer.batch_decode(input_ids['input_ids'],
#                                     skip_special_tokens=True)

### One sentence at a time

In [27]:
sentences = ['Hello']
input_ids = tokenizer(sentences, padding=True, return_tensors='pt')
print(input_ids)
lm_outputs = lm_model(**input_ids)
transformer_hidden_states = lm_outputs[-1]
print(transformer_hidden_states[-1].shape)
print(transformer_hidden_states[-1])

{'input_ids': tensor([[18435]]), 'attention_mask': tensor([[1]])}
torch.Size([1, 1, 1600])
tensor([[[ 1.0588,  0.1301,  0.2188,  ..., -4.4375,  0.7642,  0.2327]]],
       grad_fn=<ViewBackward>)


In [28]:
sentences = ['Hello world']
input_ids = tokenizer(sentences, padding=True, return_tensors='pt')
print(input_ids)
lm_outputs = lm_model(**input_ids)
transformer_hidden_states = lm_outputs[-1]
print(transformer_hidden_states[-1].shape)
print(transformer_hidden_states[-1])

{'input_ids': tensor([[18435,   995]]), 'attention_mask': tensor([[1, 1]])}
torch.Size([1, 2, 1600])
tensor([[[ 1.0588,  0.1301,  0.2188,  ..., -4.4375,  0.7642,  0.2327],
         [-0.2217,  0.3650,  0.6309,  ..., -1.3968,  0.1588,  0.1691]]],
       grad_fn=<ViewBackward>)


In [29]:
sentences = ['Hello world there']
input_ids = tokenizer(sentences, padding=True, return_tensors='pt')
print(input_ids)
lm_outputs = lm_model(**input_ids)
transformer_hidden_states = lm_outputs[-1]
print(transformer_hidden_states[-1].shape)
print(transformer_hidden_states[-1])

{'input_ids': tensor([[18435,   995,   612]]), 'attention_mask': tensor([[1, 1, 1]])}
torch.Size([1, 3, 1600])
tensor([[[ 1.0588,  0.1301,  0.2188,  ..., -4.4375,  0.7642,  0.2327],
         [-0.2217,  0.3650,  0.6309,  ..., -1.3968,  0.1588,  0.1691],
         [ 0.4922,  0.5799,  0.3748,  ..., -1.2212,  0.4659, -0.0181]]],
       grad_fn=<ViewBackward>)


In [195]:
sentences = ['Hello world there you']
input_ids = tokenizer(sentences, padding=True, return_tensors='pt')
print(input_ids)
lm_outputs = lm_model(**input_ids)
transformer_hidden_states = lm_outputs[-1]
print(transformer_hidden_states[-1].shape)
print(transformer_hidden_states[-1])

{'input_ids': tensor([[18435,   995,   612,   345]]), 'attention_mask': tensor([[1, 1, 1, 1]])}
torch.Size([1, 4, 1600])
tensor([[[ 1.0588,  0.1301,  0.2188,  ..., -4.4375,  0.7642,  0.2327],
         [-0.2217,  0.3650,  0.6309,  ..., -1.3968,  0.1588,  0.1691],
         [ 0.4922,  0.5799,  0.3748,  ..., -1.2212,  0.4659, -0.0181],
         [ 1.1680, -0.2796, -0.2864,  ..., -1.5432,  0.7543, -0.8277]]],
       grad_fn=<ViewBackward>)


In [31]:
sentences = ['Hello world there you are']
input_ids = tokenizer(sentences, padding=True, return_tensors='pt')
print(input_ids)
lm_outputs = lm_model(**input_ids)
transformer_hidden_states = lm_outputs[-1]
print(transformer_hidden_states[-1].shape)
print(transformer_hidden_states[-1])

{'input_ids': tensor([[18435,   995,   612,   345,   389]]), 'attention_mask': tensor([[1, 1, 1, 1, 1]])}
torch.Size([1, 5, 1600])
tensor([[[ 1.0588,  0.1301,  0.2188,  ..., -4.4375,  0.7642,  0.2327],
         [-0.2217,  0.3650,  0.6309,  ..., -1.3968,  0.1588,  0.1691],
         [ 0.4922,  0.5799,  0.3748,  ..., -1.2212,  0.4659, -0.0181],
         [ 1.1680, -0.2796, -0.2864,  ..., -1.5432,  0.7543, -0.8277],
         [ 0.2596,  0.3958,  0.6677,  ..., -1.7543,  0.9028,  0.6392]]],
       grad_fn=<ViewBackward>)


In [32]:
sentences = ['world there you are high']
input_ids = tokenizer(sentences, padding=True, return_tensors='pt')
print(input_ids)
lm_outputs = lm_model(**input_ids)
transformer_hidden_states = lm_outputs[-1]
print(transformer_hidden_states[-1].shape)
print(transformer_hidden_states[-1])

{'input_ids': tensor([[ 995,  612,  345,  389, 1029]]), 'attention_mask': tensor([[1, 1, 1, 1, 1]])}
torch.Size([1, 5, 1600])
tensor([[[-0.4205,  0.6139,  1.0727,  ..., -4.7529,  1.2246, -0.4381],
         [-0.1484, -0.2284,  0.3114,  ..., -1.4831,  0.4668, -0.4868],
         [-0.0156, -0.6524,  0.0106,  ..., -1.2291,  0.0511, -0.2832],
         [-1.7830,  0.0059,  0.2923,  ..., -1.4245, -0.0341,  0.6478],
         [-0.9276, -0.2381, -0.5200,  ..., -0.9828,  0.4576,  0.4824]]],
       grad_fn=<ViewBackward>)


In [36]:
sentences1 = [tuple([18435]),
        tuple([18435,   995]),
        tuple([18435,   995,   612]),
        tuple([18435,   995,   612,   34]),
        tuple([18435,   995,   612,   345,   389]),
        tuple([  995,   612,   345,   389,  1029])]
print(sentences1)

[(18435,), (18435, 995), (18435, 995, 612), (18435, 995, 612, 34), (18435, 995, 612, 345, 389), (995, 612, 345, 389, 1029)]


In [38]:
tokenizer(sentences1, padding=True, return_tensors='pt')

AssertionError: text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).

In [173]:
import torch.utils.data as data
dl = data.DataLoader(input_ids, batch_size=1)

In [175]:
for i in dl:
    i

KeyError: 'Indexing with integers (to access backend Encoding for a given batch index) is not available when using Python based tokenizers'

In [188]:
model_input = [('ĠAct',), ('ĠAct', 'ĠOne'), ('ĠAct', 'ĠOne', ',')]

In [207]:
tokenizer.batch_encode_plus(model_input, is_split_into_words=True)

{'input_ids': [[34754, 254, 6398], [34754, 254, 6398, 34754, 254, 3198], [34754, 254, 6398, 34754, 254, 3198, 837]], 'attention_mask': [[1, 1, 1], [1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1]]}

In [202]:
for input_seq in model_input:
    print(input_seq)
    print(tokenizer.encode(input_seq, is_split_into_words=True, return_tensors='pt'))
    lm_outputs = lm_model(tokenizer.encode(input_seq))
#     transformer_hidden_states = lm_outputs[-1]
#     # print(transformer_hidden_states[-1].shape)
#     # print(transformer_hidden_states[-1])

('ĠAct',)
tensor([[34754,   254,  6398]])


AttributeError: 'list' object has no attribute 'size'

In [4]:
model_input = [(314, 2540, 616, 47887, 3451, 287, 14033, 287, 262, 32486, 810, 314, 284, 3902, 355, 257, 1327, 12, 2127, 3902, 9095, 329, 262, 7431, 11)]

In [6]:
import torch
embed = lm_model(torch.tensor(model_input))

In [10]:
embed[-1][-1]

tensor([[[-0.2605, -0.5374,  0.8852,  ..., -4.1158,  0.4843,  0.3175],
         [ 0.3586, -0.7330,  0.2462,  ..., -1.2818,  0.9802, -0.1627],
         [-0.1013, -1.0927,  0.3263,  ..., -0.8650,  0.6419,  0.5125],
         ...,
         [-0.3880, -0.3558,  0.4894,  ..., -1.4640,  0.8181, -1.2703],
         [ 0.4893, -0.1838, -0.3610,  ..., -0.9057,  0.2003, -0.5803],
         [-0.2717, -0.1455,  0.2611,  ..., -1.6548,  1.1802, -0.4666]]],
       grad_fn=<ViewBackward>)