In [1]:
from colors import ColorsCorpusReader
import os
import torch

from transformers import pipeline
import torch.nn.functional as F
from transformers import (
    BertTokenizer, BertModel,
    XLNetTokenizer, XLNetModel,
    RobertaTokenizer, RobertaModel,
    ElectraTokenizer, ElectraModel,    
)

import model_utils as mu

In [2]:
COLORS_SRC_FILENAME = os.path.join(
    "data", "colors", "filteredCorpus.csv"
)

In [3]:
corpus = ColorsCorpusReader(
    COLORS_SRC_FILENAME,
    word_count=None, #2
    normalize_colors=True
)

In [4]:
examples = list(corpus.read())

In [5]:
len(examples)

46994

In [6]:
close_examples = [example for example in examples if example.condition == "close"]
split_examples = [example for example in examples if example.condition == "split"]
far_examples = [example for example in examples if example.condition == "far"]

In [7]:
print(f"close: {len(close_examples)}")
print(f"split: {len(split_examples)}")
print(f"far: {len(far_examples)}")

close: 15519
split: 15693
far: 15782


In [8]:
dev_rawcols, dev_texts = zip(*[[ex.colors, ex.contents] for ex in examples])

In [54]:
test_colours = [
    'brown. not the yellow one or classic brown one, the weirder one', 
    'brown. not the yellow one or classic brown one',
    'some other brown. that one'
]

#### Bert model embeddings extraction

In [10]:
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
bert_model = BertModel.from_pretrained('bert-base-cased')

In [30]:
input_ids = torch.tensor(bert_tokenizer.encode(test_colours, add_special_tokens=True)).unsqueeze(0)
input_ids

tensor([[ 101, 3058,  119, 1136, 1103, 3431, 1141, 1137, 5263, 3058, 1141,  117,
         1103, 6994, 1200, 1141,  102]])

In [12]:
btokens = bert_tokenizer.convert_ids_to_tokens(input_ids[0])
btokens

['[CLS]',
 'brown',
 '.',
 'not',
 'the',
 'yellow',
 'one',
 'or',
 'classic',
 'brown',
 'one',
 ',',
 'the',
 'weird',
 '##er',
 'one',
 '[SEP]']

Quick test

In [46]:
mu.extract_input_embeddings(test_colours, bert_model, bert_tokenizer)

({'brown': tensor(-0.0584, grad_fn=<SelectBackward>),
  'not': tensor(-0.0163, grad_fn=<SelectBackward>),
  'the': tensor(0.0056, grad_fn=<SelectBackward>),
  'yellow': tensor(-0.0260, grad_fn=<SelectBackward>),
  'one': tensor(0.0207, grad_fn=<SelectBackward>),
  'or': tensor(-0.0833, grad_fn=<SelectBackward>),
  'classic': tensor(-0.0158, grad_fn=<SelectBackward>),
  'weird': tensor(0.0265, grad_fn=<SelectBackward>),
  'er': tensor(-0.0054, grad_fn=<SelectBackward>)},
 ['brown', 'not', 'the', 'yellow', 'one', 'or', 'classic', 'weird', 'er'])

In [47]:
mu.extract_contextual_embeddings(test_colours, bert_model, bert_tokenizer)

([['brown_0', tensor(-1.5282, grad_fn=<SelectBackward>)],
  ['not_1', tensor(-0.2150, grad_fn=<SelectBackward>)],
  ['the_2', tensor(0.1125, grad_fn=<SelectBackward>)],
  ['yellow_3', tensor(-0.2057, grad_fn=<SelectBackward>)],
  ['one_4', tensor(0.6377, grad_fn=<SelectBackward>)],
  ['or_5', tensor(-1.7041, grad_fn=<SelectBackward>)],
  ['classic_6', tensor(-0.0015, grad_fn=<SelectBackward>)],
  ['brown_7', tensor(1.2130, grad_fn=<SelectBackward>)],
  ['one_8', tensor(0.6641, grad_fn=<SelectBackward>)],
  ['the_9', tensor(0.6327, grad_fn=<SelectBackward>)],
  ['weird_10', tensor(0.7618, grad_fn=<SelectBackward>)],
  ['er_11', tensor(0.6339, grad_fn=<SelectBackward>)]],
 ['brown', 'not', 'the', 'yellow', 'one', 'or', 'classic', 'weird', 'er'])

Extract input embeddings

In [13]:
%time \
bert_embeddings, bert_vocab = mu.extract_input_embeddings(dev_texts, bert_model, bert_tokenizer)

CPU times: user 10.5 s, sys: 55.6 ms, total: 10.6 s
Wall time: 10.6 s


Extract contextual embeddings (pre-trained embedding + position)

In [14]:
%time \
bert_contextual_embeddings, bert_contextual_vocab = mu.extract_contextual_embeddings(dev_texts, bert_model, bert_tokenizer)

CPU times: user 41min 4s, sys: 1min 47s, total: 42min 51s
Wall time: 42min 30s


#### XLNet model embeddings extraction

In [15]:
xlnet_tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
xlnet_model = XLNetModel.from_pretrained('xlnet-base-cased')

Quick test

In [16]:
input_ids = torch.tensor(xlnet_tokenizer.encode(test_colours, add_special_tokens=True)).unsqueeze(0)
input_ids

tensor([[3442,    9,   50,   18, 3493,   65,   49, 3523, 3442,   65,   19,   18,
         8189,  118,   65,    4,    3]])

In [17]:
xtest = xlnet_tokenizer.convert_ids_to_tokens(input_ids[0])
xtest

['▁brown',
 '.',
 '▁not',
 '▁the',
 '▁yellow',
 '▁one',
 '▁or',
 '▁classic',
 '▁brown',
 '▁one',
 ',',
 '▁the',
 '▁weird',
 'er',
 '▁one',
 '<sep>',
 '<cls>']

In [48]:
mu.extract_input_embeddings(test_colours, xlnet_model, xlnet_tokenizer)

({'▁brown': tensor(0.0211, grad_fn=<SelectBackward>),
  '▁not': tensor(-0.0018, grad_fn=<SelectBackward>),
  '▁the': tensor(0.0811, grad_fn=<SelectBackward>),
  '▁yellow': tensor(-0.1191, grad_fn=<SelectBackward>),
  '▁one': tensor(0.0029, grad_fn=<SelectBackward>),
  '▁or': tensor(-0.0841, grad_fn=<SelectBackward>),
  '▁classic': tensor(-0.0960, grad_fn=<SelectBackward>),
  '▁weird': tensor(0.0480, grad_fn=<SelectBackward>),
  'er': tensor(-0.0640, grad_fn=<SelectBackward>)},
 ['▁brown',
  '▁not',
  '▁the',
  '▁yellow',
  '▁one',
  '▁or',
  '▁classic',
  '▁weird',
  'er'])

In [49]:
mu.extract_contextual_embeddings(test_colours, xlnet_model, xlnet_tokenizer)

([['▁brown_0', tensor(0.0211, grad_fn=<SelectBackward>)],
  ['▁not_1', tensor(-0.0018, grad_fn=<SelectBackward>)],
  ['▁the_2', tensor(0.0811, grad_fn=<SelectBackward>)],
  ['▁yellow_3', tensor(-0.1191, grad_fn=<SelectBackward>)],
  ['▁one_4', tensor(0.0029, grad_fn=<SelectBackward>)],
  ['▁or_5', tensor(-0.0841, grad_fn=<SelectBackward>)],
  ['▁classic_6', tensor(-0.0960, grad_fn=<SelectBackward>)],
  ['▁brown_7', tensor(-0.1039, grad_fn=<SelectBackward>)],
  ['▁one_8', tensor(-0.0244, grad_fn=<SelectBackward>)],
  ['▁the_9', tensor(0.0836, grad_fn=<SelectBackward>)],
  ['▁weird_10', tensor(0.0480, grad_fn=<SelectBackward>)],
  ['er_11', tensor(-0.0640, grad_fn=<SelectBackward>)],
  ['▁one_12', tensor(-0.0657, grad_fn=<SelectBackward>)]],
 ['▁brown',
  '▁not',
  '▁the',
  '▁yellow',
  '▁one',
  '▁or',
  '▁classic',
  '▁weird',
  'er'])

Extract input embeddings

In [18]:
%time \
xlnet_embeddings, xlnet_vocab = mu.extract_input_embeddings(dev_texts, xlnet_model, xlnet_tokenizer)

CPU times: user 9.78 s, sys: 82.6 ms, total: 9.87 s
Wall time: 9.88 s


Extract contextual embeddings (pre-trained embedding + position)

In [19]:
%time \
xlnet_contextual_embeddings, xlnet_contextual_vocab = mu.extract_contextual_embeddings(dev_texts, xlnet_model, xlnet_tokenizer)

CPU times: user 48min 5s, sys: 2min, total: 50min 5s
Wall time: 49min 44s


#### RoBERTa model embeddings extractions

In [20]:
roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
roberta_model = RobertaModel.from_pretrained('roberta-base')

In [21]:
input_ids = torch.tensor(roberta_tokenizer.encode(test_colours, add_special_tokens=True)).unsqueeze(0)
input_ids

tensor([[    0, 31876,     4,    45,     5,  5718,    65,    50,  4187,  6219,
            65,     6,     5,    52,   853,  3624,    65,     2]])

In [22]:
rtest = roberta_tokenizer.convert_ids_to_tokens(input_ids[0])
rtest

['<s>',
 'brown',
 '.',
 'Ġnot',
 'Ġthe',
 'Ġyellow',
 'Ġone',
 'Ġor',
 'Ġclassic',
 'Ġbrown',
 'Ġone',
 ',',
 'Ġthe',
 'Ġwe',
 'ir',
 'der',
 'Ġone',
 '</s>']

Quick test

In [50]:
mu.extract_input_embeddings(test_colours, roberta_model, roberta_tokenizer)

({'brown': tensor(0.2496, grad_fn=<SelectBackward>),
  'not': tensor(-0.1255, grad_fn=<SelectBackward>),
  'the': tensor(0.1127, grad_fn=<SelectBackward>),
  'yellow': tensor(0.0967, grad_fn=<SelectBackward>),
  'one': tensor(-0.0260, grad_fn=<SelectBackward>),
  'or': tensor(0.1552, grad_fn=<SelectBackward>),
  'classic': tensor(-0.2581, grad_fn=<SelectBackward>),
  'we': tensor(-0.0199, grad_fn=<SelectBackward>),
  'ir': tensor(-0.0872, grad_fn=<SelectBackward>),
  'der': tensor(0.0023, grad_fn=<SelectBackward>)},
 ['brown', 'not', 'the', 'yellow', 'one', 'or', 'classic', 'we', 'ir', 'der'])

In [51]:
mu.extract_contextual_embeddings(test_colours, roberta_model, roberta_tokenizer)

([['brown_0', tensor(0.4642, grad_fn=<SelectBackward>)],
  ['not_1', tensor(-0.2442, grad_fn=<SelectBackward>)],
  ['the_2', tensor(0.4069, grad_fn=<SelectBackward>)],
  ['yellow_3', tensor(0.2805, grad_fn=<SelectBackward>)],
  ['one_4', tensor(0.2763, grad_fn=<SelectBackward>)],
  ['or_5', tensor(0.4153, grad_fn=<SelectBackward>)],
  ['classic_6', tensor(-0.3730, grad_fn=<SelectBackward>)],
  ['brown_7', tensor(0.0910, grad_fn=<SelectBackward>)],
  ['one_8', tensor(-0.5839, grad_fn=<SelectBackward>)],
  ['the_9', tensor(0.0514, grad_fn=<SelectBackward>)],
  ['we_10', tensor(-0.0807, grad_fn=<SelectBackward>)],
  ['ir_11', tensor(-0.1418, grad_fn=<SelectBackward>)],
  ['der_12', tensor(-0.0013, grad_fn=<SelectBackward>)]],
 ['brown', 'not', 'the', 'yellow', 'one', 'or', 'classic', 'we', 'ir', 'der'])

Extract input embeddings

In [23]:
%time \
roberta_embeddings, roberta_vocab = mu.extract_input_embeddings(dev_texts, roberta_model, roberta_tokenizer)

CPU times: user 10.7 s, sys: 207 ms, total: 10.9 s
Wall time: 11 s


Extract contextual embeddings (pre-trained embedding + position)

In [24]:
%time \
xlnet_contextual_embeddings, xlnet_contextual_vocab = mu.extract_contextual_embeddings(dev_texts, roberta_model, roberta_tokenizer)

CPU times: user 45min 42s, sys: 2min 25s, total: 48min 7s
Wall time: 47min 32s


#### ELECTRA model embeddings extractions

In [25]:
electra_tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')
electra_model = ElectraModel.from_pretrained('google/electra-small-discriminator')

In [26]:
input_ids = torch.tensor(electra_tokenizer.encode(test_colours, add_special_tokens=True)).unsqueeze(0)
input_ids

tensor([[ 101, 2829, 1012, 2025, 1996, 3756, 2028, 2030, 4438, 2829, 2028, 1010,
         1996, 6881, 2121, 2028,  102]])

In [27]:
rtest = electra_tokenizer.convert_ids_to_tokens(input_ids[0])
rtest

['[CLS]',
 'brown',
 '.',
 'not',
 'the',
 'yellow',
 'one',
 'or',
 'classic',
 'brown',
 'one',
 ',',
 'the',
 'weird',
 '##er',
 'one',
 '[SEP]']

Quick test

In [52]:
mu.extract_input_embeddings(test_colours, electra_model, electra_tokenizer)

({'brown': tensor(-0.0039, grad_fn=<SelectBackward>),
  'not': tensor(-0.0407, grad_fn=<SelectBackward>),
  'the': tensor(-0.1072, grad_fn=<SelectBackward>),
  'yellow': tensor(0.0862, grad_fn=<SelectBackward>),
  'one': tensor(-0.0352, grad_fn=<SelectBackward>),
  'or': tensor(-0.0047, grad_fn=<SelectBackward>),
  'classic': tensor(-0.0245, grad_fn=<SelectBackward>),
  'weird': tensor(0.0663, grad_fn=<SelectBackward>),
  'er': tensor(0.1498, grad_fn=<SelectBackward>)},
 ['brown', 'not', 'the', 'yellow', 'one', 'or', 'classic', 'weird', 'er'])

In [53]:
mu.extract_contextual_embeddings(test_colours, electra_model, electra_tokenizer)

([['brown_0', tensor(-0.0532, grad_fn=<SelectBackward>)],
  ['not_1', tensor(-0.2128, grad_fn=<SelectBackward>)],
  ['the_2', tensor(-0.0007, grad_fn=<SelectBackward>)],
  ['yellow_3', tensor(0.6196, grad_fn=<SelectBackward>)],
  ['one_4', tensor(-0.3050, grad_fn=<SelectBackward>)],
  ['or_5', tensor(0.3348, grad_fn=<SelectBackward>)],
  ['classic_6', tensor(-0.7161, grad_fn=<SelectBackward>)],
  ['brown_7', tensor(0.3253, grad_fn=<SelectBackward>)],
  ['one_8', tensor(-0.2708, grad_fn=<SelectBackward>)],
  ['the_9', tensor(-0.7404, grad_fn=<SelectBackward>)],
  ['weird_10', tensor(0.5968, grad_fn=<SelectBackward>)],
  ['er_11', tensor(-0.4330, grad_fn=<SelectBackward>)]],
 ['brown', 'not', 'the', 'yellow', 'one', 'or', 'classic', 'weird', 'er'])

Extract input embeddings

In [28]:
%time \
roberta_embeddings, roberta_vocab = mu.extract_input_embeddings(dev_texts, electra_model, electra_tokenizer)

CPU times: user 12.7 s, sys: 97.3 ms, total: 12.8 s
Wall time: 12.9 s


Extract contextual embeddings (pre-trained embedding + position)

In [29]:
%time \
xlnet_contextual_embeddings, xlnet_contextual_vocab = mu.extract_contextual_embeddings(dev_texts, electra_model, electra_tokenizer)

CPU times: user 12min 43s, sys: 1min 1s, total: 13min 45s
Wall time: 13min 7s
