In [1]:
from colors import ColorsCorpusReader
import os
import torch

from transformers import pipeline
import torch.nn.functional as F
from transformers import (
    BertTokenizer, BertModel,
    XLNetTokenizer, XLNetModel,
    RobertaTokenizer, RobertaModel,
    ElectraTokenizer, ElectraModel,    
)

import model_utils as mu

In [2]:
COLORS_SRC_FILENAME = os.path.join(
    "data", "colors", "filteredCorpus.csv"
)

In [3]:
corpus = ColorsCorpusReader(
    COLORS_SRC_FILENAME,
    word_count=None, #2
    normalize_colors=True
)

In [4]:
examples = list(corpus.read())

In [5]:
len(examples)

46994

In [6]:
close_examples = [example for example in examples if example.condition == "close"]
split_examples = [example for example in examples if example.condition == "split"]
far_examples = [example for example in examples if example.condition == "far"]

In [7]:
print(f"close: {len(close_examples)}")
print(f"split: {len(split_examples)}")
print(f"far: {len(far_examples)}")

close: 15519
split: 15693
far: 15782


In [8]:
dev_rawcols, dev_texts = zip(*[[ex.colors, ex.contents] for ex in examples])

In [9]:
test_colours = [
    'brown. not the yellow one or classic brown one, the weirder one', 
    'brown. not the yellow one or classic brown one',
    'some other brown. that one'
]

#### Bert model embeddings extraction

In [10]:
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
bert_model = BertModel.from_pretrained('bert-base-cased')

In [11]:
input_ids = torch.tensor(bert_tokenizer.encode(test_colours[0], add_special_tokens=True)).unsqueeze(0)
input_ids

tensor([[ 101, 3058,  119, 1136, 1103, 3431, 1141, 1137, 5263, 3058, 1141,  117,
         1103, 6994, 1200, 1141,  102]])

In [12]:
btokens = bert_tokenizer.convert_ids_to_tokens(input_ids[0])
btokens

['[CLS]',
 'brown',
 '.',
 'not',
 'the',
 'yellow',
 'one',
 'or',
 'classic',
 'brown',
 'one',
 ',',
 'the',
 'weird',
 '##er',
 'one',
 '[SEP]']

Quick test

In [13]:
e = mu.extract_input_embeddings(test_colours, bert_model, bert_tokenizer)

In [14]:
ce = mu.extract_contextual_embeddings(test_colours, bert_model, bert_tokenizer)

Extract input embeddings

In [15]:
%time \
bert_embeddings, bert_vocab = mu.extract_input_embeddings(dev_texts, bert_model, bert_tokenizer)

CPU times: user 9.01 s, sys: 44.4 ms, total: 9.06 s
Wall time: 9.07 s


Extract contextual embeddings (pre-trained embedding + position)

In [None]:
%time \
bert_contextual_embeddings, bert_contextual_vocab = mu.extract_contextual_embeddings(dev_texts, bert_model, bert_tokenizer)

#### XLNet model embeddings extraction

In [16]:
xlnet_tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
xlnet_model = XLNetModel.from_pretrained('xlnet-base-cased')

Quick test

In [17]:
input_ids = torch.tensor(xlnet_tokenizer.encode(test_colours[0], add_special_tokens=True)).unsqueeze(0)
input_ids

tensor([[3442,    9,   50,   18, 3493,   65,   49, 3523, 3442,   65,   19,   18,
         8189,  118,   65,    4,    3]])

In [18]:
xtest = xlnet_tokenizer.convert_ids_to_tokens(input_ids[0])
xtest

['▁brown',
 '.',
 '▁not',
 '▁the',
 '▁yellow',
 '▁one',
 '▁or',
 '▁classic',
 '▁brown',
 '▁one',
 ',',
 '▁the',
 '▁weird',
 'er',
 '▁one',
 '<sep>',
 '<cls>']

In [19]:
e = mu.extract_input_embeddings(test_colours, xlnet_model, xlnet_tokenizer)

In [20]:
ce = mu.extract_contextual_embeddings(test_colours, xlnet_model, xlnet_tokenizer)

Extract input embeddings

In [21]:
%time \
xlnet_embeddings, xlnet_vocab = mu.extract_input_embeddings(dev_texts, xlnet_model, xlnet_tokenizer)

CPU times: user 8.36 s, sys: 51.2 ms, total: 8.41 s
Wall time: 8.42 s


Extract contextual embeddings (pre-trained embedding + position)

In [None]:
%time \
xlnet_contextual_embeddings, xlnet_contextual_vocab = mu.extract_contextual_embeddings(dev_texts, xlnet_model, xlnet_tokenizer)

#### RoBERTa model embeddings extractions

In [22]:
roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
roberta_model = RobertaModel.from_pretrained('roberta-base')

In [23]:
input_ids = torch.tensor(roberta_tokenizer.encode(test_colours[0], add_special_tokens=True)).unsqueeze(0)
input_ids

tensor([[    0, 31876,     4,    45,     5,  5718,    65,    50,  4187,  6219,
            65,     6,     5,    52,   853,  3624,    65,     2]])

In [24]:
rtest = roberta_tokenizer.convert_ids_to_tokens(input_ids[0])
rtest

['<s>',
 'brown',
 '.',
 'Ġnot',
 'Ġthe',
 'Ġyellow',
 'Ġone',
 'Ġor',
 'Ġclassic',
 'Ġbrown',
 'Ġone',
 ',',
 'Ġthe',
 'Ġwe',
 'ir',
 'der',
 'Ġone',
 '</s>']

Quick test

In [25]:
e = mu.extract_input_embeddings(test_colours, roberta_model, roberta_tokenizer)

In [26]:
e = mu.extract_contextual_embeddings(test_colours, roberta_model, roberta_tokenizer)

Extract input embeddings

In [27]:
%time \
roberta_embeddings, roberta_vocab = mu.extract_input_embeddings(dev_texts, roberta_model, roberta_tokenizer)

CPU times: user 9.17 s, sys: 179 ms, total: 9.35 s
Wall time: 9.39 s


Extract contextual embeddings (pre-trained embedding + position)

In [None]:
%time \
xlnet_contextual_embeddings, xlnet_contextual_vocab = mu.extract_contextual_embeddings(dev_texts, roberta_model, roberta_tokenizer)

#### ELECTRA model embeddings extractions

In [28]:
electra_tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')
electra_model = ElectraModel.from_pretrained('google/electra-small-discriminator')

In [29]:
input_ids = torch.tensor(electra_tokenizer.encode(test_colours[0], add_special_tokens=True)).unsqueeze(0)
input_ids

tensor([[ 101, 2829, 1012, 2025, 1996, 3756, 2028, 2030, 4438, 2829, 2028, 1010,
         1996, 6881, 2121, 2028,  102]])

In [30]:
rtest = electra_tokenizer.convert_ids_to_tokens(input_ids[0])
rtest

['[CLS]',
 'brown',
 '.',
 'not',
 'the',
 'yellow',
 'one',
 'or',
 'classic',
 'brown',
 'one',
 ',',
 'the',
 'weird',
 '##er',
 'one',
 '[SEP]']

Quick test

In [31]:
e = mu.extract_input_embeddings(test_colours, electra_model, electra_tokenizer)

In [32]:
ce = mu.extract_contextual_embeddings(test_colours, electra_model, electra_tokenizer)

Extract input embeddings

In [33]:
%time \
roberta_embeddings, roberta_vocab = mu.extract_input_embeddings(dev_texts, electra_model, electra_tokenizer)

CPU times: user 10.7 s, sys: 56.6 ms, total: 10.7 s
Wall time: 10.8 s


Extract contextual embeddings (pre-trained embedding + position)

In [None]:
%time \
xlnet_contextual_embeddings, xlnet_contextual_vocab = mu.extract_contextual_embeddings(dev_texts, electra_model, electra_tokenizer)

#### Generate token sequences based on raw colour descriptions converted into model based tokens

In [34]:
mu.tokenize_colour_description(test_colours[0], tokenizer=electra_tokenizer)

['<s>',
 'brown',
 'not',
 'the',
 'yellow',
 'one',
 'or',
 'classic',
 'brown',
 'one',
 'the',
 'weird',
 'er',
 'one',
 '</s>']