In [1]:
from minicons import cwe, scorer, utils

import torch
import csv

In [2]:
model = cwe.CWE('bert-base-uncased')

In [3]:
sentences = [("the game is afoot now!", "afoot"), ("i was at the bank of the river ganga!", "bank")]
sentences = [("the game is afoot now!", 3), ("i was at the bank of the river ganga!", 4)]

In [4]:
model.extract_representation(sentences, 12)

tensor([[ 0.5399, -0.2461, -0.0968,  ..., -0.4670, -0.5312, -0.0549],
        [-0.8258, -0.4308,  0.2744,  ..., -0.5987, -0.6984,  0.2087]],
       grad_fn=<MeanBackward1>)

In [5]:
model.encode_text(["Hi my name is kanishka", "I am a disco dancer"], 12)

(tensor([[  101,  7632,  2026,  2171,  2003, 22827,  4509,  2912,   102],
         [  101,  1045,  2572,  1037, 12532,  8033,   102,     0,     0]]),
 tensor([[[ 0.0532,  0.2268, -0.3352,  ..., -0.3359,  0.2717,  0.4579],
          [ 0.5793,  0.5765,  0.4213,  ..., -0.0170,  0.6549, -0.2564],
          [-0.2209, -0.1115, -0.2526,  ..., -0.1154, -0.1818,  0.0431],
          ...,
          [ 0.1203, -0.6170, -0.4196,  ..., -0.4597,  0.2969,  0.3223],
          [-0.7284, -0.6379, -0.4574,  ..., -0.0088,  0.1195, -0.1934],
          [ 0.7997, -0.0431, -0.4385,  ...,  0.0473, -0.6968, -0.2386]],
 
         [[-0.0695,  0.3759,  0.1666,  ..., -0.3582,  0.3794,  0.5100],
          [ 0.4299,  0.2177, -0.2908,  ..., -0.0695,  0.5857,  0.2417],
          [ 0.4344,  0.6076, -0.1464,  ..., -0.4406,  0.5414,  0.2579],
          ...,
          [ 0.6442,  0.2516,  0.1550,  ...,  0.0042, -0.7955, -0.0657],
          [ 0.0751, -0.1285,  0.3310,  ...,  0.1002,  0.2263, -0.1037],
          [-0.0557, -0.11

In [12]:
[model.tokenizer.encode_plus(f' {s.split()[i]}', add_special_tokens = False)['input_ids'] for s, i in sentences]

[[21358, 17206], [21358, 17206]]

In [20]:
list(list(list(zip(*sentences))[0]))

['the game is afoot !', 'the game is afoot !']

In [2]:
"the game is afoot!".split("afoot")

['the game is ', '!']

In [3]:
utils.find_index("the game is afoot!", "afoot", "regular")

3

In [3]:
model.tokenize("The cat ate the rat")

AttributeError: 'CWE' object has no attribute 'tokenize'

In [2]:
base_sentence = "He caught the pass and scored another touchdown. There was nothing he enjoyed more than a good game of"
words = ["football", "baseball", "monopoly"]

In [5]:
models = ["bert-base-uncased", "bert-large-uncased", "roberta-base", "gpt2", "openai-gpt", "xlnet-base-cased", "roberta-large"]

with open("/home/kmisra/contextual_n400.csv", 'w') as f:
    writer = csv.writer(f)
    for model in models:
        transformer = cwe.CWE(model, all_layers = True)
        for l in range(0, transformer.layers + 1):
            for word in words:
                sentence = base_sentence + " " + word
                w, c = transformer.context_cosine(sentence, word, layer = l)
                contextual_similarity = round(torch.tensor(list(dict(zip(w,c)).values())).mean().item(), 4)
                writer.writerow([model, l, word, contextual_similarity])

ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.


In [5]:
dict(zip(w,c))

{'[CLS]': 0.9372750520706177,
 'he': 0.9298704862594604,
 'caught': 0.9448925852775574,
 'the': 0.9309799671173096,
 'pass': 0.9259966611862183,
 'and': 0.9368696808815002,
 'scored': 0.906683623790741,
 'another': 0.9423209428787231,
 'touchdown': 0.9199809432029724,
 '.': 0.9390969276428223,
 'there': 0.9411791563034058,
 'was': 0.9454566240310669,
 'nothing': 0.9330489635467529,
 'enjoyed': 0.9498648047447205,
 'more': 0.9263346791267395,
 'than': 0.9219998717308044,
 'a': 0.9281640648841858,
 'good': 0.9335828423500061,
 'game': 0.9325376749038696,
 'of': 0.9373526573181152,
 '[SEP]': 0.932860255241394}

In [4]:
mlm_model = scorer.MaskedLMScorer('bert-base-uncased', 'cpu')
clm_model = scorer.IncrementalLMScorer('distilgpt2', 'cpu')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForMaskedLM were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['cls.predictions.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using pad_token, but it is not set yet.


In [3]:
def scores(s, model = "clm"):
    if model == "clm":
        result = clm_model.logprobs(clm_model.prepare_text(s))
    else:
        result = mlm_model.logprobs(mlm_model.prepare_text(s))
    logprobs, tokens = list(zip(*result))
    surprisals = list(map(lambda x: x.sum().tolist(), logprobs))
    return surprisals

In [6]:
clm_model.score(["Football is a sport.", "A football is a sport.", "Football is a bird.", "A football is a bird."])

[-25.771879196166992,
 -33.48912048339844,
 -32.07358360290527,
 -37.89309310913086]

In [5]:
mlm_model.num_hidden_layers

AttributeError: 'MaskedLMScorer' object has no attribute 'num_hidden_layers'