In [3]:
import pandas as pd
import numpy
import pickle
import os
import json
import demonstration_routine as helper
from torch.nn import Softmax 
from transformers import BertForMaskedLM, BertTokenizer
import numpy


In [12]:
#Initialize pre trained components of BERT model
model =  BertForMaskedLM.from_pretrained("bert-base-uncased")
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
softmax = Softmax(dim=0)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
#Load calibration set of non-conformity scores
def load_cal_alphas(filename):
    path = "Data/" + filename + '.txt'
    with open(path, "r") as file:
        loaded_vals = json.load(file)
    return loaded_vals

In [8]:
#Gets index of masked word from tokenized tensor.
def find_mask_ind(tokenized_sentence):
    return tokenized_sentence.input_ids[0].tolist().index(103)

In [9]:
#Input clean sentences.  
#Words should already be masked.
def conf_pred(sentence, model, conf, calib, m_ind = -1):
    q_soft = numpy.quantile(calib, conf)

    conf_intervals = []

    
    input = tokenizer(sentence, return_tensors='pt')

    #Get index of masked word
    if m_ind == -1:
        m_ind = find_mask_ind(input)
    
    outputs = model(**input)

    result_softmax = softmax(outputs.logits[0][m_ind]).tolist()
    result_softmax = numpy.array(result_softmax)

    non_confs = 1 - result_softmax

    region_inds = numpy.nonzero(non_confs <= q_soft)[0].tolist()

    words = tokenizer.convert_ids_to_tokens(region_inds)

    return words

In [11]:
#Load calibration set of non-conformity scores
alphas = helper.load_cal_alphas("soft_alphas_0_0")

#Initialize test sentences from Michelle Obama ted talk
test_sent_1 = "...to go with him to a community [MASK]. But when we met, Barack was a community organizer." 
test_sent_2 = "And he urged the people in that meeting in that community to devote themselves to closing the gap between those two ideas, to work together to try to make the world as it is and the world as it should [MASK]one and the same."
test_sent_3 = "And they opened many new doors for millions of female doctors and nurses and artists and authors all of whom have [MASK] [MASK]. And by getting a good education you too can control your own destiny."


In [13]:
confidence = 0.75
conf_pred(test_sent_1, model, confidence, alphas)

['college', 'center', 'event', 'conference', 'meeting', 'dinner', 'gathering']