# Utility Functions

This notebook contains utility functions which I found useful for my research.

In [2]:
import torch
import torch.nn as nn

## ML

### Accuracy in PyTorch

In [6]:
def get_accuracy(y_true, y_prob):
    y_pred = y_prob.argmax(1)
    correct = (y_pred == y_true).type(torch.float).sum().item()
    return correct / len(y_true)

In [34]:
from sklearn.metrics import accuracy_score

y_prob = torch.randn(3, 2)
y_prob = nn.Softmax(dim=-1)(y_prob)
y_true=torch.empty(3, dtype=torch.long).random_(2)
my_acc = get_accuracy(y_true, y_prob)

y_pred = y_prob.argmax(1)
exp_acc = accuracy_score(y_true, y_pred)
assert my_acc == exp_acc

In [4]:
print(ip)

tensor([[0.0887, 0.1730, 0.2687, 0.0590, 0.4106],
        [0.2118, 0.2614, 0.0332, 0.1918, 0.3017],
        [0.0838, 0.1923, 0.1487, 0.3859, 0.1893]], grad_fn=<SoftmaxBackward>)


## NLP

### BERT Token Embeddings

In [4]:
def get_token_embeddings(string, tokenizer, embedding_model, merge_strategy = "average"):
    """ Retrieves token embeddings by accumulating wordpiece embeddings based on merge strategy.
    Identify wordpiece to tokens by checking if their character span is in subset of the original token char span. """
    def _merge_embeddings(wp_e, stack):
        if merge_strategy == "average": 
            t_e = torch.mean(wp_e[stack], 0)
        elif merge_strategy == "first":
            t_e = wp_e[stack[0]]
        else:
            raise NotImplementedError
        return t_e


    inputs = tokenizer(string, truncation=True, return_tensors="pt", add_special_tokens = False)
    wp_e = get_wordpiece_embeddings(inputs, embedding_model)

    ws_tokenizer = nltk.tokenize.WhitespaceTokenizer()
    token_spans = ws_tokenizer.span_tokenize(string)

    # merging wordpiece embeddings
    result = []
    stack = []  # initialise stack with idx

    t_span = next(token_spans)
    for i in range(len(wp_e)):
        wp_span = inputs.token_to_chars(i)
        if is_subspan(wp_span, t_span): stack.append(i)
        else: 
            t_e = _merge_embeddings(wp_e, stack)
            result.append(t_e)
            t_span = next(token_spans) # if error is thrown, sth is wrong as every wp should be a subspan of some token

            assert is_subspan(wp_span, t_span)
            stack = [i]  # initialise stack with current idx
    
    # clear remaining accumulated tensors
    t_e = _merge_embeddings(wp_e, stack)
    result.append(t_e)

    result = torch.stack(result, dim = 0)
    assert len(result) == len(string.split()), f"{len(result)} != {len(string.split())}"
    return result

def is_subspan(subspan: Tuple[int], span: Tuple[int]) -> bool:
    assert len(subspan) == 2
    assert len(span) == 2
    return subspan[0] >= span[0] and subspan[1] <= span[1]

def get_model_device(model):
    return next(model.parameters()).device

def get_wordpiece_embeddings(inputs: Dict, model) -> Tensor:
    """ Based on https://mccormickml.com/2019/05/14/BERT-word-embeddings-tutorial/#2-input-formatting """
    with torch.no_grad():
        inputs = {k: v.to(get_model_device(model)) for k, v in inputs.items()}
        outputs = model(**inputs)
        hidden_states = outputs[2]  # 13x1xtx768

    # stack list of tensors 
    wordpiece_embeddings = torch.stack(hidden_states[-4:], dim = 0)  # 4x1xtx768

    # remove batch dimension
    wordpiece_embeddings = torch.squeeze(wordpiece_embeddings, dim = 1)  # 4xtx768

    # order by wordpiece tokens
    wordpiece_embeddings = wordpiece_embeddings.permute(1, 0, 2)  # tx4x768

    # concat last four hidden layers as in original paper
    s = wordpiece_embeddings.size()
    wordpiece_embeddings = wordpiece_embeddings.reshape(s[0], s[1] * s[2])  # tx3072
    
    return wordpiece_embeddings

NameError: name 'Tuple' is not defined

In [5]:
def test_get_wordpiece_embeddings():
    from transformers import AutoModel
    from transformers import AutoTokenizer
    """ Tests for contextual embeddings. Follows https://mccormickml.com/2019/05/14/BERT-word-embeddings-tutorial/#2-input-formatting. """

    from scipy.spatial.distance import cosine

    # model_name = 'bert-base-uncased'
    model_name = 'bert-base-multilingual-cased'
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name, output_hidden_states = True)
    model.eval()
    
    # english
    en = "After stealing money from the bank vault, the bank robber was seen fishing on the Mississippi river bank."
    inputs = tokenizer(en, return_tensors="pt", truncation=True, add_special_tokens=False)
    we_en = get_wordpiece_embeddings(inputs, model)

    tokens = tokenizer.tokenize(en, truncation=True, add_special_tokens=False)
    assert len(we_en) == len(tokens)
    ids = [i for i, x in enumerate(tokens) if x == "bank"]

    same_bank = 1 - cosine(we_en[ids[0]], we_en[ids[1]])
    diff_bank = 1 - cosine(we_en[ids[0]], we_en[ids[2]])
    print(f"diff_bank vs same_bank for en: {diff_bank} vs {same_bank}")
    assert same_bank > diff_bank

def test_get_token_embeddings():
    from transformers import AutoModel
    from transformers import AutoTokenizer
    """ Tests for contextual embeddings. Follows https://mccormickml.com/2019/05/14/BERT-word-embeddings-tutorial/#2-input-formatting. """

    from scipy.spatial.distance import cosine

    # model_name = 'bert-base-uncased'
    model_name = 'bert-base-multilingual-cased'
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name, output_hidden_states = True)
    model.eval()
    
    # english
    en = "After stealing money from the bank vault, the bank robber was seen fishing on the Mississippi river bank."
    t_en = get_token_embeddings(en, tokenizer, model, merge_strategy="first")

    ids = [i for i, x in enumerate(en.split()) if "bank" in x]

    same_bank = 1 - cosine(t_en[ids[0]], t_en[ids[1]])  # cosine similarity
    diff_bank = 1 - cosine(t_en[ids[0]], t_en[ids[2]])
    print(f"diff_bank vs same_bank for en: {diff_bank} vs {same_bank}")
    assert same_bank > diff_bank

    inputs = tokenizer(en, return_tensors="pt", truncation=True, add_special_tokens=False)
    we_en = get_wordpiece_embeddings(inputs, model)

    tokens = tokenizer.tokenize(en, truncation=True, add_special_tokens=False)
    assert len(we_en) == len(tokens)
    we_ids = [i for i, x in enumerate(tokens) if x == "bank"]
    # token and wordpiece embeddings should be same since "bank" is not split further.
    assert torch.equal(t_en[ids[0]], we_en[we_ids[0]])
    assert torch.equal(t_en[ids[1]], we_en[we_ids[1]])
    assert torch.equal(t_en[ids[2]], we_en[we_ids[2]])

In [6]:
test_get_wordpiece_embeddings()
test_get_token_embeddings()

ModuleNotFoundError: No module named 'transformers'