In [13]:
import pandas as pd
import numpy as np
import urllib.request, json, os, math
import tensorflow as tf

from transformers import (
    TFBertForMaskedLM,
    PreTrainedTokenizerFast,
)

from scipy.spatial.distance import cosine

In [2]:
try:
    from google.colab import drive
    import sys

    drive.mount('/content/gdrive/')
    sys.path.append('/content/gdrive/My Drive/Colab Notebooks')
except:
    pass

from utils import (
    get_token_embedding
)

Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).


In [6]:
DATA_DIR = '/content/gdrive/My Drive/Colab Notebooks/w266_final_proj'

TOKENIZER1_PATH = 'birthyear.1990_2009.lowercase_tokenizer'
TOKENIZER2_PATH = 'birthyear.1950_1969.lowercase_tokenizer'

MODEL1_PATH = f'birthyear.1990_2009.lowercase_64batch_size_1000steps'
MODEL2_PATH = f'birthyear.1950_1969.lowercase_64batch_size_1000steps'

# path2 to load trained tokenizers from
full_tokenizer1_path = os.path.join(DATA_DIR, TOKENIZER1_PATH)
full_tokenizer2_path = os.path.join(DATA_DIR, TOKENIZER2_PATH)

# path2 to load trained BERT model2 from
full_model1_path = os.path.join(DATA_DIR, MODEL1_PATH)
full_model2_path = os.path.join(DATA_DIR, MODEL2_PATH)

## Load Model #1

In [7]:
tokenizer = PreTrainedTokenizerFast.from_pretrained(full_tokenizer1_path)
bert_model = TFBertForMaskedLM.from_pretrained(full_model1_path)

embedding_layer = bert_model.bert.embeddings

Some layers from the model checkpoint at /content/gdrive/My Drive/Colab Notebooks/w266_final_proj/birthyear.1990_2009.lowercase_64batch_size_1000steps were not used when initializing TFBertForMaskedLM: ['encoder/layer_._8/output/LayerNorm/beta:0', 'encoder/layer_._0/attention/output/LayerNorm/gamma:0', 'encoder/layer_._6/attention/self/key/bias:0', 'encoder/layer_._7/attention/self/key/kernel:0', 'encoder/layer_._0/attention/output/dense/kernel:0', 'encoder/layer_._5/attention/self/query/bias:0', 'encoder/layer_._2/intermediate/dense/kernel:0', 'encoder/layer_._9/output/LayerNorm/gamma:0', 'embeddings/word_embeddings/weight:0', 'embeddings/LayerNorm/beta:0', 'encoder/layer_._5/attention/output/dense/bias:0', 'encoder/layer_._7/attention/self/value/kernel:0', 'encoder/layer_._3/output/LayerNorm/gamma:0', 'predictions/transform/dense/bias:0', 'encoder/layer_._5/attention/output/LayerNorm/gamma:0', 'encoder/layer_._6/attention/output/dense/bias:0', 'encoder/layer_._4/attention/self/query/

In [22]:
vocab = tokenizer.get_vocab()
vocab_ids = list(vocab.values())

embeddings = [embedding_layer(tf.constant([[token_id]])) for token_id in vocab_ids]

In [83]:
def get_k_nearest_neighbors(
    token,
    embeddings,
    tokenizer,
    embedding_layer,
    k=10):
    # get the top k nearest neighbors for a given token

    vocab_tokens = list(tokenizer.get_vocab().keys())

    token_embedding = get_token_embedding(
        tokenizer,
        embedding_layer,
        token
    )

    dists = []
    ignore = tokenizer.all_special_tokens + [token]
    for idx, embedding in enumerate(embeddings):
        if vocab_tokens[idx] in ignore:
            dists.append(10) # don't consider when getting nearest neighbors
        else:
            dists.append(cosine(token_embedding, embedding))

    sort_idx = np.array(dists).argsort()
    sorted_dists = np.array(dists)[sort_idx]
    sorted_tokens = np.array(vocab_tokens)[sort_idx]

    return sorted_tokens[:k], sorted_dists[:k]

In [84]:
top_k, top_k_dists = get_k_nearest_neighbors(
    'dog',
    embeddings,
    tokenizer,
    embedding_layer
)
top_k

array(['gha', 'dragging', 'hospital', 'vital', 'bowler', 'obit',
       'prosecut', 'vlog', 'sig', 'ann'], dtype='<U34')