In [1]:
import pandas as pd
import numpy as np
import urllib.request, json, os, math
import tensorflow as tf

from transformers import (
    TFBertForMaskedLM,
    PreTrainedTokenizerFast,
)

from scipy.spatial.distance import cosine

In [2]:
try:
    from google.colab import drive
    import sys

    drive.mount('/content/gdrive/')
    sys.path.append('/content/gdrive/My Drive/Colab Notebooks')
except:
    pass

from utils import (
    get_token_embedding,
    get_k_nearest_neighbors,
)

Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).


In [3]:
PROJECT_DIR = '/content/gdrive/My Drive/Colab Notebooks/w266_final_proj'

TOKENIZER1_PATH = 'birthyear.1990_2009.lowercase_tokenizer'
TOKENIZER2_PATH = 'birthyear.1950_1969.lowercase_tokenizer'

MODEL1_PATH = f'birthyear.1990_2009.lowercase_64batch_size_1000steps'
MODEL2_PATH = f'birthyear.1950_1969.lowercase_64batch_size_1000steps'

# path to load trained tokenizers from
full_tokenizer1_path = os.path.join(PROJECT_DIR, TOKENIZER1_PATH)
full_tokenizer2_path = os.path.join(PROJECT_DIR, TOKENIZER2_PATH)

# path to load trained BERT model2 from
full_model1_path = os.path.join(PROJECT_DIR, MODEL1_PATH)
full_model2_path = os.path.join(PROJECT_DIR, MODEL2_PATH)

## Load Models

In [4]:
tokenizer1 = PreTrainedTokenizerFast.from_pretrained(full_tokenizer1_path)
tokenizer2 = PreTrainedTokenizerFast.from_pretrained(full_tokenizer2_path)

bert_model1 = TFBertForMaskedLM.from_pretrained(full_model1_path)
bert_model2 = TFBertForMaskedLM.from_pretrained(full_model2_path)

embedding_layer1 = bert_model1.bert.embeddings
embedding_layer2 = bert_model2.bert.embeddings

Some layers from the model checkpoint at /content/gdrive/My Drive/Colab Notebooks/w266_final_proj/birthyear.1990_2009.lowercase_64batch_size_1000steps were not used when initializing TFBertForMaskedLM: ['encoder/layer_._1/attention/self/value/bias:0', 'encoder/layer_._3/attention/output/LayerNorm/gamma:0', 'encoder/layer_._10/attention/self/value/kernel:0', 'encoder/layer_._9/attention/output/LayerNorm/gamma:0', 'encoder/layer_._6/attention/self/query/kernel:0', 'encoder/layer_._0/attention/output/dense/bias:0', 'encoder/layer_._9/attention/output/dense/kernel:0', 'encoder/layer_._2/output/dense/bias:0', 'encoder/layer_._6/attention/output/LayerNorm/beta:0', 'encoder/layer_._0/attention/output/dense/kernel:0', 'encoder/layer_._2/attention/self/value/kernel:0', 'encoder/layer_._4/attention/self/query/kernel:0', 'encoder/layer_._2/attention/output/dense/kernel:0', 'predictions/transform/dense/bias:0', 'encoder/layer_._1/output/dense/kernel:0', 'encoder/layer_._1/output/LayerNorm/gamma:0'

## Retrieving Vocab from each Tokenizer

In [5]:
vocab1 = tokenizer1.get_vocab()
vocab_ids1 = list(vocab1.values())
embeddings1 = [embedding_layer1(tf.constant([[token_id]]))[0][0] for token_id in vocab_ids1]

vocab2 = tokenizer2.get_vocab()
vocab_ids2 = list(vocab2.values())
embeddings2 = [embedding_layer2(tf.constant([[token_id]]))[0][0] for token_id in vocab_ids2]

## Comparing to Tokens from Original Paper

In [18]:
tokens = ["dem", "dam", "rep", "assist", "pr", "fr", "joint", "mega", "flow", "icymi"]
k = 10
for token in tokens:
    top_k1, _ = get_k_nearest_neighbors(
        token,
        embeddings1,
        tokenizer1,
        embedding_layer1,
        k = k
    )

    top_k2, _ = get_k_nearest_neighbors(
        token,
        embeddings2,
        tokenizer2,
        embedding_layer2,
        k = k
    )

    print(f"Token: {token}")
    print(f"Top {k} neighbors from tokenizer 1:")
    print(top_k1)
    print(f"Top {k} neighbors from tokenizer 2:")
    print(top_k2)
    intersection = set(top_k1).intersection(set(top_k2))
    print(f"Intersection: {intersection} ({len(intersection)} / {k})")
    print("========================================")

Token: dem
Top 10 neighbors from tokenizer 1:
['##u' 'slap' 'scuba' 'gardening' 'pros' 'julie' '##ssa' 'opening'
 'sleeve' '👬']
Top 10 neighbors from tokenizer 2:
['##ᄑ' 'wrongs' 'muscles' 'elitist' 'tens' '##м' 'organising'
 'commentator' 'citiz' 'begins']
Intersection: set() (0 / 10)
Token: dam
Top 10 neighbors from tokenizer 1:
['##”' 'kry' 'amazes' 'obesity' 'physical' 'bamb' 'gast' 'dissertation'
 'autobiography' '📑']
Top 10 neighbors from tokenizer 2:
['after' '##ٹ' 'sandy' '##𗀟' 'betrayal' 'length' 'indep' 'apprentice'
 '##fb' 'pathfinder']
Intersection: set() (0 / 10)
Token: rep
Top 10 neighbors from tokenizer 1:
['wisco' 'dube' 'thinking' 'farmer' 'cudi' 'shamed' 'wilf' 'skydiving'
 'ming' 'mcgreg']
Top 10 neighbors from tokenizer 2:
['playful' '##orting' 'bax' '##do' 'curios' '##style' '##maid' 'ocean'
 'supermarket' 'millen']
Intersection: set() (0 / 10)
Token: assist
Top 10 neighbors from tokenizer 1:
['##iverse' '##💫' 'gwen' 'treating' 'chopped' 'strugg' '##otch'
 'relentl

In [12]:
get_k_nearest_neighbors(
    "dem",
    embeddings1,
    tokenizer1,
    embedding_layer1,
    k = 10
)

ValueError: Input vector should be 1-D.