In [None]:
import pandas as pd
import numpy as np
import urllib.request, json, os, math
import tensorflow as tf

from transformers import (
    TFBertForMaskedLM,
    PreTrainedTokenizerFast,
)

from scipy.spatial.distance import cosine

In [None]:
try:
    from google.colab import drive
    import sys

    drive.mount('/content/gdrive/')
    sys.path.append('/content/gdrive/My Drive/Colab Notebooks')
except:
    pass

from utils import (
    get_token_embedding,
    get_k_nearest_neighbors,
)

Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).


In [None]:
PROJECT_DIR = '/content/gdrive/My Drive/Colab Notebooks/w266_final_proj'

TOKENIZER1_PATH = 'birthyear.1990_2009.lowercase_tokenizer'
TOKENIZER2_PATH = 'birthyear.1950_1969.lowercase_tokenizer'

MODEL1_PATH = f'birthyear.1990_2009.lowercase_64batch_size_20000steps'
MODEL2_PATH = f'birthyear.1950_1969.lowercase_64batch_size_20000steps'

# path to load trained tokenizers from
full_tokenizer1_path = os.path.join(PROJECT_DIR, TOKENIZER1_PATH)
full_tokenizer2_path = os.path.join(PROJECT_DIR, TOKENIZER2_PATH)

# path to load trained BERT model2 from
full_model1_path = os.path.join(PROJECT_DIR, MODEL1_PATH)
full_model2_path = os.path.join(PROJECT_DIR, MODEL2_PATH)

## Load Models

In [None]:
tokenizer1 = PreTrainedTokenizerFast.from_pretrained(full_tokenizer1_path)
tokenizer2 = PreTrainedTokenizerFast.from_pretrained(full_tokenizer2_path)

bert_model1 = TFBertForMaskedLM.from_pretrained(full_model1_path)
bert_model2 = TFBertForMaskedLM.from_pretrained(full_model2_path)

Some layers from the model checkpoint at /content/gdrive/My Drive/Colab Notebooks/w266_final_proj/birthyear.1990_2009.lowercase_64batch_size_20000steps were not used when initializing TFBertForMaskedLM: ['encoder/layer_._2/intermediate/dense/bias:0', 'encoder/layer_._3/attention/output/dense/bias:0', 'encoder/layer_._5/attention/self/query/kernel:0', 'encoder/layer_._2/output/LayerNorm/gamma:0', 'encoder/layer_._6/intermediate/dense/bias:0', 'encoder/layer_._10/attention/self/value/bias:0', 'predictions/transform/LayerNorm/beta:0', 'embeddings/token_type_embeddings/embeddings:0', 'encoder/layer_._7/output/dense/kernel:0', 'encoder/layer_._9/attention/output/LayerNorm/gamma:0', 'encoder/layer_._2/attention/self/key/bias:0', 'encoder/layer_._5/output/LayerNorm/beta:0', 'encoder/layer_._9/intermediate/dense/kernel:0', 'encoder/layer_._8/attention/self/query/bias:0', 'encoder/layer_._3/output/LayerNorm/beta:0', 'encoder/layer_._5/intermediate/dense/bias:0', 'encoder/layer_._9/attention/sel

In [None]:
import pickle

e1 = pickle.load(open(os.path.join(PROJECT_DIR,'embeddings1.pickle'), 'rb'))


In [None]:
top_k1, _ = get_k_nearest_neighbors(
    token,
    bert_model1,
    embeddings1,
    tokenizer1,
    k = k
)

## Retrieving Vocab from each Tokenizer

In [None]:
vocab1 = tokenizer1.get_vocab()
embeddings1 = [get_token_embedding(token, bert_model1, tokenizer1) for token in vocab1.keys()]
print(f"Retrieved embeddings for {MODEL1_PATH}")

vocab2 = tokenizer2.get_vocab()
embeddings2 = [get_token_embedding(token, bert_model2, tokenizer2) for token in vocab2.keys()]
print(f"Retrieved embeddings for {MODEL2_PATH}")

Retrieved embeddings for birthyear.1990_2009.lowercase_64batch_size_20000steps
Retrieved embeddings for birthyear.1950_1969.lowercase_64batch_size_20000steps


In [None]:
import pickle

with open(os.path.join(PROJECT_DIR,'embeddings1.pickle'), 'wb') as handle:
    pickle.dump(embeddings1, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open(os.path.join(PROJECT_DIR,'embeddings2.pickle'), 'wb') as handle:
    pickle.dump(embeddings2, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Comparing to Tokens from Original Paper

In [None]:
tokens = ["dem", "dam", "rep", "assist", "pr", "fr", "joint", "mega", "flow", "icymi"]
k = 10
for token in tokens:
    top_k1, _ = get_k_nearest_neighbors(
        token,
        bert_model1,
        embeddings1,
        tokenizer1,
        k = k
    )

    top_k2, _ = get_k_nearest_neighbors(
        token,
        bert_model2,
        embeddings2,
        tokenizer2,
        k = k
    )

    print(f"Token: {token}")
    print(f"Top {k} neighbors from tokenizer 1:")
    print(top_k1)
    print(f"Top {k} neighbors from tokenizer 2:")
    print(top_k2)
    intersection = set(top_k1).intersection(set(top_k2))
    print(f"Intersection: {intersection} ({len(intersection)} / {k})")
    print("========================================")

Token: dem
Top 10 neighbors from tokenizer 1:
['concentrate' 'blueberry' 'releg' 'ipo' 'bankruptcy' 'tko' 'worms'
 'interact' 'discom' 'prolly']
Top 10 neighbors from tokenizer 2:
['logging' 'ense' 'deplorable' 'slo' 'gulf' 'disastrous' 'doj' 'marvelous'
 'slaves' 'intelligence']
Intersection: set() (0 / 10)
Token: dam
Top 10 neighbors from tokenizer 1:
['gorg' '📔' 'attendants' 'bearded' 'punishment' 'ballroom' 'entitlement'
 'blasting' 'contempor' 'prod']
Top 10 neighbors from tokenizer 2:
['applauded' 'tina' 'piggy' 'wreat' '🧒' 'preced' 'article' 'gomez'
 'restore' '泰']
Intersection: set() (0 / 10)
Token: rep
Top 10 neighbors from tokenizer 1:
['flattered' 'amendment' 'kicker' '##💭' 'swedish' 'performing'
 'disappointment' 'captaincy' 'bullied' 'steeler']
Top 10 neighbors from tokenizer 2:
['wild' 'infinitely' 'illusion' 'merge' 'deemed' 'residence' 'azerbai'
 'journos' 'juicy' 'orthodox']
Intersection: set() (0 / 10)
Token: assist
Top 10 neighbors from tokenizer 1:
['frankly' 'bord'

## Validating on a Pre-Trained BERT Model

In [None]:
# Testing on a Pre-trained BERT Model
from transformers import TFBertModel, BertTokenizer
import torch

model = TFBertModel.from_pretrained("bert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

vocab = tokenizer.get_vocab()
embeddings = [get_embedding(token, model, tokenizer) for token in vocab.keys()]

top_k, _ = get_k_nearest_neighbors(
    "Australia",
    model,
    embeddings,
    tokenizer,
    k = 10,
    framework = 'tf'
)
top_k

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT