In [1]:
import transformers

In [3]:
tokenizer = transformers.AutoTokenizer.from_pretrained('bert-base-uncased')
model = transformers.AutoModel.from_pretrained('bert-base-uncased')

import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
print('model on device:', device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


model on device: cuda


In [5]:
tokenizer([['Hi', 'friend']])

{'input_ids': [[101, 7632, 102, 2767, 102]], 'token_type_ids': [[0, 0, 0, 1, 1]], 'attention_mask': [[1, 1, 1, 1, 1]]}

In [6]:
from nltk.corpus import wordnet as wn

In [7]:
limit = None

synonym_pairs = []
antonym_pairs = []
for ss in wn.all_synsets():
    word_name = ss.lemmas()[0].name()
    for lemma in ss.lemmas():
        if lemma.name() != word_name:
            synonym_pairs.append((word_name, lemma.name()))
        if lemma.antonyms():
            for ant_lemma in lemma.antonyms():
                antonym_pairs.append((word_name, ant_lemma.name()))
    if (limit is not None) and (len(synonym_pairs) > limit) and (len(antonym_pairs) > limit): break

print('\n\nSynonyms:', synonym_pairs[:10] + ['...'])
print('\n\nAntonyms:', antonym_pairs[:10] + ['...'])



Synonyms: [('abaxial', 'dorsal'), ('adaxial', 'ventral'), ('abducent', 'abducting'), ('adducent', 'adductive'), ('adducent', 'adducting'), ('emergent', 'emerging'), ('cut', 'shortened'), ('full-length', 'uncut'), ('implicit', 'unquestioning'), ('relative', 'comparative'), '...']


Antonyms: [('able', 'unable'), ('unable', 'able'), ('abaxial', 'adaxial'), ('adaxial', 'abaxial'), ('acroscopic', 'basiscopic'), ('basiscopic', 'acroscopic'), ('abducent', 'adducent'), ('adducent', 'abducent'), ('nascent', 'dying'), ('dying', 'nascent'), '...']


In [8]:
len(synonym_pairs), len(antonym_pairs)

(89319, 7979)

In [41]:
from typing import List, Tuple

import tqdm
import torch

batch_size = 256

def get_bert_similarities(word_pairs: List[Tuple[str]]) -> torch.Tensor:
    word1 = [p[0] for p in word_pairs]
    word2 = [p[1] for p in word_pairs]
    
    word1_tokenized = tokenizer(word1, padding=True, truncation=True, return_tensors='pt')
    word1_tokenized = {k: v.to(device) for k,v in word1_tokenized.items()}
    word1_reps = model(**word1_tokenized).last_hidden_state[:, 0, :]
    
    word2_tokenized = tokenizer(word2, padding=True, truncation=True, return_tensors='pt')
    word2_tokenized = {k: v.to(device) for k,v in word2_tokenized.items()}
    word2_reps = model(**word2_tokenized).last_hidden_state[:, 0, :]
    
    return torch.nn.CosineSimilarity(dim=1)(word1_reps, word2_reps)

synonym_pairs_with_sim = []
for i in tqdm.trange(0, len(synonym_pairs), batch_size, desc='processing synonyms'):
    syn_pairs = [synonym_pairs[j] for j in range(i, min(len(synonym_pairs), i+batch_size))]
    sims = get_bert_similarities(syn_pairs)
    for word_pair, sim in zip(syn_pairs, sims):
        synonym_pairs_with_sim.append((word_pair[0], word_pair[1], sim.item()))
    # if i >= batch_size * 10: break

antonym_pairs_with_sim = []
for i in tqdm.trange(0, len(antonym_pairs), batch_size, desc='processing antonyms'):
    ant_pairs = [antonym_pairs[j] for j in range(i, min(len(antonym_pairs), i+batch_size))]
    sims = get_bert_similarities(ant_pairs)
    for word_pair, sim in zip(ant_pairs, sims):
        antonym_pairs_with_sim.append((word_pair[0], word_pair[1], sim.item()))
    # if i >= batch_size * 10: break

processing synonyms: 100%|██████████| 349/349 [00:36<00:00,  9.63it/s]
processing antonyms: 100%|██████████| 32/32 [00:02<00:00, 12.44it/s]


In [43]:
import pandas as pd

a_df = pd.DataFrame(antonym_pairs_with_sim, columns=['word1', 'word2', 'sim'])
a_df['relationship'] = 'antonym'
a_df.head()

Unnamed: 0,word1,word2,sim,relationship
0,able,unable,0.98155,antonym
1,unable,able,0.98155,antonym
2,abaxial,adaxial,0.978398,antonym
3,adaxial,abaxial,0.978398,antonym
4,acroscopic,basiscopic,0.880585,antonym


In [44]:
s_df = pd.DataFrame(synonym_pairs_with_sim, columns=['word1', 'word2', 'sim'])
s_df['relationship'] = 'synonym'
s_df.head()

Unnamed: 0,word1,word2,sim,relationship
0,abaxial,dorsal,0.865212,synonym
1,adaxial,ventral,0.887948,synonym
2,abducent,abducting,0.732276,synonym
3,adducent,adductive,0.933363,synonym
4,adducent,adducting,0.775546,synonym


In [45]:
df = pd.concat((a_df, s_df)).reset_index()
print(len(a_df), len(s_df), len(df))
df.head()

7979 89319 97298


Unnamed: 0,index,word1,word2,sim,relationship
0,0,able,unable,0.98155,antonym
1,1,unable,able,0.98155,antonym
2,2,abaxial,adaxial,0.978398,antonym
3,3,adaxial,abaxial,0.978398,antonym
4,4,acroscopic,basiscopic,0.880585,antonym


In [46]:
import tqdm
tqdm.tqdm.pandas() # gives us progress_map() for pandas

In [47]:
from wordfreq import word_frequency

df['word1_freq'] = df['word1'].progress_map(lambda w: word_frequency(w, 'en'))
df['word2_freq'] = df['word2'].progress_map(lambda w: word_frequency(w, 'en'))

100%|██████████| 97298/97298 [00:00<00:00, 151772.48it/s]
100%|██████████| 97298/97298 [00:00<00:00, 111144.44it/s]


In [48]:
df['freq'] = (df['word1_freq'] + df['word2_freq'] / 2)
df['freq*sim'] = df['freq']*df['sim']
df['freq/sim'] = df['freq'] / df['sim']

In [49]:
df.head()

Unnamed: 0,index,word1,word2,sim,relationship,word1_freq,word2_freq,freq,freq*sim,freq/sim
0,0,able,unable,0.98155,antonym,0.000269,4.07e-05,0.00028935,0.0002840116,0.0002947887
1,1,unable,able,0.98155,antonym,4.07e-05,0.000269,0.0001752,0.0001719676,0.0001784931
2,2,abaxial,adaxial,0.978398,antonym,3.47e-08,2.69e-08,4.815e-08,4.710987e-08,4.92131e-08
3,3,adaxial,abaxial,0.978398,antonym,2.69e-08,3.47e-08,4.425e-08,4.329412e-08,4.522699e-08
4,4,acroscopic,basiscopic,0.880585,antonym,0.0,0.0,0.0,0.0,0.0


In [None]:
import seaborn as sns

In [None]:
sns.histplot(data=df, x="sim", hue="relationship") #multiple="stack"

## What are the most similar and dissimilar antonyms?

In [None]:
f['relationship'] == 'antonym'].sort_values(by='sim', ascending=False).head(n=10)

In [None]:
 df[df['relationship'] == 'antonym'].sort_values(by='sim', ascending=True).head(n=10)

## What are the most similar and dissimilar synonyms?

In [None]:
df[df['relationship'] == 'synonym'].sort_values(by='sim', ascending=False).head(n=10)

In [None]:
df[df['relationship'] == 'synonym'].sort_values(by='sim', ascending=True).head(n=30)

## What are the most important misunderstood word pairs?
(Measuring *importance* as just the frequency of the word in English.)

In [50]:
## Are there any highly misunderstood synonym or antonym pairs in BERT?
## (Instead of looking at the averages/totals like in the previous graphs, look at the tails. And include word frequency data.)
## Also we should use the full (not limited) WordNet synset list for this.

#### 1. Get frequency of each word in english. 
        # > Make three new columns: word1_freq, word2_freq, and mean_freq
#### 2. Make a derived column: sim/freq or freq/sim
        # > We're interested in SYNONYMS with high frequency and low similarity -- high freq/sim.
        # > and ANTONYMS with high frequency and high similarity -- high freq*sim maybe?

In [54]:
pd.options.display.max_rows = 999
df[(df['relationship'] == 'synonym') & (df['sim'] < 1.0)].sort_values(by='freq/sim', ascending=False).head(400)

Unnamed: 0,index,word1,word2,sim,relationship,word1_freq,word2_freq,freq,freq*sim,freq/sim
44050,36071,A,group_A,0.891953,synonym,0.0229,0.0,0.0229,0.020426,0.025674
44049,36070,A,type_A,0.920186,synonym,0.0229,0.0,0.0229,0.021072,0.024886
21753,13774,in,inwards,0.796561,synonym,0.0186,7.76e-07,0.0186,0.014816,0.023351
21754,13775,in,inward,0.958435,synonym,0.0186,2.34e-06,0.018601,0.017828,0.019408
84288,76309,deoxyadenosine_monophosphate,A,0.708728,synonym,0.0,0.0229,0.01145,0.008115,0.016156
83938,75959,adenine,A,0.882,synonym,2.19e-07,0.0229,0.01145,0.010099,0.012982
85112,77133,vitamin_A,A,0.911925,synonym,0.0,0.0229,0.01145,0.010442,0.012556
79806,71827,ampere,A,0.913863,synonym,1.62e-07,0.0229,0.01145,0.010464,0.012529
79872,71893,angstrom,A,0.926114,synonym,9.33e-08,0.0229,0.01145,0.010604,0.012364
79841,71862,inch,in,0.772949,synonym,3.09e-05,0.0186,0.009331,0.007212,0.012072


In [55]:
df[df['relationship'] == 'antonym'].sort_values(by='freq*sim', ascending=False).head(400)

Unnamed: 0,index,word1,word2,sim,relationship,word1_freq,word2_freq,freq,freq*sim,freq/sim
2580,2580,on,off,0.974484,antonym,0.00813,0.000851,0.008556,0.008337,0.00878
2578,2578,on,off,0.974484,antonym,0.00813,0.000851,0.008556,0.008337,0.00878
7936,7936,have,lack,0.964242,antonym,0.00513,8.32e-05,0.005172,0.004987,0.005363
2581,2581,off,on,0.974484,antonym,0.000851,0.00813,0.004916,0.004791,0.005045
2579,2579,off,on,0.974484,antonym,0.000851,0.00813,0.004916,0.004791,0.005045
3401,3401,all,no,0.90161,antonym,0.00331,0.00224,0.00443,0.003994,0.004913
3400,3400,all,some,0.935678,antonym,0.00331,0.00158,0.0041,0.003836,0.004382
3398,3398,no,all,0.90161,antonym,0.00224,0.00331,0.003895,0.003512,0.00432
3397,3397,some,all,0.935678,antonym,0.00158,0.00331,0.003235,0.003027,0.003457
3399,3399,no,some,0.913424,antonym,0.00224,0.00158,0.00303,0.002768,0.003317
