This is a primer on how some commonly used static embeddings perform on similarity tasks.

**Word embeddings** include:
- SpaCy embeddings
- Word2Vec
- GloVe
- FastText

**Emoji embeddings** include:
- emoji2vec
- emojional

Both can be used along with word2vec-google-news-300

**Basic similarity tasks** including:
- find top 10 most similar tokens (cosine similarity by default)
- find top 10 most similar emojis (cosine similarity by default)
- find top 10 most similar tokens (euclidean distance)
- find top 10 most similar emojis (euclidean distance)

In [1]:
!pip install gensim



In [2]:
import spacy
from gensim.models import KeyedVectors
import numpy as np # for similarity calculation

## SpaCy

In [3]:
# download spaCy
spacy.cli.download("en_core_web_lg")

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [4]:
# Load the spaCy model
nlp = spacy.load('en_core_web_lg')
lemmatizer = nlp.get_pipe("lemmatizer")

In [5]:
# Calculate cosine similarity between 2 tokens
word1 = nlp("happy")
word2 = nlp("joyful")

similarity = word1.similarity(word2)
print("Similarity:", similarity)

Similarity: 0.5075500964207872


In [6]:
# Find most similary phrase to a word
phrases = ["joyful time", "sad moment", "pleased atmosphere", "delightful experience", "content feeling"]

happy_doc = nlp("happy")
similarity_scores = {}

for phrase in phrases:
    phrase_doc = nlp(phrase)
    similarity = happy_doc.similarity(phrase_doc)
    similarity_scores[phrase] = similarity

most_similar_phrase = max(similarity_scores, key=similarity_scores.get)
print("Most similar phrase to 'happy':", most_similar_phrase)


Most similar phrase to 'happy': pleased atmosphere


In [61]:
# 1. Find top 10 most similar tokens (cosine similarity by default)
word = nlp("run")

similar_words = sorted(word.vocab, key=lambda w: word.similarity(w), reverse=True)[:11]

for token in similar_words:
    if token.text != "run":
        print(token.text, token.similarity(word))

time 0.2951683405947487
Ought 0.2727500607475427
O’clock 0.2568140671468667
got 0.25313470998735677
there 0.2513292723653763
goin’ 0.2351591379197318
O'clock 0.23487492170225424
goin 0.22937445520425834
goin' 0.22069260268271804
have 0.2196272277277968


  similar_words = sorted(word.vocab, key=lambda w: word.similarity(w), reverse=True)[:11]


In [62]:
# 2. Find top 10 most similar tokens (euclidean distance)
happy_vector = nlp("run").vector

# calculate euclidean distance with all tokens
distances = []
for word in nlp.vocab:
    if word.has_vector and word.is_alpha:
        dist = np.linalg.norm(happy_vector - word.vector)
        distances.append((word.text, dist))

# sort and select top 10
top10_similar_words = sorted(distances, key=lambda x: x[1])[1:11]
for word, dist in top10_similar_words:
    print(f"{word}: {dist}")


Ought: 68.85926055908203
goin: 70.31033325195312
Kans: 71.58335876464844
Adm: 72.10071563720703
Id: 72.11503601074219
Wis: 72.27690124511719
Doin: 72.83971405029297
Nev: 72.84615325927734
Coz: 72.89249420166016
Ariz: 72.98558807373047


## Word2Vec

In [14]:
# Access word2vec Google-news-300
import gensim.downloader as api
w2v_model = api.load("word2vec-google-news-300")



In [15]:
# Calculate cosine similarity between 2 tokens
if "happy" in w2v_model.key_to_index and "joyful" in w2v_model.key_to_index:
    similarity = w2v_model.similarity("happy", "joyful")
    print("Similarity:", similarity)

Similarity: 0.42381963


In [63]:
# 1. Find top 10 most similar tokens (cosine similarity by default)
if "run" in w2v_model.key_to_index:
    similar_words = w2v_model.most_similar("run", topn=10)
    for word, similarity in similar_words:
        print(word, similarity)

runs 0.6569936275482178
running 0.6062965989112854
drive 0.4834049642086029
ran 0.4764978289604187
scamper 0.46932122111320496
tworun_double 0.46402227878570557
go 0.4631645083427429
twoout 0.45749351382255554
walk 0.45697975158691406
Mark_Grudzielanek_singled 0.4565179646015167


In [64]:
# 2. Find top 10 most similar tokens (euclidean distance)
# check accessibility and get the embedding
if 'run' in w2v_model.key_to_index:
    happy_vector = w2v_model['run']
else:
    raise ValueError("The word is not in the model's vocabulary.")

# calculate euclidean distance with all tokens
distances = []
for word in w2v_model.key_to_index.keys():
    word_vector = w2v_model[word]
    dist = np.linalg.norm(happy_vector - word_vector)
    distances.append((word, dist))

# sort and select top 10
top10_similar_words = sorted(distances, key=lambda x: x[1])[1:11]

for word, dist in top10_similar_words:
    print(f"{word}: {dist}")


Batterymate_Miguel_Olivo: 1.7986350059509277
Peter_Bourjos_tripled: 1.8141846656799316
Reliever_Macay_McBride: 1.8192209005355835
Alec_Lowrey: 1.824215292930603
Earnest_Rhone: 1.8242930173873901
Melisa_Koutz: 1.8245304822921753
Joey_Swatfager: 1.8273006677627563
Miguel_Cabrera_belted: 1.829398274421692
Nate_Rolison: 1.8309024572372437
Austin_Kearns_grounded: 1.8338943719863892


## GloVe

In [18]:
# Load GloVe model (this will download the model if not already cached locally)
glove_model = api.load("glove-wiki-gigaword-300")



In [19]:
# Calculate cosine similarity between 2 tokens
if 'happy' in glove_model.key_to_index and 'joyful' in glove_model.key_to_index:
    similarity = glove_model.similarity('happy', 'joyful')
    print(f"Similarity between 'happy' and 'joyful': {similarity}")

Similarity between 'happy' and 'joyful': 0.4751318395137787


In [65]:
# 1. Find top 10 most similar tokens (cosine similarity by default)
if 'run' in glove_model.key_to_index:
    top5_similar_words = glove_model.most_similar('run', topn=10)
    for word, similarity in top5_similar_words:
        print(f"{word}: {similarity}")

runs: 0.7017062306404114
running: 0.6978930830955505
ran: 0.630337119102478
go: 0.5583360195159912
allowed: 0.551228940486908
going: 0.5329922437667847
went: 0.5286133289337158
out: 0.5273594260215759
start: 0.5214914083480835
off: 0.510507345199585


In [66]:
# 2. Find top 10 most similar tokens (euclidean distance)
if 'run' in glove_model.key_to_index:
    happy_vector = glove_model['run']
else:
    raise ValueError("The word is not in the GloVe model's vocabulary.")

# calculate euclidean distance with all tokens
distances = []
for word in glove_model.key_to_index.keys():
    word_vector = glove_model[word]
    dist = np.linalg.norm(happy_vector - word_vector)
    distances.append((word, dist))

# sort and select top 10
top10_similar_words = sorted(distances, key=lambda x: x[1])[1:11]

for word, dist in top10_similar_words:
    print(f"{word}: {dist}")


running: 4.71186637878418
runs: 4.880781173706055
ran: 5.099777698516846
allowed: 5.4944844245910645
go: 5.5153961181640625
went: 5.581228256225586
going: 5.697341442108154
well: 5.726036071777344
out: 5.763474941253662
only: 5.772265434265137


## FastText

In [22]:
import gensim.downloader as api
ft_model = api.load('fasttext-wiki-news-subwords-300')



In [23]:
# Calculate cosine similarity between 2 tokens
similarity = ft_model.similarity('happy', 'joyful')
print("Similarity between 'happy' and 'joyful':", similarity)


Similarity between 'happy' and 'joyful': 0.71287423


In [67]:
# 1. Find top 10 most similar tokens (cosine similarity by default)
top10_words = ft_model.most_similar('run', topn=10)
for word, similarity in top10_words:
    print(f"{word}: {similarity}")

running: 0.8197714686393738
ran: 0.7697737216949463
runnning: 0.7573951482772827
runs: 0.7466383576393127
trun: 0.7098832130432129
runned: 0.7096849083900452
retrun: 0.6958275437355042
runnng: 0.6936571002006531
runnign: 0.6931751370429993
srun: 0.679145872592926


In [68]:
# 2. Find top 10 most similar tokens (euclidean distance)
happy_vector = ft_model['run']
word_distances = []

# calculate euclidean distance with all tokens
for word in ft_model.index_to_key:
    word_vector = ft_model[word]
    dist = np.linalg.norm(happy_vector - word_vector)
    word_distances.append((word, dist))

# sort and select top 10
sorted_distances = sorted(word_distances, key=lambda x: x[1])[1:11]
for word, dist in sorted_distances:
    print(f"{word}: {dist}")

running: 0.8109013438224792
ran: 0.8868503570556641
runs: 0.8995624780654907
runned: 0.919409990310669
trun: 0.92112797498703
runnning: 0.9247350096702576
retrun: 0.9549537301063538
runnign: 0.9586350321769714
run.: 0.9668093323707581
runing: 0.9748907089233398


## Find top10 emojis:
- cosine similarity + word2vec + emoji2vec
- cosine similarity + word2vec + emojional
- euclidean distance + word2vec + emoji2vec
- euclidean distance + word2vec + emojional

In [27]:
# Load the emoji2vec emoji embeddings
e2v = KeyedVectors.load_word2vec_format("emoji2vec.bin", binary=True)

In [28]:
# Load the emojional emoji embeddings
el2v = KeyedVectors.load_word2vec_format("emojional.bin", binary=True)

In [46]:
# 1.1 cosine similarity + word2vec + emoji2vec
word_vector = w2v_model.get_vector("run")
top5_emojis = e2v.most_similar(word_vector, topn=10)
for emoji, similarity in top5_emojis:
    print(f"{emoji}: {similarity}")


🏃: 0.623569667339325
🎽: 0.4897686839103699
🏇: 0.3370935022830963
🏁: 0.3312745988368988
🚶: 0.33033713698387146
💸: 0.30279526114463806
🚷: 0.2999289035797119
💯: 0.29605159163475037
💭: 0.29440468549728394
🎳: 0.2800602614879608


In [47]:
# 1.2 cosine similarity + word2vec + emojional
word_vector = w2v_model.get_vector("run")
top5_emojis = el2v.most_similar(word_vector, topn=10)
for emoji, similarity in top5_emojis:
    print(f"{emoji}: {similarity}")

🏃‍♀️: 0.08441895246505737
🏃‍♂️: 0.0673506110906601
👞: 0.054681211709976196
⌛: 0.0460536852478981
👟: 0.03200191259384155
🏇: 0.03135138377547264
🚥: 0.030578602105379105
⛳: 0.028483860194683075
🪣: 0.027112886309623718
👨‍🦽: 0.015860840678215027


In [48]:
# 2.1 euclidean distance + word2vec + emoji2vec
if 'happy' in w2v_model.key_to_index:
    happy_vector = w2v_model['run']
else:
    raise ValueError("The word is not in the Word2Vec model's vocabulary.")

emoji_distances = []
for emoji in e2v.key_to_index.keys():
    emoji_vector = e2v[emoji]
    # calculate euclidean distance
    dist = np.linalg.norm(happy_vector - emoji_vector)
    emoji_distances.append((emoji, dist))

top10_emojis = sorted(emoji_distances, key=lambda x: x[1])[:10]

for emoji, dist in top10_emojis:
    print(f"{emoji}: {dist}")

🏃: 1.5988041162490845
🎽: 1.7596166133880615
🏇: 1.9267877340316772
🏁: 1.932873010635376
🚶: 1.933851718902588
💸: 1.9623844623565674
🚷: 1.9653302431106567
💯: 1.9693076610565186
💭: 1.9709948301315308
🎳: 1.9856284856796265


In [49]:
# 2.2 euclidean distance + word2vec + emojional
if 'happy' in w2v_model.key_to_index:
    happy_vector = w2v_model['run']
else:
    raise ValueError("The word is not in the Word2Vec model's vocabulary.")

emoji_distances = []
for emoji in el2v.key_to_index.keys():
    emoji_vector = el2v[emoji]
    # calculate euclidean distance
    dist = np.linalg.norm(happy_vector - emoji_vector)
    emoji_distances.append((emoji, dist))

top10_emojis = sorted(emoji_distances, key=lambda x: x[1])[:10]

for emoji, dist in top10_emojis:
    print(f"{emoji}: {dist}")

👱‍♂️: 3.6923322677612305
🔳: 4.122771263122559
👩‍🦱: 4.410493850708008
🤙: 4.442147254943848
🔘: 4.445608139038086
👨: 4.470482349395752
👱: 4.497052192687988
👨‍🦱: 4.6015167236328125
📱: 4.61079740524292
🏡: 4.612793445587158
