In [1]:
# our libraries
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
#
# Create a model, m, with the line   m = read_word2vec_model()
#

from gensim.models import KeyedVectors

def read_word2vec_model(filename = "word2vec_model.txt"):  
    """ a function that reads a word2vec model from the file
        "word2vec_model.txt" and returns a model object that
        we will usually name m or model...
    """
    try:
        print("Starting to load the model in ", filename, "...")
        model = KeyedVectors.load_word2vec_format(filename, binary=False)
        print("Model loaded.\n")
    except FileNotFoundError as e:
        print(f"  [WARNING]    The file {filename} was not found.     [WARNING]  ")
        return None   # returning a placeholder, not a model

    # let's print some attributes
    print("The model built is", model, "\n")
    print("The vocabulary has", model.vectors.shape[0], "words")   # The vocabulary has 43981 words
    print("Each word is a vector of size", model.vector_size)  # 300
    print("\nTry m.get_vector('python') to see a the vector for 'python'!\n")
    model.fill_norms()  # freezes the model, m, as-is (no more training)
    # we weren't going to train more, so no worries (in week7, at least)
    return model


In [3]:
# 
# best to run this only once... or once in a while
#
m = read_word2vec_model()

Starting to load the model in  word2vec_model.txt ...
Model loaded.

The model built is KeyedVectors<vector_size=300, 43981 keys> 

The vocabulary has 43981 words
Each word is a vector of size 300

Try m.get_vector('python') to see a the vector for 'python'!



<br>

#### More word-embedding geometry:  Analogies


In [4]:
#
# Let's take a look at some additional "geometry" of word-meanings (cool!)
#

m.most_similar(positive='python', topn=10)  # negative='snake'

[('snake', 0.660629153251648),
 ('crocodile', 0.6591362953186035),
 ('alligator', 0.6421656012535095),
 ('boa', 0.5617719888687134),
 ('constrictor', 0.5378887057304382),
 ('constrictors', 0.5356365442276001),
 ('snakes', 0.5345131754875183),
 ('anaconda', 0.5207394361495972),
 ('rabbit', 0.5074971318244934),
 ('tortoise', 0.5046288967132568)]

In [5]:
#
# With this most_similar method, we can "subtract" vectors, too:
#

m.most_similar(positive='python', negative='snake', topn=10) 

[('Java', 0.22111035883426666),
 ('Dior', 0.2102828025817871),
 ('Notte', 0.207855224609375),
 ('os', 0.19944755733013153),
 ('frock', 0.19739560782909393),
 ('blouse', 0.19704443216323853),
 ('plaids', 0.19696445763111115),
 ('blazer', 0.1878664493560791),
 ('gown', 0.17895956337451935),
 ('Gala', 0.17834939062595367)]

In [6]:
#
# Here, see if you can determine the analogy that is being computed using word embeddings:
# 

m.most_similar(positive=['king','woman'], negative=['man'], topn=10) 

[('queen', 0.7118193507194519),
 ('monarch', 0.6189674139022827),
 ('princess', 0.5902430415153503),
 ('prince', 0.5377321243286133),
 ('kings', 0.5236843228340149),
 ('queens', 0.5181134939193726),
 ('throne', 0.5005807280540466),
 ('royal', 0.493820458650589),
 ('ruler', 0.49092739820480347),
 ('princes', 0.481081485748291)]

In [9]:
# 
# This problem is about building and testing analogies...
# 
# This function has a hard-coded set of words, i.e., 'woman', 'king', and 'man'
# Your tasks:
#      + add inputs to the function 
#
def test_most_similar(m):
    """ example of most_similar """
    print("Testing most_similar on the king - man + woman example...")
    results = m.most_similar(positive=['woman', 'king'], negative=['man'], topn=10) # topn == # of results
    results = m.most_similar(positive=['France', 'Berlin'], negative=['Germany'], topn=10) # topn == # of results
    return results

hard_coded_results = test_most_similar(m)
hard_coded_results

Testing most_similar on the king - man + woman example...


[('Paris', 0.7672387361526489),
 ('French', 0.6049168109893799),
 ('Parisian', 0.5810437202453613),
 ('Brussels', 0.542099118232727),
 ('Rome', 0.5099510550498962),
 ('Strasbourg', 0.5049293637275696),
 ('Marseilles', 0.49816644191741943),
 ('Toulouse', 0.4843180179595947),
 ('Paix', 0.4830804169178009),
 ('Francois', 0.4801149368286133)]

In [None]:
# We want to do a few things here:
# (A) Write generate_analogy(word1, word2, word3, m) and try it on some examples of your own.
#     if word4 is the return value, then the idea is  word1 : word2 :: word3 : word4
#          Warning:  the ordering of the words in the most_similar call is DIFFERENT (this is key)
#
#     Also, include a check that all of the words are in the model, e.g., adapting this:
"""
        if word not in model:  # or, not in m
            print(f"Aargh - the model does not contain {word}")
            return 'python' # or a suitable alternative
"""

In [29]:
def generate_analogy(w1, w2, w3, m):
  """ returns word2vec's "answer" for w1:w2 :: w3:?? """
  if w1 not in m:  # example check
    print(f"{w1} was not in the model.")
    return 'Instead, we will return CS35'
  elif w2 not in m:
    print(f"{w2} was not in the model.")
    return 'Instead, we will return CS35'
  elif w3 not in m:
    print(f"{w3} was not in the model.")
    return 'Instead, we will return CS35'
  else:
    results = m.most_similar(positive=[f'{w3}', f'{w2}'], negative=[f'{w1}'], topn=1)
    return results[0][0]
    
a = generate_analogy('California', 'Sacramento', 'Korea', m)    # WORKS!
b = generate_analogy('sports', 'basketball', 'music', m)        # WORKS!
c = generate_analogy('apple','red', 'watermelon', m)            # DOESN'T WORK!
d = generate_analogy('jacket', 'cold', 'sandals', m)            # DOESN'T WORK!
print(a)
print(b)
print(c)
print(d)

Seoul
jazz
blue
chilly


In [None]:
#
# (B) Write check_analogy(word1, word2, word3, word4, model) to return a "score" on how well word2vec_model
#     does at solving the analogy provided, i.e.,    word1 : word2 :: word3 : word4
#     + it should determine where word4 appears in the top 100 (use topn=100) most-similar words
#     + if it _doens't_ appear in the top-100, it should give a score of 0
#     + if it _does_ appear, it should give a score between 1 and 100, but
#          it should be the distance from the _far_ end of the list. 
#     + Thus, a score of 100 means a perfect score. 
#     + A score of 1 means that word4 was the 100th in the list (index 99)
#     + Try it out:   check_analogy( "man", "king", "woman", "queen", m ) -> 100
#                     check_analogy( "woman", "man", "bicycle", "fish", m ) -> 0
#                     check_analogy( "woman", "man", "bicycle", "pedestrian", m ) -> 96

In [50]:
#
# our check_analogy function
#

def check_analogy(word1, word2, word3, word4, model):
    """ check_analogy's docstring - be sure to include it!
    """
    if word1 not in m:  # example check
        print(f"{word1} was not in the model.")
        return 'Instead, we will return CS35'
    elif word2 not in m:
        print(f"{word2} was not in the model.")
        return 'Instead, we will return CS35'
    elif word3 not in m:
        print(f"{word3} was not in the model.")
        return 'Instead, we will return CS35'
    else:
        results = m.most_similar(positive=[f'{word3}', f'{word2}'], negative=[f'{word1}'], topn=100)
        for item in results:
            if item[0] == word4:
                return 100 - int(results.index(item))
        return 0

a1 = check_analogy( "man", "king", "woman", "queen", m )
a2 = check_analogy( "woman", "man", "bicycle", "fish", m )
a3 = check_analogy( "woman", "man", "bicycle", "pedestrian", m )

print(a1)
print(a2)
print(a3)

100
0
96


In [124]:
#
# Results and commentary...
#

b1 = check_analogy('apple', 'red', 'blueberry', 'blue', m)
print(b1)

b2 = check_analogy('sun', 'summer', 'ice', 'winter', m)
print(b2)

b3 = check_analogy('potato', 'vegetable', 'tomato', 'spaceship', m)
print(b3)

b4 = check_analogy('caramel', 'sweet', 'medicine', 'sword', m)
print(b4)


print()
print()

# Trial and Error...

c1 = check_analogy('ears', 'hear', 'eyes', 'see', m)
print(c1)
c2 = check_analogy('sun', 'yellow', 'snow', 'white', m)
print(c2)
c3 = check_analogy('crackers', 'snack', 'pork', 'breakfast', m)
print(c3)
c4 = check_analogy('Japan', 'Tokyo', 'America', 'Texas', m)
print(c4)
c5 = check_analogy('coffee', 'drink', 'soda', 'pizza', m)
print(c5)

100
100
0
0


100
90
42
25
0
