# Store the embeddings into a dictionary through a function, where the key is the word and the value is the vector.

In [19]:
import numpy as np

glove_path = "./data/glove.6B.100d.txt"

def load_glove_embeddings(file_path):
    word_index = {}
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.array(values[1:], dtype='float32')
            word_index[word] = coefs
    return word_index

word_index = load_glove_embeddings(glove_path)
print(f"Embeddings loaded: {len(word_index)}")

Embeddings loaded: 400000


## Vectors for the words: (Woman), (Man), (King). Define a new vector for expressions new_vector = woman_vector - man_vector + king_vector

In [20]:
woman_vector = word_index["woman"]
man_vector = word_index["man"]
king_vector = word_index["king"]

new_vector = woman_vector - man_vector + king_vector
print(f"New vector: {new_vector}")

New vector: [-0.10231996 -0.81294     0.10211001  0.985924    0.34218282  1.09095
 -0.48913    -0.05616698 -0.21029997 -1.02996    -0.86851     0.36786997
  0.01960999  0.59259    -0.231901   -1.016919   -0.012184   -1.17194
 -0.52329     0.60645    -0.98537004 -1.001028    0.48913902  0.630072
  0.58224     0.15908998  0.43684998 -1.25351     0.97054005 -0.06552899
  0.733763    0.44219002  1.2091839   0.19698    -0.15948     0.34364
 -0.46222997  0.33772     0.14792703 -0.24959499 -0.77093005  0.522717
 -0.12830001 -0.91881    -0.01755    -0.44041002 -0.52656496  0.33734798
  0.60639    -0.45067    -0.04158002  0.08408298  1.31456     0.67737997
 -0.24316001 -2.071      -0.60648996  0.19710997  0.63567     0.07819999
  0.49161002  0.08172001  0.708557    0.201938    0.5155501  -0.23025298
 -0.40473     0.39212003 -0.5093     -0.139153    0.21609999 -0.628671
  0.08894001  0.49167    -0.06637001  0.76095    -0.19442001  0.41131
 -1.04476    -0.14801991 -0.098355   -0.25115     0.80895

## store the vectors in an array and transform the vectors into an array and print the array, also transform the vectors into a matrix and print the matrix. store word indexes in all_words

In [21]:
all_words = list(word_index.keys())
embedding_weights = np.array(list(word_index.values()))
print(f"Embedding weights shape: {embedding_weights}")

Embedding weights shape: [[-0.038194 -0.24487   0.72812  ... -0.1459    0.8278    0.27062 ]
 [-0.10767   0.11053   0.59812  ... -0.83155   0.45293   0.082577]
 [-0.33979   0.20941   0.46348  ... -0.23394   0.47298  -0.028803]
 ...
 [ 0.36088  -0.16919  -0.32704  ...  0.27139  -0.29188   0.16109 ]
 [-0.10461  -0.5047   -0.49331  ...  0.42527  -0.5125   -0.17054 ]
 [ 0.28365  -0.6263   -0.44351  ...  0.43678  -0.82607  -0.15701 ]]


In [22]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def find_nearest_vectors(target_vector, top_n=10):

    similarities = cosine_similarity(embedding_weights, target_vector.reshape(1, -1)).reshape(-1)
    similar_indices = np.argsort(similarities)[-top_n-1:-1][::-1]

    similar_words = []
    for idx in similar_indices:
        similar_vector = embedding_weights[idx]
        similar_word = all_words[idx]
        similar_words.append(similar_word)

    return similar_words

## Find the nearest vectors for the (woman - man + king) vector.

In [23]:
nearest_vectors = find_nearest_vectors(new_vector)
for i in range(10):
    print(f"{i + 1}. {nearest_vectors[i]}")

1. queen
2. monarch
3. throne
4. daughter
5. prince
6. princess
7. mother
8. elizabeth
9. father
10. wife
