In [7]:
import pickle
import numpy as np

In [8]:
# load vocabulary
with open('vocabulary.pkl', 'rb') as f:
    vocabulary = pickle.load(f)

# load embeddings
with open('embeddings.pkl', 'rb') as f:
    embeddings = pickle.load(f)


# combine vocabulary and embeddings
combined = {}
for word,definitions in vocabulary.items():
    # zip the definitions with the embeddings
    def_embed = zip(definitions,embeddings[word])

    # add to combined
    combined[word] = list(def_embed)

for word, combination in combined.items():
    print(word)
    for definition, embedding in combination:
        print(definition, np.array(embedding).shape)
    print()
   

uphold
verb: to support or maintain something, such as a law, decision, or principle, even in the face of opposition. (1536,)
verb: to confirm or approve a judgment or decision. (1536,)
verb: to physically hold something up and prevent it from falling. (1536,)

discard
verb: to get rid of something because it is no longer useful or desirable (1536,)
verb: to throw away or dispose of something (1536,)
verb: to abandon or renounce something, such as a plan or idea (1536,)
noun: something that has been thrown away or disposed of (1536,)
noun: a card that is thrown out in a card game (1536,)

honor
noun: high respect or esteem (1536,)
noun: a privilege or an award given in recognition of achievement (1536,)
noun: adherence to a conventional standard of conduct (1536,)
verb: to regard or treat with admiration and respect (1536,)
verb: to fulfill an obligation or keep a promise (1536,)
verb: to confer distinction upon someone or something (1536,)
noun: a person's sense of self-worth or pride

**GHCP Prompt**
```text
convert combined into a dataframe with colums: word, definition and embedding
```

In [10]:
import pandas as pd

# Prepare data for DataFrame
data = []
for word, combinations in combined.items():
    for definition, embedding in combinations:
        data.append([word, definition, np.array(embedding)])

# Create DataFrame
df_combined = pd.DataFrame(data, columns=['word', 'definition', 'embedding'])

print(df_combined)

        word                                         definition  \
0     uphold  verb: to support or maintain something, such a...   
1     uphold  verb: to confirm or approve a judgment or deci...   
2     uphold  verb: to physically hold something up and prev...   
3    discard  verb: to get rid of something because it is no...   
4    discard        verb: to throw away or dispose of something   
..       ...                                                ...   
121    throw  verb: To put something in a place carelessly o...   
122    throw  verb: To deliberately lose a game or match, e....   
123    throw  noun: An act of propelling something with the ...   
124    throw  noun: A thin blanket or cover casually placed ...   
125    throw  noun: A wrestling or martial art maneuver that...   

                                             embedding  
0    [0.035864878445863724, -0.0004398994497023523,...  
1    [0.016594935208559036, 0.002141183940693736, -...  
2    [0.0091772722080349

In [15]:
df_combined.dtypes

word          object
definition    object
embedding     object
dtype: object

dtype('float64')

**GHCP Prompt**
```text
for each word find three closest words based on the embedding vectors.  Use only the closest embedding vectors in the three other words.
```

In [28]:

from sklearn.metrics.pairwise import cosine_similarity

# Create a dictionary to store the closest words for each word
closest_words = {}

# Iterate through each word in the combined dictionary
temp = {}
for word, combinations in combined.items():
    # Get the embedding vectors for the current word
    word_embeddings = np.array([embedding for _, embedding in combinations])
    
    # Calculate the cosine similarity between the current word's embeddings and all other words' embeddings
    similarities = {}
    for other_word, other_combinations in combined.items():
        if word != other_word:
            other_embeddings = np.array([embedding for _, embedding in other_combinations])
            similarity = cosine_similarity(word_embeddings, other_embeddings).mean()
            similarities[other_word] = similarity

    temp[word] = similarities
    
    # Find the three closest words based on the highest cosine similarity
    closest_words[word] = sorted(similarities, key=similarities.get, reverse=True)[:3]

for k,v in temp.items():
    print(k, v)

# Print the closest words for each word
group_of_four =[]
for word, closest in closest_words.items():
    print(f"{word}: {closest}")

    # create a group of 4 words
    group_of_four.append(sorted([word]+closest))

print("\n\n")

# display the group of four words
group_of_four = sorted(group_of_four)
for g in group_of_four:
    print(g)

uphold {'discard': 0.23827443830156642, 'honor': 0.33415401436492126, 'energy': 0.23115039255303463, 'state': 0.2608244771769216, 'play': 0.2602716316692143, 'justice': 0.3287582922355521, 'labor': 0.23064116371962154, 'pass': 0.3058632359689009, 'fulfill': 0.39371110563312994, 'draw': 0.262397332793364, 'keep': 0.36346797973052025, 'blanket': 0.26742986474784725, 'sham': 0.21329573089934212, 'sheet': 0.23690585356959634, 'throw': 0.25992655485687277}
discard {'uphold': 0.23827443830156642, 'honor': 0.24063965418527838, 'energy': 0.1869847639721876, 'state': 0.19763275062105404, 'play': 0.24109463841142212, 'justice': 0.17126755868833637, 'labor': 0.19223066048044324, 'pass': 0.27673142340070483, 'fulfill': 0.2665064411139145, 'draw': 0.27205725371522077, 'keep': 0.2217366192899525, 'blanket': 0.23107965555671878, 'sham': 0.2679477312063574, 'sheet': 0.2459441075692162, 'throw': 0.34724661911675925}
honor {'uphold': 0.33415401436492126, 'discard': 0.24063965418527838, 'energy': 0.24460