In [1]:
import pickle
import numpy as np

In [2]:
# load vocabulary
with open('vocabulary.pkl', 'rb') as f:
    vocabulary = pickle.load(f)

# load embeddings
with open('embeddings.pkl', 'rb') as f:
    embeddings = pickle.load(f)


# combine vocabulary and embeddings
combined = {}
for word,definitions in vocabulary.items():
    # zip the definitions with the embeddings
    def_embed = zip(definitions,embeddings[word])

    # add to combined
    combined[word] = list(def_embed)

for word, combination in combined.items():
    print(word)
    for definition, embedding in combination:
        print(definition, np.array(embedding).shape)
    print()
   

uphold
verb: to support or maintain something, often referring to a decision, law, or principle. (1536,)
verb: to raise or lift up, physically holding something in place. (1536,)

discard
verb: to throw away or get rid of something because it is no longer useful or desirable (1536,)
verb: to abandon a card from one's hand in a card game (1536,)
noun: something that is thrown away or rejected (1536,)
noun: a card that is intentionally gotten rid of in certain card games (1536,)

honor
noun: respect or esteem shown to someone, especially for their integrity or achievements (1536,)
noun: a sense of ethical conduct or integrity (1536,)
noun: a privilege or distinctive recognition, often signified with an award or title (1536,)
noun: adherence to what is right or to a conventional standard of conduct within a community (1536,)
verb: to regard or treat with respect and admiration (1536,)
verb: to fulfill an obligation or keep a promise (1536,)
verb: to accept or pay in recognition of a finan

**GHCP Prompt**
```text
convert combined into a dataframe with colums: word, definition and embedding
```

In [3]:
import pandas as pd

# Prepare data for DataFrame
data = []
for word, combinations in combined.items():
    for definition, embedding in combinations:
        data.append([word, definition, np.array(embedding)])

# Create DataFrame
df_combined = pd.DataFrame(data, columns=['word', 'definition', 'embedding'])

print(df_combined)

        word                                         definition  \
0     uphold  verb: to support or maintain something, often ...   
1     uphold  verb: to raise or lift up, physically holding ...   
2    discard  verb: to throw away or get rid of something be...   
3    discard  verb: to abandon a card from one's hand in a c...   
4    discard    noun: something that is thrown away or rejected   
..       ...                                                ...   
112    throw  verb: to project or put forth an idea or conce...   
113    throw  verb: to lose a game or contest intentionally,...   
114    throw  noun: the act of propelling something with for...   
115    throw  noun: a fabric covering used for decoration or...   
116    throw  noun: a distance something is thrown, e.g., 'H...   

                                             embedding  
0    [0.032581623643636703, 0.006792697124183178, 0...  
1    [0.004148778505623341, -0.04162462800741196, -...  
2    [0.0123115861788392

In [4]:
df_combined.dtypes

word          object
definition    object
embedding     object
dtype: object

**GHCP Prompt**
```text
for each word find three closest words based on the embedding vectors.  Use only the closest embedding vectors in the three other words.
```

In [5]:

from sklearn.metrics.pairwise import cosine_similarity

# Create a dictionary to store the closest words for each word
closest_words = {}

# Iterate through each word in the combined dictionary
temp = {}
for word, combinations in combined.items():
    # Get the embedding vectors for the current word
    word_embeddings = np.array([embedding for _, embedding in combinations])
    
    # Calculate the cosine similarity between the current word's embeddings and all other words' embeddings
    similarities = {}
    for other_word, other_combinations in combined.items():
        if word != other_word:
            other_embeddings = np.array([embedding for _, embedding in other_combinations])
            similarity = cosine_similarity(word_embeddings, other_embeddings).mean()
            similarities[other_word] = similarity

    temp[word] = similarities
    
    # Find the three closest words based on the highest cosine similarity
    closest_words[word] = sorted(similarities, key=similarities.get, reverse=True)[:3]

for k,v in temp.items():
    print(k, v)

# Print the closest words for each word
group_of_four =[]
for word, closest in closest_words.items():
    print(f"{word}: {closest}")

    # create a group of 4 words
    group_of_four.append(sorted([word]+closest))

print("\n\n")

# display the group of four words
group_of_four = sorted(group_of_four)
for g in group_of_four:
    print(g)

uphold {'discard': 0.1891107263014488, 'honor': 0.34364705211922203, 'energy': 0.27429418734307776, 'state': 0.2791291220451757, 'play': 0.26885632789504005, 'justice': 0.33952024661960534, 'labor': 0.27243515326514806, 'pass': 0.2768536754621598, 'fulfill': 0.36138984518850265, 'draw': 0.3064765627962617, 'keep': 0.3946245796171386, 'blanket': 0.25578008206367653, 'sham': 0.1940616239797734, 'sheet': 0.2527518123911778, 'throw': 0.2795594275867012}
discard {'uphold': 0.18911072630144882, 'honor': 0.2283455451310721, 'energy': 0.15514149761029988, 'state': 0.18037195497699654, 'play': 0.23244936997138035, 'justice': 0.1723671717514345, 'labor': 0.20833316801354532, 'pass': 0.2804888086334442, 'fulfill': 0.22299191627994194, 'draw': 0.2712076084168971, 'keep': 0.21215835347869894, 'blanket': 0.1970368156062508, 'sham': 0.262559246490517, 'sheet': 0.22841726925686148, 'throw': 0.3121839635752311}
honor {'uphold': 0.343647052119222, 'discard': 0.22834554513107205, 'energy': 0.238400850722