In [27]:
import pickle
import numpy as np

# import cosine similarity
from sklearn.metrics.pairwise import cosine_similarity

In [28]:
# load vocabulary
with open('vocabulary.pkl', 'rb') as f:
    vocabulary = pickle.load(f)

# load embeddings
with open('embeddings.pkl', 'rb') as f:
    embeddings = pickle.load(f)


# combine vocabulary and embeddings
combined = {}
for word,definitions in vocabulary.items():
    # zip the definitions with the embeddings
    def_embed = zip(definitions,embeddings[word])

    # add to combined
    combined[word] = list(def_embed)

for word, combination in combined.items():
    print(word)
    for definition, embedding in combination:
        print(definition, np.array(embedding).shape)
    print()
   

uphold
verb: to support or maintain something, often referring to a decision, law, or principle. (1536,)
verb: to raise or lift up, physically holding something in place. (1536,)

discard
verb: to throw away or get rid of something because it is no longer useful or desirable (1536,)
verb: to abandon a card from one's hand in a card game (1536,)
noun: something that is thrown away or rejected (1536,)
noun: a card that is intentionally gotten rid of in certain card games (1536,)

honor
noun: respect or esteem shown to someone, especially for their integrity or achievements (1536,)
noun: a sense of ethical conduct or integrity (1536,)
noun: a privilege or distinctive recognition, often signified with an award or title (1536,)
noun: adherence to what is right or to a conventional standard of conduct within a community (1536,)
verb: to regard or treat with respect and admiration (1536,)
verb: to fulfill an obligation or keep a promise (1536,)
verb: to accept or pay in recognition of a finan

**GHCP Prompt**
```text
convert combined into a dataframe with colums: word, definition and embedding
```

In [29]:
import pandas as pd

# Prepare data for DataFrame
data = []
for word, combinations in combined.items():
    for definition, embedding in combinations:
        data.append([word, definition, np.array(embedding)])

# Create DataFrame
df_combined = pd.DataFrame(data, columns=['word', 'definition', 'embedding'])

print(df_combined)

        word                                         definition  \
0     uphold  verb: to support or maintain something, often ...   
1     uphold  verb: to raise or lift up, physically holding ...   
2    discard  verb: to throw away or get rid of something be...   
3    discard  verb: to abandon a card from one's hand in a c...   
4    discard    noun: something that is thrown away or rejected   
..       ...                                                ...   
112    throw  verb: to project or put forth an idea or conce...   
113    throw  verb: to lose a game or contest intentionally,...   
114    throw  noun: the act of propelling something with for...   
115    throw  noun: a fabric covering used for decoration or...   
116    throw  noun: a distance something is thrown, e.g., 'H...   

                                             embedding  
0    [0.032581623643636703, 0.006792697124183178, 0...  
1    [0.004148778505623341, -0.04162462800741196, -...  
2    [0.0123115861788392

In [30]:
df_combined.dtypes

word          object
definition    object
embedding     object
dtype: object

In [31]:
for sentence, embedding in combined["sham"]:
    print(sentence, np.array(embedding).shape)

noun: A thing that is not what it is purported to be, for example, 'The entire meeting was a sham and a waste of time.' (1536,)
noun: A decorative pillow covering, for example, 'I changed the bedspread and added a matching sham.' (1536,)
adjective: Bogus or false, for example, 'He was accused of running a sham company to defraud investors.' (1536,)
verb: To falsely present something as the truth, for example, 'They were sham the data to fit their desired outcome.' (1536,)


In [32]:
df_combined.head()

Unnamed: 0,word,definition,embedding
0,uphold,"verb: to support or maintain something, often ...","[0.032581623643636703, 0.006792697124183178, 0..."
1,uphold,"verb: to raise or lift up, physically holding ...","[0.004148778505623341, -0.04162462800741196, -..."
2,discard,verb: to throw away or get rid of something be...,"[0.012311586178839207, -0.0013197718653827906,..."
3,discard,verb: to abandon a card from one's hand in a c...,"[0.0015365865547209978, -0.009047421626746655,..."
4,discard,noun: something that is thrown away or rejected,"[0.01473868265748024, 0.013226685114204884, -0..."


In [None]:
def get_candidate_words(df):

    # create cosine similarity matrix for all pairs of the vectors
    cosine_similarities = cosine_similarity(df['embedding'].tolist())
    print(cosine_similarities.shape)

    # for each row in the cosine similarity matrix, sort by the cosine similarity
    sorted_cosine_similarites = np.argsort(cosine_similarities, axis=1)
    print(sorted_cosine_similarites.shape)

    # group of words that are most similar to each other
    max_similarity = -1
    candidate_words = []
    for r in range(df.shape[0]):
        # get the top 3 closest words that are not the same as the current word and are not already connected
        connected_words = set()
        top3 = []
        for i in range(sorted_cosine_similarites.shape[1]-2, 0, -1):
            c = sorted_cosine_similarites[r, i]
            
            # make sure the word is not already connected and not the current word
            if df.iloc[c, 0] not in connected_words and df.iloc[c, 0] != df.iloc[r, 0]:
                connected_words.add(df.iloc[c, 0])
                top3.append(c)
            if len(connected_words) == 3:
                break   

        # print current word and connected words
        print("\n",df.iloc[r,0], connected_words)

        # show definiton and cosine similarity measure of the current word and connected words
        if df.iloc[r, 0] not in connected_words and len(connected_words) == 3:
            mean_similarity = cosine_similarities[r, top3].mean()
            print(df.iloc[r, 0], df.iloc[r,1], mean_similarity)
            for c in top3:
                print("\t",df.iloc[c, 0],df.iloc[c,1], cosine_similarities[r,c])

            # Use maximum avearage similarity to select the best group of words
            if mean_similarity > max_similarity:
                max_similarity = mean_similarity
                candidate_words = [mean_similarity] +[df.iloc[r, 0]] + list(connected_words)

    print(candidate_words)

    return sorted(candidate_words[1:])

        

In [34]:

candidate_words = get_candidate_words(df_combined)
print(candidate_words)

(117, 117)
(117, 117)

 uphold {'keep', 'honor', 'justice'}
uphold verb: to support or maintain something, often referring to a decision, law, or principle. 0.5218301776632716
	 justice noun: The use of authority to uphold what is deemed morally right or appropriate. 0.5627673412004944
	 keep verb: To maintain a certain state or condition, such as 'keep calm' or 'keep clean.' 0.5166643463056055
	 honor verb: to fulfill an obligation or keep a promise 0.48605884548371464

 uphold {'keep', 'draw', 'pass'}
uphold verb: to raise or lift up, physically holding something in place. 0.4552701822232936
	 draw verb: to move or lead someone into a particular position or state 0.48666061840036257
	 pass verb: To transfer something to another person. 0.4422450349496213
	 keep verb: To retain possession of something, as in 'keep the books' or 'keep the change.' 0.43690489331989685

 discard {'pass', 'play', 'throw'}
discard verb: to throw away or get rid of something because it is no longer useful o

In [35]:
#  remove from df_combined the rows where the word is in candidate_words
df_combined = df_combined[~df_combined['word'].isin(candidate_words)]
df_combined.shape

(94, 3)

In [36]:
df_combined.word.unique()

array(['uphold', 'discard', 'honor', 'energy', 'state', 'play', 'justice',
       'labor', 'pass', 'fulfill', 'draw', 'keep'], dtype=object)

In [37]:
candidate_words = get_candidate_words(df_combined)
print(candidate_words)

(94, 94)
(94, 94)

 uphold {'keep', 'honor', 'justice'}
uphold verb: to support or maintain something, often referring to a decision, law, or principle. 0.5218301776632716
	 justice noun: The use of authority to uphold what is deemed morally right or appropriate. 0.5627673412004944
	 keep verb: To maintain a certain state or condition, such as 'keep calm' or 'keep clean.' 0.5166643463056055
	 honor verb: to fulfill an obligation or keep a promise 0.48605884548371464

 uphold {'keep', 'draw', 'pass'}
uphold verb: to raise or lift up, physically holding something in place. 0.4552701822232936
	 draw verb: to move or lead someone into a particular position or state 0.48666061840036257
	 pass verb: To transfer something to another person. 0.4422450349496213
	 keep verb: To retain possession of something, as in 'keep the books' or 'keep the change.' 0.43690489331989685

 discard {'draw', 'pass', 'play'}
discard verb: to throw away or get rid of something because it is no longer useful or des

In [38]:
#  remove from df_combined the rows where the word is in candidate_words
df_combined = df_combined[~df_combined['word'].isin(candidate_words)]
df_combined.shape

(71, 3)

In [39]:
candidate_words = get_candidate_words(df_combined)
print(candidate_words)

(71, 71)
(71, 71)

 discard {'draw', 'pass', 'play'}
discard verb: to throw away or get rid of something because it is no longer useful or desirable 0.3892868981839017
	 pass verb: To decline or refuse an offer or opportunity. 0.47535401596408766
	 play verb: To engage in activity for enjoyment and recreation rather than a serious or practical purpose. 0.3497680835012893
	 draw verb: to pull or drag something in a particular direction 0.34273859508632815

 discard {'draw', 'pass', 'play'}
discard verb: to abandon a card from one's hand in a card game 0.4998807106801777
	 pass verb: In card games, to opt not to make a bid or play a card. 0.6264811677375186
	 draw verb: to withdraw money from a bank account 0.4837503069392187
	 play verb: To act in a disingenuous or deceitful manner, often referred to as 'playing someone'. 0.38941065736379593

 discard {'draw', 'pass', 'justice'}
discard noun: something that is thrown away or rejected 0.38346730903403065
	 pass verb: To decline or refuse