In [1]:
import pandas as pd
import numpy as np

# import cosine similarity
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
test_vocab = "word_list2.pkl"

# load the vocabulary
df = pd.read_pickle(test_vocab)
df.shape

(80, 3)

In [3]:
df.head(15)

Unnamed: 0,word,definition,embedding
0,inspire,verb: To fill someone with the urge or ability...,"[0.056669682264328, -0.029521910473704338, -0...."
1,inspire,verb: To influence or arouse a particular feel...,"[0.0282593946903944, -0.08118896931409836, -0...."
2,inspire,verb: To stimulate someone to a higher level o...,"[0.03241196274757385, -0.03061419352889061, -0..."
3,inspire,verb: To breathe life or energy into something...,"[0.03345229849219322, 0.01821242645382881, -0...."
4,inspire,verb: To inhale; to draw air into the lungs.,"[0.01020112819969654, 0.0006023363675922155, -..."
5,madden,verb: to make someone very angry or irritated,"[0.0020106742158532143, -0.046447038650512695,..."
6,madden,verb: to drive someone insane or to cause some...,"[0.00396437244489789, -0.051915284246206284, -..."
7,madden,adjective (archaic): insane or mentally unstable,"[0.009489667601883411, -0.02472560666501522, -..."
8,madden,adjective: extremely annoying or frustrating,"[-0.007401373237371445, -0.0009346121805720031..."
9,madden,noun (archaic): a state of madness or frenzy,"[-0.013078952208161354, 0.004764189012348652, ..."


In [4]:
def get_candidate_words(df):

    # create cosine similarity matrix for all pairs of the vectors
    cosine_similarities = cosine_similarity(df['embedding'].tolist())
    print(cosine_similarities.shape)

    # for each row in the cosine similarity matrix, sort by the cosine similarity
    sorted_cosine_similarites = np.argsort(cosine_similarities, axis=1)
    print(sorted_cosine_similarites.shape)

    # group of words that are most similar to each other
    max_similarity = -1
   
    for r in range(df.shape[0]):
        # get the top 3 closest words that are not the same as the current word and are not already connected
        connected_words = set()
        top3 = []
        for i in range(sorted_cosine_similarites.shape[1]-2, 0, -1):
            c = sorted_cosine_similarites[r, i]
            
            # make sure the word is not already connected and not the current word
            if df.iloc[c, 0] not in connected_words and df.iloc[c, 0] != df.iloc[r, 0]:
                connected_words.add(df.iloc[c, 0])
                top3.append(c)
            if len(connected_words) == 3:
                break   

        # print current word and connected words
        # print("\n",df.iloc[r,0], connected_words)

        # show definiton and cosine similarity measure of the current word and connected words
        if df.iloc[r, 0] not in connected_words and len(connected_words) == 3:
            mean_similarity = cosine_similarities[r, top3].mean()
            # print(df.iloc[r, 0], df.iloc[r,1], mean_similarity)
            
            # for c in top3:
            #     print("\t",df.iloc[c, 0],df.iloc[c,1], cosine_similarities[r,c])

            # Use maximum avearage similarity to select the best group of words
            if mean_similarity > max_similarity:
                max_similarity = mean_similarity
                candidate_words = [mean_similarity] +[df.iloc[r, 0]] + list(connected_words)
                candidate_connections = [df.iloc[r, 1]]
                for c in top3:
                    candidate_connections.append(df.iloc[c, 1])

    print(candidate_words)


    return sorted(candidate_words[1:]), candidate_connections

In [5]:
candidate_words, candidate_conections = get_candidate_words(df)
print(candidate_words)
for definition in candidate_conections:
    print(definition)

(80, 80)
(80, 80)
[0.7877791018511658, 'provoke', 'nettle', 'incense', 'prompt']
['incense', 'nettle', 'prompt', 'provoke']
verb: to incite or stimulate someone to feel or react in a certain way, often anger or irritation.
verb: To incite or encourage someone to take action or respond.
verb: To provoke or upset someone, causing them to feel uncomfortable or angry.
verb: To arouse extreme anger or indignation in someone.


In [6]:
#  remove from df_combined the rows where the word is in candidate_words
df = df[~df['word'].isin(candidate_words)]
df.shape

(63, 3)

In [7]:
len(df.word.unique()),df.word.unique()

(12,
 array(['inspire', 'madden', 'jellyfish', 'metroid', 'insult', 'candle',
        'halo', 'soap', 'generate', 'civilization', 'lotion', 'wasp'],
       dtype=object))

In [8]:
candidate_words, candidate_conections = get_candidate_words(df)
print(candidate_words)
for definition in candidate_conections:
    print(definition)

(63, 63)
(63, 63)
[0.5591091786726455, 'madden', 'inspire', 'insult', 'wasp']
['inspire', 'insult', 'madden', 'wasp']
verb: to make someone very angry or irritated
noun: A figure of speech used to describe a person who is irritable or quick to anger.
verb: To influence or arouse a particular feeling or thought in someone.
verb: To offend someone by saying or doing something rude or insensitive, like telling a tasteless joke at someone's expense.


In [9]:
#  remove from df_combined the rows where the word is in candidate_words
df = df[~df['word'].isin(candidate_words)]
df.shape

(42, 3)

In [10]:
len(df.word.unique()),df.word.unique()

(8,
 array(['jellyfish', 'metroid', 'candle', 'halo', 'soap', 'generate',
        'civilization', 'lotion'], dtype=object))

In [11]:
candidate_words, candidate_conections = get_candidate_words(df)
print(candidate_words)
for definition in candidate_conections:
    print(definition)

(42, 42)
(42, 42)
[0.47254372562829544, 'lotion', 'candle', 'jellyfish', 'soap']
['candle', 'jellyfish', 'lotion', 'soap']
noun: A thick, smooth liquid preparation designed to be applied to the skin for medicinal or cosmetic purposes.
noun: A bar or liquid form of the cleaning agent used for personal hygiene.
noun: A type of dessert or gelatin-based food that is wobbly and translucent, often fruit-flavored.
noun: A cylinder or block of wax or tallow with a wick through its center, used for illumination or for scent.


In [12]:
#  remove from df_combined the rows where the word is in candidate_words
df = df[~df['word'].isin(candidate_words)]
df.shape

(21, 3)

In [13]:
len(df.word.unique()),df.word.unique()

(4, array(['metroid', 'halo', 'generate', 'civilization'], dtype=object))