In [1]:
from dataclasses import dataclass, field
import hashlib

import pandas as pd
import numpy as np

# import cosine similarity
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
@dataclass
class ConnectionGroup:
    group_metric: float = field(default=0.0, metadata={"help": "Average cosine similarity of the group"})
    root_word: str = field(default="", metadata={"help": "Root word of the group"})
    candidate_pairs: list = field(default_factory=list, metadata={"help": "List of candidate word with definition"})

    def add_entry(self, word, connection):
        self.candidate_pairs.append((word, connection))

    def get_candidate_words(self):
        sorted_pairs = sorted(self.candidate_pairs, key=lambda x:x[0])
        return [x[0] for x in sorted_pairs]
    
    def get_candidate_connections(self):
        sorted_pairs = sorted(self.candidate_pairs, key=lambda x:x[0])
        # strip the part of speech tag at the beginning of the connection, which looks like "noun:" or "verb:" etc.
        # find the first colon and take the substring after it

        stripped_connections = [x[1].split(':', 1)[1].strip() if ':' in x[1] else x[1] for x in sorted_pairs]
        return stripped_connections
    
    def get_candidate_words_checksum(self):
        return hashlib.md5("".join(self.get_candidate_words()).encode()).hexdigest()

    def __repr__(self):
        return_string = f"Group metric: {self.group_metric}, root word: {self.root_word}, checksum: {self.get_candidate_words_checksum()}\n"
        return_string += f"Candidate words: {self.get_candidate_words()}\n"
        for connection in self.get_candidate_connections():
            return_string += f"\t{connection}\n"

        return return_string
    
    # method to determine if the group is equal to another group
    def __eq__(self, other):
        return set(self.get_candidate_words()) == set(other.get_candidate_words())

In [3]:
test_vocab = "word_list5.pkl"

# load the vocabulary
df = pd.read_pickle(test_vocab)
df.shape

(121, 3)

In [4]:
df.head(15)

Unnamed: 0,word,definition,embedding
0,uphold,"verb: to maintain or support something, such a...","[0.05433855205774307, 0.028151938691735268, 0...."
1,uphold,verb: to lift or hold something in an elevated...,"[0.006949818227440119, -0.0720582902431488, -0..."
2,uphold,verb: to defend or stand up for a principle or...,"[0.0063194772228598595, -0.022249704226851463,..."
3,discard,verb: To throw something away because it is no...,"[0.023533722385764122, 0.005563678685575724, -..."
4,discard,"verb: To get rid of a card in a game, usually ...","[-6.281930836848915e-05, -0.023267202079296112..."
5,discard,noun: The act of throwing something away.,"[0.013182280585169792, 0.010861128568649292, -..."
6,discard,noun: A card that has been thrown away in a ca...,"[0.0007180095999501646, -0.007413978222757578,..."
7,honor,noun: Respect and esteem shown to someone or s...,"[0.05223957821726799, -0.04617014899849892, -0..."
8,honor,noun: A privilege or mark of distinction.,"[0.048169057816267014, -0.009565112181007862, ..."
9,honor,"noun: A code of integrity and ethics, often as...","[0.022865494713187218, -0.0090312659740448, 0...."


In [5]:
def get_candidate_words(df):

    candidate_list = []

    # create cosine similarity matrix for all pairs of the vectors
    cosine_similarities = cosine_similarity(df['embedding'].tolist())
    print(cosine_similarities.shape)

    # for each row in the cosine similarity matrix, sort by the cosine similarity
    sorted_cosine_similarites = np.argsort(cosine_similarities, axis=1)
    print(sorted_cosine_similarites.shape)

    # group of words that are most similar to each other
    max_similarity = -1
   
    for r in range(df.shape[0]):
        # get the top 3 closest words that are not the same as the current word and are not already connected
        connected_words = set()
        top3 = []
        for i in range(sorted_cosine_similarites.shape[1]-2, 0, -1):
            c = sorted_cosine_similarites[r, i]
            
            # make sure the word is not already connected and not the current word
            if df.iloc[c, 0] not in connected_words and df.iloc[c, 0] != df.iloc[r, 0]:
                connected_words.add(df.iloc[c, 0])
                top3.append(c)
            if len(connected_words) == 3:
                break   

        # create candidate group for the current word and the top 3 closest words
        if df.iloc[r, 0] not in connected_words and len(connected_words) == 3:
            candidate_group = ConnectionGroup()
            candidate_group.group_metric = cosine_similarities[r, top3].mean()
            candidate_group.root_word = df.iloc[r, 0]
            candidate_group.add_entry(df.iloc[r, 0], df.iloc[r, 1])
            
            for c in top3:
                candidate_group.add_entry(df.iloc[c, 0], df.iloc[c, 1])

            candidate_list.append(candidate_group)

    candidate_list.sort(key=lambda x: x.group_metric, reverse=True)
    found_groups = set()
    unique_candidate_list = []
    for candidate in candidate_list:
        if candidate.get_candidate_words_checksum() not in found_groups:
            unique_candidate_list.append(candidate)
            found_groups.add(candidate.get_candidate_words_checksum())

    return unique_candidate_list

In [6]:
candidate_list = get_candidate_words(df)
print(len(candidate_list))
for candidate in candidate_list[:20]:
    print(candidate)

(121, 121)
(121, 121)
86
Group metric: 0.7400479116733788, root word: blanket, checksum: 7f7f5db443336ea81c19f94b4a07d8d8
Candidate words: ['blanket', 'sham', 'sheet', 'throw']
	A large piece of fabric used to cover and keep warm, typically on a bed.
	A decorative pillow covering or case.
	A large rectangular piece of fabric used for covering a bed.
	a light blanket or cover used for warmth or decoration

Group metric: 0.6919912754064516, root word: keep, checksum: 02f440f2aa99d7c860db64d79479f106
Candidate words: ['fulfill', 'honor', 'keep', 'uphold']
	to carry out a task or duty as required or expected
	To fulfill or keep an agreement or commitment.
	to fulfill or adhere to a commitment or promise
	to maintain or support something, such as a decision or a law

Group metric: 0.6266846582091273, root word: throw, checksum: 6392bedcc900a7f57b24ac66cf6ae7f8
Candidate words: ['discard', 'pass', 'play', 'throw']
	The act of throwing something away.
	an act of passing a ball or puck to a te

In [7]:
#  remove from df_combined the rows where the word is in candidate_words
df = df[~df['word'].isin(candidate_list[0].get_candidate_words())]
df.shape

(94, 3)

In [8]:
len(df.word.unique()),df.word.unique()

(12,
 array(['uphold', 'discard', 'honor', 'energy', 'state', 'play', 'justice',
        'labor', 'pass', 'fulfill', 'draw', 'keep'], dtype=object))

In [9]:
candidate_list = get_candidate_words(df)
print(len(candidate_list))
for candidate in candidate_list[:20]:
    print(candidate)

(94, 94)
(94, 94)
56
Group metric: 0.6919912754064516, root word: keep, checksum: 02f440f2aa99d7c860db64d79479f106
Candidate words: ['fulfill', 'honor', 'keep', 'uphold']
	to carry out a task or duty as required or expected
	To fulfill or keep an agreement or commitment.
	to fulfill or adhere to a commitment or promise
	to maintain or support something, such as a decision or a law

Group metric: 0.5979796574323007, root word: uphold, checksum: 21d95ecc15046a786a7793d21781a54b
Candidate words: ['honor', 'justice', 'keep', 'uphold']
	To fulfill or keep an agreement or commitment.
	The administration of the law or authority in maintaining this
	the act of maintaining or caring for something
	to maintain or support something, such as a decision or a law

Group metric: 0.5724557117326886, root word: fulfill, checksum: 5df52bf83b22266cd2b9516cbf70763e
Candidate words: ['fulfill', 'honor', 'keep', 'pass']
	to bring to completion or reality; achieve or realize
	To fulfill or keep an agreement 

In [10]:
#  remove from df_combined the rows where the word is in candidate_words
df = df[~df['word'].isin(candidate_list[0].get_candidate_words())]
df.shape

(71, 3)

In [11]:
len(df.word.unique()),df.word.unique()

(8,
 array(['discard', 'energy', 'state', 'play', 'justice', 'labor', 'pass',
        'draw'], dtype=object))

In [12]:
candidate_list = get_candidate_words(df)
print(len(candidate_list))
for candidate in candidate_list[:20]:
    print(candidate)

(71, 71)
(71, 71)
26
Group metric: 0.498632365050673, root word: play, checksum: 4e853246b0975f3e24d92f16cab37fd6
Candidate words: ['energy', 'labor', 'pass', 'play']
	A person's or organization's active involvement and dynamism.
	Physical or mental effort exerted to achieve a purpose, such as manual labor.
	an act of passing a ball or puck to a teammate
	The action or manner of engaging in a game or sport, such as 'His play on the field was impressive.'

Group metric: 0.4795463231072503, root word: pass, checksum: 94a1e9bffdbd88dd4af7c29ea2e1bf89
Candidate words: ['discard', 'draw', 'pass', 'play']
	The act of throwing something away.
	an act of selecting a winner or winners in a lottery or similar game of chance
	an act of passing a ball or puck to a teammate
	The action or manner of engaging in a game or sport, such as 'His play on the field was impressive.'

Group metric: 0.47832088055250094, root word: pass, checksum: 6168f7c8fb6732e21b9ba67b67c83c62
Candidate words: ['draw', 'ene

In [13]:
#  remove from df_combined the rows where the word is in candidate_words
df = df[~df['word'].isin(candidate_list[0].get_candidate_words())]
df.shape

(29, 3)

In [14]:
len(df.word.unique()),df.word.unique()

(4, array(['discard', 'state', 'justice', 'draw'], dtype=object))