In [1]:
from dataclasses import dataclass, field

import pandas as pd
import numpy as np

# import cosine similarity
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
@dataclass
class ConnectionGroup:
    group_metric: float = field(default=0.0, metadata={"help": "Average cosine similarity of the group"})
    candidate_pairs: list = field(default_factory=list, metadata={"help": "List of candidate word with definition"})

    def add_entry(self, word, connection):
        self.candidate_pairs.append((word, connection))

    def get_candidate_words(self):
        sorted_pairs = sorted(self.candidate_pairs, key=lambda x:x[0])
        return [x[0] for x in sorted_pairs]
    
    def get_candidate_connections(self):
        sorted_pairs = sorted(self.candidate_pairs, key=lambda x:x[0])
        # strip the part of speech tag at the beginning of the connection, which looks like "noun:" or "verb:" etc.
        # find the first colon and take the substring after it

        stripped_connections = [x[1].split(':', 1)[1].strip() if ':' in x[1] else x[1] for x in sorted_pairs]
        return stripped_connections

    def __repr__(self):
        return_string = f"Group metric: {self.group_metric}, "
        return_string += f"Candidate words: {self.get_candidate_words()}\n"
        for connection in self.get_candidate_connections():
            return_string += f"\t{connection}\n"

        return return_string

In [86]:
test_vocab = "word_list5.pkl"

# load the vocabulary
df = pd.read_pickle(test_vocab)
df.shape

(121, 3)

In [87]:
df.head(15)

Unnamed: 0,word,definition,embedding
0,uphold,"verb: to maintain or support something, such a...","[0.05433855205774307, 0.028151938691735268, 0...."
1,uphold,verb: to lift or hold something in an elevated...,"[0.006949818227440119, -0.0720582902431488, -0..."
2,uphold,verb: to defend or stand up for a principle or...,"[0.0063194772228598595, -0.022249704226851463,..."
3,discard,verb: To throw something away because it is no...,"[0.023533722385764122, 0.005563678685575724, -..."
4,discard,"verb: To get rid of a card in a game, usually ...","[-6.281930836848915e-05, -0.023267202079296112..."
5,discard,noun: The act of throwing something away.,"[0.013182280585169792, 0.010861128568649292, -..."
6,discard,noun: A card that has been thrown away in a ca...,"[0.0007180095999501646, -0.007413978222757578,..."
7,honor,noun: Respect and esteem shown to someone or s...,"[0.05223957821726799, -0.04617014899849892, -0..."
8,honor,noun: A privilege or mark of distinction.,"[0.048169057816267014, -0.009565112181007862, ..."
9,honor,"noun: A code of integrity and ethics, often as...","[0.022865494713187218, -0.0090312659740448, 0...."


In [None]:
def get_candidate_words(df):

    candidate_list = []

    # create cosine similarity matrix for all pairs of the vectors
    cosine_similarities = cosine_similarity(df['embedding'].tolist())
    print(cosine_similarities.shape)

    # for each row in the cosine similarity matrix, sort by the cosine similarity
    sorted_cosine_similarites = np.argsort(cosine_similarities, axis=1)
    print(sorted_cosine_similarites.shape)

    # group of words that are most similar to each other
    max_similarity = -1
   
    for r in range(df.shape[0]):
        # get the top 3 closest words that are not the same as the current word and are not already connected
        connected_words = set()
        top3 = []
        for i in range(sorted_cosine_similarites.shape[1]-2, 0, -1):
            c = sorted_cosine_similarites[r, i]
            
            # make sure the word is not already connected and not the current word
            if df.iloc[c, 0] not in connected_words and df.iloc[c, 0] != df.iloc[r, 0]:
                connected_words.add(df.iloc[c, 0])
                top3.append(c)
            if len(connected_words) == 3:
                break   

        # create candidate group for the current word and the top 3 closest words
        if df.iloc[r, 0] not in connected_words and len(connected_words) == 3:
            candidate_group = ConnectionGroup()
            candidate_group.group_metric = cosine_similarities[r, top3].mean()
            candidate_group.add_entry(df.iloc[r, 0], df.iloc[r, 1])
            
            for c in top3:
                candidate_group.add_entry(df.iloc[c, 0], df.iloc[c, 1])

            candidate_list.append(candidate_group)

    return candidate_list

In [89]:
candidate_list = get_candidate_words(df)
candidate_list.sort(key=lambda x: x.group_metric, reverse=True)
for candidate in candidate_list[:20]:
    print(candidate)

(121, 121)
(121, 121)
Group metric: 0.7400479116733788, Candidate words: ['blanket', 'sham', 'sheet', 'throw']
	A large piece of fabric used to cover and keep warm, typically on a bed.
	A decorative pillow covering or case.
	A large rectangular piece of fabric used for covering a bed.
	a light blanket or cover used for warmth or decoration

Group metric: 0.6919912754064516, Candidate words: ['fulfill', 'honor', 'keep', 'uphold']
	to carry out a task or duty as required or expected
	To fulfill or keep an agreement or commitment.
	to fulfill or adhere to a commitment or promise
	to maintain or support something, such as a decision or a law

Group metric: 0.686097124139617, Candidate words: ['blanket', 'sham', 'sheet', 'throw']
	A large piece of fabric used to cover and keep warm, typically on a bed.
	A decorative pillow covering or case.
	A large rectangular piece of fabric used for covering a bed.
	a light blanket or cover used for warmth or decoration

Group metric: 0.6794160664441097,

In [65]:
for candidate in candidate_list[-10:]:
    print(candidate)

Group metric: 0.3708138053552257, Candidate words: ['honor', 'keep', 'pass', 'state']
	verb: To fulfill or keep an agreement or commitment.
	verb: to fulfill or adhere to a commitment or promise
	verb: to formally approve a law or proposal
	verb: To express something in words, e.g., 'Please state your name and address clearly.'

Group metric: 0.36934542148681154, Candidate words: ['blanket', 'honor', 'keep', 'pass']
	noun: A general term for any cover or layer that envelops or conceals something.
	noun: A privilege or mark of distinction.
	noun: a fortified tower or dungeon in a castle
	noun: a route through mountains or hills

Group metric: 0.3692878655002903, Candidate words: ['justice', 'keep', 'sheet', 'state']
	noun: The administration of the law or authority in maintaining this
	noun: an area or structure where something is stored or kept
	noun: A large expanse or continuous surface of something, like ice or water.
	noun: The territories or regions governed by a particular govern

In [90]:
#  remove from df_combined the rows where the word is in candidate_words
df = df[~df['word'].isin(candidate_list[0].get_candidate_words())]
df.shape

(94, 3)

In [91]:
len(df.word.unique()),df.word.unique()

(12,
 array(['uphold', 'discard', 'honor', 'energy', 'state', 'play', 'justice',
        'labor', 'pass', 'fulfill', 'draw', 'keep'], dtype=object))

In [92]:
candidate_list = get_candidate_words(df)
candidate_list.sort(key=lambda x: x.group_metric, reverse=True)
for candidate in candidate_list[:20]:
    print(candidate)

(94, 94)
(94, 94)
Group metric: 0.6919912754064516, Candidate words: ['fulfill', 'honor', 'keep', 'uphold']
	to carry out a task or duty as required or expected
	To fulfill or keep an agreement or commitment.
	to fulfill or adhere to a commitment or promise
	to maintain or support something, such as a decision or a law

Group metric: 0.6794160664441097, Candidate words: ['fulfill', 'honor', 'keep', 'uphold']
	to bring to completion or reality; achieve or realize
	To fulfill or keep an agreement or commitment.
	to fulfill or adhere to a commitment or promise
	to maintain or support something, such as a decision or a law

Group metric: 0.5979796574323007, Candidate words: ['honor', 'justice', 'keep', 'uphold']
	To fulfill or keep an agreement or commitment.
	The administration of the law or authority in maintaining this
	the act of maintaining or caring for something
	to maintain or support something, such as a decision or a law

Group metric: 0.5724557117326886, Candidate words: ['fulfi

In [69]:
#  remove from df_combined the rows where the word is in candidate_words
df = df[~df['word'].isin(candidate_list[0].get_candidate_words())]
df.shape

(71, 3)

In [70]:
len(df.word.unique()),df.word.unique()

(8,
 array(['discard', 'energy', 'state', 'play', 'justice', 'labor', 'pass',
        'draw'], dtype=object))

In [71]:
candidate_list = get_candidate_words(df)
candidate_list.sort(key=lambda x: x.group_metric, reverse=True)
for candidate in candidate_list[:20]:
    print(candidate)

(71, 71)
(71, 71)
Group metric: 0.498632365050673, Candidate words: ['energy', 'labor', 'pass', 'play']
	noun: A person's or organization's active involvement and dynamism.
	noun: Physical or mental effort exerted to achieve a purpose, such as manual labor.
	noun: an act of passing a ball or puck to a teammate
	noun: The action or manner of engaging in a game or sport, such as 'His play on the field was impressive.'

Group metric: 0.48667413697238976, Candidate words: ['energy', 'labor', 'pass', 'play']
	noun: The strength and vitality required for sustained physical or mental activity.
	noun: Physical or mental effort exerted to achieve a purpose, such as manual labor.
	noun: a gesture or action that conveys a message or intention
	noun: The action or manner of engaging in a game or sport, such as 'His play on the field was impressive.'

Group metric: 0.4795463231072503, Candidate words: ['discard', 'draw', 'pass', 'play']
	noun: The act of throwing something away.
	noun: an act of se

In [72]:
#  remove from df_combined the rows where the word is in candidate_words
df = df[~df['word'].isin(candidate_list[0].get_candidate_words())]
df.shape

(29, 3)

In [73]:
len(df.word.unique()),df.word.unique()

(4, array(['discard', 'state', 'justice', 'draw'], dtype=object))