In [1]:
from dataclasses import dataclass, field
import hashlib
import itertools
from typing import List, Dict, Any, Tuple, Optional
import json
import os
import sys

import pandas as pd
import numpy as np

# import cosine similarity
from sklearn.metrics.pairwise import cosine_similarity

# Import the 'rag_tools' module from the 'agent' directory
current_dir = os.getcwd()
project_dir = os.path.dirname(current_dir)

# Add the 'agent' directory to the sys.path
other_dir = os.path.join(project_dir, 'agent')
sys.path.append(other_dir)

# confirm the 'agent' directory is in the sys.path
for p in sys.path:
    print(p)

from embedvec_tools import choose_embedvec_item

/usr/local/lib/python311.zip
/usr/local/lib/python3.11
/usr/local/lib/python3.11/lib-dynload

/usr/local/lib/python3.11/site-packages
/workspaces/connection_solver/src/agent


In [2]:
with open("/openai/api_key.json") as f:
    config = json.load(f)


os.environ["OPENAI_API_KEY"] = config["key"]

In [3]:
test_vocab = "word_list6.pkl"

# load the vocabulary
df = pd.read_pickle(test_vocab)
df.shape

(130, 3)

In [4]:
@dataclass
class ConnectionGroup:
    group_metric: float = field(default=0.0, metadata={"help": "Average cosine similarity of all combinations of words in the group"})
    root_word: str = field(default="", metadata={"help": "Root word of the group"})
    candidate_pairs: list = field(default_factory=list, metadata={"help": "List of candidate word with definition"})
    group_id: Optional[str] = field(default=None, metadata={"help": "Checksum identifer for the group"})

    def add_entry(self, word, connection):
        if len(self.candidate_pairs) < 4:
            self.candidate_pairs.append((word, connection))
            if len(self.candidate_pairs) == 4:
                self.group_id = self._compute_group_id()
        else:
            raise ValueError("Group is full, cannot add more entries")          

    def get_candidate_words(self):
        sorted_pairs = sorted(self.candidate_pairs, key=lambda x:x[0])
        return [x[0] for x in sorted_pairs]
    
    def get_candidate_connections(self):
        sorted_pairs = sorted(self.candidate_pairs, key=lambda x:x[0])

        # strip the part of speech tag at the beginning of the connection, which looks like "noun:" or "verb:" etc.
        # find the first colon and take the substring after it
        stripped_connections = [x[1].split(':', 1)[1].strip() if ':' in x[1] else x[1] for x in sorted_pairs]

        return stripped_connections
    
    def _compute_group_id(self):
        return hashlib.md5("".join(self.get_candidate_words()).encode()).hexdigest()

    def __repr__(self):
        return_string = f"group metric: {self.group_metric}, "
        return_string += f"root word: {self.root_word}, group id: {self.group_id}\n"
        return_string += f"candidate group: {self.get_candidate_words()}\n"
        for connection in self.get_candidate_connections():
            return_string += f"\t{connection}\n"

        return return_string
    
    # method to determine if the group is equal to another group
    def __eq__(self, other):
        return set(self.get_candidate_words()) == set(other.get_candidate_words())

In [5]:
df.head(15)

Unnamed: 0,word,definition,embedding
0,grain,"noun: A small, hard seed of a cereal plant, su...","[-0.012600858695805073, 0.012784576043486595, ..."
1,grain,"noun: A single small particle or piece, such a...","[0.020823834463953972, -0.024603812023997307, ..."
2,grain,noun: The texture or pattern of fibers in wood...,"[-0.007330888416618109, -0.0005790892755612731..."
3,grain,noun: An individual element or component withi...,"[0.025177670642733574, 0.00015151509433053434,..."
4,grain,noun: A unit of weight in the avoirdupois syst...,"[0.01975935325026512, -0.03580138459801674, -0..."
5,grain,noun: A characteristic or essential quality.,"[0.019196202978491783, -0.0004823370254598558,..."
6,grain,"noun: A small amount or degree, e.g., 'a grain...","[-0.0131387859582901, 0.02310130186378956, -0...."
7,grain,noun: The alignment or direction of the fibers...,"[-0.017064416781067848, 0.010718798264861107, ..."
8,grain,verb: To form into grains or granules.,"[-0.01263611763715744, 0.011555520817637444, -..."
9,grain,verb: To give a texture resembling grain to a ...,"[-0.012740093283355236, 0.005322569981217384, ..."


In [6]:
def get_candidate_words(df: pd.DataFrame) -> list:
    """
    Generate a list of candidate word groups based on cosine similarity of their embeddings.

    Args:
        df (pd.DataFrame): DataFrame containing words and their corresponding embeddings. Dataframe should have two columns: 'word', 'definition' and 'embedding', in that order.

    Returns:
        list: A list of unique candidate word groups sorted by their group metric in descending order.
    """

    candidate_list = []

    # create cosine similarity matrix for all pairs of the vectors
    cosine_similarities = cosine_similarity(df['embedding'].tolist())
    print(cosine_similarities.shape)

    # for each row in the cosine similarity matrix, sort by the cosine similarity
    sorted_cosine_similarites = np.argsort(cosine_similarities, axis=1)
    print(sorted_cosine_similarites.shape)

    # group of words that are most similar to each other
    for r in range(df.shape[0]):

        # get the top 3 closest words that are not the same as the current word and are not already connected
        connected_words = set()
        top3 = []
        for i in range(sorted_cosine_similarites.shape[1]-2, 0, -1):
            c = sorted_cosine_similarites[r, i]
            
            # make sure the word is not already connected and not the current word
            if df.iloc[c, 0] not in connected_words and df.iloc[c, 0] != df.iloc[r, 0]:
                connected_words.add(df.iloc[c, 0])
                top3.append(c)
            if len(connected_words) == 3:
                break   

        # create candidate group for the current word and the top 3 closest words
        if df.iloc[r, 0] not in connected_words and len(connected_words) == 3:
            candidate_group = ConnectionGroup()
            candidate_group.group_metric = cosine_similarities[r, top3].mean()
            candidate_group.root_word = df.iloc[r, 0]
            candidate_group.add_entry(df.iloc[r, 0], df.iloc[r, 1])
            
            for c in top3:
                candidate_group.add_entry(df.iloc[c, 0], df.iloc[c, 1])

            combinations = list(itertools.combinations([r] + top3, 2))
            candidate_group.group_metric= np.array([cosine_similarities[r, c] for r,c in combinations]).mean()


            candidate_list.append(candidate_group)

    # sort the candidate list by the group metric in descending order
    candidate_list.sort(key=lambda x: x.group_metric, reverse=True)

    # remove duplicate groups
    found_groups = set()
    unique_candidate_list = []
    for candidate in candidate_list:
        if candidate.group_id not in found_groups:
            unique_candidate_list.append(candidate)
            found_groups.add(candidate.group_id)

    return unique_candidate_list

In [7]:
def find_recommendation(df: pd.DataFrame) -> Tuple[List[str], Dict[str, Any]]:
    candidate_list = get_candidate_words(df)
    print(len(candidate_list))
    list_to_validate = "\n".join([str(x) for x in candidate_list[:5]])
    recommended_group = choose_embedvec_item(list_to_validate)
    print(recommended_group)
    recommended_words = recommended_group["candidate_group"]
    print(recommended_words)
    return recommended_words



# Example usage:
while df.word.unique().size > 0:
    rw = find_recommendation(df)
    print(f"using these words {rw}, type {type(rw)}")

    # remove the words from the dataframe
    df = df[~df.word.isin(rw)]

df.word.unique().size

In [8]:
rw = find_recommendation(df)
print(f"using these words {rw}, type {type(rw)}")

(130, 130)
(130, 130)
89
{'candidate_group': ['cube', 'grain', 'shred', 'tip'], 'explanation': 'The words are connected by the theme of small pieces or fragments.'}
['cube', 'grain', 'shred', 'tip']
using these words ['cube', 'grain', 'shred', 'tip'], type <class 'list'>


In [10]:
candidate_list = get_candidate_words(df)
candidate_list[:10]

(130, 130)
(130, 130)


[group metric: 0.6668112426310061, root word: exhaust, group id: fb7dc317dbf562c0a79420118f823f50
 candidate group: ['drain', 'exhaust', 'sap', 'total']
 	To deplete or exhaust resources, energy, or vitality.
 	To drain of resources or strength; to wear out completely.
 	To drain someone of energy or vitality.
 	To destroy or wreck completely, especially in reference to vehicles.,
 group metric: 0.6468176127874155, root word: tax, group id: 1832357b3b2db5fa0d8e1d7bbc25a7ab
 candidate group: ['drain', 'exhaust', 'sap', 'tax']
 	To deplete or exhaust resources, energy, or vitality.
 	To drain of resources or strength; to wear out completely.
 	To drain someone of energy or vitality.
 	To make heavy demands on someone’s resources or abilities, e.g., 'The difficult project taxed his patience.',
 group metric: 0.6410907016394721, root word: sap, group id: 03fcf0339c681de880eaedad3c6ce82a
 candidate group: ['drain', 'empty', 'exhaust', 'sap']
 	To deplete or exhaust resources, energy, or vit