In [2]:
from dataclasses import dataclass, field
import hashlib
import itertools
from typing import List, Dict, Any, Tuple, Optional
import json
import os

import pandas as pd
import numpy as np

# import cosine similarity
from sklearn.metrics.pairwise import cosine_similarity

from rag_tools import choose_rag_item

In [3]:
with open("/openai/api_key.json") as f:
    config = json.load(f)


os.environ["OPENAI_API_KEY"] = config["key"]

In [32]:
test_vocab = "word_list4.pkl"

# load the vocabulary
df = pd.read_pickle(test_vocab)
df.shape

(118, 3)

In [33]:
@dataclass
class ConnectionGroup:
    group_metric: float = field(default=0.0, metadata={"help": "Average cosine similarity of all combinations of words in the group"})
    root_word: str = field(default="", metadata={"help": "Root word of the group"})
    candidate_pairs: list = field(default_factory=list, metadata={"help": "List of candidate word with definition"})
    group_id: Optional[str] = field(default=None, metadata={"help": "Checksum identifer for the group"})

    def add_entry(self, word, connection):
        if len(self.candidate_pairs) < 4:
            self.candidate_pairs.append((word, connection))
            if len(self.candidate_pairs) == 4:
                self.group_id = self._compute_group_id()
        else:
            raise ValueError("Group is full, cannot add more entries")          

    def get_candidate_words(self):
        sorted_pairs = sorted(self.candidate_pairs, key=lambda x:x[0])
        return [x[0] for x in sorted_pairs]
    
    def get_candidate_connections(self):
        sorted_pairs = sorted(self.candidate_pairs, key=lambda x:x[0])

        # strip the part of speech tag at the beginning of the connection, which looks like "noun:" or "verb:" etc.
        # find the first colon and take the substring after it
        stripped_connections = [x[1].split(':', 1)[1].strip() if ':' in x[1] else x[1] for x in sorted_pairs]

        return stripped_connections
    
    def _compute_group_id(self):
        return hashlib.md5("".join(self.get_candidate_words()).encode()).hexdigest()

    def __repr__(self):
        return_string = f"group metric: {self.group_metric}, "
        return_string += f"root word: {self.root_word}, group id: {self.group_id}\n"
        return_string += f"candidate group: {self.get_candidate_words()}\n"
        for connection in self.get_candidate_connections():
            return_string += f"\t{connection}\n"

        return return_string
    
    # method to determine if the group is equal to another group
    def __eq__(self, other):
        return set(self.get_candidate_words()) == set(other.get_candidate_words())

In [34]:
df.head(15)

Unnamed: 0,word,definition,embedding
0,rumble,"noun: A deep, resonant sound typically associa...","[-0.006525901146233082, -0.00801665149629116, ..."
1,rumble,"noun: A street fight, particularly amongst gangs.","[-0.02613157406449318, -0.006333488505333662, ..."
2,rumble,"verb: To make a continuous deep, resonant sound.","[0.0348958894610405, -0.01895660161972046, -0...."
3,rumble,"verb: To move with a resonant, rolling sound, ...","[0.020638514310121536, -0.0015102694742381573,..."
4,rumble,"verb: To discover or expose something, often u...","[0.0066839889623224735, -0.03194846957921982, ..."
5,table,noun: A piece of furniture with a flat top and...,"[-0.0295570008456707, 0.0011174401734024286, -..."
6,table,noun: A set of data arranged in rows and colum...,"[-0.032239943742752075, -0.03164214640855789, ..."
7,table,noun: A group of people gathered for a meal or...,"[-0.012719403021037579, -0.007633887231349945,..."
8,table,verb: To postpone consideration of a proposal ...,"[0.033500708639621735, 0.007590617518872023, -..."
9,table,"verb: To present or submit for discussion, esp...","[-0.001278589479625225, -0.015538915060460567,..."


In [35]:
def get_candidate_words(df: pd.DataFrame) -> list:
    """
    Generate a list of candidate word groups based on cosine similarity of their embeddings.

    Args:
        df (pd.DataFrame): DataFrame containing words and their corresponding embeddings. Dataframe should have two columns: 'word', 'definition' and 'embedding', in that order.

    Returns:
        list: A list of unique candidate word groups sorted by their group metric in descending order.
    """

    candidate_list = []

    # create cosine similarity matrix for all pairs of the vectors
    cosine_similarities = cosine_similarity(df['embedding'].tolist())
    print(cosine_similarities.shape)

    # for each row in the cosine similarity matrix, sort by the cosine similarity
    sorted_cosine_similarites = np.argsort(cosine_similarities, axis=1)
    print(sorted_cosine_similarites.shape)

    # group of words that are most similar to each other
    for r in range(df.shape[0]):

        # get the top 3 closest words that are not the same as the current word and are not already connected
        connected_words = set()
        top3 = []
        for i in range(sorted_cosine_similarites.shape[1]-2, 0, -1):
            c = sorted_cosine_similarites[r, i]
            
            # make sure the word is not already connected and not the current word
            if df.iloc[c, 0] not in connected_words and df.iloc[c, 0] != df.iloc[r, 0]:
                connected_words.add(df.iloc[c, 0])
                top3.append(c)
            if len(connected_words) == 3:
                break   

        # create candidate group for the current word and the top 3 closest words
        if df.iloc[r, 0] not in connected_words and len(connected_words) == 3:
            candidate_group = ConnectionGroup()
            candidate_group.group_metric = cosine_similarities[r, top3].mean()
            candidate_group.root_word = df.iloc[r, 0]
            candidate_group.add_entry(df.iloc[r, 0], df.iloc[r, 1])
            
            for c in top3:
                candidate_group.add_entry(df.iloc[c, 0], df.iloc[c, 1])

            combinations = list(itertools.combinations([r] + top3, 2))
            candidate_group.group_metric= np.array([cosine_similarities[r, c] for r,c in combinations]).mean()


            candidate_list.append(candidate_group)

    # sort the candidate list by the group metric in descending order
    candidate_list.sort(key=lambda x: x.group_metric, reverse=True)

    # remove duplicate groups
    found_groups = set()
    unique_candidate_list = []
    for candidate in candidate_list:
        if candidate.group_id not in found_groups:
            unique_candidate_list.append(candidate)
            found_groups.add(candidate.group_id)

    return unique_candidate_list

In [36]:
def find_recommendation(df: pd.DataFrame) -> Tuple[List[str], Dict[str, Any]]:
    candidate_list = get_candidate_words(df)
    print(len(candidate_list))
    list_to_validate = "\n".join([str(x) for x in candidate_list[:5]])
    recommended_group = choose_rag_item(list_to_validate)
    print(recommended_group)
    recommended_words = recommended_group["candidate_group"]
    print(recommended_words)
    return recommended_words



In [37]:
# Example usage:
while df.word.unique().size > 0:
    rw = find_recommendation(df)
    print(f"using these words {rw}, type {type(rw)}")

    # remove the words from the dataframe
    df = df[~df.word.isin(rw)]

(118, 118)
(118, 118)
77
{'candidate_group': ['clap', 'peal', 'roll', 'rumble'], 'explanation': 'This group is connected by the theme of producing a loud, resonant sound, often associated with thunder or clapping. The words describe different ways of generating or experiencing such sounds, making them uniquely unified by this auditory concept compared to the other groups which focus more on hair styling techniques.'}
['clap', 'peal', 'roll', 'rumble']
using these words ['clap', 'peal', 'roll', 'rumble'], type <class 'list'>
(86, 86)
(86, 86)
59
{'candidate_group': ['comb', 'dig', 'root', 'sift'], 'explanation': "The group 'comb', 'dig', 'root', 'sift' is the most unique because all four words are connected by the theme of searching or examining thoroughly. 'Comb' can mean to search thoroughly, 'dig' can mean to search deeply, 'root' involves digging or turning up soil, and 'sift' means to examine thoroughly to isolate important elements. This theme of thorough searching or examination 

In [38]:
df.word.unique().size

0