In [1]:
import requests
import heapq
from tqdm import tqdm

In [4]:

words = [f.split(',')[0].strip() for f in open('../data/rmt_test.csv','r')][1:]

In [6]:
def get_related_concepts(word, limit=10):
    base_url = 'http://api.conceptnet.io/'
    search_url = f'{base_url}c/en/{word}?limit={limit}'
    related_concepts = []
    w_concepts = []

    response = requests.get(search_url)
    if response.status_code == 200:
        data = response.json()

        edges = data['edges']
        for edge in edges:
            related_concepts.append(edge['end']['label'])
            w_concepts.append(edge['weight'])
    return related_concepts, w_concepts

def bfs(word, depth=10, limit=10):
    visited = set()
    queue = [(word, 0)]
    concepts = []
    ct = 0
    while queue:
        current_word, current_depth = queue.pop(0)
        if current_word not in visited:
            visited.add(current_word)
            concepts.append(current_word)

            if len(concepts)==limit:
                break

            related,_ = get_related_concepts(current_word, limit)
            for related_word in related:
                queue.append((related_word, current_depth + 1))

    return concepts


def dfs(word, depth=16, limit=10, concepts = [], visited=None,):
    if visited is None:
        visited = set()

    if word not in visited:
        visited.add(word)
        concepts.append(word)

        if len(concepts)==limit:
            return

        if depth > 0:
            related,_ = get_related_concepts(word, limit)
            for related_word in related:
                dfs(related_word, depth - 1, limit, concepts,  visited)

    return

def priority(starting_word, limit=10):
    priority_queue = []
    visited = set()
    concepts = []
    heapq.heappush(priority_queue, (0, starting_word))
    ct = 0
    while priority_queue:
        current_w, current_word = heapq.heappop(priority_queue)
        ct += 1
        if ct > limit:
            break

        if current_word not in visited:
            visited.add(current_word)
            concepts.append(current_word)

            related,w = get_related_concepts(current_word, limit)
            for related_word,wt in zip(related,w):
                heapq.heappush(priority_queue, (w, related_word))
    return concepts


In [9]:
bfs_net = []
dfs_net = []
pq_net = []

# %%
for word in tqdm(words):
    bfs_concepts = bfs(word, depth=4, limit=8)[1:]
    dfs_concepts = []
    dfs(word, 4, 8, dfs_concepts)
    dfs_concepts = dfs_concepts[1:]
    pq_concepts = priority(word, 8)[1:]
    bfs_net.append(bfs_concepts)
    dfs_net.append(dfs_concepts)
    pq_net.append(pq_concepts)

 12%|█▏        | 103/869 [10:24<1:17:21,  6.06s/it]


KeyboardInterrupt: 

In [10]:
len(bfs_net), len(dfs_net), len(pq_net)

(103, 103, 103)

In [11]:
sub_words = words[:103]

In [33]:
import torch
from transformers import BertTokenizer, BertModel
import spacy
from sklearn.metrics.pairwise import cosine_similarity

# Function to calculate cosine similarity between two vectors
def cosine_similarity_vec(v1, v2):
    return cosine_similarity(v1.reshape(1, -1), v2.reshape(1, -1))[0][0]

# Load BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Load GloVe embeddings using spaCy
try:
    nlp = spacy.load('en_core_web_sm')
except OSError:
    print('Downloading language model for the spaCy POS tagger\n'
        "(don't worry, this will only happen once)")
    from spacy.cli import download
    download('en')
    nlp = spacy.load('en_core_web_sm')

# Sample dictionary with words and their synonyms
word_synonyms = {
    'happy': ['joyful', 'content', 'pleased', 'glad', 'delighted'],
    'sad': ['unhappy', 'gloomy', 'miserable', 'sorrowful', 'depressed'],
    # Add more words and their synonyms as needed
}

keys = sub_words

# Calculate mean similarity for different k values
k_values = [2, 4, 8]

b = 0
g = 0

for k in k_values:
    print(f"\nFor k = {k}:")
    j = 0
    for i in tqdm(range(len(sub_words))):
        key = sub_words[i]
        bfs_cp = bfs_net[i][:k]
        dfs_cp = dfs_net[i][:k]
        pq_cp = pq_net[i][:k]
        bfs_cp.insert(0,key)
        for synonyms in [bfs_cp]:
            if len(synonyms) == 1:
                continue
            j += 1
            # BERT similarity
            bert_embeddings = []

            for synonym in synonyms:
                inputs = tokenizer(synonym, return_tensors="pt", padding=True, truncation=True)
                with torch.no_grad():
                    outputs = model(**inputs)
                embeddings = torch.mean(outputs.last_hidden_state, dim=1).numpy()
                bert_embeddings.append(embeddings)

            bert_mean_similarity = 0
            
            for j in range(1, len(bert_embeddings)):
                similarity = cosine_similarity_vec(bert_embeddings[0], bert_embeddings[j])
                bert_mean_similarity += similarity

            

            bert_mean_similarity /= (len(synonyms)-1)
            b += bert_mean_similarity
            # print(f"BERT Similarity for '{key}' with {k} synonyms: {bert_mean_similarity}")

            # GloVe similarity
            glove_embeddings = [nlp(synonym).vector for synonym in synonyms]

            glove_mean_similarity = 0
            
            for j in range(1, len(glove_embeddings)):
                similarity = cosine_similarity_vec(glove_embeddings[0], glove_embeddings[j])
                glove_mean_similarity += similarity

            glove_mean_similarity /= len(synonyms) 
            g += glove_mean_similarity
            # print(f"GloVe Similarity for '{key}' with {k} synonyms: {glove_mean_similarity}")
    print("Average Bert Sim: ",bert_mean_similarity/j)
    print("Average GloVe Sim: ", glove_mean_similarity/j)


For k = 2:


100%|██████████| 103/103 [01:46<00:00,  1.03s/it]


Average Bert Sim:  0.2699388712644577
Average GloVe Sim:  0.1277693510055542

For k = 4:


 41%|████      | 42/103 [01:45<04:43,  4.64s/it]

In [None]:
Priority Queue:
For k = 2:
100%|██████████| 103/103 [02:28<00:00,  1.44s/it]
Average Bert Sim:  0.3404962718486786
Average GloVe Sim:  0.0862845703959465

For k = 4:
100%|██████████| 103/103 [02:48<00:00,  1.64s/it]
Average Bert Sim:  0.3404962718486786
Average GloVe Sim:  0.0862845703959465

For k = 8:
100%|██████████| 103/103 [03:37<00:00,  2.11s/it]
Average Bert Sim:  0.3404962718486786
Average GloVe Sim:  0.0862845703959465