In [None]:
import sys
from functools import lru_cache as memoize
from os.path import join as join_path
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage
from nltk.corpus import wordnet as wn
from PyDictionary import PyDictionary
DICTIONARY = PyDictionary()

OUTPUT_DIR = '../../data/output/trial_5'

In [None]:
# FIXME: this is a temperary function for txt searching. Change this if using sqlite
def search_file(filename, string):
    """search specific string in file"""
    with open(filename, 'r', encoding='utf-8') as f:
        results = {}
        for line in f.readlines():
            ls = line.split()
            if string == ls[0]:
                results[ls[1]] = float(ls[2])
    return results


@memoize(maxsize=None)
def get_synonyms(word, pos=None):
    """Get synonyms for a word using PyDictionary and WordNet.

    Arguments:
        word (str): The word to find synonyms for.
        pos (int): WordNet part-of-speech constant. Defaults to None.

    Returns:
        set[str]: The set of synonyms.
    """
    syn_list = []

    # add WordNet synonyms to the list
    for synset in wn.synsets(word, pos):
        for lemma in synset.lemmas():
            syn = lemma.name()
            if syn != word:
                syn_list.append(syn)
    # add thesaurus synonyms
    dict_syns = DICTIONARY.synonym(word)

    # combine them and return
    if dict_syns:
        return set(syn_list) | set(dict_syns)
    else:
        return set(syn_list)

In [None]:
def get_related_words(word, word_pos, expect_pos):
    """
    this function return the given word's all verb/noun/adj and it's probability
    Args:
        word: a given word
        word_pos: the given word's part of speech
        expect_pos: expected words' part of speech

    Returns:
        (dictionary) {related word: probability}
    """
    filename = join_path(OUTPUT_DIR, ("prob_" + expect_pos + "_" + word_pos + ".txt"))
    return search_file(filename, word)


def get_wn_pos(pos):
    """untility function that return wordnet's part of speech parameter for given pos"""
    if pos == "noun":
        return wn.NOUN
    elif pos == "verb":
        return wn.VERB
    elif pos == "adj":
        return wn.ADJ
    else:
        return None


def get_synonyms_dict(word, word_pos, expect_pos):
    """
    find the given word's synonym's related expect_pos word and their probability
    Args:
        word (str): the word querying for it's synonym
        word_pos (str): the part of speech of the given word
        expect_pos (str): the part of speech of the expected 
    Returns:
        word_dict (dictionary): {word: {expect: prob}}
        syn_dict (dictionary): {synonym: {expect: prob}}
    """
    syn_dict = {}
    word_dict = {word:{}}
    syn_list = list(get_synonyms(word, get_wn_pos(word_pos)))
    word_dict[word] = get_related_words(word, word_pos, expect_pos)
    for synonym in syn_list:
        syn_dict[synonym] = get_related_words(synonym, word_pos, expect_pos)
    return word_dict, syn_dict

    
def get_synonym_df (word, word_pos, expect_pos, save=0):
    """
    return a dataframe with the given word's synonym's related expect_pos word and their probability
    Args:
        word (str): the word querying for it's synonym
        word_pos (str): the part of speech of the given word
        expect_pos (str): the part of speech of the expected 
    Returns:
        (dataframe)
    """
    word_dict, syn_dict = get_synonyms_dict(word, word_pos, expect_pos)
    columns = [key for key in word_dict]
    columns.extend([synonym for synonym in syn_dict])

    word_df = pd.DataFrame.from_dict(word_dict)
    syn_df = pd.DataFrame.from_dict(syn_dict)
    df = pd.concat([word_df, syn_df], axis=1, join='outer').sort_values(by=[word], ascending=0).fillna(0)
    
    if save:
        file_name = join_path(OUTPUT_DIR, "syn_test_output",  (word + "_" + word_pos + "_" + expect_pos + ".csv"))
        df.to_csv(file_name, sep='\t', encoding='utf-8')
    return df

def get_HC(dataframe):
    X = dataframe.transpose()
    M = linkage(X, method='ward')
    fig = plt.figure(figsize=(25, 10))
    dn = dendrogram(M, labels=X.index)
    plt.show()
    return M

In [None]:
noun_ls= ["car", "knife", "pen", "key", "bottle", "wall", "glass"]
for noun in noun_ls:
    get_synonym_df(noun, word_pos="noun", expect_pos="adj", save=1)
    get_synonym_df(noun, word_pos="noun", expect_pos="verb", save=1)
    
verb_ls = ["open", "open_with" "break", "kill", "cut", "drink_from"]
for verb in verb_ls:
    get_synonym_df(verb, word_pos="verb", expect_pos="noun", save=1)

adj_ls = ["sharp", "delicious", "firm", "open"]
for adj in adj_ls:
    get_synonym_df(adj, word_pos="adj", expect_pos="noun", save=1)

In [None]:
df = get_synonym_df("open", word_pos="verb", expect_pos="noun", save=0)
df