In [1]:
nouns_path = "D:/dialogue2020/taxonomy-enrichment/data/training_data/training_nouns.tsv"
verbs_path = "D:/dialogue2020/taxonomy-enrichment/data/training_data/training_verbs.tsv"

In [2]:
from scoring_program.evaluate import *
from scoring_program.utils import *
import json
import numpy as np
import re
from operator import itemgetter
from scipy import spatial

In [3]:
from baselines.vectorizers.fasttext_vectorizer import FasttextVectorizer
from gensim.models import KeyedVectors
from ruwordnet.ruwordnet_reader import RuWordnet
from collections import Counter

In [43]:
nouns = read_dataset(nouns_path, lambda x: json.loads(x))
verbs = read_dataset(verbs_path, lambda x: json.loads(x))

In [5]:
ft = FasttextVectorizer("baselines/models/cc.ru.300.bin")

Model loaded


In [6]:
rwn = RuWordnet(db_path="dataset/ruwordnet.db", ruwordnet_path=None)

In [12]:
wiki_path = "wiki_ru.jsonlines"
wiki_vectors_path = "baselines/models/vectors/fasttext/ru/wiki.txt"
rwn_vectors_path = "baselines/models/vectors/fasttext/ru/rwn_full.txt"
node2vec_nouns_path = "D:\\dialogue2020\\diachrony_for_taxonomy_enrichment\\data\\node2vec\\node2vec_ru_nouns.txt"
node2vec_verbs_path = "D:\\dialogue2020\\diachrony_for_taxonomy_enrichment\\data\\node2vec\\node2vec_ru_verbs.txt"

In [8]:
pattern = re.compile("[^А-я \-]")
delete_bracets = re.compile(r"\(.+?\)")

In [9]:
def get_wiktionary(path):
        wiktionary = {}
        with open(path, 'r') as f:
            for line in f:
                data = json.loads(line)
                wiktionary[data['word']] = {"hypernyms": data['hypernyms'], "synonyms": data['synonyms'],
                                            "meanings": data['meanings']}
        return wiktionary

In [10]:
wiktionary = get_wiktionary(wiki_path)
wiki_model = KeyedVectors.load_word2vec_format(wiki_vectors_path, binary=False)
rwn_model = KeyedVectors.load_word2vec_format(rwn_vectors_path, binary=False)

In [11]:
from baselines.vectorizers.projection_vectorizer import ProjectionVectorizer

In [30]:
projection_nouns = ProjectionVectorizer(ft.model, "../ru_projection_verbs")

projection_verbs = ProjectionVectorizer(ft.model, "../ru_projection_verbs")

In [13]:
node2vec_nouns_model = KeyedVectors.load_word2vec_format(node2vec_nouns_path, binary=False)
node2vec_verbs_model = KeyedVectors.load_word2vec_format(node2vec_verbs_path, binary=False)

In [14]:
def generate_associates(neologism, topn):
    vector = ft.get_multiword_vectors([neologism])[0]
    return rwn_model.similar_by_vector(vector, topn)

def compute_hchs(neologism, topn):
    associates = map(itemgetter(0), generate_associates(neologism, topn))
    hchs = [hypernym for associate in associates for hypernym in rwn.get_hypernyms_by_id(associate)]
    return hchs

def distance2vote(d, a=3.0, b=5.0, y=1.0):
    sim = np.maximum(0, 1 - d ** 2 / 2)
    return np.exp(-d ** a) * y * sim ** b

def compute_distance(s):
    return np.sqrt(2*(1-s))

In [15]:
def compute_candidates(neologism, topn):
    hypernyms = compute_hchs(neologism, topn)
    second_order_hypernyms = [s_o for hypernym in hypernyms for s_o in rwn.get_hypernyms_by_id(hypernym)]
    all_hypernyms = Counter(hypernyms + second_order_hypernyms)
    associates = generate_associates(neologism, 100)
    votes = Counter()
    for associate, similarity in associates:
        distance = compute_distance(similarity)
        for hypernym in rwn.get_hypernyms_by_id(associate):
            votes[hypernym] += distance2vote(distance)
            for second_order in rwn.get_hypernyms_by_id(hypernym):
                votes[second_order] += distance2vote(distance, y=0.5)
    return all_hypernyms, votes

In [16]:
def get_similarity(word, candidate):
    v1 = ft.get_multiword_vectors([word])[0]
    v2 = rwn_model[candidate]
    v1 = v1 / (sum(v1 ** 2) ** 0.5)
    v2 = v2 / (sum(v2 ** 2) ** 0.5)
    return 1 - spatial.distance.cosine(v1, v2)

In [17]:
private_nouns = read_dataset("D:/dialogue2020/dialogue2020_shared_task_hypernyms/dataset/private/nouns_private_subgraphs.tsv", lambda x: json.loads(x))
private_verbs = read_dataset("D:/dialogue2020/dialogue2020_shared_task_hypernyms/dataset/private/verbs_private_subgraphs.tsv", lambda x: json.loads(x))
public_nouns = read_dataset("D:/dialogue2020/dialogue2020_shared_task_hypernyms/dataset/public/nouns_public_subgraphs.tsv", lambda x: json.loads(x))
public_verbs = read_dataset("D:/dialogue2020/dialogue2020_shared_task_hypernyms/dataset/public/verbs_public_subgraphs.tsv", lambda x: json.loads(x))

In [18]:
def count_statistics(data):
    pr_count = 0
    in_hypernyms = 0
    in_synonyms = 0
    in_definition = 0
    n_cands = 0

    for word, hypernyms in data.items():
        hypernyms = [j for i in hypernyms for j in i]

        if word.lower() in wiktionary:
            pr_count += 1
            
            hyp = False
            syn = False
            def_ = False
            
            wiktionary_data = wiktionary[word.lower()]
            
            for candidate in hypernyms:
                n_cands += 1
                candidate_words = delete_bracets.sub("", rwn.get_name_by_id(candidate)).split(',')
                
                if any([candidate_word.lower() in wiktionary_data['hypernyms'] for candidate_word in candidate_words]):
                    hyp = True
                    #in_hypernyms += 1

                if any([candidate_word.lower() in wiktionary_data['synonyms'] for candidate_word in candidate_words]):
                    #in_synonyms += 1
                    syn = True

                if any([any([candidate_word.lower() in i for candidate_word in candidate_words])
                        for i in wiktionary_data['meanings']]):
                    #in_definition += 1
                    def_ = True
            if hyp:
                in_hypernyms += 1
            if syn:
                in_synonyms += 1
            if def_:
                in_definition += 1
                
    print(f"All: {len(data)}, in wiki: {pr_count},  in hyper: {in_hypernyms}, in synonyms: {in_synonyms}, in def: {in_definition}")

count_statistics(private_nouns)
count_statistics(private_verbs)
count_statistics(public_nouns)
count_statistics(public_verbs)

All: 1525, in wiki: 1501,  in hyper: 292, in synonyms: 16, in def: 536
All: 350, in wiki: 350,  in hyper: 11, in synonyms: 10, in def: 99
All: 762, in wiki: 741,  in hyper: 143, in synonyms: 12, in def: 251
All: 175, in wiki: 173,  in hyper: 4, in synonyms: 4, in def: 46


In [31]:
def compute_weights(neologism, candidate, count, hyponym_count, node2vec_similarity):
    similarity = get_similarity(neologism, candidate)
    wiki_similarity = 0.0
    not_wiki_similarity = 0.0
    in_synonyms = 0.0
    in_hypernyms = 0.0
    in_definition = 0.0
    not_in_synonyms = 0.0
    not_in_hypernyms = 0.0
    not_in_definition = 0.0
    
    if hyponym_count == 0.0:
        not_hyponym_count = 1.0
    else:
        not_hyponym_count = 0.0
    
    candidate_words = delete_bracets.sub("", rwn.get_name_by_id(candidate)).split(',')
    if neologism.lower() in wiktionary:
        wiktionary_data = wiktionary[neologism.lower()]
        
        if any([candidate_word.lower() in wiktionary_data['hypernyms'] for candidate_word in candidate_words]):
            in_hypernyms = 1.0
        else:
            not_in_hypernyms = 1.0
            
        if any([candidate_word.lower() in wiktionary_data['synonyms'] for candidate_word in candidate_words]):
            in_synonyms = 1.0
        else:
            not_in_synonyms = 1.0
            
        if any([any([candidate_word.lower() in i for candidate_word in candidate_words])
                for i in wiktionary_data['meanings']]):
            in_definition = 1.0
        else:
            not_in_definition = 1.0
            
        wiki_similarities = []
        for wiki_hypernym in wiktionary_data['hypernyms']:
            wiki_hypernym = wiki_hypernym.replace("|", " ").replace('--', '')
            wiki_hypernym = pattern.sub("", wiki_hypernym)
            if not all([i == " " for i in wiki_hypernym]):
                wiki_similarities.append(compute_similarity(wiki_hypernym.replace(" ", "_"), candidate))
        if wiki_similarities:
            wiki_similarity = sum(wiki_similarities)/len(wiki_similarities)
        else:
            not_wiki_similarity = 1.0
    else:
        not_wiki_similarity = 1.0
            
    return np.array([count*similarity, wiki_similarity, not_wiki_similarity, in_synonyms, not_in_synonyms, in_hypernyms, not_in_hypernyms, in_definition, not_in_definition, hyponym_count, not_hyponym_count, node2vec_similarity])

In [32]:
def get_node2vec_similarity(model, v1, candidate):
    v2 = model[candidate]
    v1 = v1 / (sum(v1 ** 2) ** 0.5)
    v2 = v2 / (sum(v2 ** 2) ** 0.5)
    return 1 - spatial.distance.cosine(v1, v2)

In [33]:
def compute_similarity(wiki, candidate):
        v1 = wiki_model[wiki]
        v2 = rwn_model[candidate]
        v1 = v1 / (sum(v1 ** 2) ** 0.5)
        v2 = v2 / (sum(v2 ** 2) ** 0.5)
        return 1 - spatial.distance.cosine(v1, v2)

In [34]:
import tqdm

In [45]:
word_candidate_pair = []
feature_vectors = []
labels = []

for neologism, true_hypernyms in tqdm.tqdm(nouns.items()):
    if " " not in neologism and len(neologism)>3:
        true_hypernyms = [j for i in true_hypernyms for j in i]
        counts, votes = compute_candidates(neologism, 10)
        candidates = set(counts).union(set(votes))
        _, node2vec_vector = projection_nouns.predict_projection_word(neologism, node2vec_nouns_model)
        
        for candidate in candidates:
            count = counts.get(candidate, 1)
            
            if candidate.endswith('-N'):
                node2vec_similarity = get_node2vec_similarity(node2vec_nouns_model, node2vec_vector, candidate)

                weights = compute_weights(neologism, candidate, count, votes.get(candidate, 0.0), node2vec_similarity)

                word_candidate_pair.append((neologism, candidate))
                feature_vectors.append(weights)
                labels.append(int(candidate in true_hypernyms))
            
        for true_h in true_hypernyms:
            if true_h not in candidates:
                
                node2vec_similarity = get_node2vec_similarity(node2vec_nouns_model, node2vec_vector, true_h)
                
                weights = compute_weights(neologism, true_h, 1, 1, node2vec_similarity)
                
                word_candidate_pair.append((neologism, true_h))
                feature_vectors.append(weights)
                labels.append(1)
                
for neologism, true_hypernyms in tqdm.tqdm(verbs.items()):
    if " " not in neologism and len(neologism)>3 and neologism.lower() in wiktionary:
        true_hypernyms = [j for i in true_hypernyms for j in i]
        counts, votes = compute_candidates(neologism, 10)
        candidates = set(counts).union(set(votes))
        _, node2vec_vector = projection_verbs.predict_projection_word(neologism, node2vec_verbs_model)
        
        for candidate in candidates:
            count = counts.get(candidate, 1)
            
            if candidate.endswith('-V'):
                node2vec_similarity = get_node2vec_similarity(node2vec_verbs_model, node2vec_vector, candidate)

                weights = compute_weights(neologism, candidate, count, votes.get(candidate, 0.0), node2vec_similarity)

                word_candidate_pair.append((neologism, candidate))
                feature_vectors.append(weights)
                labels.append(int(candidate in true_hypernyms))
            
        for true_h in true_hypernyms:
            if true_h not in candidates:
                node2vec_similarity = get_node2vec_similarity(node2vec_verbs_model, node2vec_vector, true_h)
                
                weights = compute_weights(neologism, true_h, 1, 1, node2vec_similarity)
                
                word_candidate_pair.append((neologism, true_h))
                feature_vectors.append(weights)
                labels.append(1)

100%|████████████████████████████████████████████████████████████████████████████| 25376/25376 [20:08<00:00, 21.00it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 6806/6806 [08:07<00:00,  7.85it/s]


In [46]:
with open("data_vectors_large_new_proper.jsonlines", 'w', encoding='utf-8', newline='\n') as w:
    for (neologism, candidate), vector, label in zip(word_candidate_pair, feature_vectors, labels):
        w.write(json.dumps({"neologism": neologism, "candidate": candidate, "vector": list(vector), "label": label})+"\n")

In [47]:
Counter(labels)

Counter({0: 2573751, 1: 59914})