In [1]:
import sys
sys.path.append('taxonomy-enrichment/baselines/ruwordnet')
sys.path.append('taxonomy-enrichment/baselines')

In [2]:
import re
import numpy as np
import pandas as pd
import fasttext
from ruwordnet.ruwordnet_reader import RuWordnet
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import xml.etree.ElementTree as ET

In [3]:
ruwordnet = RuWordnet(db_path="dataset/ruwordnet.db", ruwordnet_path=None)

In [6]:
public_test = []
with open('dataset/public/nouns_public_no_labels.tsv', 'r', encoding="utf-8") as f:
    for line in f:
        line = line.rstrip()
        public_test.append(line)

In [8]:
private_test = []
with open('dataset/private/nouns_private_no_labels.tsv', 'r', encoding="utf-8") as f:
    for line in f:
        line = line.rstrip()
        private_test.append(line)

In [9]:
public_test[:3], private_test[:3]

(['АБДОМИНОПЛАСТИКА', 'АБСОРБЕНТ', 'АВАЛЬ'],
 ['АБСЕНТЕИЗМ', 'АБСОЛЮТИЗАЦИЯ', 'АБСТРАКЦИОНИЗМ'])

In [10]:
nouns = {}
nouns_list = []
for sense_id, synset_id, text in ruwordnet.get_all_senses():
    if synset_id.endswith("N"):
        ltext = text.lower()
        if ltext not in nouns:
            nouns_list.append(ltext)
        nouns.setdefault(ltext, []).append(synset_id)
len(nouns), len(nouns_list)

(68392, 68392)

In [11]:
synset2words = {}
for sense_id, synset_id, text in ruwordnet.get_all_senses():
    if synset_id.endswith("N"):
        synset2words.setdefault(synset_id, []).append(text.lower())
len(synset2words)

29296

In [12]:
list(nouns.items())[:3]

[('крестный родитель', ['147272-N']),
 ('злоупотребления в торговле', ['7331-N']),
 ('нарушение правил продажи', ['7331-N'])]

In [13]:
df_test = pd.DataFrame(data={'word': public_test + private_test})
df_test.shape

(2287, 1)

In [14]:
df_test['private'] = [1 if x in private_test else 0 for x in df_test['word']]

In [15]:
df_test['private'].value_counts()

1    1525
0     762
Name: private, dtype: int64

In [16]:
df_test.head()

Unnamed: 0,word,private
0,АБДОМИНОПЛАСТИКА,0
1,АБСОРБЕНТ,0
2,АВАЛЬ,0
3,АВТАРКИЯ,0
4,АГНОСТИК,0


In [17]:
wiktionarydump = "dataset/ruwiktionary-20200120-pages-articles-multistream.xml"

In [18]:
title2doc = {}

In [20]:
doc = {}
fields = {
    "timestamp": "timestamp",
    "title": "title",
    "text": "text",
    "redirect title": "redirect_title",
}
cnt = 0
for _, elem in tqdm(ET.iterparse(wiktionarydump, events=("end",))):
    prefix, has_namespace, postfix = elem.tag.partition('}')
    tag = postfix if postfix else prefix
    if tag in fields:
        doc[fields[tag]] = elem.text
    if tag == "page":
        elem.clear()
        cnt += 1
        title2doc[doc["title"]] = doc
        doc = {}

35866269it [02:04, 288127.82it/s]


In [21]:
# longest article by lowercased word
ltitle2doc = {}
for x in title2doc.keys():
    if x.lower() in ltitle2doc:
        if len(title2doc[x]['text']) > len(ltitle2doc[x.lower()]['text']):
            ltitle2doc[x.lower()] = title2doc[x]
    else:
        ltitle2doc[x.lower()] = title2doc[x]
ltitle_list = list(ltitle2doc.keys())

In [22]:
# longest article by lowercased word
ltitle2docs = {}
for x in title2doc.keys():
    ltitle2docs.setdefault(x.lower(), []).append(title2doc[x])
ltitle_list = list(ltitle2docs.keys())

In [23]:
df_test['wikt_in'] = [1 if x.lower() in ltitle2doc else 0 for x in df_test['word']]
print(df_test[df_test['private']==0]['wikt_in'].value_counts())
print(df_test[df_test['private']==1]['wikt_in'].value_counts())

1    741
0     21
Name: wikt_in, dtype: int64
1    1507
0      18
Name: wikt_in, dtype: int64


In [28]:
ftmodel = fasttext.load_model("baselines/models/cc.ru.300.bin")




In [29]:
ftwords_list = ftmodel.get_words()
ftwords = set(ftwords_list)

In [30]:
lword2word = {word.lower(): word for word in ftwords_list}
len(lword2word), len(ftwords_list)

(1674899, 2000000)

In [31]:
df_test['ft_in'] = [1 if x.lower() in lword2word else 0 for x in df_test['word']]
print(df_test[df_test['private']==0]['ft_in'].value_counts())
print(df_test[df_test['private']==1]['ft_in'].value_counts())

1    734
0     28
Name: ft_in, dtype: int64
1    1460
0      65
Name: ft_in, dtype: int64


In [32]:
nouns_vectors = np.zeros((len(nouns_list), ftmodel.get_dimension()))
for i, word in enumerate(tqdm(nouns_list)):
    nouns_vectors[i] = ftmodel.get_sentence_vector(word)

100%|█████████████████████████████████████████████████████████████████████████| 68392/68392 [00:01<00:00, 52647.41it/s]


In [33]:
ltitle_vectors = np.zeros((len(ltitle_list), ftmodel.get_dimension()))
for i, word in enumerate(tqdm(ltitle_list)):
    ltitle_vectors[i] = ftmodel.get_sentence_vector(word)

100%|█████████████████████████████████████████████████████████████████████| 2177428/2177428 [01:17<00:00, 28224.31it/s]


In [34]:
def get_top_k_similar(vectors, vector, k=1):
    res = []
    dots = np.dot(vectors, vector)
    for i in range(k):
        idx = np.argmax(dots)
        res.append(idx)
        dots[idx] = 0
    return res

In [35]:
i = 1
lword = public_test[i].lower()
idxs = get_top_k_similar(ltitle_vectors, ftmodel.get_sentence_vector(lword), k=5)
for idx in idxs:
    print(lword, ltitle_list[idx])

абсорбент абсорбент
абсорбент адсорбент
абсорбент сорбент
абсорбент абсорбирующий
абсорбент энтеросорбент


In [36]:
df_test['wn_top10'] = [
    [nouns_list[x] for x in get_top_k_similar(nouns_vectors, ftmodel.get_sentence_vector(word.lower()), k=10)]
    for word in tqdm(df_test['word'])
]

100%|██████████████████████████████████████████████████████████████████████████████| 2287/2287 [00:23<00:00, 97.72it/s]


In [37]:
df_train = pd.DataFrame(data={'word': [x[0].upper() for x in df_test['wn_top10']]})
# df_train = pd.DataFrame(data={'word': []})
df_train.shape

(2287, 1)

In [38]:
# skip self
df_train['wn_top10'] = [
    [nouns_list[x] for x in get_top_k_similar(nouns_vectors, ftmodel.get_sentence_vector(word.lower()), k=11) if nouns_list[x] != word.lower()]
    for word in tqdm(df_train['word'])
]

100%|██████████████████████████████████████████████████████████████████████████████| 2287/2287 [00:23<00:00, 98.03it/s]


In [39]:
for df in [
    df_test,
    df_train
]:
    df['wikt_top10'] = [
        [ltitle_list[x] for x in get_top_k_similar(ltitle_vectors, ftmodel.get_sentence_vector(word.lower()), k=10)]
        for word in tqdm(df['word'])
    ]

100%|██████████████████████████████████████████████████████████████████████████████| 2287/2287 [12:48<00:00,  2.91it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 2287/2287 [12:42<00:00,  2.96it/s]


In [40]:
def clean_markup(text):
    return text.replace("[[", "").replace("]]", "").replace("{{aslinks|", "")

def parse_item(text):
    items = []
    if text.startswith("# ") and len(line) > 2:
        items.extend([
            clean_markup(x).replace("?", "").replace(";", "").replace("'", "").strip() 
            for x in re.split(',|;', text[2:]) if x not in {'-', '?', '—', ''}
        ])
    return items

def parse_translation(trans):
    res = {}
    for line in trans.split('\n'):
        if line.startswith('|'):
            l, r = line.split('=')
            res[l[1:]] = r.replace('[[', '').replace(']]', '')
    return res

def parse_wiktionary(text):
    res = {'hypernym': [], 'synonym': [], 'meaning': []}
    h1 = ""
    texts = []
    for line in text.split("\n"):
        if line.startswith("= ") and line.endswith(" ="):
            h1 = line
        if h1 == '= {{-ru-}} =':
            texts.append(line)
    text = "\n".join(texts)
    for par in text.split("\n\n"):
        for h, f in [('==== Гиперонимы ====', 'hypernym'), ('==== Синонимы ====', 'synonym')]:
            if h in par:
                res[f] = [w for line in par.split("\n") for w in parse_item(line)]
        for h, f in [('==== Значение ====', 'meaning')]:
            if h in par:
                res[f] = [clean_markup(line[2:]) for line in par.split("\n") if line.startswith('# ') and len(line) > 2]
        if '=== Перевод ===' in par:
            res['translation'] = par.replace('=== Перевод ===\n', '')
    return res

In [41]:
for df in [df_test, df_train]:
    df['wikt_hypernyms_text'] = [
        parse_wiktionary(ltitle2doc[word.lower()]['text'])['hypernym'] if word.lower() in ltitle2doc else []
        for word in tqdm(df['word'])
    ]

100%|█████████████████████████████████████████████████████████████████████████████| 2287/2287 [00:03<00:00, 654.72it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 2287/2287 [00:02<00:00, 958.64it/s]


In [42]:
for df in [df_test, df_train]:
    df['wikt_top1_hypernyms_text'] = [
        parse_wiktionary(ltitle2doc[words[0].lower()]['text'])['hypernym']
        for words in tqdm(df['wikt_top10'])
    ]

100%|████████████████████████████████████████████████████████████████████████████| 2287/2287 [00:00<00:00, 8369.05it/s]
100%|████████████████████████████████████████████████████████████████████████████| 2287/2287 [00:00<00:00, 6948.86it/s]


In [43]:
sum([len(x) for x in df_test['wikt_top1_hypernyms_text']]), sum([len(x) for x in df_train['wikt_top1_hypernyms_text']])

(2164, 2697)

In [46]:
for df in [df_test, df_train]:
    res = []
    for words in tqdm(df['wikt_top10']):
        res_el = []
        for doc in ltitle2docs[words[0].lower()]:
            res_el.extend(parse_wiktionary(doc['text'])['hypernym'])
        res.append(res_el)
    df['wikt_top1_hypernyms_text_docs'] = res

100%|███████████████████████████████████████████████████████████████████████████| 2287/2287 [00:00<00:00, 10372.48it/s]
100%|████████████████████████████████████████████████████████████████████████████| 2287/2287 [00:00<00:00, 8525.86it/s]


In [47]:
sum([len(x) for x in df_test['wikt_top1_hypernyms_text_docs']]), sum([len(x) for x in df_train['wikt_top1_hypernyms_text_docs']])

(2254, 2828)

In [48]:
for df in [df_test, df_train]:
    for i in range(10):
        df['wn_top%d_hypernyms' % (i + 1)] = [
            [hyp for synset_id in nouns[words[i]] for hyp in ruwordnet.get_hypernyms_by_id(synset_id)]
            for words in tqdm(df['wn_top10'])
        ]

100%|████████████████████████████████████████████████████████████████████████████| 2287/2287 [00:01<00:00, 2171.36it/s]
100%|███████████████████████████████████████████████████████████████████████████| 2287/2287 [00:00<00:00, 14794.32it/s]
100%|███████████████████████████████████████████████████████████████████████████| 2287/2287 [00:00<00:00, 15920.82it/s]
100%|███████████████████████████████████████████████████████████████████████████| 2287/2287 [00:00<00:00, 17377.02it/s]
100%|███████████████████████████████████████████████████████████████████████████| 2287/2287 [00:00<00:00, 17776.00it/s]
100%|███████████████████████████████████████████████████████████████████████████| 2287/2287 [00:00<00:00, 17377.33it/s]
100%|███████████████████████████████████████████████████████████████████████████| 2287/2287 [00:00<00:00, 17500.02it/s]
100%|███████████████████████████████████████████████████████████████████████████| 2287/2287 [00:00<00:00, 17236.91it/s]
100%|███████████████████████████████████

In [49]:
import os
from collections import Counter
import pymorphy2
from nltk.tokenize import word_tokenize

In [50]:
morph = pymorphy2.MorphAnalyzer()

In [51]:
for df in [df_test, df_train]:
    cnt = 0
    lens = []
    wikt_top1_hypernyms = []
    for word, hypernyms_text in zip(df['word'], df['wikt_top1_hypernyms_text_docs']):
        res = []
        for hypernym in hypernyms_text:
            lhypernym = hypernym.lower().replace('ё', 'е')
            if lhypernym in nouns:
                res.extend(sorted(nouns[lhypernym]))
            else:
                parsed = morph.parse(lhypernym)
                if 'plur' in parsed[0].tag and parsed[0].normal_form in nouns:
                    res.extend(sorted(nouns[parsed[0].normal_form]))
                cnt += 1
        lens.append(len(res))
        wikt_top1_hypernyms.append(res)
    df['wikt_top1_hypernyms_docs'] = wikt_top1_hypernyms
    print(cnt)

206
325


In [52]:
df_test.head(6)

Unnamed: 0,word,private,wikt_in,ft_in,wn_top10,wikt_top10,wikt_hypernyms_text,wikt_top1_hypernyms_text,wikt_top1_hypernyms_text_docs,wn_top1_hypernyms,wn_top2_hypernyms,wn_top3_hypernyms,wn_top4_hypernyms,wn_top5_hypernyms,wn_top6_hypernyms,wn_top7_hypernyms,wn_top8_hypernyms,wn_top9_hypernyms,wn_top10_hypernyms,wikt_top1_hypernyms_docs
0,АБДОМИНОПЛАСТИКА,0,1,1,"[ринопластика, лапароскопия, подтяжка, хирурги...","[абдоминопластика, липосакция, блефаропластика...",[],[],[],[144594-N],[1063-N],"[118172-N, 129710-N, 144594-N]",[1047-N],[1064-N],[1047-N],"[108870-N, 110911-N]",[7529-N],"[118698-N, 7529-N]",[5390-N],[]
1,АБСОРБЕНТ,0,1,1,"[коагулянт, реагент, растворитель, теплоизолят...","[абсорбент, адсорбент, сорбент, абсорбирующий,...",[сорбент],[сорбент],[сорбент],[113920-N],[147758-N],[3675-N],[2409-N],"[130864-N, 820-N]",[56-N],"[142-N, 146017-N, 8432-N]","[820-N, 106613-N]",[121668-N],"[133238-N, 56-N, 8998-N]",[]
2,АВАЛЬ,0,1,1,"[акцепт, вексель, аккредитив, тратта, переводн...","[аваль, авалист, авал, акцепт, индоссамент, ве...","[гарантия, поручительство]","[гарантия, поручительство]","[гарантия, поручительство]","[1333-N, 107138-N]","[145804-N, 145983-N, 1792-N, 6424-N]","[1764-N, 457-N, 6424-N]",[1337-N],[1337-N],"[1278-N, 142951-N]","[121692-N, 566-N, 138373-N]","[116488-N, 1333-N, 151047-N]",[1333-N],[138167-N],"[147548-N, 150684-N, 9063-N]"
3,АВТАРКИЯ,0,1,1,"[самоизоляция, самодостаточность, суверенизаци...","[автаркия, автаркиям, автаркию, автаркиях, авт...",[],[],[],[113853-N],[119136-N],[111489-N],"[113853-N, 1208-N]","[136242-N, 3706-N, 7984-N]",[923-N],"[119563-N, 107258-N]",[125652-N],"[106825-N, 151436-N]","[4895-N, 107374-N, 125550-N]",[]
4,АГНОСТИК,0,1,1,"[атеист, материалист, протестант, идеалист, фу...","[агностик, атеист, агностицизм, теист, верующи...",[],[],[],[107524-N],[122232-N],[4544-N],"[126711-N, 107524-N, 122232-N]","[127024-N, 4474-N]",[4474-N],[107524-N],[4474-N],[107524-N],[2149-N],[]
5,АДЖИКА,0,1,1,"[приправа, алыча, кинза, пряная приправа, папр...","[аджика, ткемали, лечо, приправа, чихиртма, ба...",[приправа],[приправа],[приправа],[368-N],"[107956-N, 144253-N, 144322-N, 354-N]","[124049-N, 124081-N, 153883-N]","[107911-N, 370-N, 4681-N]","[107871-N, 348-N, 6878-N]","[350-N, 4789-N, 107911-N, 370-N]","[107641-N, 370-N]",[113199-N],"[107778-N, 106934-N, 106934-N]","[109109-N, 111436-N, 5731-N]",[107911-N]


In [53]:
df_train.head(6)

Unnamed: 0,word,wn_top10,wikt_top10,wikt_hypernyms_text,wikt_top1_hypernyms_text,wikt_top1_hypernyms_text_docs,wn_top1_hypernyms,wn_top2_hypernyms,wn_top3_hypernyms,wn_top4_hypernyms,wn_top5_hypernyms,wn_top6_hypernyms,wn_top7_hypernyms,wn_top8_hypernyms,wn_top9_hypernyms,wn_top10_hypernyms,wikt_top1_hypernyms_docs
0,РИНОПЛАСТИКА,"[хирургия, подтяжка, пластическая хирургия, пл...","[ринопластика, отопластика, блефаропластика, а...",[пластика],[пластика],[пластика],[1047-N],"[118172-N, 129710-N, 144594-N]",[1064-N],[5390-N],"[118698-N, 7529-N]","[113873-N, 125596-N]",[7529-N],[1063-N],"[123844-N, 5390-N]",[1063-N],"[113873-N, 118951-N]"
1,КОАГУЛЯНТ,"[реагент, коллоид, растворитель, раствор, конц...","[коагулянт, флокулянт, коагулятор, минерализат...","[химикат, лекарство, медикамент]","[химикат, лекарство, медикамент]","[химикат, лекарство, медикамент]",[147758-N],[115242-N],[3675-N],"[61-N, 109704-N, 3675-N, 111752-N]","[106507-N, 820-N, 154715-N, 3393-N]",[820-N],"[109704-N, 3675-N]","[131273-N, 135863-N, 461-N]","[109873-N, 2445-N, 56-N]",[56-N],"[6232-N, 1067-N, 1067-N]"
2,АКЦЕПТ,"[акцепт оферты, акцепт плательщика, акцепт без...","[акцепт, акцептант, акцептовать, индоссамент, ...","[принятие, подтверждение, подпись]","[принятие, подтверждение, подпись]","[принятие, подтверждение, подпись]",[107138-N],[1333-N],[107138-N],"[1764-N, 457-N, 6424-N]","[145804-N, 145983-N, 1792-N, 6424-N]",[118698-N],[1337-N],"[150944-N, 1554-N]",[112280-N],"[124851-N, 130027-N, 1333-N]","[124852-N, 134035-N, 146793-N, 106870-N, 13746..."
3,САМОИЗОЛЯЦИЯ,"[маргинализация, изоляция, милитаризация, конф...","[самоизоляция, самоизоляцию, самоизоляции, мар...",[],[],[],[139989-N],"[2409-N, 107417-N, 106595-N, 107325-N]","[1527-N, 833-N]",[107213-N],[118839-N],"[113853-N, 1208-N]","[137916-N, 143193-N]",[106633-N],[111489-N],[151843-N],[]
4,АТЕИСТ,"[сектант, христианин, безбожник, протестант, к...","[атеист, теист, верующий, сектант, христианин,...",[человек],[человек],[человек],[4474-N],[4474-N],[107524-N],[4544-N],[4544-N],[122232-N],[107524-N],[107524-N],[4544-N],[4474-N],[2149-N]
5,ПРИПРАВА,"[пряная приправа, специя, пряность, паприка, к...","[приправа, специя, пряность, аджика, специи, п...","[пищевая добавка, добавление, добавка]","[пищевая добавка, добавление, добавка]","[пищевая добавка, добавление, добавка]","[107911-N, 370-N, 4681-N]","[107911-N, 370-N, 4681-N, 368-N]","[107911-N, 370-N, 4681-N, 119371-N, 119844-N]","[107871-N, 348-N, 6878-N]","[124049-N, 124081-N, 153883-N]","[107778-N, 106934-N, 106934-N]","[350-N, 4789-N, 107911-N, 370-N]",[106509-N],[107911-N],[107911-N],"[111983-N, 118759-N, 130864-N, 138267-N, 13086..."


In [54]:
import codecs

def save_to_file(words_with_hypernyms, output_path, ruwordnet):
    with codecs.open(output_path, 'w', encoding='utf-8') as f:
        for word, hypernyms in words_with_hypernyms.items():
            for hypernym in hypernyms:
                f.write(f"{word}\t{hypernym}\t{ruwordnet.get_name_by_id(hypernym)}\n")

In [55]:
def get_top_hypernyms(l, sz=10):
    res_set = set()
    res = []
    for el in sorted(l):
        if el[1] not in res_set:
            res.append(el[1])
        res_set.add(el[1])
    return res[:sz]

In [56]:
from itertools import chain

In [57]:
# in df_train some items added to hypernyms multiple times
features = {word: {} for word in chain(df_test['word'], df_train['word'])}
hypernyms = {word: [] for word in chain(df_test['word'], df_train['word'])}

syn_priority_l1 = 4.
syn_priority_l2 = 2.
syn_priority_l3 = 3.
syntail_priority_l1 = 7.
syntail_priority_l2 = 5.
syntail_priority_l3 = 6.
wikhyp_priority2_l1 = 0.
wikhyp_priority2_l2 = 1.
wikhyp_priority3_l1 = 5.
wikhyp_priority3_l2 = 6.

for df in [df_test, df_train]:
    for word, hs in zip(df['word'], df['wikt_top1_hypernyms_docs']):
        for j, hypernym in enumerate(hs):
            features[word].setdefault(hypernym, {})['wikhyp_priority_l1'] = 1
            features[word].setdefault(hypernym, {})['wikhyp_priority_l1_pos'] = j
            for hyphyp in ruwordnet.get_hypernyms_by_id(hypernym):
                features[word].setdefault(hyphyp, {})['wikhyp_priority_l2'] = 1
                features[word].setdefault(hyphyp, {})['wikhyp_priority_l2_pos'] = j
        for j, hypernym in enumerate(hs[:2]):
            hypernyms[word].append((wikhyp_priority2_l1 + j*1e-3, hypernym))
            for hyphyp in ruwordnet.get_hypernyms_by_id(hypernym):
                hypernyms[word].append((wikhyp_priority2_l2 + j*1e-3, hyphyp))
        for j, hypernym in enumerate(hs[2:]):
            hypernyms[word].append((wikhyp_priority3_l1 + j*1e-3, hypernym))
            for hyphyp in ruwordnet.get_hypernyms_by_id(hypernym):
                hypernyms[word].append((wikhyp_priority3_l2 + j*1e-3, hyphyp))

    for i in range(2, 11):
        for word, hs in zip(df['word'], df['wn_top%d_hypernyms' % i]):
            for j, hypernym in enumerate(hs):
                features[word].setdefault(hypernym, {})['syn%d_priority_l2'%i] = 1
                features[word].setdefault(hypernym, {})['syn%d_priority_l2_pos'%i] = j
                hypernyms[word].append((syntail_priority_l2 + (i-2)*1e-3, hypernym))
                for hyphyp in ruwordnet.get_hypernyms_by_id(hypernym):
                    features[word].setdefault(hyphyp, {})['syn%d_priority_l3'%i] = 1
                    features[word].setdefault(hyphyp, {})['syn%d_priority_l3_pos'%i] = j
                    hypernyms[word].append((syntail_priority_l3 + (i-2)*1e-3, hyphyp))

    for word, hs in zip(df['word'], df['wn_top1_hypernyms']):
        for j, hypernym in enumerate(hs):
            hypernyms[word].append((syn_priority_l2, hypernym))
            features[word].setdefault(hypernym, {})['syn1_priority_l2'] = 1
            features[word].setdefault(hypernym, {})['syn1_priority_l2_pos'] = j
            for hyphyp in ruwordnet.get_hypernyms_by_id(hypernym):
                features[word].setdefault(hyphyp, {})['syn1_priority_l3'] = 1
                features[word].setdefault(hyphyp, {})['syn1_priority_l3_pos'] = j
                hypernyms[word].append((syn_priority_l3, hyphyp))

    for word, words in zip(df['word'], df['wn_top10']):
        for synset_id in nouns[words[0]]:
            hypernyms[word].append((syn_priority_l1, synset_id))
            features[word].setdefault(synset_id, {})['syn1_priority_l1'] = 1
            features[word].setdefault(synset_id, {})['syn1_priority_l1_pos'] = 0
        for i, word2 in enumerate(words[1:]):
            for j, synset_id in enumerate(nouns[word2]):
                hypernyms[word].append((syntail_priority_l1 + i*1e-3, synset_id))
                features[word].setdefault(synset_id, {})['syn%d_priority_l1'%(i+2)] = 1
                features[word].setdefault(synset_id, {})['syn%d_priority_l1_pos'%(i+2)] = j

In [60]:
hypernyms

{'АБДОМИНОПЛАСТИКА': [(5.0, '1063-N'),
  (6.0, '1062-N'),
  (6.0, '149151-N'),
  (5.001, '118172-N'),
  (6.001, '113825-N'),
  (5.001, '129710-N'),
  (6.001, '106817-N'),
  (6.001, '110474-N'),
  (5.001, '144594-N'),
  (6.001, '113873-N'),
  (5.002, '1047-N'),
  (6.002, '106562-N'),
  (5.003, '1064-N'),
  (6.003, '1047-N'),
  (5.004, '1047-N'),
  (6.004, '106562-N'),
  (5.005, '108870-N'),
  (6.005, '106640-N'),
  (6.005, '106938-N'),
  (5.005, '110911-N'),
  (6.005, '110908-N'),
  (5.006, '7529-N'),
  (6.006, '142491-N'),
  (5.007, '118698-N'),
  (6.007, '153471-N'),
  (5.007, '7529-N'),
  (6.007, '142491-N'),
  (5.008, '5390-N'),
  (6.008, '7529-N'),
  (2.0, '144594-N'),
  (3.0, '113873-N'),
  (4.0, '150611-N'),
  (7.0, '9808-N'),
  (7.001, '119152-N'),
  (7.001, '154733-N'),
  (7.002, '1064-N'),
  (7.003, '10123-N'),
  (7.004, '138981-N'),
  (7.005, '130808-N'),
  (7.006, '5390-N'),
  (7.007, '135045-N'),
  (7.007, '5390-N'),
  (7.008, '113873-N')],
 'АБСОРБЕНТ': [(5.0, '147758-N'),

In [62]:
from nltk.corpus import wordnet as wn
try:
    wn.all_synsets
except LookupError as e:
    import nltk
    nltk.download('wordnet')

In [63]:
def drop_trailing_dot(s):
    if s.endswith('.'):
        return s[:-1]
    return s

In [65]:
ru2en = {}
with open('dataset/ru.txt', 'r') as f_ru, open('dataset/en_ya.txt', 'r') as f_en_y:
    for i, r, ey in zip(range(100500), f_ru, f_en_y):
        r = drop_trailing_dot(r.strip())
        ey = drop_trailing_dot(ey.strip())
        ru2en[r] = ey
len(ru2en)

FileNotFoundError: [Errno 2] No such file or directory: 'dataset/ru.txt'

In [51]:
en2ru = {}
with open('data/hyp_en.txt', 'r') as f_en, open('data/hyp_ru_ya.txt', 'r') as f_ru_y:
    for i, e, ruy, in zip(range(100500), f_en, f_ru_y):
        e = drop_trailing_dot(e.strip())
        ruy = drop_trailing_dot(ruy.strip())
        en2ru[e] = ruy
len(en2ru)

8481

In [52]:
missing = set()

hypernyms_en = {}
hypernyms_en_txt = {}
for df in [df_test, df_train]:
    cnt = 0
    for word in df["word"]:
        hypernyms_en[word] = set()
        hypernyms_en_txt[word] = set()
        lword = word.lower()
        if lword in ru2en:
            synsets = wn.synsets(ru2en[lword])
            if synsets:
                flag = False
                for sense in synsets:
                    for hyp in sense.hypernyms():
                        for name in hyp.lemma_names():
                            name = name.replace('_', ' ')
                            if name in en2ru:
                                if en2ru[name].lower() in nouns:
                                    flag = True
                                    hypernyms_en_txt[word].add(en2ru[name].lower())
                                    for id_ in nouns[en2ru[name].lower()]:
                                        hypernyms[word].append((0.0, id_))
                                        hypernyms_en[word].add((0.0, id_))
                            else:
                                missing.add(name)
            if hypernyms_en[word]:
                cnt += 1
    print(cnt / (len(df["word"])+1e-6))

0.5736773062642426
0.6974202008319107


In [61]:
for word in public_test[:2] + private_test[:2]:
    print(word, hypernyms_en_txt[word], hypernyms_en[word])

NameError: name 'hypernyms_en_txt' is not defined

In [54]:
for word in hypernyms_en:
    for score, hypernym in hypernyms_en[word]:
        features[word].setdefault(hypernym, {})['wordnet_en_l1'] = 1
        for hyphyp in ruwordnet.get_hypernyms_by_id(hypernym):
            features[word].setdefault(hyphyp, {})['wordnet_en_l2'] = 1

In [55]:
norm_words = {}
def normalize(s):
    res = []
    for word in word_tokenize(s.lower()):
        if word in norm_words:
            res.append(norm_words[word])
        else:
            mp = morph.parse(word)
            if mp:
                norm_words[word] = mp[0].normal_form
                res.append(norm_words[word])
    return " ".join(res)

In [56]:
def innertext(tag):
    return (tag.text or '') + ''.join(innertext(e) for e in tag) + (tag.tail or '')

def get_serp_texts(xml_path, k=5):
    root = ET.parse(xml_path).getroot()
    res = []
    for e in root.find('response').find('results').find('grouping').findall('group')[:k]:
        res.append(innertext(e.find('doc').find('title')))
        if e.find('doc').find('passages'):
            for passage in e.find('doc').find('passages'):
                res.append(innertext(passage))
        if e.find('doc').find('headline'):
            res.append(innertext(e.find('doc').find('headline')))
    return " ".join(res)

In [57]:
synset_norm_serp_ya_cnt = Counter()
synset_norm_serp_g_cnt = Counter()
hypernyms_wserp = {}
serp_priority = -4.
serp_hyp_priority = -4.
meaning_priority = -1.

for df in [
    df_test,
    df_train
]:
    for word in tqdm(df["word"]):
        word_file_path = 'data/google_it_all/' + word.lower() + '.tsv'
        total_g_serp = ""
        if os.path.exists(word_file_path):
            with open(word_file_path, 'r') as f:
                for line in f:
                    text = line.split('\t')[1]
                    total_g_serp += text + " "
        norm_total_g_serp = normalize(total_g_serp)
        word_file_path = 'data/yandex_it_all/' + word.upper() + '.xml'
        total_ya_serp = get_serp_texts(word_file_path, k=10)
        norm_total_ya_serp = normalize(total_ya_serp)

        total_meaning = ""
        if word.lower() in ltitle2doc:
            for meaning in parse_wiktionary(ltitle2doc[word.lower()]['text'])['meaning']:
                total_meaning += meaning + " "
        norm_total_meaning = normalize(total_meaning)

        res = []
        for score, hypernym in hypernyms[word]:
            hypernym_texts = synset2words[hypernym] + [ruwordnet.get_name_by_id(hypernym)]
            for hypernym_text in hypernym_texts:
                norm_hypernym_text = normalize(hypernym_text)
                if norm_hypernym_text in norm_total_g_serp:
                    score += serp_priority
                    features[word][hypernym]['serp_g_norm'] = 1
                    for hyphyp in ruwordnet.get_hypernyms_by_id(hypernym):
                        if hyphyp in features[word]:
                            features[word][hyphyp]['serp_g_norm_l2'] = 1
                    synset_norm_serp_g_cnt[hypernym] += 1
                if hypernym_text in total_g_serp:
                    features[word][hypernym]['serp_g'] = 1

                if norm_hypernym_text in norm_total_ya_serp:
                    features[word][hypernym]['serp_ya_norm'] = 1
                    synset_norm_serp_ya_cnt[hypernym] += 1
                    for hyphyp in ruwordnet.get_hypernyms_by_id(hypernym):
                        if hyphyp in features[word]:
                            features[word][hyphyp]['serp_ya_norm_l2'] = 1
                if hypernym_text in total_ya_serp:
                    features[word][hypernym]['serp_ya'] = 1

                if norm_hypernym_text in norm_total_meaning:
                    score += meaning_priority
                    features[word][hypernym]['meaning_norm'] = 1
                if hypernym_text in total_meaning:
                    features[word][hypernym]['meaning'] = 1
            res.append((score, hypernym))
        hypernyms_wserp[word] = res

100%|██████████| 2287/2287 [01:25<00:00, 26.72it/s]
100%|██████████| 2287/2287 [01:57<00:00, 19.40it/s]


In [58]:
# feature_names = set()
# for word in features:
#     for synset_id in features[word]:
#         for key in features[word][synset_id]:
#             feature_names.add(key)
# feature_names = sorted(feature_names)
# len(feature_names)
feature_names = [
    "meaning", 
    "meaning_norm", 
    "serp_g", 
    "serp_g_norm", 
    "serp_g_norm_l2", 
    "serp_ya", 
    "serp_ya_norm", 
    "serp_ya_norm_l2", 
    "syn1_priority_l1", 
    "syn1_priority_l2", 
    "syn1_priority_l3", 
    "syn2_priority_l1", 
    "syn2_priority_l2", 
    "syn2_priority_l3", 
    "syn3_priority_l1", 
    "syn3_priority_l2", 
    "syn3_priority_l3", 
    "syn4_priority_l1", 
    "syn4_priority_l2", 
    "syn4_priority_l3", 
    "syn5_priority_l1", 
    "syn5_priority_l2", 
    "syn5_priority_l3", 
    "syn6_priority_l1", 
    "syn6_priority_l2", 
    "syn6_priority_l3", 
    "syn7_priority_l1", 
    "syn7_priority_l2", 
    "syn7_priority_l3", 
    "wikhyp_priority_l1", 
    "wikhyp_priority_l2", 
    "wordnet_en_l1", 
    "wordnet_en_l2", 
]

In [59]:
total = sum([len(features[x]) for x in df_train["word"]])
X = np.zeros( (total, len(feature_names)) )
y = np.zeros( total )
X.shape, y.shape

((96760, 33), (96760,))

In [60]:
pos = 0
for word in df_train["word"]:
    lword = word.lower()
    true_hypernyms = set()
    for synset_id in nouns[lword]:
        true_hypernyms.update(ruwordnet.get_hypernyms_by_id(synset_id))
    for synset_id in features[word]:
        y[pos] = 1 if synset_id in true_hypernyms else 0
        X[pos] = [features[word][synset_id].get(fn, 0) for fn in feature_names]
        pos += 1
pos

96760

In [61]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

In [62]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
y_train.shape, y_test.shape

((77408,), (19352,))

In [63]:
for C in [0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1.0, 3, 10, 30, 100, 300, 1000]:
    model = LogisticRegression(C=C)
    model.fit(X_train, y_train)
    print(C, roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]))

0.001 0.9301172877030781
0.003 0.9352008403698161
0.01 0.9377833242037193
0.03 0.939217898273835
0.1 0.9393266186137239
0.3 0.9392825292592988
1.0 0.939319186885421
3 0.9392707553976058
10 0.9392747635207354
30 0.9392499632588712
100 0.9392508817870885
300 0.9392502137665668
1000 0.939250297269132


In [64]:
model = LogisticRegression(C=0.03)
model.fit(X, y)

LogisticRegression(C=0.03, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [65]:
weights = {k:v for k,v in zip(feature_names, model.coef_[0])}

In [66]:
pos_weights = {}
for i in range(1, 11):
    for j in range(1,4):
        pos_weights['syn%d_priority_l%d_pos' % (i, j)] = -0.0001

def calc_score(d):
    score = 0.
    for feature, weight in weights.items():
        score += weight * d.get(feature, 0)
    for feature, weight in pos_weights.items():
        score += weight * d.get(feature, 0)
    return score

In [67]:
prefix = 'subm/subm106'
save_to_file({
    k: get_top_hypernyms( [(-calc_score(features[k][x]), x) for x in features[k]] )
    for k in public_test
}, prefix + '_public.tsv', ruwordnet)
save_to_file({
    k: get_top_hypernyms( [(-calc_score(features[k][x]), x) for x in features[k]] )
    for k in private_test
}, prefix + '_private.tsv', ruwordnet)

In [68]:
weights = {
    "meaning": 0.15,
    "meaning_norm": 0.15,
    "serp_g": 0.20,
    "serp_g_norm": 0.50,
    "serp_g_norm_l2": 0.20,
    "serp_ya": 0.15,
    "serp_ya_norm": 0.20,
    "serp_ya_norm_l2": 0.20,
    
    "syn1_priority_l1": 0.05,
    "syn1_priority_l2": 0.50,
    "syn1_priority_l3": 0.30,
    
    "syn2_priority_l1": 0.05,
    "syn2_priority_l2": 0.25,
    "syn2_priority_l3": 0.15,
    
    "syn3_priority_l1": 0.05,
    "syn3_priority_l2": 0.25,
    "syn3_priority_l3": 0.15,
    
    "syn4_priority_l1": 0.00,
    "syn4_priority_l2": 0.10,
    "syn4_priority_l3": 0.15,
    
    "syn5_priority_l1": 0.00,
    "syn5_priority_l2": 0.10,
    "syn5_priority_l3": 0.00,
    
    "syn6_priority_l1": 0.00,
    "syn6_priority_l2": 0.10,
    "syn6_priority_l3": 0.00,
    
    "syn7_priority_l1": 0.00,
    "syn7_priority_l2": 0.10,
    "syn7_priority_l3": 0.00,
    
    "wikhyp_priority_l1": 0.20,
    "wikhyp_priority_l2": 0.50,
    
    "wordnet_en_l1": 0.05,
    "wordnet_en_l2": 0.20,
}

In [69]:
prefix = 'subm/subm107'
save_to_file({
    k: get_top_hypernyms( [(-calc_score(features[k][x]), x) for x in features[k]] )
    for k in public_test
}, prefix + '_public.tsv', ruwordnet)
save_to_file({
    k: get_top_hypernyms( [(-calc_score(features[k][x]), x) for x in features[k]] )
    for k in private_test
}, prefix + '_private.tsv', ruwordnet)