In [9]:
import sys
sys.path.append('taxonomy-enrichment/baselines/ruwordnet')
sys.path.append('taxonomy-enrichment/baselines')

In [10]:
import re
import numpy as np
import pandas as pd
import fasttext
from ruwordnet.ruwordnet_reader import RuWordnet
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import xml.etree.ElementTree as ET

In [11]:
ruwordnet = RuWordnet(db_path="dataset/ruwordnet.db", ruwordnet_path=None)

In [23]:
public_test = []
with open('dataset/public/nouns_public_no_labels.tsv', 'r', encoding="utf-8") as f:
    for line in f:
        line = line.rstrip()
        public_test.append(line)

In [24]:
private_test = []
with open('dataset/private/nouns_private_no_labels.tsv', 'r', encoding="utf-8") as f:
    for line in f:
        line = line.rstrip()
        private_test.append(line)

In [12]:
wiktionarydump = "dataset/ruwiktionary-20200120-pages-articles-multistream.xml"

In [13]:
title2doc = {}

In [14]:
doc = {}
fields = {
    "timestamp": "timestamp",
    "title": "title",
    "text": "text",
    "redirect title": "redirect_title",
}
cnt = 0
for _, elem in tqdm(ET.iterparse(wiktionarydump, events=("end",))):
    prefix, has_namespace, postfix = elem.tag.partition('}')
    tag = postfix if postfix else prefix
    if tag in fields:
        doc[fields[tag]] = elem.text
    if tag == "page":
        elem.clear()
        cnt += 1
        title2doc[doc["title"]] = doc
        doc = {}

35866269it [02:06, 283402.44it/s]


In [21]:
SPECIAL = re.compile(r"\{\{=\|")
SPECIAL2 = re.compile(r"\{\{пример\|?.*?\}\}")
SPECIALS = re.compile("\{\{.+?\|ru\}\}")
SPECIALS2 = re.compile(r"\b\w+?\|(?=\w+? )")
SPECIALS4 = re.compile(r"\|.+?}}")
SPECIALS3 = re.compile("[^\w\s]")

def clean_markup(text):
    text = text.replace("[[", "").replace("]]", "").replace("{{aslinks|", "")
    return text.strip().replace("\xa0", "").replace("\xad", "")


def clean_markup1(text):
    text = text.replace("[[", "").replace("]]", "").replace("{{aslinks|", "")
    text = SPECIAL.sub("", text)
    text = SPECIAL2.sub("", text)
    text = SPECIALS.sub("", text)
    text = SPECIALS2.sub("", text)
    text = re.sub("\{\{хим-элем", "химический элемент", text)
    text = SPECIALS4.sub("", text)
    text = SPECIALS3.sub("", text)
    return text.strip().replace("\xa0", "").replace("\xad", "")

def clean_markup2(text):
    text = text.replace("[[", "").replace("]]", "").replace("{{aslinks|", "")
    text = SPECIAL.sub("", text)
    text = SPECIAL2.sub("", text)
    text = SPECIALS.sub("", text)
    text = SPECIALS2.sub("", text)
    text = re.sub("\{\{хим-элем", "химический элемент", text)
    text = SPECIALS4.sub("", text)
    text = SPECIALS3.sub("", text)
    return text.strip().replace("\xa0", "").replace("\xad", "").replace("}}", "")


def parse_item(text):
    items = []
    if text.startswith("# ") and len(line) > 2:
        items.extend([
            clean_markup(x).replace("?", "").replace(";", "").replace("'", "").strip() 
            for x in re.split(',|;', text[2:]) if x not in {'-', '?', '—', ''}
        ])
    return items

def parse_wiktionary(text, cl):
    res = {'hypernym': [], 'synonym': [], 'meaning': []}
    h1 = ""
    texts = []
    for line in text.split("\n"):
        if line.startswith("= ") and line.endswith(" ="):
            h1 = line
        if h1 == '= {{-ru-}} =':
            texts.append(line)
    text = "\n".join(texts)
    for par in text.split("\n\n"):
        for h, f in [('==== Гиперонимы ====', 'hypernym'), ('==== Синонимы ====', 'synonym')]:
            if h in par:
                res[f] = [w for line in par.split("\n") for w in parse_item(line)]
        for h, f in [('==== Значение ====', 'meaning')]:
            if h in par:
                res[f] = [cl(line[2:]) for line in par.split("\n") if line.startswith('# ') and len(line) > 2]
#         if '=== Перевод ===' in par:
#             res['translation'] = par.replace('=== Перевод ===\n', '')
    return res

In [24]:
with open("wiki_ru.jsonlines", 'w', encoding='utf-8', newline="\n") as w:
    for word in title2doc:
        text = title2doc[word]['text']
        if text != None:
            parsed_data = parse_wiktionary(text, clean_markup1)
            meanings = [meaning.lower() for meaning in parsed_data['meaning'] if meaning]
            w.write(json.dumps({"word":word, "synonyms":parsed_data['synonym'], "hypernyms":parsed_data['hypernym'], "meanings":meanings}))
            w.write("\n")

In [243]:
with open("wiki_private_ru.jsonlines", 'w', encoding='utf-8', newline="\n") as w:
    for word in private_test:
        word = word.lower()
        if word in title2doc:
            text = title2doc[word]['text']
            parsed_data = parse_wiktionary(text, clean_markup1)
            meanings = [meaning.lower() for meaning in parsed_data['meaning'] if meaning]
            w.write(json.dumps({"word":word, "synonyms":parsed_data['synonym'], "hypernyms":parsed_data['hypernym'], "meanings":meanings}))
            w.write("\n")

In [5]:
COMP = re.compile(r">.+?>")
with open("output.json", 'r') as f, open("wiki_en.jsonlines", 'w', encoding='utf-8', newline="\n") as w:
    for line in f:
        data = json.loads(line)
        if 'senses' in data:
            meanings = [j for i in range(len(data['senses'])) for j in data['senses'][i].get('glosses', [])]
            hypernyms = [j['word'] for i in range(len(data['senses'])) for j in data['senses'][i].get('hypernyms', [])]
            hypernyms = [i for i in hypernyms if ">" not in i]
            synonyms = [i['word'].replace("Thesaurus:", "") for i in data.get('synonyms', [])]
            word = data['word']
            w.write(json.dumps({"word": word, "synonyms": synonyms, "hypernyms": hypernyms, "meanings": meanings}))
            w.write("\n")

In [4]:
from wiktextract import WiktionaryConfig, parse_wiktionary
import json

In [177]:
enwiktionarydump = "dataset/ruwiktionary-20200120-pages-articles-multistream.xml"

In [178]:
fh = open("output_ru.json", "w")
def word_cb(data):
    fh.write(json.dumps(data))
    fh.write("\n")

config = WiktionaryConfig(
             capture_languages=["Russian"],
             capture_translations=False,
             capture_pronunciation=False,
             capture_linkages=True,
             capture_compounds=False,
             capture_redirects=False)
ctx = parse_wiktionary(enwiktionarydump, config, word_cb)    
fh.close()

ё IN None/None: DEBUG: unrenderable template: unrecognized: {{з|I}}
ё IN None/None: DEBUG: unrenderable template: unrecognized: {{з|II}}
ё IN None/None: DEBUG: unrenderable template: unrecognized: {{з|III}}
а IN None/None: DEBUG: unrenderable template: unrecognized: {{з|(существительное)}}
а IN None/None: DEBUG: unrenderable template: unrecognized: {{з|(союз)}}
а IN None/None: DEBUG: unrenderable template: unrecognized: {{з|(частица)}}
а IN None/None: DEBUG: unrenderable template: unrecognized: {{з|(междометие)}}
а IN None/None: DEBUG: unrenderable template: unrecognized: {{з|I}}
а IN None/None: DEBUG: unrenderable template: unrecognized: {{з|II}}
а IN None/None: DEBUG: unrenderable template: unrecognized: {{з|(числительное)}}
а IN None/None: DEBUG: unrenderable template: unrecognized: {{з|(существительное)}}
а IN None/None: DEBUG: unrenderable template: unrecognized: {{з|(наречие)}}
а IN None/None: DEBUG: unrenderable template: unrecognized: {{з|(прилагательное)}}
а IN None/None: DEBU

retail IN None/None: DEBUG: unrenderable template: unrecognized: {{заголовок|I (наречие)}}
retail IN None/None: DEBUG: unrenderable template: unrecognized: {{заголовок|I (глагол)}}
retail IN None/None: DEBUG: unrenderable template: unrecognized: {{заголовок|II}}
top IN None/None: DEBUG: unrenderable template: unrecognized: {{заголовок|I}}
top IN None/None: DEBUG: unrenderable template: unrecognized: {{заголовок|II}}
top IN None/None: DEBUG: unrenderable template: unrecognized: {{заголовок|(существительное)}}
top IN None/None: DEBUG: unrenderable template: unrecognized: {{заголовок|(глагол)}}
top IN None/None: DEBUG: unrenderable template: unrecognized: {{заголовок|(прилагательное)}}
top IN None/None: DEBUG: unrenderable template: unrecognized: {{заголовок|(наречие)}}
top IN None/None: DEBUG: unrenderable template: unrecognized: {{заголовок|I}}
top IN None/None: DEBUG: unrenderable template: unrecognized: {{заголовок|II}}
head IN None/None: DEBUG: unrenderable template: unrecognized: {{

recall IN None/None: DEBUG: unrenderable template: unrecognized: {{заголовок|I}}
recall IN None/None: DEBUG: unrenderable template: unrecognized: {{заголовок|II}}
wall IN None/None: DEBUG: unrenderable template: unrecognized: {{заголовок|(прилагательное)}}
wall IN None/None: DEBUG: unrenderable template: unrecognized: {{заголовок|(существительное)}}
wall IN None/None: DEBUG: unrenderable template: unrecognized: {{заголовок|(глагол)}}
carpet IN None/None: DEBUG: unrenderable template: unrecognized: {{заголовок|I}}
carpet IN None/None: DEBUG: unrenderable template: unrecognized: {{заголовок|II}}
garner IN None/None: DEBUG: unrenderable template: unrecognized: {{заголовок|(существительное)}}
garner IN None/None: DEBUG: unrenderable template: unrecognized: {{заголовок|(глагол)}}
рожа IN None/None: DEBUG: unrenderable template: unrecognized: {{заголовок|I}}
рожа IN None/None: DEBUG: unrenderable template: unrecognized: {{заголовок|II}}
рожа IN None/None: DEBUG: unrenderable template: unreco

salt IN None/None: DEBUG: unrenderable template: unrecognized: {{заголовок|I}}
salt IN None/None: DEBUG: unrenderable template: unrecognized: {{заголовок|II}}
snake IN None/None: DEBUG: unrenderable template: unrecognized: {{заголовок|I (существительное)}}
snake IN None/None: DEBUG: unrenderable template: unrecognized: {{заголовок|II (глагол)}}
ship IN None/None: DEBUG: unrenderable template: unrecognized: {{заголовок|(существительное)}}
ship IN None/None: DEBUG: unrenderable template: unrecognized: {{заголовок|(глагол)}}
это IN None/None: DEBUG: unrenderable template: unrecognized: {{з|(частица)}}
это IN None/None: DEBUG: unrenderable template: unrecognized: {{з|(существительное)}}
это IN None/None: DEBUG: unrenderable template: unrecognized: {{з|(местоимение)}}
он IN None/None: DEBUG: unrenderable template: unrecognized: {{з|(местоимение)}}
он IN None/None: DEBUG: unrenderable template: unrecognized: {{з|(существительное)}}
он IN None/None: DEBUG: unrenderable template: unrecognized:

quadrupede IN None/None: DEBUG: unrenderable template: unrecognized: {{заголовок|(прилагательное)}}
quadrupede IN None/None: DEBUG: unrenderable template: unrecognized: {{заголовок|(существительное)}}
quadro IN None/None: DEBUG: unrenderable template: unrecognized: {{заголовок|(существительное)}}
quadro IN None/None: DEBUG: unrenderable template: unrecognized: {{заголовок|(прилагательное)}}
load IN None/None: DEBUG: unrenderable template: unrecognized: {{заголовок|(существительное)}}
load IN None/None: DEBUG: unrenderable template: unrecognized: {{заголовок|(глагол)}}
pesca IN None/None: DEBUG: unrenderable template: unrecognized: {{заголовок|I}}
pesca IN None/None: DEBUG: unrenderable template: unrecognized: {{заголовок|II}}
riccio IN None/None: DEBUG: unrenderable template: unrecognized: {{заголовок|I}}
riccio IN None/None: DEBUG: unrenderable template: unrecognized: {{заголовок|II}}
lot IN None/None: DEBUG: unrenderable template: unrecognized: {{заголовок|(существительное)}}
lot IN 

vanilla IN None/None: DEBUG: unrenderable template: unrecognized: {{заголовок|I (прилагательное)}}
essential IN None/None: DEBUG: unrenderable template: unrecognized: {{заголовок|(прилагательное)}}
essential IN None/None: DEBUG: unrenderable template: unrecognized: {{заголовок|(существительное)}}
paper IN None/None: DEBUG: unrenderable template: unrecognized: {{заголовок|I}}
paper IN None/None: DEBUG: unrenderable template: unrecognized: {{заголовок|II}}
paper IN None/None: DEBUG: unrenderable template: unrecognized: {{заголовок|III}}
бумага IN None/None: DEBUG: unrenderable template: unrecognized: {{з|I}}
бумага IN None/None: DEBUG: unrenderable template: unrecognized: {{з|II}}
mention IN None/None: DEBUG: unrenderable template: unrecognized: {{заголовок|(существительное)}}
mention IN None/None: DEBUG: unrenderable template: unrecognized: {{заголовок|(глагол)}}
pet IN None/None: DEBUG: unrenderable template: unrecognized: {{заголовок|I (существительное)}}
pet IN None/None: DEBUG: unre

sky IN None/None: DEBUG: unrenderable template: unrecognized: {{з|I}}
sky IN None/None: DEBUG: unrenderable template: unrecognized: {{з|II}}
sky IN None/None: DEBUG: unrenderable template: unrecognized: {{з|III}}
slave IN None/None: DEBUG: unrenderable template: unrecognized: {{заголовок|I}}
slave IN None/None: DEBUG: unrenderable template: unrecognized: {{заголовок|II}}
slave IN None/None: DEBUG: unrenderable template: unrecognized: {{заголовок|III}}
sleep IN None/None: DEBUG: unrenderable template: unrecognized: {{заголовок|I}}
sleep IN None/None: DEBUG: unrenderable template: unrecognized: {{заголовок|II}}
sleeve IN None/None: DEBUG: unrenderable template: unrecognized: {{заголовок|I}}
sleeve IN None/None: DEBUG: unrenderable template: unrecognized: {{заголовок|II}}
mean IN None/None: DEBUG: unrenderable template: unrecognized: {{заголовок|I}}
mean IN None/None: DEBUG: unrenderable template: unrecognized: {{заголовок|II}}
mean IN None/None: DEBUG: unrenderable template: unrecognized

KeyboardInterrupt: 