In [1]:
! pip install ruwordnet
! ruwordnet download

Collecting ruwordnet
  Downloading ruwordnet-0.0.3.tar.gz (13 kB)
Building wheels for collected packages: ruwordnet
  Building wheel for ruwordnet (setup.py) ... [?25l[?25hdone
  Created wheel for ruwordnet: filename=ruwordnet-0.0.3-py3-none-any.whl size=11621 sha256=758abb6d8cb969fd01b0e3893298cb3239c6124499716342708be8e93ee53e8d
  Stored in directory: /root/.cache/pip/wheels/44/82/4b/016af6cfab5855b87d3015eee1650f5560ee9fab885c7ef6b3
Successfully built ruwordnet
Installing collected packages: ruwordnet
Successfully installed ruwordnet-0.0.3
downloading a ruwordnet model from https://github.com/avidale/python-ruwordnet/releases/download/0.0.2/ruwordnet.db


In [2]:
! pip install wiktionaryparser

Collecting wiktionaryparser
  Downloading wiktionaryparser-0.0.97-py3-none-any.whl (19 kB)
Installing collected packages: wiktionaryparser
Successfully installed wiktionaryparser-0.0.97


In [3]:
import re

In [4]:
from tqdm.auto import tqdm
tqdm.pandas()

In [5]:
import gzip

WordNet

In [6]:
from ruwordnet import RuWordNet
import pandas as pd

wn = RuWordNet()

Wiktionary

In [7]:
from wiktionaryparser import WiktionaryParser

wp = WiktionaryParser()
wp.set_default_language('Russian')
entries = wp.fetch('лук')
for entry in entries:
    print(entry)

{'etymology': 'Inherited from Proto-Slavic *lukъ, borrowed from Proto-Germanic *laukaz (compare German Lauch, English leek, Old Norse laukr whence Danish løg). Cognates include Polabian lāuk, Bulgarian лук (luk) and Slovene and Serbo-Croatian luk.\n', 'definitions': [{'partOfSpeech': 'noun', 'text': ['лук • (luk)\xa0m\xa0inan (genitive лу́ка, nominative plural лу́ки, genitive plural лу́ков, related adjective лу́ковый)', '(usually uncountable, collectively) onion, onions'], 'relatedWords': [], 'examples': ['зелёный лук ― zeljónyj luk ― scallion', 'лук-поре́й ― luk-poréj ― leek']}], 'pronunciations': {'text': ['IPA: [ɫuk]', 'Homophone: луг (lug)'], 'audio': ['//upload.wikimedia.org/wikipedia/commons/4/49/Ru-%D0%BB%D1%83%D0%BA.ogg']}}
{'etymology': 'Inherited from Proto-Slavic *lǫkъ. Cognate with Lithuanian lankas.\n', 'definitions': [{'partOfSpeech': 'noun', 'text': ['лук • (luk)\xa0m\xa0inan (genitive лу́ка, nominative plural лу́ки, genitive plural лу́ков)', 'bow (weapon used for shooti

Частотный словарь Ляшевской и Шаровой

In [8]:
! unzip Freq2011.zip

Archive:  Freq2011.zip
  inflating: freqrnc2011.csv         
  inflating: freqrnc_readme.txt      


In [9]:
freqrnc = pd.read_csv('freqrnc2011.csv', sep='\t')
freqrnc = freqrnc[freqrnc.PoS == 's']

НКРЯ

In [10]:
def get_lemmas_count_rnc(filepath):

    with gzip.open(filepath, mode='rt') as f:
        rows = f.readlines()

    table = []
    for row in rows:
        row = row.strip().split(';')[:11]
        table.append(tuple(row))

    df = pd.DataFrame(table[1:], columns=table[0])
    df = df[df.Cat == 'S']

    grouped_df = df.groupby('Lemma').agg("count")
    grouped_df = grouped_df[['Cat']].sort_values(['Cat'], ascending=False).reset_index()
    grouped_df['Lemma'] = grouped_df['Lemma'].astype(str)
    grouped_df['Lemma'] = grouped_df['Lemma'].apply(lambda x: x.lower())
    lemmas_data = grouped_df[~grouped_df.Lemma.str.contains("[a-z0-9]")].rename({'Cat':'rnc_num_senses'}, axis=1)

    return lemmas_data

In [12]:
lemmas = get_lemmas_count_rnc('semantic.csv.gz')

In [13]:
# объединить частотность и количество значений в НКРЯ
freq_and_count = pd.merge(freqrnc, lemmas, on='Lemma', how='inner')

In [14]:
# 200 самых частотных существительных
most_freq_lemmas = freq_and_count.sort_values('Freq(ipm)')[-200:]

In [15]:
def fetch_with_catch_error(x):
    try:
        return len([1 for i in wp.fetch(x)])
    except AttributeError as E:
        print(x, E)

def ruwordnet_wiki_comparison(data):
    
    data['wiktionary_num_senses'] = data['Lemma'].progress_apply(fetch_with_catch_error)
    data['ruwordnet_num_senses'] = data['Lemma'].progress_apply(lambda x: len(wn.get_senses(x)))
    
    return data

In [16]:
result = ruwordnet_wiki_comparison(most_freq_lemmas)

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/200 [00:00<?, ?it/s]

In [17]:
result.head()

Unnamed: 0,Lemma,PoS,Freq(ipm),R,D,Doc,rnc_num_senses,wiktionary_num_senses,ruwordnet_num_senses
8793,метод,s,197.0,100,83,4912,2,2,2
17936,тип,s,197.0,100,89,5666,6,1,1
13739,принцип,s,197.9,100,91,6654,3,1,3
17791,тема,s,198.1,100,94,6199,2,1,2
15329,рост,s,198.4,100,93,5936,6,1,4


In [18]:
result['Mean'] = result[['rnc_num_senses', 'wiktionary_num_senses', 'ruwordnet_num_senses']].mean(axis=1).round().astype('int')

In [19]:
result = result.drop(['R', 'D', 'Doc'], axis=1).reset_index(drop=True)

In [20]:
result.head()

Unnamed: 0,Lemma,PoS,Freq(ipm),rnc_num_senses,wiktionary_num_senses,ruwordnet_num_senses,Mean
0,метод,s,197.0,2,2,2,2
1,тип,s,197.0,6,1,1,3
2,принцип,s,197.9,3,1,3,2
3,тема,s,198.1,2,1,2,2
4,рост,s,198.4,6,1,4,4


In [21]:
import random

In [22]:
g = result.groupby('Mean')

In [23]:
word_ids = []

second = []
third = []

for key in g.groups.keys():
    if key == 1:
        word_ids.extend(random.sample(list(g.groups[1]), 20))
    elif key in [2, 3, 4]:
        second += list(g.groups[key])
    else:
        third += list(g.groups[key])

word_ids.extend(random.sample(second, 20))
word_ids.extend(random.sample(third, 20))

In [24]:
result.reset_index(drop=True, inplace=True)

In [25]:
target = result.iloc[word_ids].reset_index(drop=True)

In [27]:
target.to_csv('num_senses_rnc_wiki.tsv', sep='\t')