In [52]:
!pip install pymorphy2



In [53]:
from collections import Counter, namedtuple
import pymorphy2
import numpy as np

In [54]:
USEFUL_POS = ['NOUN', 'ADJF', 'ADJS', 'COMP',
              'VERB', 'INFN', 'PRTF', 'PRTS', 
              'GRND', 'ADVB', 'NPRO', 'PRED']

Stats = namedtuple("Stats", "vocabulary, lemma_vocabulary, words_counts, unique_words_counts")

In [55]:
def collect_statistics(data, max_words):
    pm = pymorphy2.MorphAnalyzer()
    stats = Stats(Counter(),  Counter(), list(), list())
    for text in data:
        process_text(text, stats, max_words, pm)
    return stats


def process_text(text, text_stats, max_words, analyzer):
    words = [word.text for word in razdel.tokenize(text)][:max_words]
    lemmas = [analyzer.parse(word)[0] for word in words]
    lemmas = [lemma for lemma in lemmas if lemma.tag.POS in USEFUL_POS]
    words = [lemma.word for lemma in lemmas]
    lemmas = [lemma.normal_form for lemma in lemmas]
    text_stats.vocabulary.update(words)
    text_stats.lemma_vocabulary.update(lemmas)
    text_stats.words_counts.append(len(words))
    text_stats.unique_words_counts.append(len(set(words)))


def draw_stat_hists(data, bins=10, label=None):
    words = data.words_counts
    uwords = data.unique_words_counts
    if label is not None:
        label = " in single " + label
    else:
        label = ""
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(18, 5))
    ax1.hist(words, bins)
    ax1.set_title("Words" + label)
    ax1.set_xlabel("Num of words")
    ax1.set_ylabel("Examples")
    ax2.hist(uwords, bins)
    ax2.set_title("Unique words" + label)
    ax2.set_xlabel("Num of words")
    ax2.set_ylabel("Examples")
    plt.show()

In [None]:
examples = 30000
indices = np.random.randint(len(cleaned_dataset), size=examples)
text_data = cleaned_dataset['Text'].values[indices]
header_data = cleaned_dataset['Head_title'].values[indices]

In [None]:
%%time
text_stats = collect_statistics(text_data, max_words=3000)
header_stats = collect_statistics(header_data, max_words=3000)

In [None]:
print("Texts vocabulary size: ", len(text_stats.vocabulary))
print("Texts lemma vocabulary size: ", len(text_stats.lemma_vocabulary))
print("Headers vocabulary size: ", len(header_stats.vocabulary))
print("Headers lemma vocabulary size: ", len(header_stats.lemma_vocabulary))
print("Common lemmas headers vs texts: ", len(set(header_stats.lemma_vocabulary.keys()) & set(text_stats.lemma_vocabulary.keys())))

In [None]:
text_stats.lemma_vocabulary.most_common(30)

In [None]:
header_stats.lemma_vocabulary.most_common(30)

In [None]:
draw_stat_hists(text_stats, 50, label='text')

In [None]:
draw_stat_hists(header_stats, 10, label='header')