In [1]:
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import collections
from pymystem3 import Mystem
from itertools import chain
import re
from tqdm import tqdm
from scipy import optimize
from scipy.stats import chisquare

# Создадим отсортированный словарь слов в тексте

In [2]:
wordCount_news = {}
with open('ru.txt', 'r') as f:
    lines = [re.sub("[^а-я\s]", "", line.lower()) for line in f.readlines()]
lines = [re.sub("\s+", " ", line).strip() for line in lines]
stemmer = Mystem()
words = chain.from_iterable([[word for word in stemmer.lemmatize(line) if re.match('[а-я]+', word)] for line in tqdm(lines)])
for word in words:
    if word.isalpha():
        if word in wordCount_news:
            wordCount_news[word.lower()] += 1
        else:
            wordCount_news[word.lower()] = 1
sortedWordCount_news = collections.OrderedDict(reversed(sorted(wordCount_news.items(), key= lambda t : t[1])))

100%|██████████| 1997/1997 [00:03<00:00, 543.24it/s]


In [3]:
wordCount_tolstoj = {}
with open('tolstoj_lew_nikolaewich-text_0080.txt', 'r') as f:
    lines = [re.sub("[^а-я\s]", "", line.lower()) for line in f.readlines()]
lines = [re.sub("\s+", " ", line).strip() for line in lines]
stemmer = Mystem()
words = chain.from_iterable([[word for word in stemmer.lemmatize(line) if re.match('[а-я]+', word)] for line in tqdm(lines)])
for word in words:
    if word.isalpha():
        if word in wordCount_tolstoj:
            wordCount_tolstoj[word.lower()] += 1
        else:
            wordCount_tolstoj[word.lower()] = 1
sortedWordCount_tolstoj = collections.OrderedDict(reversed(sorted(wordCount_tolstoj.items(), key= lambda t : t[1])))

100%|██████████| 18130/18130 [00:19<00:00, 936.22it/s] 


In [4]:
def logL(a, words):
    N = len(words)
    c = 1.0/np.sum([(1.0/k)**a for k in range(1, N+1)])
    lnL = np.log(c) - a*np.mean(np.log([words[x] for x in words]))
    return lnL
def maximaze_loglikelihood(words):
    return optimize.minimize(lambda a: -logL(a, words), x0=[1.5]).x[0]

In [5]:
a_news = maximaze_loglikelihood(sortedWordCount_news)
c_news = 1.0/np.sum([(1.0/k)**a_news for k in range(1, len(sortedWordCount_news)+1)])
news_exp = np.array([c_news/k**a_news for k in range(1, len(sortedWordCount_news)+1)])

In [6]:
a_tolstoj = maximaze_loglikelihood(sortedWordCount_tolstoj)
c_tolstoj = 1.0/np.sum([(1.0/k)**a_tolstoj for k in range(1, len(sortedWordCount_tolstoj)+1)])
tolstoj_exp = np.array([c_tolstoj/k**a_tolstoj for k in range(1, len(sortedWordCount_tolstoj)+1)])

# Проверка гипотез черех хи-квадрат

In [7]:
tolstojs = np.array([sortedWordCount_tolstoj[key] for key in sortedWordCount_tolstoj])
tolstojs = tolstojs/np.sum(tolstojs)
chisquare(tolstojs, tolstoj_exp, ddof=1)

Power_divergenceResult(statistic=6.01074109462997, pvalue=1.0)

In [8]:
news = np.array([sortedWordCount_news[key] for key in sortedWordCount_news])
news = news/np.sum(news)
chisquare(news, news_exp, ddof=1)

Power_divergenceResult(statistic=42.89100031101945, pvalue=1.0)

### Видим, что данные гипотезе о распределении по закону Ципфа данным не противоречат 

# Проверим, можно ли утверждать, что параметры закона Ципфа для этих двух корпусов совпадают

In [9]:
chisquare(news, tolstoj_exp[:len(news)], ddof=1)

Power_divergenceResult(statistic=12.9330745781218, pvalue=1.0)

### Проверили через хи-квадрат, подставив параметр из другого распределения, видим, что данные гипотезе не противоречат