In [41]:
import re
import itertools
from collections import Counter
from collections import OrderedDict
from pattern.web import Wikipedia, plaintext
from nltk.collocations import *
from nltk import regexp_tokenize

class WikiParser:
    def __init__(self):
        pass
    
    def text_cleaning(self, article):
        stop_words = ['on', 'in', 'at', 'near', 'over', 'under', 'between', 'to', 'from', 'into', 'out', 'of', 'off', 'with', 'since', 'by', 'as', 'for', 'on', 'a', 'the', 'an']
        text = article.plaintext()
        text = text.lower()
        tokens = regexp_tokenize(text, r'[\w]*', gaps=False)
        for token in tokens:
            if token in stop_words:
                tokens.remove(token)
        draft = ' '.join(tokens)
        remove_numbers = re.sub('[\d]{1,4}', '', draft)
        final = re.sub('[\s]{1,25}', ' ', remove_numbers)
        return final
    
    def get_articles(self, start):
        arr = []
        stop_words = ['on', 'in', 'at', 'near', 'over', 'under', 'between', 'to', 'from', 'into', 'out', 'of', 'off', 'with', 'since', 'by', 'as', 'for', 'on', 'a', 'the', 'an']
        wiki = Wikipedia(language="en")
        article = wiki.article(start)
        arr.append(self.text_cleaning(article))
        for title in article.links:
            article = wiki.article(title)
            arr.append(self.text_cleaning(article))
        list_of_strings = ' '.join(arr)
        return list_of_strings
            
            
       # for i in range(depth):
        #    for title in article.links:
         #       print(set(self.get_articles(title, depth-1)))
        
        

In [42]:
class TextStatistics:
    def __init__(self, articles):
        self.articles = articles
        
    def get_top_3grams(self, n):
        list_of_3grams_in_descending_order_by_freq = list()
        list_of_their_corresponding_freq = list()
        finder = TrigramCollocationFinder.from_words(self.articles.split())
        freqs = finder.ngram_fd
        dictionary = {}
        for key, value in list(freqs.items())[:n]:
            dictionary[key] = value
        list_of_3grams_in_descending_order_by_freq = OrderedDict(sorted(dictionary.items(), key=lambda t: t[1], reverse=True))
        return list_of_3grams_in_descending_order_by_freq
            
    def get_top_words(self, n):
        corpus = self.articles.split()
        list_of_words_in_descending_order_by_freq = Counter(corpus)
        return list_of_words_in_descending_order_by_freq.most_common(n)

In [52]:
class Experiment(TextStatistics, WikiParser):
    def __init__(self):
        self.articles = 'Natural language processing'
    def show_result(self):
        corpus = self.get_articles(self.articles)
        freq = TextStatistics(corpus)
        trigram = freq.get_top_3grams(20)
        word_top = freq.get_top_words(20)
        #По статье "Natural language processing" (только по ней) считает топ-5 3-грамм и топ-5 слов.
        stop_words = ['on', 'in', 'at', 'near', 'over', 'under', 'between', 'to', 'from', 'into', 'out', 'of', 'off', 'with', 'since', 'by', 'as', 'for', 'on', 'a', 'the', 'an']
        wiki = Wikipedia(language="en")
        article = wiki.article(self.articles)
        NLP_freq = TextStatistics(self.text_cleaning(article))
        NLP_trigram = NLP_freq.get_top_3grams(5)
        NLP_word_top = NLP_freq.get_top_words(5)
        output = 'Топ-20 3-грамм по корпусу текстов: \n' + str(trigram) + '\nТоп-20 слов по корпусу текстов: \n' + str(word_top)
        NLP_output = 'Топ-5 3-грамм по статье NLP: \n' + str(NLP_trigram) + '\nТоп-5 слов по статье NLP: \n' + str(NLP_word_top)
        return output + '\n' + NLP_output
        
        

## Результаты метода show_result
### Топ-20 3грамм по корпусу текстов: 

OrderedDict([(('natural', 'language', 'processing'), 285), (('this', 'article', 'is'), 23), (('article', 'is', 'about'), 17), (('language', 'processing', 'nlp'), 15), (('is', 'about', 'language'), 1), (('about', 'language', 'processing'), 1), (('language', 'processing', 'computers'), 1), (('processing', 'computers', 'processing'), 1), (('computers', 'processing', 'language'), 1), (('processing', 'language', 'human'), 1), (('language', 'human', 'brain'), 1), (('human', 'brain', 'see'), 1), (('brain', 'see', 'language'), 1), (('see', 'language', 'processing'), 1), (('language', 'processing', 'brain'), 1), (('processing', 'brain', 'natural'), 1), (('brain', 'natural', 'language'), 1), (('processing', 'nlp', 'is'), 1), (('nlp', 'is', 'field'), 1), (('is', 'field', 'computer'), 1)])

### Топ-20 слов по корпусу текстов: 

[('and', 16194), ('is', 8585), ('that', 4836), ('are', 4284), ('language', 3797), ('or', 3764), ('s', 3389), ('be', 3384), ('it', 2720), ('this', 2418), ('which', 2174), ('can', 1973), ('not', 1929), ('was', 1769), ('speech', 1755), ('such', 1733), ('retrieved', 1719), ('english', 1712), ('i', 1709), ('have', 1650)]


### Топ-5 3-грамм по статье NLP: 

OrderedDict([(('this', 'article', 'is'), 1), (('article', 'is', 'about'), 1), (('is', 'about', 'language'), 1), (('about', 'language', 'processing'), 1), (('language', 'processing', 'computers'), 1)])


### Топ-5 слов по статье NLP: 

[('and', 70), ('language', 59), ('is', 48), ('natural', 35), ('such', 30)]