In [1]:
import xml.etree.ElementTree as ET
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from pymorphy2 import MorphAnalyzer
from tqdm import tqdm_notebook
from scipy.sparse import csc_matrix
from scipy.sparse.linalg import svds
from time import time
import numpy as np
import sparsesvd

Parsing XML file

In [None]:
def parsing(filename):
    pages = ET.parse(filename).getroot()
    texts = [page.text for page in pages]
    morph = MorphAnalyzer()
    
    def process_text(text):
        # Removing punctuation, numbers and processing to lower case
        text = re.sub('[^а-яА-Я\s]', '', text).lower()
        words = text.split()
        # Removing stop words
        std_stopwords = stopwords.words('russian')
        expansion = [
        u'я', u'а', u'да', u'но', u'тебе', u'мне', u'ты', u'и', u'у', u'на', u'ща', u'ага',
        u'так', u'там', u'какие', u'который', u'какая', u'туда', u'давай', u'короче', u'кажется', u'вообще',
        u'ну', u'не', u'чет', u'неа', u'свои', u'наше', u'хотя', u'такое', u'например', u'кароч', u'как-то',
        u'нам', u'хм', u'всем', u'нет', u'да', u'оно', u'своем', u'про', u'вы', u'м', u'тд',
        u'вся', u'кто-то', u'что-то', u'вам', u'это', u'эта', u'эти', u'этот', u'прям', u'либо', u'как', u'мы',
        u'просто', u'блин', u'очень', u'самые', u'твоем', u'ваша', u'кстати', u'вроде', u'типа', u'пока', u'ок']
        std_stopwords.extend(expansion)
        
        stop_words = set(std_stopwords)
        
        words = [str(morph.parse(word)[0]) for word in words if not word in std_stopwords]
        return ' '.join(words)
    
    return [process_text(text) for text in tqdm_notebook(texts)]

file = 'wikipedia_2000_dump.xml'
texts = parsing(file)
print(texts[0])

Creating bag of words

In [3]:
start = time()
vectorizer = CountVectorizer(analyzer='word', min_df=5)

features = vectorizer.fit_transform(texts)
end = time()
print('elapsed time: {:.4f} s'.format(end - start))
print(features.shape)

elapsed time: 34.2149 s
(2000, 76116)


In [4]:
sp_matrix = csc_matrix(features, dtype=np.float32)
# _, s, _ = svds(sp_matrix, k=50)
_, s, _ = sparsesvd.sparsesvd(sp_matrix, sp_matrix.shape[1])
p = s / np.sum(s)
print(np.exp(-1 * np.sum([p_k * np.log(p_k) for p_k in p])))

(2000, 76116)
66.2432679606
