In [1]:
import artm
import jsonlines
import operator
import codecs
from collections import Counter
from tqdm import tqdm
from scipy.sparse import save_npz
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import logging
import numpy as np
import json

import os
os.environ['ARTM_SHARED_LIBRARY']='C:/BigARTM/bin/artm.dll'

In [4]:
texts = {item['id']: item['text'] for item in jsonlines.open("sources/normalized_texts.jl", 'r')}
#texts.sort(key=operator.itemgetter(1))

# Перобразовать в wabbit формат, если ещё не

In [17]:
with codecs.open("wabbit", "w", encoding='utf-8') as output:
    for text in tqdm(texts):
        cnt = Counter(texts[text].split())
        line = '\'' + text + ' |@words ' + ' '.join([item + (':' + str(cnt[item]) if cnt[item] != 1 else '') for item in cnt])
        line += '\n'
        output.write(line)

texts.clear()

100%|██████████████████████████████████████████████████████████████████████████| 96794/96794 [00:21<00:00, 4504.95it/s]


# Преобразовать в UCI формат, если ещё не

In [6]:
class BagOfWordsModel(object):
    OUT_FOLDER = 'out'

    def __init__(self, id_document_dict, max_features=None, max_df=1.0):
        """Builds bow model.
        Args:
            id_document_dict: ids of documents and theirs contents in format
                "{id: 'text', ...}"
            max_features: If not None, build a vocabulary that only consider the top
                max_features ordered by term frequency across the corpus.
                This parameter is ignored if vocabulary is not None.
            max_df: When building the vocabulary ignore terms that have a
                document frequency strictly higher than the given threshold
                (corpus-specific stop words). If float, the parameter
                represents a proportion of documents, integer absolute counts.
                This parameter is ignored if vocabulary is not None.
        """
        self.logger = logging.getLogger(__name__)
        self.logger.info(
            "Building bag-of-words model with max_features={0}, max_df={1}".format(
                max_features, max_df))
        self.logger.info("Size of data set: " + str(len(id_document_dict)))

        if len(id_document_dict) != 0:
            self.logger.info("Building pandas dataframe")
            df = pd.DataFrame.from_dict(data=id_document_dict, orient='index')
            self.logger.info("Built pandas dataframe")
            ids = df.index
            self.index2id = dict(enumerate(ids))
            self.id2index = {v: k for k, v in self.index2id.items()}
            documents_corpus = df[0].values  # 1-dim np.array.
            # documents_corpus = documents_corpus.astype(unicode)
            del df
            if max_features is None:
                self.logger.info(
                    "Training CountVectorizer with all {0} features".format(
                        len(ids)))
            else:
                self.logger.info(
                    "Training CountVectorizer with max {0} features".format(
                        max_features))
            vectorizer = CountVectorizer(max_features=max_features,
                                         max_df=max_df,
                                         stop_words='english').fit(
                documents_corpus)
            self.logger.info("Trained vectorizer with {0} features".format(
                len(vectorizer.get_feature_names())))
            self.logger.info("Building bag-of-words model")
            bow = vectorizer.transform(documents_corpus)
            self.logger.info("Done")

            self.url_ids = ids
            self.bow_sparse_matrix = bow
            self.feature_names = vectorizer.get_feature_names()  # mapping from url_id to url
            self.vocabulary = vectorizer.vocabulary_  # mapping from url to url_id
            self.shape = self.bow_sparse_matrix.shape

    def get_index(self, doc_id):
        return self.id2index[doc_id]

    def get_doc_id(self, index):
        return self.index2id[index]

    def get_feature_id(self, feature_name):
        return self.vocabulary.get(feature_name)

    def get_feature_name(self, feature_id):
        return self.feature_names[feature_id]

    def toarray(self):
        return self.bow_sparse_matrix.toarray()

    def to_uci(self, model_name='bow', save_folder=OUT_FOLDER):
        import os.path
        import codecs
        if self.bow_sparse_matrix is None:
            self.logger.error("Model is None.")
            return
        if not os.path.exists(save_folder):
            os.makedirs(save_folder)
        filenames = model_name
        docword_name = os.path.join(save_folder,
                                    'docword.' + filenames + '.txt')
        vocab_name = os.path.join(save_folder, 'vocab.' + filenames + '.txt')
        with codecs.open(docword_name, 'w', encoding='utf-8') as docword_f, \
                codecs.open(vocab_name, 'w', encoding='utf-8') as vocab_f:
            urls_count = self.shape[0]
            words_count = self.shape[1]
            # Fill vocab_f file
            self.logger.info("Start filling {0}".format(vocab_name))
            print("Start filling {0}".format(vocab_name))
            for i in range(words_count):
                vocab_f.write(self.get_feature_name(i) + '\n')
            self.logger.info("Done.")
            print("Done")
            # Fill docword_f file
            self.logger.info("Start filling {0}".format(docword_name))
            print("Start filling {0}".format(docword_name))
            docword_f.write(str(urls_count) + '\n')
            docword_f.write(str(words_count) + '\n')
            docword_f.write(str(self.bow_sparse_matrix.nnz) + '\n')
            # nnz_position = docword_f.tell() # We fill this line later with nnz_counter.
            # nnz_counter = 0 # The number of nonzero counts in the bag-of-words.
            nnz_x, nnz_y = self.bow_sparse_matrix.nonzero()
            for x, y in tqdm(zip(nnz_x, nnz_y), total=len(nnz_x)):
                # nnz_counter += len(url_sparse_vector)
                docword_f.write(str(x + 1) + ' ' + str(y + 1) + ' ' + str(
                    self.bow_sparse_matrix[x, y]) + '\n')
            self.logger.info("Done.")
            print("Done")

In [7]:
bow = BagOfWordsModel(texts)
bow.to_uci()

Start filling out\vocab.bow.txt
Done
Start filling out\docword.bow.txt


100%|███████████████████████████████████████████████████████████████████| 19420712/19420712 [13:29<00:00, 23986.31it/s]


Done


# Применяем ARTM

In [2]:
num_topics = len([line for line in codecs.open("sources/accepted_categories.txt", "r", encoding='utf-8')])
num_topics = 1000

In [3]:
'''batch_vectorizer = artm.BatchVectorizer(data_path='out/',
                                        data_format='bow_uci',
                                        collection_name='nu_collection',
                                        target_folder='nu_collection_batches')'''

'''batch_vectorizer = artm.BatchVectorizer(data_path='wabbit',
                                        data_format='vowpal_wabbit',
                                        target_folder='my_collection_batches')'''

batch_vectorizer = artm.BatchVectorizer(data_format='batches',
                                        data_path='nu_collection_batches')

In [4]:
dictionary = artm.Dictionary()
dictionary.gather(data_path='nu_collection_batches')
#dictionary.gather(data_path='my_collection_batches')

In [5]:
model = artm.ARTM(num_topics=num_topics, dictionary=dictionary,
                  show_progress_bars=True,
                  cache_theta=True,
                  regularizers=[artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer'),
                               artm.SmoothSparseThetaRegularizer(name='smooth_sparce', tau=-0.15)])

In [6]:
model.scores.add(artm.PerplexityScore(name='perplexity_score',
                                      dictionary=dictionary))
model.scores.add(artm.SparsityPhiScore(name='sparcity_score',
                                       eps=1.0e-10))

In [7]:
model.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=10)

KeyboardInterrupt: 

In [14]:
print(model.score_tracker['perplexity_score'].value)
print(model.score_tracker['sparcity_score'].value)

[459674.125, 8306.26171875, 4483.96484375, 2873.71142578125, 2374.592041015625, 2154.426025390625, 2035.2706298828125, 1961.7843017578125, 1912.0257568359375, 1876.1219482421875]
[0.2869090735912323, 0.47531643509864807, 0.7656528353691101, 0.8958291411399841, 0.9438644647598267, 0.9632195830345154, 0.9725461006164551, 0.9777270555496216, 0.9809162616729736, 0.9830437302589417]


In [61]:
theta = model.get_theta_sparse()

In [62]:
theta[0].transpose().shape

(96761, 1000)

In [63]:
model.save("sources/plsa_model2")

In [64]:
save_npz("sources/theta2", theta[0].transpose())

In [15]:
theta_df = model.get_theta().transpose()

In [58]:
theta_dict = {}
for index, row in theta_df.iterrows():
    theta_dict[bow.get_doc_id(index - 1)] = list(row)

In [60]:
with open("sources/plsa_res.json", "w") as output:
    output.write(json.dumps(theta_dict))

In [66]:
with open("sources/plsa_get_doc_id", "w") as output:
    output.write(json.dumps(bow.index2id))

In [46]:
#Это те, которые plsa пропустил

#len(bow.index2id.keys())-len(theta_df.index)
set(set(bow.index2id.keys())).difference(theta_df.index)

{0, 3252, 8365}