In [1]:
import sys
import numpy as np
import artm
print artm.version()

from os import path, mkdir
from datetime import datetime
%matplotlib inline
sys.path.insert(0, '..\\modules\\helpers')
from plot_helper import PlotMaker
from config_helper import ConfigPaths
from print_helper import PrintHelper

0.8.1


In [2]:
config = ConfigPaths('config.cfg')
plot_maker = PlotMaker()
printer = PrintHelper()

In [3]:
print config.models_file_name

Q:\\topic_modeling\\csi_science_collections.git\experiments\UCI_filtered_ngramm_trimmed_without_names\04_12_num_topics\models.txt


In [4]:
models_file = open(config.models_file_name, 'a')

In [5]:
def create_model(current_dictionary, n_topics, n_doc_passes, seed_value, n_top_tokens, p_mass_threshold):    
    print '[{}] creating model'.format(datetime.now())
    model = artm.ARTM(num_topics=n_topics, dictionary=current_dictionary, cache_theta=True, seed=seed_value, 
                  class_ids={'ngramm': 1.0, 'author_id': 0.0, 'author': 0.0, 
                             'post_tag': 0.0, 'projects': 0.0, 'category': 0.0,
                             'following_users': 0.0})
    model.num_document_passes = n_doc_passes
    add_scores_to_model(model, n_top_tokens=n_top_tokens, p_mass_threshold=p_mass_threshold)
    return model


def add_scores_to_model(artm_model, n_top_tokens, p_mass_threshold):
    print '[{}] adding scores'.format(datetime.now())
    artm_model.scores.add(artm.PerplexityScore(name='perplexity_score',
                                      use_unigram_document_model=False,
                                      dictionary=dictionary))
    artm_model.scores.add(artm.SparsityPhiScore(name='sparsity_phi_score', class_id='ngramm'))
    artm_model.scores.add(artm.SparsityThetaScore(name='sparsity_theta_score'))
    artm_model.scores.add(artm.TopicKernelScore(name='topic_kernel_score', class_id='ngramm', 
                                                probability_mass_threshold=p_mass_threshold))
    artm_model.scores.add(artm.TopTokensScore(name='top_tokens_score', class_id='ngramm', num_tokens=n_top_tokens))

In [6]:
def process_one_model(dictionary, _n_topics, _n_doc_passes, _seed_value, _n_top_tokens, _p_mass_threshold, _n_iterations,
                     _model_name=''):
    print '[{}] processing model'.format(datetime.now())
    model = create_model(current_dictionary=dictionary, n_topics=_n_topics, n_doc_passes=_n_doc_passes, seed_value=_seed_value,
                         n_top_tokens=_n_top_tokens, p_mass_threshold=_p_mass_threshold)
    model = fit_one_model(model, _n_iterations, _model_name)
    return model
    
def fit_one_model(model, _n_iterations, _model_name=''): 
    print '[{}] fitting'.format(datetime.now())
    model.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=_n_iterations)
    print '[{}] outputting'.format(datetime.now())
    printer.print_artm_model(model, _model_name, _n_iterations, output_file=models_file)
    model_pics_file_name =  path.join(config.experiment_path, _model_name)
    plot_maker.make_tm_plots(model, model_pics_file_name)
    model_output_file_name = path.join(config.experiment_path, _model_name + '.txt')
    printer.print_scores(model, _model_name, _n_iterations, model_output_file_name)
    printer.print_top_tokens(model, model_output_file_name)
    return model

In [7]:
def save_model(_model, _model_name): 
    print '[{}] saving model'.format(datetime.now())
    model_output_file_name = path.join(config.models_archive_path, _model_name)
    _model.save(filename=model_output_file_name+'_saved_p_wt', model_name=_model_name+'p_wt')
    _model.save(filename=model_output_file_name+'_saved_n_wt', model_name=_model_name+'n_wt')

In [8]:
# batch_vectorizer = artm.BatchVectorizer(data_path=config.dataset_path,
#                                         data_format='bow_uci',
#                                         collection_name=config.collection_name,
#                                         target_folder=config.output_batches_path)
# dictionary = artm.Dictionary()
# dictionary.gather(data_path=config.output_batches_path,
#                   vocab_file_path=config.vocabulary_path)
# dictionary.save(dictionary_path=config.dictionary_path)
# dictionary.save_text(dictionary_path=config.dictionary_path + '.txt')
# dictionary.load_text(dictionary_path=config.dictionary_path + '.txt')

In [9]:
batch_vectorizer = artm.BatchVectorizer(data_path=config.output_batches_path,
                                        data_format='batches')
dictionary = artm.Dictionary()
dictionary.load(dictionary_path=config.dictionary_path + '.dict')

In [10]:
# dictionary.filter(min_tf=5, max_tf=2000, min_df_rate=0.01, max_df_rate=0.9)

In [11]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=10, n_doc_passes=5, seed_value=100,
                            n_top_tokens=15, p_mass_threshold=0.25)
tmp_model = fit_one_model(tmp_model, _n_iterations=25, _model_name='model_10')
model_10 = tmp_model; tmp_model = None

[2016-12-04 23:25:11.403000] creating model
[2016-12-04 23:25:12.741000] adding scores
[2016-12-04 23:25:12.746000] fitting
[2016-12-04 23:25:25.608000] outputting
name = model_10, n_topics = 10, n_doc_passes = 5, seed_value = 100, n_iterations = 25, n_top_tokens = 15, p_threshold = 0.25



In [12]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=20, n_doc_passes=5, seed_value=100,
                            n_top_tokens=15, p_mass_threshold=0.25)
tmp_model = fit_one_model(tmp_model, _n_iterations=25, _model_name='model_20')
model_20 = tmp_model; tmp_model = None

[2016-12-04 23:25:55.419000] creating model
[2016-12-04 23:25:56.596000] adding scores
[2016-12-04 23:25:56.601000] fitting
[2016-12-04 23:26:12.144000] outputting
name = model_20, n_topics = 20, n_doc_passes = 5, seed_value = 100, n_iterations = 25, n_top_tokens = 15, p_threshold = 0.25



In [13]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=50, n_doc_passes=5, seed_value=100,
                            n_top_tokens=15, p_mass_threshold=0.25)
tmp_model = fit_one_model(tmp_model, _n_iterations=25, _model_name='model_50')
model_50 = tmp_model; tmp_model = None

[2016-12-04 23:26:40.875000] creating model
[2016-12-04 23:26:42.083000] adding scores
[2016-12-04 23:26:42.087000] fitting
[2016-12-04 23:27:05.004000] outputting
name = model_50, n_topics = 50, n_doc_passes = 5, seed_value = 100, n_iterations = 25, n_top_tokens = 15, p_threshold = 0.25



In [14]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=100, n_doc_passes=5, seed_value=100,
                            n_top_tokens=15, p_mass_threshold=0.25)
tmp_model = fit_one_model(tmp_model, _n_iterations=25, _model_name='model_100')
model_100 = tmp_model; tmp_model = None

[2016-12-04 23:27:28.032000] creating model
[2016-12-04 23:27:29.436000] adding scores
[2016-12-04 23:27:29.440000] fitting
[2016-12-04 23:28:05.010000] outputting
name = model_100, n_topics = 100, n_doc_passes = 5, seed_value = 100, n_iterations = 25, n_top_tokens = 15, p_threshold = 0.25



In [15]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=250, n_doc_passes=5, seed_value=100,
                            n_top_tokens=15, p_mass_threshold=0.25)
tmp_model = fit_one_model(tmp_model, _n_iterations=25, _model_name='model_250')
model_250 = tmp_model; tmp_model = None

[2016-12-04 23:28:27.618000] creating model
[2016-12-04 23:28:29.059000] adding scores
[2016-12-04 23:28:29.065000] fitting
[2016-12-04 23:29:57.543000] outputting
name = model_250, n_topics = 250, n_doc_passes = 5, seed_value = 100, n_iterations = 25, n_top_tokens = 15, p_threshold = 0.25



In [16]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=500, n_doc_passes=5, seed_value=100,
                            n_top_tokens=15, p_mass_threshold=0.25)
tmp_model = fit_one_model(tmp_model, _n_iterations=25, _model_name='model_500')
model_500 = tmp_model; tmp_model = None

[2016-12-04 23:30:18.706000] creating model
[2016-12-04 23:30:20.660000] adding scores
[2016-12-04 23:30:20.673000] fitting
[2016-12-04 23:33:19.137000] outputting
name = model_500, n_topics = 500, n_doc_passes = 5, seed_value = 100, n_iterations = 25, n_top_tokens = 15, p_threshold = 0.25



In [17]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=750, n_doc_passes=5, seed_value=100,
                            n_top_tokens=15, p_mass_threshold=0.25)
tmp_model = fit_one_model(tmp_model, _n_iterations=25, _model_name='model_750')
model_750 = tmp_model; tmp_model = None

[2016-12-04 23:33:40.724000] creating model
[2016-12-04 23:33:42.804000] adding scores
[2016-12-04 23:33:42.816000] fitting
[2016-12-04 23:38:23.825000] outputting
name = model_750, n_topics = 750, n_doc_passes = 5, seed_value = 100, n_iterations = 25, n_top_tokens = 15, p_threshold = 0.25



In [18]:
# модель с маленьким числом тем, будем пытаться разрядить только декорр

In [19]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=20, n_doc_passes=5, seed_value=100,
                         n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 10
tmp_model = fit_one_model(tmp_model, _n_iterations=15, _model_name='model_20_decor_reg_1')
model_20_decor_reg_1 = tmp_model; tmp_model = None

[2016-12-04 23:38:52.119000] creating model
[2016-12-04 23:38:53.425000] adding scores
[2016-12-04 23:38:53.430000] fitting
[2016-12-04 23:39:04.745000] outputting
name = model_20_decor_reg_1, n_topics = 20, n_doc_passes = 5, seed_value = 100, n_iterations = 15, n_top_tokens = 15, p_threshold = 0.25
decorrelator_phi_regularizer, tau = 10



In [20]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=20, n_doc_passes=5, seed_value=100,
                         n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 100
tmp_model = fit_one_model(tmp_model, _n_iterations=15, _model_name='model_20_decor_reg_2')
model_20_decor_reg_2 = tmp_model; tmp_model = None

[2016-12-04 23:39:20.369000] creating model
[2016-12-04 23:39:21.630000] adding scores
[2016-12-04 23:39:21.636000] fitting
[2016-12-04 23:39:33.695000] outputting
name = model_20_decor_reg_2, n_topics = 20, n_doc_passes = 5, seed_value = 100, n_iterations = 15, n_top_tokens = 15, p_threshold = 0.25
decorrelator_phi_regularizer, tau = 100



In [21]:
# model_20_decor_reg_2 не сильно отличается от model_20_decor_reg_1

In [22]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=20, n_doc_passes=5, seed_value=100,
                         n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 1000
tmp_model = fit_one_model(tmp_model, _n_iterations=15, _model_name='model_20_decor_reg_3')
model_20_decor_reg_3 = tmp_model; tmp_model = None

[2016-12-04 23:39:48.319000] creating model
[2016-12-04 23:39:49.482000] adding scores
[2016-12-04 23:39:49.487000] fitting
[2016-12-04 23:39:59.498000] outputting
name = model_20_decor_reg_3, n_topics = 20, n_doc_passes = 5, seed_value = 100, n_iterations = 15, n_top_tokens = 15, p_threshold = 0.25
decorrelator_phi_regularizer, tau = 1000



In [23]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=20, n_doc_passes=5, seed_value=100,
                         n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 1e+5
tmp_model = fit_one_model(tmp_model, _n_iterations=15, _model_name='model_20_decor_reg_4')
model_20_decor_reg_4 = tmp_model; tmp_model = None

[2016-12-04 23:40:13.729000] creating model
[2016-12-04 23:40:14.910000] adding scores
[2016-12-04 23:40:14.926000] fitting
[2016-12-04 23:40:25.096000] outputting
name = model_20_decor_reg_4, n_topics = 20, n_doc_passes = 5, seed_value = 100, n_iterations = 15, n_top_tokens = 15, p_threshold = 0.25
decorrelator_phi_regularizer, tau = 100000.0



In [24]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=20, n_doc_passes=5, seed_value=100,
                         n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 1e+7
tmp_model = fit_one_model(tmp_model, _n_iterations=15, _model_name='model_20_decor_reg_5')
model_20_decor_reg_5 = tmp_model; tmp_model = None

[2016-12-04 23:40:39.671000] creating model
[2016-12-04 23:40:40.839000] adding scores
[2016-12-04 23:40:40.844000] fitting
[2016-12-04 23:40:52.385000] outputting
name = model_20_decor_reg_5, n_topics = 20, n_doc_passes = 5, seed_value = 100, n_iterations = 15, n_top_tokens = 15, p_threshold = 0.25
decorrelator_phi_regularizer, tau = 10000000.0



In [25]:
# много тем смешались

In [26]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=20, n_doc_passes=5, seed_value=100,
                         n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 1e+9
tmp_model = fit_one_model(tmp_model, _n_iterations=15, _model_name='model_20_decor_reg_6')
model_20_decor_reg_6 = tmp_model; tmp_model = None

[2016-12-04 23:41:03.027000] creating model
[2016-12-04 23:41:04.172000] adding scores
[2016-12-04 23:41:04.177000] fitting
[2016-12-04 23:41:15.299000] outputting
name = model_20_decor_reg_6, n_topics = 20, n_doc_passes = 5, seed_value = 100, n_iterations = 15, n_top_tokens = 15, p_threshold = 0.25
decorrelator_phi_regularizer, tau = 1000000000.0



In [27]:
# выродилась

In [28]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=20, n_doc_passes=5, seed_value=100,
                         n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 1e+8
tmp_model = fit_one_model(tmp_model, _n_iterations=15, _model_name='model_20_decor_reg_7')
model_20_decor_reg_7 = tmp_model; tmp_model = None

[2016-12-04 23:41:16.480000] creating model
[2016-12-04 23:41:17.782000] adding scores
[2016-12-04 23:41:17.782000] fitting
[2016-12-04 23:41:27.475000] outputting
name = model_20_decor_reg_7, n_topics = 20, n_doc_passes = 5, seed_value = 100, n_iterations = 15, n_top_tokens = 15, p_threshold = 0.25
decorrelator_phi_regularizer, tau = 100000000.0



In [29]:
# плохая

In [30]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=20, n_doc_passes=5, seed_value=100,
                         n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 1e+8
tmp_model = fit_one_model(tmp_model, _n_iterations=15, _model_name='model_20_decor_reg_7')
model_20_decor_reg_7 = tmp_model; tmp_model = None

[2016-12-04 23:41:28.622000] creating model
[2016-12-04 23:41:29.648000] adding scores
[2016-12-04 23:41:29.648000] fitting
[2016-12-04 23:41:39.547000] outputting
name = model_20_decor_reg_7, n_topics = 20, n_doc_passes = 5, seed_value = 100, n_iterations = 15, n_top_tokens = 15, p_threshold = 0.25
decorrelator_phi_regularizer, tau = 100000000.0



In [31]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=20, n_doc_passes=5, seed_value=100,
                         n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 1e+6
tmp_model = fit_one_model(tmp_model, _n_iterations=15, _model_name='model_20_decor_reg_8')
model_20_decor_reg_8 = tmp_model; tmp_model = None

[2016-12-04 23:41:40.654000] creating model
[2016-12-04 23:41:41.836000] adding scores
[2016-12-04 23:41:41.836000] fitting
[2016-12-04 23:41:53.173000] outputting
name = model_20_decor_reg_8, n_topics = 20, n_doc_passes = 5, seed_value = 100, n_iterations = 15, n_top_tokens = 15, p_threshold = 0.25
decorrelator_phi_regularizer, tau = 1000000.0



In [32]:
# + sparse phi

In [33]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=20, n_doc_passes=5, seed_value=100,
                         n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers.add(artm.SmoothSparsePhiRegularizer(name='ss_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers['ss_phi_regularizer'].tau = -0.5
tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 1e+6
tmp_model = fit_one_model(tmp_model, _n_iterations=15, _model_name='model_20_decor_sst_1')
model_20_decor_sst_1 = tmp_model; tmp_model = None

[2016-12-04 23:42:07.662000] creating model
[2016-12-04 23:42:08.847000] adding scores
[2016-12-04 23:42:08.854000] fitting
[2016-12-04 23:42:20.794000] outputting
name = model_20_decor_sst_1, n_topics = 20, n_doc_passes = 5, seed_value = 100, n_iterations = 15, n_top_tokens = 15, p_threshold = 0.25
decorrelator_phi_regularizer, tau = 1000000.0
ss_phi_regularizer, tau = -0.5



In [34]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=20, n_doc_passes=5, seed_value=100,
                         n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers.add(artm.SmoothSparsePhiRegularizer(name='ss_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers['ss_phi_regularizer'].tau = -0.7
tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 1e+6
tmp_model = fit_one_model(tmp_model, _n_iterations=15, _model_name='model_20_decor_sst_2')
model_20_decor_sst_2 = tmp_model; tmp_model = None

[2016-12-04 23:42:40.307000] creating model
[2016-12-04 23:42:41.784000] adding scores
[2016-12-04 23:42:41.800000] fitting
[2016-12-04 23:42:53.020000] outputting
name = model_20_decor_sst_2, n_topics = 20, n_doc_passes = 5, seed_value = 100, n_iterations = 15, n_top_tokens = 15, p_threshold = 0.25
decorrelator_phi_regularizer, tau = 1000000.0
ss_phi_regularizer, tau = -0.7



In [35]:
# показатели ядра стали лучше, размер ядра меньше, темы всё равно смешанные и плохие

In [36]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=20, n_doc_passes=5, seed_value=100,
                         n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers.add(artm.SmoothSparsePhiRegularizer(name='ss_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers['ss_phi_regularizer'].tau = -1.5
tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 1e+6
tmp_model = fit_one_model(tmp_model, _n_iterations=15, _model_name='model_20_decor_sst_3')
model_20_decor_sst_3 = tmp_model; tmp_model = None

[2016-12-04 23:43:09] creating model
[2016-12-04 23:43:10.313000] adding scores
[2016-12-04 23:43:10.319000] fitting
[2016-12-04 23:43:24.441000] outputting
name = model_20_decor_sst_3, n_topics = 20, n_doc_passes = 5, seed_value = 100, n_iterations = 15, n_top_tokens = 15, p_threshold = 0.25
decorrelator_phi_regularizer, tau = 1000000.0
ss_phi_regularizer, tau = -1.5



In [37]:
# вроде темы получше

In [38]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=20, n_doc_passes=5, seed_value=100,
                         n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers.add(artm.SmoothSparsePhiRegularizer(name='ss_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers['ss_phi_regularizer'].tau = -2
tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 1e+6
tmp_model = fit_one_model(tmp_model, _n_iterations=15, _model_name='model_20_decor_sst_4')
model_20_decor_sst_4 = tmp_model; tmp_model = None

[2016-12-04 23:43:37.683000] creating model
[2016-12-04 23:43:38.940000] adding scores
[2016-12-04 23:43:38.948000] fitting
[2016-12-04 23:43:50.977000] outputting
name = model_20_decor_sst_4, n_topics = 20, n_doc_passes = 5, seed_value = 100, n_iterations = 15, n_top_tokens = 15, p_threshold = 0.25
decorrelator_phi_regularizer, tau = 1000000.0
ss_phi_regularizer, tau = -2



In [39]:
# вроде темы тоже норм

In [40]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=20, n_doc_passes=5, seed_value=100,
                         n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers.add(artm.SmoothSparsePhiRegularizer(name='ss_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers['ss_phi_regularizer'].tau = -2.5
tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 1e+6
tmp_model = fit_one_model(tmp_model, _n_iterations=15, _model_name='model_20_decor_sst_5')
model_20_decor_sst_5 = tmp_model; tmp_model = None

[2016-12-04 23:43:59.094000] creating model
[2016-12-04 23:44:00.271000] adding scores
[2016-12-04 23:44:00.278000] fitting
[2016-12-04 23:44:12.617000] outputting
name = model_20_decor_sst_5, n_topics = 20, n_doc_passes = 5, seed_value = 100, n_iterations = 15, n_top_tokens = 15, p_threshold = 0.25
decorrelator_phi_regularizer, tau = 1000000.0
ss_phi_regularizer, tau = -2.5



In [41]:
# темы несколько в одной

In [42]:
## + ss theta

In [43]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=20, n_doc_passes=5, seed_value=100,
                         n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers.add(artm.SmoothSparsePhiRegularizer(name='ss_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers.add(artm.SmoothSparseThetaRegularizer(name='ss_theta_regularizer'))
tmp_model.regularizers['ss_phi_regularizer'].tau = -2
tmp_model.regularizers['ss_theta_regularizer'].tau = -0.5
tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 1e+6
tmp_model = fit_one_model(tmp_model, _n_iterations=15, _model_name='model_20_decor_sst_ssp_1')
model_20_decor_sst_ssp_1 = tmp_model; tmp_model = None

[2016-12-04 23:44:19.584000] creating model
[2016-12-04 23:44:20.706000] adding scores
[2016-12-04 23:44:20.706000] fitting
[2016-12-04 23:44:31.742000] outputting
name = model_20_decor_sst_ssp_1, n_topics = 20, n_doc_passes = 5, seed_value = 100, n_iterations = 15, n_top_tokens = 15, p_threshold = 0.25
ss_theta_regularizer, tau = -0.5
decorrelator_phi_regularizer, tau = 1000000.0
ss_phi_regularizer, tau = -2



In [44]:
# темы норм / есть общие темы-мусор, разредим еще тету 

In [45]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=20, n_doc_passes=5, seed_value=100,
                         n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers.add(artm.SmoothSparsePhiRegularizer(name='ss_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers.add(artm.SmoothSparseThetaRegularizer(name='ss_theta_regularizer'))
tmp_model.regularizers['ss_phi_regularizer'].tau = -2
tmp_model.regularizers['ss_theta_regularizer'].tau = -1.5
tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 1e+6
tmp_model = fit_one_model(tmp_model, _n_iterations=15, _model_name='model_20_decor_sst_ssp_2')
model_20_decor_sst_ssp_2 = tmp_model; tmp_model = None

[2016-12-04 23:44:38.448000] creating model
[2016-12-04 23:44:39.464000] adding scores
[2016-12-04 23:44:39.464000] fitting
[2016-12-04 23:44:50.696000] outputting
name = model_20_decor_sst_ssp_2, n_topics = 20, n_doc_passes = 5, seed_value = 100, n_iterations = 15, n_top_tokens = 15, p_threshold = 0.25
ss_theta_regularizer, tau = -1.5
decorrelator_phi_regularizer, tau = 1000000.0
ss_phi_regularizer, tau = -2



In [None]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=20, n_doc_passes=5, seed_value=100,
                         n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers.add(artm.SmoothSparsePhiRegularizer(name='ss_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers.add(artm.SmoothSparseThetaRegularizer(name='ss_theta_regularizer'))
tmp_model.regularizers['ss_phi_regularizer'].tau = -2
tmp_model.regularizers['ss_theta_regularizer'].tau = -1.5
tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 1e+7
tmp_model = fit_one_model(tmp_model, _n_iterations=15, _model_name='model_20_decor_sst_ssp_3')
model_20_decor_sst_ssp_3 = tmp_model; tmp_model = None

[2016-12-04 23:44:58.396000] creating model
[2016-12-04 23:44:59.659000] adding scores
[2016-12-04 23:44:59.668000] fitting
[2016-12-04 23:45:11.667000] outputting
name = model_20_decor_sst_ssp_3, n_topics = 20, n_doc_passes = 5, seed_value = 100, n_iterations = 15, n_top_tokens = 15, p_threshold = 0.25
ss_theta_regularizer, tau = -1.5
decorrelator_phi_regularizer, tau = 10000000.0
ss_phi_regularizer, tau = -2



In [None]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=20, n_doc_passes=5, seed_value=100,
                         n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers.add(artm.SmoothSparsePhiRegularizer(name='ss_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers.add(artm.SmoothSparseThetaRegularizer(name='ss_theta_regularizer'))
tmp_model.regularizers['ss_phi_regularizer'].tau = -2
tmp_model.regularizers['ss_theta_regularizer'].tau = -1.5
tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 1e+7
tmp_model = fit_one_model(tmp_model, _n_iterations=15, _model_name='model_20_decor_sst_ssp_4')
model_20_decor_sst_ssp_4 = tmp_model; tmp_model = None

[2016-12-04 23:45:14.683000] creating model
[2016-12-04 23:45:15.963000] adding scores
[2016-12-04 23:45:15.973000] fitting


In [None]:
# cтало хуже

In [None]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=20, n_doc_passes=5, seed_value=100,
                         n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers.add(artm.SmoothSparsePhiRegularizer(name='ss_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers.add(artm.SmoothSparseThetaRegularizer(name='ss_theta_regularizer'))
tmp_model.regularizers['ss_phi_regularizer'].tau = -2
tmp_model.regularizers['ss_theta_regularizer'].tau = -2.5
tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 1e+6
tmp_model = fit_one_model(tmp_model, _n_iterations=15, _model_name='model_20_decor_sst_ssp_5')
model_20_decor_sst_ssp_5 = tmp_model; tmp_model = None

In [None]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=20, n_doc_passes=5, seed_value=100,
                         n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers.add(artm.SmoothSparsePhiRegularizer(name='ss_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers.add(artm.SmoothSparseThetaRegularizer(name='ss_theta_regularizer'))
tmp_model.regularizers['ss_phi_regularizer'].tau = -2
tmp_model.regularizers['ss_theta_regularizer'].tau = -2.5
tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 5*1e+6
tmp_model = fit_one_model(tmp_model, _n_iterations=15, _model_name='model_20_decor_sst_ssp_6')
model_20_decor_sst_ssp_6 = tmp_model; tmp_model = None

In [None]:
models_file.close()