In [1]:
import sys
import numpy as np
import artm
print artm.version()

from os import path, mkdir
from datetime import datetime
%matplotlib inline
sys.path.insert(0, '..\\modules\\helpers')
from plot_helper import PlotMaker
from config_helper import ConfigPaths
from print_helper import PrintHelper

0.8.1


In [2]:
config = ConfigPaths('config.cfg')
plot_maker = PlotMaker()
printer = PrintHelper()

In [3]:
print config.models_file_name

Q:\\topic_modeling\\csi_science_collections.git\experiments\UCI_filtered_ngramm_trimmed_without_names\24_11_decor\models.txt


In [119]:
models_file = open(config.models_file_name, 'a')

In [5]:
def create_model(current_dictionary, n_topics, n_doc_passes, seed_value, n_top_tokens, p_mass_threshold):    
    print '[{}] creating model'.format(datetime.now())
    model = artm.ARTM(num_topics=n_topics, dictionary=current_dictionary, cache_theta=True, seed=seed_value, 
                  class_ids={'ngramm': 1.0, 'author_id': 0.0, 'author': 0.0, 
                             'post_tag': 0.0, 'projects': 0.0, 'category': 0.0,
                             'following_users': 0.0})
    model.num_document_passes = n_doc_passes
    add_scores_to_model(model, n_top_tokens=n_top_tokens, p_mass_threshold=p_mass_threshold)
    return model


def add_scores_to_model(artm_model, n_top_tokens, p_mass_threshold):
    print '[{}] adding scores'.format(datetime.now())
    artm_model.scores.add(artm.PerplexityScore(name='perplexity_score',
                                      use_unigram_document_model=False,
                                      dictionary=dictionary))
    artm_model.scores.add(artm.SparsityPhiScore(name='sparsity_phi_score', class_id='ngramm'))
    artm_model.scores.add(artm.SparsityThetaScore(name='sparsity_theta_score'))
    artm_model.scores.add(artm.TopicKernelScore(name='topic_kernel_score', class_id='ngramm', 
                                                probability_mass_threshold=p_mass_threshold))
    artm_model.scores.add(artm.TopTokensScore(name='top_tokens_score', class_id='ngramm', num_tokens=n_top_tokens))

In [6]:
def process_one_model(dictionary, _n_topics, _n_doc_passes, _seed_value, _n_top_tokens, _p_mass_threshold, _n_iterations,
                     _model_name=''):
    print '[{}] processing model'.format(datetime.now())
    model = create_model(current_dictionary=dictionary, n_topics=_n_topics, n_doc_passes=_n_doc_passes, seed_value=_seed_value,
                         n_top_tokens=_n_top_tokens, p_mass_threshold=_p_mass_threshold)
    model = fit_one_model(model, _n_iterations, _model_name)
    return model
    
def fit_one_model(model, _n_iterations, _model_name=''): 
    print '[{}] fitting'.format(datetime.now())
    model.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=_n_iterations)
    print '[{}] outputting'.format(datetime.now())
    printer.print_artm_model(model, _model_name, _n_iterations, output_file=models_file)
    model_pics_file_name =  path.join(config.experiment_path, _model_name)
    plot_maker.make_tm_plots(model, model_pics_file_name)
    model_output_file_name = path.join(config.experiment_path, _model_name + '.txt')
    printer.print_scores(model, _model_name, _n_iterations, model_output_file_name)
    printer.print_top_tokens(model, model_output_file_name)
    return model

In [7]:
def save_model(_model, _model_name): 
    print '[{}] saving model'.format(datetime.now())
    model_output_file_name = path.join(config.models_archive_path, _model_name)
    _model.save(filename=model_output_file_name+'_saved_p_wt', model_name=_model_name+'p_wt')
    _model.save(filename=model_output_file_name+'_saved_n_wt', model_name=_model_name+'n_wt')

In [None]:
batch_vectorizer = artm.BatchVectorizer(data_path=config.dataset_path,
                                        data_format='bow_uci',
                                        collection_name=config.collection_name,
                                        target_folder=config.output_batches_path)
dictionary = artm.Dictionary()
dictionary.gather(data_path=config.output_batches_path,
                  vocab_file_path=config.vocabulary_path)
dictionary.save(dictionary_path=config.dictionary_path)
dictionary.save_text(dictionary_path=config.dictionary_path + '.txt')
dictionary.load_text(dictionary_path=config.dictionary_path + '.txt')

In [8]:
batch_vectorizer = artm.BatchVectorizer(data_path=config.output_batches_path,
                                        data_format='batches')
dictionary = artm.Dictionary()
dictionary.load(dictionary_path=config.dictionary_path + '.dict')

In [None]:
# dictionary.filter(min_tf=5, max_tf=2000, min_df_rate=0.01, max_df_rate=0.9)

In [None]:
# простая модель без регуляризаторов 
# темы вроде нормаьльные, возможно в некоторых темах странные слова-выбросы (18)

In [9]:
model_no_reg2000_5 = create_model(current_dictionary=dictionary, n_topics=2000, n_doc_passes=5, seed_value=100,
                            n_top_tokens=15, p_mass_threshold=0.25)
model_no_reg2000_5 = fit_one_model(model_no_reg2000_5, _n_iterations=25, _model_name='model_no_reg2000_5')

[2016-11-24 11:41:24.162000] creating model
[2016-11-24 11:41:28.028000] adding scores
[2016-11-24 11:41:28.059000] fitting
[2016-11-24 11:52:03.585000] outputting
name = model_no_reg2000_5, n_topics = 2000, n_doc_passes = 5, seed_value = 100, n_iterations = 25, n_top_tokens = 15, p_threshold = 0.25



In [10]:
model_no_reg2000_5_ds = create_model(current_dictionary=dictionary, n_topics=2000, n_doc_passes=5, seed_value=200,
                            n_top_tokens=15, p_mass_threshold=0.25)
model_no_reg2000_5_ds = fit_one_model(model_no_reg2000_5_ds, _n_iterations=25, _model_name='model_no_reg2000_5_ds')

[2016-11-23 18:03:57.109000] creating model
[2016-11-23 18:04:00.854000] adding scores
[2016-11-23 18:04:00.876000] fitting
[2016-11-23 18:14:13.569000] outputting
name = model_no_reg2000_5_ds, n_topics = 2000, n_doc_passes = 5, seed_value = 200, n_iterations = 25, n_top_tokens = 15, p_threshold = 0.25



In [None]:
# модель c sparse регуляризаторами 

In [10]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=2000, n_doc_passes=5, seed_value=100,
                            n_top_tokens=15, p_mass_threshold=0.25)
tmp_model = fit_one_model(tmp_model, _n_iterations=25, _model_name='model_sparse_reg_1')
tmp_model.regularizers.add(artm.SmoothSparsePhiRegularizer(name='sparse_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers.add(artm.SmoothSparseThetaRegularizer(name='sparse_theta_regularizer'))
tmp_model.regularizers['sparse_phi_regularizer'].tau = -0.5
tmp_model.regularizers['sparse_theta_regularizer'].tau = -0.5
tmp_model = fit_one_model(tmp_model, _n_iterations=20, _model_name='model_sparse_reg_1')
model_sparse_reg_1 = tmp_model; tmp_model = None

[2016-11-24 11:52:49.164000] creating model
[2016-11-24 11:52:53.432000] adding scores
[2016-11-24 11:52:53.460000] fitting
[2016-11-24 12:03:31.009000] outputting
name = model_sparse_reg_1, n_topics = 2000, n_doc_passes = 5, seed_value = 100, n_iterations = 25, n_top_tokens = 15, p_threshold = 0.25

[2016-11-24 12:04:17.238000] fitting
[2016-11-24 12:13:16.252000] outputting
name = model_sparse_reg_1, n_topics = 2000, n_doc_passes = 5, seed_value = 100, n_iterations = 20, n_top_tokens = 15, p_threshold = 0.25
sparse_theta_regularizer, tau = -0.5
sparse_phi_regularizer, tau = -0.5



In [None]:
# модель со всеми регуляризаторами

In [11]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=2000, n_doc_passes=5, seed_value=100,
                            n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.SmoothSparsePhiRegularizer(name='sparse_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers.add(artm.SmoothSparseThetaRegularizer(name='sparse_theta_regularizer'))
tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers['sparse_phi_regularizer'].tau = -0.5
tmp_model.regularizers['sparse_theta_regularizer'].tau = -0.5
tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 1e-1
tmp_model = fit_one_model(tmp_model, _n_iterations=15, _model_name='model_reg_1')
model_reg_1 = tmp_model; tmp_model = None

[2016-11-24 12:14:39.602000] creating model
[2016-11-24 12:14:43.373000] adding scores
[2016-11-24 12:14:43.468000] fitting
[2016-11-24 12:21:01.558000] outputting
name = model_reg_1, n_topics = 2000, n_doc_passes = 5, seed_value = 100, n_iterations = 15, n_top_tokens = 15, p_threshold = 0.25
sparse_theta_regularizer, tau = -0.5
decorrelator_phi_regularizer, tau = 0.1
sparse_phi_regularizer, tau = -0.5



In [None]:
# модель только декорр

In [12]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=2000, n_doc_passes=5, seed_value=100,
                         n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 1e-1
tmp_model = fit_one_model(tmp_model, _n_iterations=15, _model_name='model_decor_reg_1')
model_decor_reg_1 = tmp_model; tmp_model = None

[2016-11-24 12:21:21.948000] creating model
[2016-11-24 12:21:25.645000] adding scores
[2016-11-24 12:21:25.713000] fitting
[2016-11-24 12:28:07.729000] outputting
name = model_decor_reg_1, n_topics = 2000, n_doc_passes = 5, seed_value = 100, n_iterations = 15, n_top_tokens = 15, p_threshold = 0.25
decorrelator_phi_regularizer, tau = 0.1



In [13]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=2000, n_doc_passes=5, seed_value=100,
                         n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 1
tmp_model = fit_one_model(tmp_model, _n_iterations=15, _model_name='model_decor_reg_2')
model_decor_reg_2 = tmp_model; tmp_model = None

[2016-11-24 12:28:31.662000] creating model
[2016-11-24 12:28:35.369000] adding scores
[2016-11-24 12:28:35.444000] fitting
[2016-11-24 12:35:20.191000] outputting
name = model_decor_reg_2, n_topics = 2000, n_doc_passes = 5, seed_value = 100, n_iterations = 15, n_top_tokens = 15, p_threshold = 0.25
decorrelator_phi_regularizer, tau = 1



In [14]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=2000, n_doc_passes=5, seed_value=100,
                         n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 10
tmp_model = fit_one_model(tmp_model, _n_iterations=15, _model_name='model_decor_reg_3')
model_decor_reg_3 = tmp_model; tmp_model = None

[2016-11-24 12:35:43.754000] creating model
[2016-11-24 12:35:47.456000] adding scores
[2016-11-24 12:35:47.494000] fitting
[2016-11-24 12:42:29.440000] outputting
name = model_decor_reg_3, n_topics = 2000, n_doc_passes = 5, seed_value = 100, n_iterations = 15, n_top_tokens = 15, p_threshold = 0.25
decorrelator_phi_regularizer, tau = 10



In [15]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=2000, n_doc_passes=5, seed_value=100,
                         n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 100
tmp_model = fit_one_model(tmp_model, _n_iterations=15, _model_name='model_decor_reg_4')
model_decor_reg_4 = tmp_model; tmp_model = None

[2016-11-24 12:42:52.945000] creating model
[2016-11-24 12:42:56.613000] adding scores
[2016-11-24 12:42:56.653000] fitting
[2016-11-24 12:49:36.869000] outputting
name = model_decor_reg_4, n_topics = 2000, n_doc_passes = 5, seed_value = 100, n_iterations = 15, n_top_tokens = 15, p_threshold = 0.25
decorrelator_phi_regularizer, tau = 100



In [16]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=2000, n_doc_passes=5, seed_value=100,
                         n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 1e+8
tmp_model = fit_one_model(tmp_model, _n_iterations=15, _model_name='model_decor_reg_5')
model_decor_reg_5 = tmp_model; tmp_model = None

[2016-11-24 12:50:00.911000] creating model
[2016-11-24 12:50:04.702000] adding scores
[2016-11-24 12:50:04.805000] fitting
[2016-11-24 12:56:13.853000] outputting
name = model_decor_reg_5, n_topics = 2000, n_doc_passes = 5, seed_value = 100, n_iterations = 15, n_top_tokens = 15, p_threshold = 0.25
decorrelator_phi_regularizer, tau = 100000000.0



In [17]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=2000, n_doc_passes=5, seed_value=100,
                         n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 1e-2
tmp_model = fit_one_model(tmp_model, _n_iterations=15, _model_name='model_decor_reg_6')
model_decor_reg_6 = tmp_model; tmp_model = None

[2016-11-24 12:56:31.764000] creating model
[2016-11-24 12:56:36.234000] adding scores
[2016-11-24 12:56:36.272000] fitting
[2016-11-24 13:03:43.581000] outputting
name = model_decor_reg_6, n_topics = 2000, n_doc_passes = 5, seed_value = 100, n_iterations = 15, n_top_tokens = 15, p_threshold = 0.25
decorrelator_phi_regularizer, tau = 0.01



In [18]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=2000, n_doc_passes=5, seed_value=100,
                         n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 1e-3
tmp_model = fit_one_model(tmp_model, _n_iterations=15, _model_name='model_decor_reg_7')
model_decor_reg_7 = tmp_model; tmp_model = None

[2016-11-24 13:04:09.337000] creating model
[2016-11-24 13:04:13.172000] adding scores
[2016-11-24 13:04:13.205000] fitting
[2016-11-24 13:11:57.369000] outputting
name = model_decor_reg_7, n_topics = 2000, n_doc_passes = 5, seed_value = 100, n_iterations = 15, n_top_tokens = 15, p_threshold = 0.25
decorrelator_phi_regularizer, tau = 0.001



In [19]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=2000, n_doc_passes=5, seed_value=100,
                         n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 1e-4
tmp_model = fit_one_model(tmp_model, _n_iterations=15, _model_name='model_decor_reg_8')
model_decor_reg_8 = tmp_model; tmp_model = None

[2016-11-24 13:12:21.734000] creating model
[2016-11-24 13:12:25.757000] adding scores
[2016-11-24 13:12:25.796000] fitting
[2016-11-24 13:20:08.653000] outputting
name = model_decor_reg_8, n_topics = 2000, n_doc_passes = 5, seed_value = 100, n_iterations = 15, n_top_tokens = 15, p_threshold = 0.25
decorrelator_phi_regularizer, tau = 0.0001



In [20]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=2000, n_doc_passes=5, seed_value=100,
                         n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 1e+4
tmp_model = fit_one_model(tmp_model, _n_iterations=15, _model_name='model_decor_reg_9')
model_decor_reg_9 = tmp_model; tmp_model = None

[2016-11-24 13:20:33.877000] creating model
[2016-11-24 13:20:37.785000] adding scores
[2016-11-24 13:20:37.887000] fitting
[2016-11-24 13:27:25.505000] outputting
name = model_decor_reg_9, n_topics = 2000, n_doc_passes = 5, seed_value = 100, n_iterations = 15, n_top_tokens = 15, p_threshold = 0.25
decorrelator_phi_regularizer, tau = 10000.0



In [21]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=2000, n_doc_passes=5, seed_value=100,
                         n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 1e+6
tmp_model = fit_one_model(tmp_model, _n_iterations=15, _model_name='model_decor_reg_10')
model_decor_reg_10 = tmp_model; tmp_model = None

[2016-11-24 13:27:44.898000] creating model
[2016-11-24 13:27:48.861000] adding scores
[2016-11-24 13:27:48.901000] fitting
[2016-11-24 13:35:15.475000] outputting
name = model_decor_reg_10, n_topics = 2000, n_doc_passes = 5, seed_value = 100, n_iterations = 15, n_top_tokens = 15, p_threshold = 0.25
decorrelator_phi_regularizer, tau = 1000000.0



In [None]:
# модель + sparse + decor

In [22]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=2000, n_doc_passes=5, seed_value=100,
                         n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers.add(artm.SmoothSparseThetaRegularizer(name='sparse_theta_regularizer'))
tmp_model.regularizers['sparse_theta_regularizer'].tau = -0.5
tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 1e-1
tmp_model = fit_one_model(tmp_model, _n_iterations=15, _model_name='model_decor_sparse_t_reg_1')
model_decor_sparse_t_reg_1 = tmp_model; tmp_model = None

[2016-11-24 13:35:34.443000] creating model
[2016-11-24 13:35:38.938000] adding scores
[2016-11-24 13:35:39.033000] fitting
[2016-11-24 13:43:03.471000] outputting
name = model_decor_sparse_t_reg_1, n_topics = 2000, n_doc_passes = 5, seed_value = 100, n_iterations = 15, n_top_tokens = 15, p_threshold = 0.25
sparse_theta_regularizer, tau = -0.5
decorrelator_phi_regularizer, tau = 0.1



In [120]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=2000, n_doc_passes=5, seed_value=100,
                         n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.SmoothSparseThetaRegularizer(name='sparse_theta_regularizer'))
tmp_model.regularizers['sparse_theta_regularizer'].tau = -0.5
tmp_model = fit_one_model(tmp_model, _n_iterations=15, _model_name='model_decor_sparse_t_reg_2')
model_decor_sparse_t_reg_2 = tmp_model; tmp_model = None

[2016-11-24 15:07:09.417000] creating model
[2016-11-24 15:07:14.274000] adding scores
[2016-11-24 15:07:14.316000] fitting
[2016-11-24 15:14:19.297000] outputting
name = model_decor_sparse_t_reg_2, n_topics = 2000, n_doc_passes = 5, seed_value = 100, n_iterations = 15, n_top_tokens = 15, p_threshold = 0.25
sparse_theta_regularizer, tau = -0.5



In [121]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=2500, n_doc_passes=5, seed_value=100,
                         n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers.add(artm.SmoothSparseThetaRegularizer(name='sparse_theta_regularizer'))
tmp_model.regularizers['sparse_theta_regularizer'].tau = -0.5
tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 1e-1
tmp_model = fit_one_model(tmp_model, _n_iterations=15, _model_name='model_decor_sparse_t_reg_3')
model_decor_sparse_t_reg_3 = tmp_model; tmp_model = None

[2016-11-24 15:14:47.469000] creating model
[2016-11-24 15:14:53.403000] adding scores
[2016-11-24 15:14:53.465000] fitting
[2016-11-24 15:24:00.110000] outputting
name = model_decor_sparse_t_reg_3, n_topics = 2500, n_doc_passes = 5, seed_value = 100, n_iterations = 15, n_top_tokens = 15, p_threshold = 0.25
sparse_theta_regularizer, tau = -0.5
decorrelator_phi_regularizer, tau = 0.1



In [122]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=3000, n_doc_passes=5, seed_value=100,
                         n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers.add(artm.SmoothSparseThetaRegularizer(name='sparse_theta_regularizer'))
tmp_model.regularizers['sparse_theta_regularizer'].tau = -0.5
tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 1e-1
tmp_model = fit_one_model(tmp_model, _n_iterations=15, _model_name='model_decor_sparse_t_reg_4')
model_decor_sparse_t_reg_4 = tmp_model; tmp_model = None

[2016-11-24 15:24:32.023000] creating model
[2016-11-24 15:24:37.834000] adding scores
[2016-11-24 15:24:37.933000] fitting
[2016-11-24 15:35:30.929000] outputting
name = model_decor_sparse_t_reg_4, n_topics = 3000, n_doc_passes = 5, seed_value = 100, n_iterations = 15, n_top_tokens = 15, p_threshold = 0.25
sparse_theta_regularizer, tau = -0.5
decorrelator_phi_regularizer, tau = 0.1



In [123]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=3500, n_doc_passes=5, seed_value=100,
                         n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers.add(artm.SmoothSparseThetaRegularizer(name='sparse_theta_regularizer'))
tmp_model.regularizers['sparse_theta_regularizer'].tau = -0.5
tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 1e-1
tmp_model = fit_one_model(tmp_model, _n_iterations=15, _model_name='model_decor_sparse_t_reg_5')
model_decor_sparse_t_reg_5 = tmp_model; tmp_model = None

[2016-11-24 15:36:08.018000] creating model
[2016-11-24 15:36:14.945000] adding scores
[2016-11-24 15:36:15.040000] fitting
[2016-11-24 15:49:11.050000] outputting
name = model_decor_sparse_t_reg_5, n_topics = 3500, n_doc_passes = 5, seed_value = 100, n_iterations = 15, n_top_tokens = 15, p_threshold = 0.25
sparse_theta_regularizer, tau = -0.5
decorrelator_phi_regularizer, tau = 0.1



In [124]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=4000, n_doc_passes=5, seed_value=100,
                         n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers.add(artm.SmoothSparseThetaRegularizer(name='sparse_theta_regularizer'))
tmp_model.regularizers['sparse_theta_regularizer'].tau = -0.5
tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 1e-1
tmp_model = fit_one_model(tmp_model, _n_iterations=15, _model_name='model_decor_sparse_t_reg_6')
model_decor_sparse_t_reg_6 = tmp_model; tmp_model = None

[2016-11-24 15:49:58.683000] creating model
[2016-11-24 15:50:09.417000] adding scores
[2016-11-24 15:50:09.522000] fitting
[2016-11-24 16:04:58.532000] outputting
name = model_decor_sparse_t_reg_6, n_topics = 4000, n_doc_passes = 5, seed_value = 100, n_iterations = 15, n_top_tokens = 15, p_threshold = 0.25
sparse_theta_regularizer, tau = -0.5
decorrelator_phi_regularizer, tau = 0.1



In [125]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=4500, n_doc_passes=5, seed_value=100,
                         n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers.add(artm.SmoothSparseThetaRegularizer(name='sparse_theta_regularizer'))
tmp_model.regularizers['sparse_theta_regularizer'].tau = -0.5
tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 1e-1
tmp_model = fit_one_model(tmp_model, _n_iterations=15, _model_name='model_decor_sparse_t_reg_7')
model_decor_sparse_t_reg_7 = tmp_model; tmp_model = None

[2016-11-24 16:05:59.860000] creating model
[2016-11-24 16:06:40.855000] adding scores
[2016-11-24 16:06:40.967000] fitting
[2016-11-24 16:24:59.037000] outputting
name = model_decor_sparse_t_reg_7, n_topics = 4500, n_doc_passes = 5, seed_value = 100, n_iterations = 15, n_top_tokens = 15, p_threshold = 0.25
sparse_theta_regularizer, tau = -0.5
decorrelator_phi_regularizer, tau = 0.1



In [126]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=5000, n_doc_passes=5, seed_value=100,
                         n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers.add(artm.SmoothSparseThetaRegularizer(name='sparse_theta_regularizer'))
tmp_model.regularizers['sparse_theta_regularizer'].tau = -0.5
tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 1e-1
tmp_model = fit_one_model(tmp_model, _n_iterations=15, _model_name='model_decor_sparse_t_reg_8')
model_decor_sparse_t_reg_8 = tmp_model; tmp_model = None

[2016-11-24 16:26:07.376000] creating model
[2016-11-24 16:26:20.109000] adding scores
[2016-11-24 16:26:20.301000] fitting
[2016-11-24 16:45:53.036000] outputting
name = model_decor_sparse_t_reg_8, n_topics = 5000, n_doc_passes = 5, seed_value = 100, n_iterations = 15, n_top_tokens = 15, p_threshold = 0.25
sparse_theta_regularizer, tau = -0.5
decorrelator_phi_regularizer, tau = 0.1



In [118]:
tt = get_doc_top_topics(model_decor_reg_8, 4)
print doc_top_topics_to_str(tt)

1 | topic_1048 : 0.819385766983, topic_1696 : 0.0214816760272, topic_1433 : 0.00801280327141, topic_1741 : 0.00786730460823
2 | topic_1986 : 0.835175037384, topic_137 : 0.0566121675074, topic_1602 : 0.0087715536356, topic_1305 : 0.00742827821523
3 | topic_753 : 0.558131575584, topic_1836 : 0.0833358764648, topic_1643 : 0.0829107090831, topic_1452 : 0.0823532566428
4 | topic_1883 : 0.893652975559, topic_54 : 0.0102877896279, topic_555 : 0.00946168787777, topic_1956 : 0.00527089834213
5 | topic_1690 : 0.604127585888, topic_1541 : 0.0520543307066, topic_1203 : 0.0277139786631, topic_476 : 0.0264881849289
6 | topic_1824 : 0.175944134593, topic_897 : 0.100168392062, topic_1899 : 0.0868125110865, topic_1388 : 0.0756196379662
7 | topic_141 : 0.824461698532, topic_1645 : 0.0159749072045, topic_19 : 0.0131582152098, topic_993 : 0.00845735985786
8 | topic_1866 : 0.71635389328, topic_188 : 0.159392535686, topic_496 : 0.0425183735788, topic_820 : 0.0288405176252
9 | topic_1270 : 0.271973133087, to

In [115]:
def get_doc_top_topics(model, num_top_topics):
    theta = model.get_theta()
    top_topics = {col: theta.ix[:, col].sort_values(ascending=False).head(num_top_topics) for col in theta.columns}
    return top_topics

def doc_top_topics_to_str(top_topics):
    str = ''
    for key in top_topics.iterkeys():
        values = top_topics[key].iloc[top_topics[key].nonzero()[0]]
        if len(values):
            value = ', '.join(['{} : {}'.format(ind, values[ind]) for ind in values.index])
        else:
            value = 'None'
        str += '{} | {}\n'.format(key, value)
    return str

In [127]:
models_file.close()