In [1]:
import sys
import numpy as np
import artm
print artm.version()

from os import path, mkdir
from datetime import datetime
%matplotlib inline
sys.path.insert(0, '..\\modules\\helpers')
from plot_helper import PlotMaker
from config_helper import ConfigPaths
from print_helper import PrintHelper

0.8.1


In [2]:
config = ConfigPaths('config.cfg')
plot_maker = PlotMaker()
printer = PrintHelper()

In [3]:
print config.models_file_name

Q:\\topic_modeling\\csi_science_collections.git\experiments\UCI_filtered_ngramm_trimmed_without_names\08_12_1000_topics_exp\models.txt


In [4]:
models_file = open(config.models_file_name, 'a')

In [10]:
def create_model(current_dictionary, n_topics, n_doc_passes, seed_value, n_top_tokens, p_mass_threshold):    
    print '[{}] creating model'.format(datetime.now())
    model = artm.ARTM(num_topics=n_topics, dictionary=current_dictionary, cache_theta=True, seed=seed_value, 
                  class_ids={'ngramm': 1.0, 'author_id': 0.0, 'author': 0.0, 
                             'post_tag': 0.0, 'projects': 0.0, 'category': 0.0,
                             'following_users': 0.0})
    model.num_document_passes = n_doc_passes
    add_scores_to_model(model, n_top_tokens=n_top_tokens, p_mass_threshold=p_mass_threshold)
    return model


def add_scores_to_model(artm_model, n_top_tokens, p_mass_threshold):
    print '[{}] adding scores'.format(datetime.now())
    artm_model.scores.add(artm.PerplexityScore(name='perplexity_score',
                                      use_unigram_document_model=False,
                                      dictionary=dictionary))
    artm_model.scores.add(artm.SparsityPhiScore(name='sparsity_phi_score', class_id='ngramm'))
    artm_model.scores.add(artm.SparsityThetaScore(name='sparsity_theta_score'))
    artm_model.scores.add(artm.TopicKernelScore(name='topic_kernel_score', class_id='ngramm', 
                                                probability_mass_threshold=p_mass_threshold))
    artm_model.scores.add(artm.TopTokensScore(name='top_tokens_score', class_id='ngramm', num_tokens=n_top_tokens))
def fit_one_model(model, _n_iterations, _model_name=''): 
    print '[{}] fitting'.format(datetime.now())
    model.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=_n_iterations)
    print '[{}] outputting'.format(datetime.now())
    printer.print_artm_model(model, _model_name, _n_iterations, output_file=models_file)
    model_pics_file_name =  path.join(config.experiment_path, _model_name)
    plot_maker.make_tm_plots(model, model_pics_file_name)
    model_output_file_name = path.join(config.experiment_path, _model_name + '.txt')
    printer.print_scores(model, _model_name, _n_iterations, model_output_file_name)
    printer.print_top_tokens(model, model_output_file_name)
    return model

In [8]:
batch_vectorizer = artm.BatchVectorizer(data_path=config.output_batches_path,
                                        data_format='batches')
dictionary = artm.Dictionary()
dictionary.load(dictionary_path=config.dictionary_path + '.dict')

In [None]:
# dictionary.filter(min_tf=5, max_tf=2000, min_df_rate=0.01, max_df_rate=0.9)

In [None]:
# простая модель без регуляризаторов 

In [11]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=1000, n_doc_passes=5, seed_value=100,
                            n_top_tokens=15, p_mass_threshold=0.25)
tmp_model = fit_one_model(tmp_model, _n_iterations=25, _model_name='model_no_reg_1000')
model_no_reg_1000= tmp_model; tmp_model = None

[2016-12-08 13:14:20.880000] creating model
[2016-12-08 13:14:23.516000] adding scores
[2016-12-08 13:14:23.532000] fitting
[2016-12-08 13:20:20.375000] outputting
name = model_no_reg_1000, n_topics = 1000, n_doc_passes = 5, seed_value = 100, n_iterations = 25, n_top_tokens = 15, p_threshold = 0.25



In [None]:
# + decor phi

In [12]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=1000, n_doc_passes=5, seed_value=100,
                            n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 0.1
tmp_model = fit_one_model(tmp_model, _n_iterations=20, _model_name='model_decor_1')
model_decor_1 = tmp_model; tmp_model = None

[2016-12-08 13:21:30.248000] creating model
[2016-12-08 13:21:32.624000] adding scores
[2016-12-08 13:21:32.645000] fitting
[2016-12-08 13:26:26.897000] outputting
name = model_decor_1, n_topics = 1000, n_doc_passes = 5, seed_value = 100, n_iterations = 20, n_top_tokens = 15, p_threshold = 0.25
decorrelator_phi_regularizer, tau = 0.1



In [13]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=1000, n_doc_passes=5, seed_value=100,
                            n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 10
tmp_model = fit_one_model(tmp_model, _n_iterations=20, _model_name='model_decor_2')
model_decor_2 = tmp_model; tmp_model = None

[2016-12-08 13:26:50.206000] creating model
[2016-12-08 13:26:53.090000] adding scores
[2016-12-08 13:26:53.121000] fitting
[2016-12-08 13:32:18.395000] outputting
name = model_decor_2, n_topics = 1000, n_doc_passes = 5, seed_value = 100, n_iterations = 20, n_top_tokens = 15, p_threshold = 0.25
decorrelator_phi_regularizer, tau = 10



In [14]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=1000, n_doc_passes=5, seed_value=100,
                            n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 100
tmp_model = fit_one_model(tmp_model, _n_iterations=20, _model_name='model_decor_3')
model_decor_3 = tmp_model; tmp_model = None

[2016-12-08 13:32:44.783000] creating model
[2016-12-08 13:32:47.457000] adding scores
[2016-12-08 13:32:47.494000] fitting
[2016-12-08 13:38:08.339000] outputting
name = model_decor_3, n_topics = 1000, n_doc_passes = 5, seed_value = 100, n_iterations = 20, n_top_tokens = 15, p_threshold = 0.25
decorrelator_phi_regularizer, tau = 100



In [15]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=1000, n_doc_passes=5, seed_value=100,
                            n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 1000
tmp_model = fit_one_model(tmp_model, _n_iterations=20, _model_name='model_decor_4')
model_decor_4 = tmp_model; tmp_model = None

[2016-12-08 13:40:29.122000] creating model
[2016-12-08 13:40:31.580000] adding scores
[2016-12-08 13:40:31.615000] fitting
[2016-12-08 13:44:57.921000] outputting
name = model_decor_4, n_topics = 1000, n_doc_passes = 5, seed_value = 100, n_iterations = 20, n_top_tokens = 15, p_threshold = 0.25
decorrelator_phi_regularizer, tau = 1000



In [16]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=1000, n_doc_passes=5, seed_value=100,
                            n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 10000
tmp_model = fit_one_model(tmp_model, _n_iterations=20, _model_name='model_decor_5')
model_decor_5 = tmp_model; tmp_model = None

[2016-12-08 13:45:19.688000] creating model
[2016-12-08 13:45:22.050000] adding scores
[2016-12-08 13:45:22.082000] fitting
[2016-12-08 13:49:35.226000] outputting
name = model_decor_5, n_topics = 1000, n_doc_passes = 5, seed_value = 100, n_iterations = 20, n_top_tokens = 15, p_threshold = 0.25
decorrelator_phi_regularizer, tau = 10000



In [None]:
# ss theta

In [26]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=1000, n_doc_passes=5, seed_value=100,
                            n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.SmoothSparseThetaRegularizer(name='ss_theta_regularizer'))
tmp_model.regularizers['ss_theta_regularizer'].tau = -0.1
tmp_model = fit_one_model(tmp_model, _n_iterations=20, _model_name='model_sst_1')
model_sst_1 = tmp_model; tmp_model = None

[2016-12-08 14:40:42.248000] creating model
[2016-12-08 14:40:45.708000] adding scores
[2016-12-08 14:40:45.739000] fitting
[2016-12-08 14:47:13.988000] outputting
name = model_sst_1, n_topics = 1000, n_doc_passes = 5, seed_value = 100, n_iterations = 20, n_top_tokens = 15, p_threshold = 0.25
ss_theta_regularizer, tau = -0.1



In [19]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=1000, n_doc_passes=5, seed_value=100,
                            n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.SmoothSparseThetaRegularizer(name='ss_theta_regularizer'))
tmp_model.regularizers['ss_theta_regularizer'].tau = -0.5
tmp_model = fit_one_model(tmp_model, _n_iterations=20, _model_name='model_sst_2')
model_sst_2 = tmp_model; tmp_model = None

[2016-12-08 14:00:28.978000] creating model
[2016-12-08 14:00:31.521000] adding scores
[2016-12-08 14:00:31.537000] fitting
[2016-12-08 14:05:15.667000] outputting
name = model_sst_2, n_topics = 1000, n_doc_passes = 5, seed_value = 100, n_iterations = 20, n_top_tokens = 15, p_threshold = 0.25
ss_theta_regularizer, tau = -0.5



In [20]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=1000, n_doc_passes=5, seed_value=100,
                            n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.SmoothSparseThetaRegularizer(name='ss_theta_regularizer'))
tmp_model.regularizers['ss_theta_regularizer'].tau = -1
tmp_model = fit_one_model(tmp_model, _n_iterations=20, _model_name='model_sst_3')
model_sst_3 = tmp_model; tmp_model = None

[2016-12-08 14:05:37.670000] creating model
[2016-12-08 14:05:39.987000] adding scores
[2016-12-08 14:05:40.009000] fitting
[2016-12-08 14:10:10.072000] outputting
name = model_sst_3, n_topics = 1000, n_doc_passes = 5, seed_value = 100, n_iterations = 20, n_top_tokens = 15, p_threshold = 0.25
ss_theta_regularizer, tau = -1



In [None]:
# ss phi

In [21]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=1000, n_doc_passes=5, seed_value=100,
                            n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.SmoothSparsePhiRegularizer(name='ss_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers['ss_phi_regularizer'].tau = -0.0001
tmp_model = fit_one_model(tmp_model, _n_iterations=20, _model_name='model_ssphi_1')
model_ssphi_1 = tmp_model; tmp_model = None

[2016-12-08 14:10:31.405000] creating model
[2016-12-08 14:10:33.809000] adding scores
[2016-12-08 14:10:33.831000] fitting
[2016-12-08 14:15:12.219000] outputting
name = model_ssphi_1, n_topics = 1000, n_doc_passes = 5, seed_value = 100, n_iterations = 20, n_top_tokens = 15, p_threshold = 0.25
ss_phi_regularizer, tau = -0.0001



In [22]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=1000, n_doc_passes=5, seed_value=100,
                            n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.SmoothSparsePhiRegularizer(name='ss_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers['ss_phi_regularizer'].tau = -0.01
tmp_model = fit_one_model(tmp_model, _n_iterations=20, _model_name='model_ssphi_2')
model_ssphi_2 = tmp_model; tmp_model = None

[2016-12-08 14:15:34] creating model
[2016-12-08 14:15:36.373000] adding scores
[2016-12-08 14:15:36.395000] fitting
[2016-12-08 14:20:00.398000] outputting
name = model_ssphi_2, n_topics = 1000, n_doc_passes = 5, seed_value = 100, n_iterations = 20, n_top_tokens = 15, p_threshold = 0.25
ss_phi_regularizer, tau = -0.01



In [23]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=1000, n_doc_passes=5, seed_value=100,
                            n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.SmoothSparsePhiRegularizer(name='ss_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers['ss_phi_regularizer'].tau = -0.1
tmp_model = fit_one_model(tmp_model, _n_iterations=20, _model_name='model_ssphi_3')
model_ssphi_3 = tmp_model; tmp_model = None

[2016-12-08 14:20:26.790000] creating model
[2016-12-08 14:20:29.045000] adding scores
[2016-12-08 14:20:29.076000] fitting
[2016-12-08 14:26:36.754000] outputting
name = model_ssphi_3, n_topics = 1000, n_doc_passes = 5, seed_value = 100, n_iterations = 20, n_top_tokens = 15, p_threshold = 0.25
ss_phi_regularizer, tau = -0.1



In [24]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=1000, n_doc_passes=5, seed_value=100,
                            n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.SmoothSparsePhiRegularizer(name='ss_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers['ss_phi_regularizer'].tau = 0.1
tmp_model = fit_one_model(tmp_model, _n_iterations=20, _model_name='model_ssphi_4')
model_ssphi_4 = tmp_model; tmp_model = None

[2016-12-08 14:27:03.198000] creating model
[2016-12-08 14:27:06.789000] adding scores
[2016-12-08 14:27:06.842000] fitting
[2016-12-08 14:33:50.365000] outputting
name = model_ssphi_4, n_topics = 1000, n_doc_passes = 5, seed_value = 100, n_iterations = 20, n_top_tokens = 15, p_threshold = 0.25
ss_phi_regularizer, tau = 0.1



In [25]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=1000, n_doc_passes=5, seed_value=100,
                            n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.SmoothSparsePhiRegularizer(name='ss_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers['ss_phi_regularizer'].tau = 0.5
tmp_model = fit_one_model(tmp_model, _n_iterations=20, _model_name='model_ssphi_5')
model_ssphi_5 = tmp_model; tmp_model = None

[2016-12-08 14:34:06.945000] creating model
[2016-12-08 14:34:09.364000] adding scores
[2016-12-08 14:34:09.386000] fitting
[2016-12-08 14:40:09.658000] outputting
name = model_ssphi_5, n_topics = 1000, n_doc_passes = 5, seed_value = 100, n_iterations = 20, n_top_tokens = 15, p_threshold = 0.25
ss_phi_regularizer, tau = 0.5



In [27]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=1000, n_doc_passes=5, seed_value=100,
                            n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers.add(artm.SmoothSparseThetaRegularizer(name='ss_theta_regularizer'))
tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 1000
tmp_model.regularizers['ss_theta_regularizer'].tau = -0.5
tmp_model = fit_one_model(tmp_model, _n_iterations=20, _model_name='model_decor_sst_1')
model_decor_sst_1 = tmp_model; tmp_model = None

[2016-12-08 14:47:54.581000] creating model
[2016-12-08 14:47:58.645000] adding scores
[2016-12-08 14:47:58.693000] fitting
[2016-12-08 14:54:54.933000] outputting
name = model_decor_sst_1, n_topics = 1000, n_doc_passes = 5, seed_value = 100, n_iterations = 20, n_top_tokens = 15, p_threshold = 0.25
ss_theta_regularizer, tau = -0.5
decorrelator_phi_regularizer, tau = 1000



In [28]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=1000, n_doc_passes=5, seed_value=100,
                            n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers.add(artm.SmoothSparseThetaRegularizer(name='ss_theta_regularizer'))
tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 100
tmp_model.regularizers['ss_theta_regularizer'].tau = -0.5
tmp_model = fit_one_model(tmp_model, _n_iterations=20, _model_name='model_decor_sst_2')
model_decor_sst_2 = tmp_model; tmp_model = None

[2016-12-08 14:55:57.038000] creating model
[2016-12-08 14:56:00.992000] adding scores
[2016-12-08 14:56:01.104000] fitting
[2016-12-08 15:03:06.353000] outputting
name = model_decor_sst_2, n_topics = 1000, n_doc_passes = 5, seed_value = 100, n_iterations = 20, n_top_tokens = 15, p_threshold = 0.25
ss_theta_regularizer, tau = -0.5
decorrelator_phi_regularizer, tau = 100



In [29]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=1000, n_doc_passes=5, seed_value=100,
                            n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers.add(artm.SmoothSparseThetaRegularizer(name='ss_theta_regularizer'))
tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 100
tmp_model.regularizers['ss_theta_regularizer'].tau = -1.5
tmp_model = fit_one_model(tmp_model, _n_iterations=20, _model_name='model_decor_sst_3')
model_decor_sst_3 = tmp_model; tmp_model = None

[2016-12-08 15:05:38.859000] creating model
[2016-12-08 15:05:42.529000] adding scores
[2016-12-08 15:05:42.574000] fitting
[2016-12-08 15:15:18.146000] outputting
name = model_decor_sst_3, n_topics = 1000, n_doc_passes = 5, seed_value = 100, n_iterations = 20, n_top_tokens = 15, p_threshold = 0.25
ss_theta_regularizer, tau = -1.5
decorrelator_phi_regularizer, tau = 100



In [30]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=1000, n_doc_passes=5, seed_value=100,
                            n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers.add(artm.SmoothSparseThetaRegularizer(name='ss_theta_regularizer'))
tmp_model.regularizers.add(artm.SmoothSparsePhiRegularizer(name='ss_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 100
tmp_model.regularizers['ss_theta_regularizer'].tau = -0.5
tmp_model.regularizers['ss_phi_regularizer'].tau = -0.01
tmp_model = fit_one_model(tmp_model, _n_iterations=20, _model_name='model_decor_sst_ssphi_1')
model_decor_sst_ssphi_1 = tmp_model; tmp_model = None

[2016-12-08 15:16:06.663000] creating model
[2016-12-08 15:16:11.409000] adding scores
[2016-12-08 15:16:11.605000] fitting
[2016-12-08 15:24:04.459000] outputting
name = model_decor_sst_ssphi_1, n_topics = 1000, n_doc_passes = 5, seed_value = 100, n_iterations = 20, n_top_tokens = 15, p_threshold = 0.25
ss_theta_regularizer, tau = -0.5
decorrelator_phi_regularizer, tau = 100
ss_phi_regularizer, tau = -0.01



In [31]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=1000, n_doc_passes=5, seed_value=100,
                            n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers.add(artm.SmoothSparseThetaRegularizer(name='ss_theta_regularizer'))
tmp_model.regularizers.add(artm.SmoothSparsePhiRegularizer(name='ss_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 100
tmp_model.regularizers['ss_theta_regularizer'].tau = -0.5
tmp_model.regularizers['ss_phi_regularizer'].tau = -0.1
tmp_model = fit_one_model(tmp_model, _n_iterations=20, _model_name='model_decor_sst_ssphi_2')
model_decor_sst_ssphi_2 = tmp_model; tmp_model = None

[2016-12-08 15:24:57.452000] creating model
[2016-12-08 15:25:02.187000] adding scores
[2016-12-08 15:25:02.389000] fitting
[2016-12-08 15:32:29.561000] outputting
name = model_decor_sst_ssphi_2, n_topics = 1000, n_doc_passes = 5, seed_value = 100, n_iterations = 20, n_top_tokens = 15, p_threshold = 0.25
ss_theta_regularizer, tau = -0.5
decorrelator_phi_regularizer, tau = 100
ss_phi_regularizer, tau = -0.1



In [None]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=1000, n_doc_passes=5, seed_value=100,
                            n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers.add(artm.SmoothSparseThetaRegularizer(name='ss_theta_regularizer'))
tmp_model.regularizers.add(artm.SmoothSparsePhiRegularizer(name='ss_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 100
tmp_model.regularizers['ss_theta_regularizer'].tau = -0.5
tmp_model.regularizers['ss_phi_regularizer'].tau = -0.01
tmp_model = fit_one_model(tmp_model, _n_iterations=20, _model_name='model_decor_sst_ssphi_3')
model_decor_sst_ssphi_3 = tmp_model; tmp_model = None

In [32]:
models_file.close()