In [1]:
import pickle
import sys
import numpy as np
import pandas as pd
import artm
import seaborn as sns
import matplotlib.pyplot as plt
print artm.version()

from os import path, mkdir
from datetime import datetime
sys.path.insert(0, '..\\modules\\helpers')
import distances_helper as dh 
from plot_helper import PlotMaker
from config_helper import ConfigPaths
from print_helper import PrintHelper

0.8.1


In [2]:
config = ConfigPaths('config.cfg')
plot_maker = PlotMaker()
printer = PrintHelper()
print config.models_file_name

Q:\\topic_modeling\\csi_science_collections.git\experiments\UCI_filtered_ngramm_trimmed_without_names\np_17_01\models.txt


In [17]:
models_file = open(config.models_file_name, 'a')

In [4]:
def create_model(current_dictionary, n_topics, n_doc_passes, seed_value, n_top_tokens, p_mass_threshold):    
    print '[{}] creating model'.format(datetime.now())
    model = artm.ARTM(num_topics=n_topics, dictionary=current_dictionary, cache_theta=True, seed=seed_value, 
                  class_ids={'ngramm': 1.0, 'author_id': 0.0, 'author': 0.0, 
                             'post_tag': 0.0, 'projects': 0.0, 'category': 0.0,
                             'following_users': 0.0})
    model.num_document_passes = n_doc_passes
    add_scores_to_model(model, n_top_tokens=n_top_tokens, p_mass_threshold=p_mass_threshold)
    return model
def add_scores_to_model(artm_model, n_top_tokens, p_mass_threshold):
    print '[{}] adding scores'.format(datetime.now())
    artm_model.scores.add(artm.PerplexityScore(name='perplexity_score',
                                      dictionary=dictionary))
    artm_model.scores.add(artm.SparsityPhiScore(name='ss_phi_score', class_id='ngramm'))
    artm_model.scores.add(artm.SparsityThetaScore(name='ss_theta_score'))
    artm_model.scores.add(artm.TopicKernelScore(name='topic_kernel_score', class_id='ngramm', 
                                                probability_mass_threshold=p_mass_threshold))
    artm_model.scores.add(artm.TopTokensScore(name='top_tokens_score', class_id='ngramm', num_tokens=n_top_tokens))
def fit_one_model(model, _n_iterations, _model_name=''): 
    print '[{}] fitting'.format(datetime.now())
    model.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=_n_iterations)
    print '[{}] outputting'.format(datetime.now())
    printer.print_artm_model(model, _model_name, _n_iterations, output_file=models_file)
    model_pics_file_name =  path.join(config.experiment_path, _model_name)
    plot_maker.make_tm_plots(model, model_pics_file_name)
    model_output_file_name = path.join(config.experiment_path, _model_name + '.txt')
    printer.print_scores(model, _model_name, _n_iterations, model_output_file_name)
    printer.print_top_tokens(model, model_output_file_name)
    return model
def save_pickle_file(dists, filename):
    pickle_filename = path.join(config.experiment_path, filename)
    pickle_file = open(pickle_filename, 'wb')
    pickle.dump(dists, pickle_file)
    pickle_file.close()
def load_pickle_file(filename):
    pickle_filename = path.join(config.experiment_path, filename)
    pickle_file = open(pickle_filename, 'rb')
    p_file = pickle.load(pickle_file)
    pickle_file.close()
    return p_file
def save_model_pickle(_model_name, _model, _save=True):
    phi = _model.get_phi()
    phi = phi[(phi.T != 0).any()]
    theta = _model.get_theta()    
    saved_top_tokens = _model.score_tracker['top_tokens_score'].last_tokens
    if _save:
        save_pickle_file(phi, 'phi_{}.p'.format(_model_name))
        save_pickle_file(theta, 'theta_{}.p'.format(_model_name))
        save_pickle_file(saved_top_tokens, 'saved_top_tokens_{}.p'.format(_model_name))
    return phi, theta, saved_top_tokens
def load_model_pickle(_model_name, _distance_name):
    phi = load_pickle_file('phi_{}.p'.format(_model_name))
    theta = load_pickle_file('theta_{}.p'.format(_model_name))
    saved_top_tokens = load_pickle_file('saved_top_tokens_{}.p'.format(_model_name))
    distances = load_pickle_file('{}.p'.format(_distance_name))
    return phi, theta, saved_top_tokens, distances

In [5]:
batch_vectorizer = artm.BatchVectorizer(data_path=config.output_batches_path,
                                        data_format='batches')
dictionary = artm.Dictionary()
dictionary.load(dictionary_path=config.dictionary_path + '.dict')

In [6]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=100, n_doc_passes=5, seed_value=100,
                            n_top_tokens=15, p_mass_threshold=0.25)
tmp_model = fit_one_model(tmp_model, _n_iterations=20, _model_name='model_100_1')
model1 = tmp_model; tmp_model = None

[2017-01-17 17:27:30.048000] creating model
[2017-01-17 17:27:31.568000] adding scores
[2017-01-17 17:27:31.573000] fitting
[2017-01-17 17:28:03.547000] outputting
name = model_100_1, n_topics = 100, n_doc_passes = 5, seed_value = 100, n_iterations = 20, n_top_tokens = 15, p_threshold = 0.25



In [9]:
taus = [1e-1, 10, 100, 1e+3, 1e+5, 1e+7]
for idx, t in enumerate(taus):
    print '[{}] processing model {} / {}'.format(datetime.now(), idx, len(taus))
    tmp_model = create_model(current_dictionary=dictionary, n_topics=100, n_doc_passes=5, seed_value=100,
                                n_top_tokens=15, p_mass_threshold=0.25)
    tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['ngramm']))
    tmp_model.regularizers['decorrelator_phi_regularizer'].tau = t
    tmp_model = fit_one_model(tmp_model, _n_iterations=20, _model_name='model_100_decor_{}'.format(idx))

[2017-01-17 17:42:44.448000] processing model 0 / 6
[2017-01-17 17:42:44.449000] creating model
[2017-01-17 17:42:45.994000] adding scores
[2017-01-17 17:42:46.033000] fitting
[2017-01-17 17:43:21.728000] outputting
name = model_100_decor_0, n_topics = 100, n_doc_passes = 5, seed_value = 100, n_iterations = 20, n_top_tokens = 15, p_threshold = 0.25
decorrelator_phi_regularizer, tau = 0.1

[2017-01-17 17:43:38.897000] processing model 1 / 6
[2017-01-17 17:43:38.897000] creating model
[2017-01-17 17:43:40.265000] adding scores
[2017-01-17 17:43:40.302000] fitting
[2017-01-17 17:44:15.196000] outputting
name = model_100_decor_1, n_topics = 100, n_doc_passes = 5, seed_value = 100, n_iterations = 20, n_top_tokens = 15, p_threshold = 0.25
decorrelator_phi_regularizer, tau = 10

[2017-01-17 17:44:32.554000] processing model 2 / 6
[2017-01-17 17:44:32.555000] creating model
[2017-01-17 17:44:33.944000] adding scores
[2017-01-17 17:44:33.979000] fitting
[2017-01-17 17:45:09.815000] outputting
n

In [10]:
taus = [-1e+5, -1e+3, -1e+1, -1, -1e-1, -1e-2, -1e-5, 1e-1, 10, 100, 1e+3, 1e+5, 1e+7]
for idx, t in enumerate(taus):
    print '[{}] processing model {} / {}'.format(datetime.now(), idx, len(taus))
    tmp_model = create_model(current_dictionary=dictionary, n_topics=100, n_doc_passes=5, seed_value=100,
                                n_top_tokens=15, p_mass_threshold=0.25)
    tmp_model.regularizers.add(artm.SmoothSparseThetaRegularizer(name='ss_theta_regularizer'))
    tmp_model.regularizers['ss_theta_regularizer'].tau = t
    tmp_model = fit_one_model(tmp_model, _n_iterations=20, _model_name='model_100_ss_th_{}'.format(idx))

[2017-01-17 17:47:51.102000] processing model 0 / 13
[2017-01-17 17:47:51.104000] creating model
[2017-01-17 17:47:52.493000] adding scores
[2017-01-17 17:47:52.525000] fitting
[2017-01-17 17:48:21.335000] outputting
name = model_100_ss_th_0, n_topics = 100, n_doc_passes = 5, seed_value = 100, n_iterations = 20, n_top_tokens = 15, p_threshold = 0.25
ss_theta_regularizer, tau = -100000.0

[2017-01-17 17:48:23.430000] processing model 1 / 13
[2017-01-17 17:48:23.431000] creating model
[2017-01-17 17:48:24.803000] adding scores
[2017-01-17 17:48:24.833000] fitting
[2017-01-17 17:48:53.993000] outputting
name = model_100_ss_th_1, n_topics = 100, n_doc_passes = 5, seed_value = 100, n_iterations = 20, n_top_tokens = 15, p_threshold = 0.25
ss_theta_regularizer, tau = -1000.0

[2017-01-17 17:48:56.049000] processing model 2 / 13
[2017-01-17 17:48:56.050000] creating model
[2017-01-17 17:48:57.311000] adding scores
[2017-01-17 17:48:57.340000] fitting
[2017-01-17 17:49:30.084000] outputting
nam

In [11]:
taus = [-1e+5, -1e+3, -1e+1, -1, -1e-1, -1e-2, -1e-5, 1e-1, 10, 100, 1e+3, 1e+5, 1e+7]
for idx, t in enumerate(taus):
    print '[{}] processing model {} / {}'.format(datetime.now(), idx, len(taus))
    tmp_model = create_model(current_dictionary=dictionary, n_topics=100, n_doc_passes=5, seed_value=100,
                                n_top_tokens=15, p_mass_threshold=0.25)
    tmp_model.regularizers.add(artm.SmoothSparsePhiRegularizer(name='ss_phi_regularizer', class_ids=['ngramm']))
    tmp_model.regularizers['ss_phi_regularizer'].tau = t
    tmp_model = fit_one_model(tmp_model, _n_iterations=20, _model_name='model_100_ss_phi_{}'.format(idx))

[2017-01-17 17:57:19.738000] processing model 0 / 13
[2017-01-17 17:57:19.739000] creating model
[2017-01-17 17:57:21.143000] adding scores
[2017-01-17 17:57:21.172000] fitting
[2017-01-17 17:57:50.952000] outputting
name = model_100_ss_phi_0, n_topics = 100, n_doc_passes = 5, seed_value = 100, n_iterations = 20, n_top_tokens = 15, p_threshold = 0.25
ss_phi_regularizer, tau = -100000.0

[2017-01-17 17:57:53.036000] processing model 1 / 13
[2017-01-17 17:57:53.036000] creating model
[2017-01-17 17:57:54.366000] adding scores
[2017-01-17 17:57:54.394000] fitting
[2017-01-17 17:58:23.833000] outputting
name = model_100_ss_phi_1, n_topics = 100, n_doc_passes = 5, seed_value = 100, n_iterations = 20, n_top_tokens = 15, p_threshold = 0.25
ss_phi_regularizer, tau = -1000.0

[2017-01-17 17:58:25.939000] processing model 2 / 13
[2017-01-17 17:58:25.939000] creating model
[2017-01-17 17:58:27.200000] adding scores
[2017-01-17 17:58:27.230000] fitting
[2017-01-17 17:58:58.619000] outputting
name 

In [14]:
taus_th = [-1e+5, -1e+3, -1e+1, -1, -1e-1, -1e-2, -1e-5, 1e-1, 10, 100, 1e+3, 1e+5, 1e+7]
taus_phi = [-1e+5, -1e+3, -1e+1, -1, -1e-1, -1e-2, -1e-5, 1e-1, 10, 100, 1e+3, 1e+5, 1e+7]
for idx_th, t_th in enumerate(taus_th):
    for idx_phi, t_phi in enumerate(taus_phi):
        print '[{}] processing model {} / {}'.format(datetime.now(), idx_th * len(taus_th) + idx_phi, len(taus_th) + len(taus_phi))
        tmp_model = create_model(current_dictionary=dictionary, n_topics=100, n_doc_passes=5, seed_value=100,
                                    n_top_tokens=15, p_mass_threshold=0.25)
        tmp_model.regularizers.add(artm.SmoothSparseThetaRegularizer(name='ss_theta_regularizer'))
        tmp_model.regularizers['ss_theta_regularizer'].tau = t_th
        tmp_model.regularizers.add(artm.SmoothSparsePhiRegularizer(name='ss_phi_regularizer', class_ids=['ngramm']))
        tmp_model.regularizers['ss_phi_regularizer'].tau = t_phi
        tmp_model = fit_one_model(tmp_model, _n_iterations=20, _model_name='model_100_ss_th_ss_phi_{}'.format(idx_th * len(taus_th) + idx_phi))

[2017-01-17 19:20:25.995000] processing model 0 / 26
[2017-01-17 19:20:25.995000] creating model
[2017-01-17 19:20:27.497000] adding scores
[2017-01-17 19:20:27.505000] fitting
[2017-01-17 19:20:58.780000] outputting
name = model_100_ss_th_ss_phi_0, n_topics = 100, n_doc_passes = 5, seed_value = 100, n_iterations = 20, n_top_tokens = 15, p_threshold = 0.25
ss_theta_regularizer, tau = -100000.0
ss_phi_regularizer, tau = -100000.0

[2017-01-17 19:21:00.869000] processing model 1 / 26
[2017-01-17 19:21:00.869000] creating model
[2017-01-17 19:21:02.172000] adding scores
[2017-01-17 19:21:02.204000] fitting
[2017-01-17 19:21:32.829000] outputting
name = model_100_ss_th_ss_phi_1, n_topics = 100, n_doc_passes = 5, seed_value = 100, n_iterations = 20, n_top_tokens = 15, p_threshold = 0.25
ss_theta_regularizer, tau = -100000.0
ss_phi_regularizer, tau = -1000.0

[2017-01-17 19:21:34.971000] processing model 2 / 26
[2017-01-17 19:21:34.972000] creating model
[2017-01-17 19:21:36.345000] adding s

In [None]:
taus_th = [-1e+5, -1e+3, -1e+1, -1, -1e-1, -1e-2, -1e-5, 1e-1, 10, 100, 1e+3, 1e+5, 1e+7]
taus_phi = [-1e+5, -1e+3, -1e+1, -1, -1e-1, -1e-2, -1e-5, 1e-1, 10, 100, 1e+3, 1e+5, 1e+7]
taus_decor_th = [1e-1, 10, 100, 1e+3, 1e+5, 1e+7]
for idx_decor_th, t_decor_th in enumerate(taus_decor_th):
    for idx_th, t_th in enumerate(taus_th):
        for idx_phi, t_phi in enumerate(taus_phi):
            print '[{}] processing model {} / {}'.format(datetime.now(), idx_th + idx_phi, len(taus_th) + len(taus_phi))
            tmp_model = create_model(current_dictionary=dictionary, n_topics=100, n_doc_passes=5, seed_value=100,
                                        n_top_tokens=15, p_mass_threshold=0.25)
            tmp_model.regularizers.add(artm.SmoothSparseThetaRegularizer(name='ss_theta_regularizer'))
            tmp_model.regularizers['ss_theta_regularizer'].tau = t_th
            tmp_model.regularizers.add(artm.SmoothSparsePhiRegularizer(name='ss_phi_regularizer', class_ids=['ngramm']))
            tmp_model.regularizers['ss_phi_regularizer'].tau = t_phi
            tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['ngramm']))
            tmp_model.regularizers['decorrelator_phi_regularizer'].tau = t_decor_th
            tmp_model = fit_one_model(tmp_model, _n_iterations=20, _model_name='model_100_ss_th_ss_phi_decor_{}'.format(idx_th + idx_phi + idx+_decor_th))

In [7]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=100, n_doc_passes=5, seed_value=100,
                            n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers.add(artm.SmoothSparseThetaRegularizer(name='ss_theta_regularizer'))
tmp_model.regularizers.add(artm.SmoothSparsePhiRegularizer(name='ss_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 100
tmp_model.regularizers['ss_theta_regularizer'].tau = -1
tmp_model.regularizers['ss_phi_regularizer'].tau = -1
tmp_model = fit_one_model(tmp_model, _n_iterations=20, _model_name='model_100_2')
model2 = tmp_model; tmp_model = None

[2017-01-18 13:50:14.199000] creating model
[2017-01-18 13:50:15.808000] adding scores
[2017-01-18 13:50:15.828000] fitting
[2017-01-18 13:50:58.132000] outputting
name = model_100_2, n_topics = 100, n_doc_passes = 5, seed_value = 100, n_iterations = 20, n_top_tokens = 15, p_threshold = 0.25
ss_theta_regularizer, tau = -1
decorrelator_phi_regularizer, tau = 100
ss_phi_regularizer, tau = -1



In [8]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=100, n_doc_passes=5, seed_value=100,
                            n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers.add(artm.SmoothSparseThetaRegularizer(name='ss_theta_regularizer'))
tmp_model.regularizers.add(artm.SmoothSparsePhiRegularizer(name='ss_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 1000
tmp_model.regularizers['ss_theta_regularizer'].tau = -1
tmp_model.regularizers['ss_phi_regularizer'].tau = -1
tmp_model = fit_one_model(tmp_model, _n_iterations=20, _model_name='model_100_3')

[2017-01-18 13:51:44.164000] creating model
[2017-01-18 13:51:45.670000] adding scores
[2017-01-18 13:51:45.681000] fitting
[2017-01-18 13:52:20.233000] outputting
name = model_100_3, n_topics = 100, n_doc_passes = 5, seed_value = 100, n_iterations = 20, n_top_tokens = 15, p_threshold = 0.25
ss_theta_regularizer, tau = -1
decorrelator_phi_regularizer, tau = 1000
ss_phi_regularizer, tau = -1



In [9]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=100, n_doc_passes=5, seed_value=100,
                            n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers.add(artm.SmoothSparseThetaRegularizer(name='ss_theta_regularizer'))
tmp_model.regularizers.add(artm.SmoothSparsePhiRegularizer(name='ss_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 100
tmp_model.regularizers['ss_theta_regularizer'].tau = -1.5
tmp_model.regularizers['ss_phi_regularizer'].tau = -1.5
tmp_model = fit_one_model(tmp_model, _n_iterations=20, _model_name='model_100_4')

[2017-01-18 14:10:41.023000] creating model
[2017-01-18 14:10:42.603000] adding scores
[2017-01-18 14:10:42.640000] fitting
[2017-01-18 14:11:17.749000] outputting
name = model_100_4, n_topics = 100, n_doc_passes = 5, seed_value = 100, n_iterations = 20, n_top_tokens = 15, p_threshold = 0.25
ss_theta_regularizer, tau = -1.5
decorrelator_phi_regularizer, tau = 100
ss_phi_regularizer, tau = -1.5



In [10]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=100, n_doc_passes=5, seed_value=100,
                            n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers.add(artm.SmoothSparseThetaRegularizer(name='ss_theta_regularizer'))
tmp_model.regularizers.add(artm.SmoothSparsePhiRegularizer(name='ss_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 1000
tmp_model.regularizers['ss_theta_regularizer'].tau = -2
tmp_model.regularizers['ss_phi_regularizer'].tau = -2
tmp_model = fit_one_model(tmp_model, _n_iterations=20, _model_name='model_100_5')
# BAD TOP

[2017-01-18 14:13:34.646000] creating model
[2017-01-18 14:13:36.221000] adding scores
[2017-01-18 14:13:36.268000] fitting
[2017-01-18 14:14:11.615000] outputting
name = model_100_5, n_topics = 100, n_doc_passes = 5, seed_value = 100, n_iterations = 20, n_top_tokens = 15, p_threshold = 0.25
ss_theta_regularizer, tau = -2
decorrelator_phi_regularizer, tau = 1000
ss_phi_regularizer, tau = -2



In [11]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=100, n_doc_passes=5, seed_value=100,
                            n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers.add(artm.SmoothSparseThetaRegularizer(name='ss_theta_regularizer'))
tmp_model.regularizers.add(artm.SmoothSparsePhiRegularizer(name='ss_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 100
tmp_model.regularizers['ss_theta_regularizer'].tau = -0.5
tmp_model.regularizers['ss_phi_regularizer'].tau = -0.5
tmp_model = fit_one_model(tmp_model, _n_iterations=20, _model_name='model_100_6')

[2017-01-18 14:15:42.702000] creating model
[2017-01-18 14:15:44.216000] adding scores
[2017-01-18 14:15:44.255000] fitting
[2017-01-18 14:16:19.501000] outputting
name = model_100_6, n_topics = 100, n_doc_passes = 5, seed_value = 100, n_iterations = 20, n_top_tokens = 15, p_threshold = 0.25
ss_theta_regularizer, tau = -0.5
decorrelator_phi_regularizer, tau = 100
ss_phi_regularizer, tau = -0.5



In [12]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=100, n_doc_passes=5, seed_value=100,
                            n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers.add(artm.SmoothSparseThetaRegularizer(name='ss_theta_regularizer'))
tmp_model.regularizers.add(artm.SmoothSparsePhiRegularizer(name='ss_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 10
tmp_model.regularizers['ss_theta_regularizer'].tau = -0.5
tmp_model.regularizers['ss_phi_regularizer'].tau = -0.5
tmp_model = fit_one_model(tmp_model, _n_iterations=20, _model_name='model_100_7')
# THIS ONE

[2017-01-18 14:18:07.942000] creating model
[2017-01-18 14:18:09.529000] adding scores
[2017-01-18 14:18:09.582000] fitting
[2017-01-18 14:18:45.054000] outputting
name = model_100_7, n_topics = 100, n_doc_passes = 5, seed_value = 100, n_iterations = 20, n_top_tokens = 15, p_threshold = 0.25
ss_theta_regularizer, tau = -0.5
decorrelator_phi_regularizer, tau = 10
ss_phi_regularizer, tau = -0.5



In [13]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=100, n_doc_passes=5, seed_value=100,
                            n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers.add(artm.SmoothSparseThetaRegularizer(name='ss_theta_regularizer'))
tmp_model.regularizers.add(artm.SmoothSparsePhiRegularizer(name='ss_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 10
tmp_model.regularizers['ss_theta_regularizer'].tau = -1
tmp_model.regularizers['ss_phi_regularizer'].tau = -1
tmp_model = fit_one_model(tmp_model, _n_iterations=20, _model_name='model_100_8')

[2017-01-18 14:22:17.846000] creating model
[2017-01-18 14:22:19.331000] adding scores
[2017-01-18 14:22:19.370000] fitting
[2017-01-18 14:22:54.246000] outputting
name = model_100_8, n_topics = 100, n_doc_passes = 5, seed_value = 100, n_iterations = 20, n_top_tokens = 15, p_threshold = 0.25
ss_theta_regularizer, tau = -1
decorrelator_phi_regularizer, tau = 10
ss_phi_regularizer, tau = -1



In [None]:
# top words better

In [14]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=100, n_doc_passes=5, seed_value=100,
                            n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers.add(artm.SmoothSparseThetaRegularizer(name='ss_theta_regularizer'))
tmp_model.regularizers.add(artm.SmoothSparsePhiRegularizer(name='ss_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 5
tmp_model.regularizers['ss_theta_regularizer'].tau = -1
tmp_model.regularizers['ss_phi_regularizer'].tau = -1
tmp_model = fit_one_model(tmp_model, _n_iterations=20, _model_name='model_100_9')

[2017-01-18 14:24:55.615000] creating model
[2017-01-18 14:24:57.161000] adding scores
[2017-01-18 14:24:57.201000] fitting
[2017-01-18 14:25:32.289000] outputting
name = model_100_9, n_topics = 100, n_doc_passes = 5, seed_value = 100, n_iterations = 20, n_top_tokens = 15, p_threshold = 0.25
ss_theta_regularizer, tau = -1
decorrelator_phi_regularizer, tau = 5
ss_phi_regularizer, tau = -1



In [18]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=100, n_doc_passes=5, seed_value=100,
                            n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers.add(artm.SmoothSparseThetaRegularizer(name='ss_theta_regularizer'))
tmp_model.regularizers.add(artm.SmoothSparsePhiRegularizer(name='ss_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 10
tmp_model.regularizers['ss_theta_regularizer'].tau = -0.5
tmp_model.regularizers['ss_phi_regularizer'].tau = -1
tmp_model = fit_one_model(tmp_model, _n_iterations=20, _model_name='model_100_10')

[2017-01-18 14:33:32.630000] creating model
[2017-01-18 14:33:33.978000] adding scores
[2017-01-18 14:33:33.992000] fitting
[2017-01-18 14:34:12.437000] outputting
name = model_100_10, n_topics = 100, n_doc_passes = 5, seed_value = 100, n_iterations = 20, n_top_tokens = 15, p_threshold = 0.25
ss_theta_regularizer, tau = -0.5
decorrelator_phi_regularizer, tau = 10
ss_phi_regularizer, tau = -1



In [19]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=100, n_doc_passes=5, seed_value=100,
                            n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers.add(artm.SmoothSparseThetaRegularizer(name='ss_theta_regularizer'))
tmp_model.regularizers.add(artm.SmoothSparsePhiRegularizer(name='ss_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 10
tmp_model.regularizers['ss_theta_regularizer'].tau = -0.5
tmp_model.regularizers['ss_phi_regularizer'].tau = -1.5
tmp_model = fit_one_model(tmp_model, _n_iterations=20, _model_name='model_100_11')

[2017-01-18 14:35:42.327000] creating model
[2017-01-18 14:35:43.693000] adding scores
[2017-01-18 14:35:43.739000] fitting
[2017-01-18 14:36:18.083000] outputting
name = model_100_11, n_topics = 100, n_doc_passes = 5, seed_value = 100, n_iterations = 20, n_top_tokens = 15, p_threshold = 0.25
ss_theta_regularizer, tau = -0.5
decorrelator_phi_regularizer, tau = 10
ss_phi_regularizer, tau = -1.5



In [20]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=100, n_doc_passes=5, seed_value=100,
                            n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers.add(artm.SmoothSparseThetaRegularizer(name='ss_theta_regularizer'))
tmp_model.regularizers.add(artm.SmoothSparsePhiRegularizer(name='ss_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 10
tmp_model.regularizers['ss_theta_regularizer'].tau = -0.5
tmp_model.regularizers['ss_phi_regularizer'].tau = -2
tmp_model = fit_one_model(tmp_model, _n_iterations=20, _model_name='model_100_12')
# THIS ONE

[2017-01-18 14:38:42.500000] creating model
[2017-01-18 14:38:44.198000] adding scores
[2017-01-18 14:38:44.287000] fitting
[2017-01-18 14:39:17.185000] outputting
name = model_100_12, n_topics = 100, n_doc_passes = 5, seed_value = 100, n_iterations = 20, n_top_tokens = 15, p_threshold = 0.25
ss_theta_regularizer, tau = -0.5
decorrelator_phi_regularizer, tau = 10
ss_phi_regularizer, tau = -2



In [21]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=100, n_doc_passes=5, seed_value=100,
                            n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers.add(artm.SmoothSparseThetaRegularizer(name='ss_theta_regularizer'))
tmp_model.regularizers.add(artm.SmoothSparsePhiRegularizer(name='ss_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 10
tmp_model.regularizers['ss_theta_regularizer'].tau = -0.5
tmp_model.regularizers['ss_phi_regularizer'].tau = -2.5
tmp_model = fit_one_model(tmp_model, _n_iterations=20, _model_name='model_100_13')

[2017-01-18 14:41:00.920000] creating model
[2017-01-18 14:41:02.436000] adding scores
[2017-01-18 14:41:02.470000] fitting
[2017-01-18 14:41:37.578000] outputting
name = model_100_13, n_topics = 100, n_doc_passes = 5, seed_value = 100, n_iterations = 20, n_top_tokens = 15, p_threshold = 0.25
ss_theta_regularizer, tau = -0.5
decorrelator_phi_regularizer, tau = 10
ss_phi_regularizer, tau = -2.5



In [22]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=100, n_doc_passes=5, seed_value=100,
                            n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers.add(artm.SmoothSparseThetaRegularizer(name='ss_theta_regularizer'))
tmp_model.regularizers.add(artm.SmoothSparsePhiRegularizer(name='ss_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 10
tmp_model.regularizers['ss_theta_regularizer'].tau = -0.5
tmp_model.regularizers['ss_phi_regularizer'].tau = -3
tmp_model = fit_one_model(tmp_model, _n_iterations=20, _model_name='model_100_14')

[2017-01-18 14:41:43.035000] creating model
[2017-01-18 14:41:44.697000] adding scores
[2017-01-18 14:41:44.741000] fitting
[2017-01-18 14:42:20.007000] outputting
name = model_100_14, n_topics = 100, n_doc_passes = 5, seed_value = 100, n_iterations = 20, n_top_tokens = 15, p_threshold = 0.25
ss_theta_regularizer, tau = -0.5
decorrelator_phi_regularizer, tau = 10
ss_phi_regularizer, tau = -3



In [23]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=100, n_doc_passes=5, seed_value=100,
                            n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers.add(artm.SmoothSparseThetaRegularizer(name='ss_theta_regularizer'))
tmp_model.regularizers.add(artm.SmoothSparsePhiRegularizer(name='ss_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 10
tmp_model.regularizers['ss_theta_regularizer'].tau = -1
tmp_model.regularizers['ss_phi_regularizer'].tau = -2.5
tmp_model = fit_one_model(tmp_model, _n_iterations=20, _model_name='model_100_15')

[2017-01-18 14:43:35.376000] creating model
[2017-01-18 14:43:36.783000] adding scores
[2017-01-18 14:43:36.825000] fitting
[2017-01-18 14:44:08.948000] outputting
name = model_100_15, n_topics = 100, n_doc_passes = 5, seed_value = 100, n_iterations = 20, n_top_tokens = 15, p_threshold = 0.25
ss_theta_regularizer, tau = -1
decorrelator_phi_regularizer, tau = 10
ss_phi_regularizer, tau = -2.5



Выбираем 3 модели: без регуляриз,  7 (норм, но у ядра размер 90) и 12 (норм и ядро размера 24). 

In [24]:
models_file.close()