In [1]:
import pickle
import sys
import numpy as np
import pandas as pd
import artm
import seaborn as sns
import matplotlib.pyplot as plt
print artm.version()

from os import path, mkdir
from datetime import datetime
sys.path.insert(0, '..\\modules\\helpers')
import distances_helper as dh 
from plot_helper import PlotMaker
from config_helper import ConfigPaths
from print_helper import PrintHelper
from sklearn.decomposition import PCA
from scipy.spatial import ConvexHull
from scipy.optimize import minimize
from sklearn.metrics.pairwise import cosine_distances
from numpy.linalg import norm as euclidean_norm

0.8.1


In [2]:
config = ConfigPaths('config_sample.cfg')
original_model_name = 'phi_model5'
plot_maker = PlotMaker()
printer = PrintHelper()
print config.models_file_name
print config.dataset_path
print config.dataset_folder_name
print config.output_batches_path

Q:\\topic_modeling\\csi_science_collections.git\experiments\ndw_model5\np_19_12\models.txt
Q:\\topic_modeling\\csi_science_collections.git\..\data\postnauka\UCI_collections\ndw_model5
ndw_model5
Q:\\topic_modeling\\csi_science_collections.git\..\data\postnauka\bigARTM_files\ndw_model5


In [4]:
models_file = open(config.models_file_name, 'a')

In [3]:
def create_model(current_dictionary, n_topics, n_doc_passes, seed_value, n_top_tokens, p_mass_threshold):    
    print '[{}] creating model'.format(datetime.now())
    model = artm.ARTM(num_topics=n_topics, dictionary=current_dictionary, cache_theta=True, seed=seed_value, 
                  class_ids={'@default_class': 1.0})
    model.num_document_passes = n_doc_passes
    add_scores_to_model(model, n_top_tokens=n_top_tokens, p_mass_threshold=p_mass_threshold)
    return model
def add_scores_to_model(artm_model, n_top_tokens, p_mass_threshold):
    print '[{}] adding scores'.format(datetime.now())
    artm_model.scores.add(artm.PerplexityScore(name='perplexity_score', dictionary=dictionary))
    artm_model.scores.add(artm.SparsityPhiScore(name='sparsity_phi_score', class_id='@default_class'))
    artm_model.scores.add(artm.SparsityThetaScore(name='sparsity_theta_score'))
    artm_model.scores.add(artm.TopicKernelScore(name='topic_kernel_score', class_id='@default_class', 
                                                probability_mass_threshold=p_mass_threshold))
    artm_model.scores.add(artm.TopTokensScore(name='top_tokens_score', class_id='@default_class', num_tokens=n_top_tokens))
def fit_one_model(model, _n_iterations, _model_name=''): 
    print '[{}] fitting'.format(datetime.now())
    model.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=_n_iterations)
    print '[{}] outputting'.format(datetime.now())
    printer.print_artm_model(model, _model_name, _n_iterations, output_file=models_file)
    model_pics_file_name =  path.join(config.experiment_path, _model_name)
    plot_maker.make_tm_plots(model, model_pics_file_name)
    model_output_file_name = path.join(config.experiment_path, _model_name + '.txt')
    printer.print_scores(model, _model_name, _n_iterations, model_output_file_name)
    printer.print_top_tokens(model, model_output_file_name)
    return model
def save_pickle_file(dists, filename):
    pickle_filename = path.join(config.experiment_path, filename)
    pickle_file = open(pickle_filename, 'wb')
    pickle.dump(dists, pickle_file)
    pickle_file.close()
def load_pickle_file(filename, _path):
    pickle_filename = path.join(_path, filename)
    pickle_file = open(pickle_filename, 'rb')
    p_file = pickle.load(pickle_file)
    pickle_file.close()
    return p_file
def save_model_pickle(_model_name, _model, _save=True):
    phi = _model.get_phi()
    phi = phi[(phi.T != 0).any()]
    theta = _model.get_theta()    
    saved_top_tokens = _model.score_tracker['top_tokens_score'].last_tokens
    if _save:
        save_pickle_file(phi, 'phi_{}.p'.format(_model_name))
        save_pickle_file(theta, 'theta_{}.p'.format(_model_name))
        save_pickle_file(saved_top_tokens, 'saved_top_tokens_{}.p'.format(_model_name))
    return phi, theta, saved_top_tokens
def load_model_pickle(_model_name, _distance_name, _path=config.experiment_path):
    phi = load_pickle_file('phi_{}.p'.format(_model_name), _path)
    theta = load_pickle_file('theta_{}.p'.format(_model_name), _path)
    saved_top_tokens = load_pickle_file('saved_top_tokens_{}.p'.format(_model_name), _path)
    distances = load_pickle_file('{}.p'.format(_distance_name), _path)
    return phi, theta, saved_top_tokens, distances

In [5]:
batch_vectorizer = artm.BatchVectorizer(data_path=config.output_batches_path,
                                        data_format='batches')
dictionary = artm.Dictionary()
dictionary.load(dictionary_path=config.dictionary_path + '.dict')

In [14]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=500, n_doc_passes=5, seed_value=100,
                            n_top_tokens=15, p_mass_threshold=0.25)
tmp_model = fit_one_model(tmp_model, _n_iterations=20, _model_name='model1')
model1 = tmp_model; tmp_model = None
phi1, theta1, saved_top_tokens1 = save_model_pickle('model1', model1)

[2016-12-19 16:32:23.798000] creating model
[2016-12-19 16:32:25.109000] adding scores
[2016-12-19 16:32:25.125000] fitting
[2016-12-19 17:03:24.270000] outputting
name = model1, n_topics = 500, n_doc_passes = 5, seed_value = 100, n_iterations = 20, n_top_tokens = 15, p_threshold = 0.25



In [15]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=500, n_doc_passes=5, seed_value=100,
                            n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['@default_class']))
tmp_model.regularizers.add(artm.SmoothSparseThetaRegularizer(name='ss_theta_regularizer'))
tmp_model.regularizers.add(artm.SmoothSparsePhiRegularizer(name='ss_phi_regularizer', class_ids=['@default_class']))
tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 100
tmp_model.regularizers['ss_theta_regularizer'].tau = -0.1
tmp_model.regularizers['ss_phi_regularizer'].tau = -0.05
tmp_model = fit_one_model(tmp_model, _n_iterations=20, _model_name='model2')
model2 = tmp_model; tmp_model = None
phi2, theta2, saved_top_tokens2 = save_model_pickle('model2', model2)

[2016-12-19 17:03:51.357000] creating model
[2016-12-19 17:03:52.678000] adding scores
[2016-12-19 17:03:52.795000] fitting
[2016-12-19 17:39:09.967000] outputting
name = model2, n_topics = 500, n_doc_passes = 5, seed_value = 100, n_iterations = 20, n_top_tokens = 15, p_threshold = 0.25
ss_theta_regularizer, tau = -0.1
decorrelator_phi_regularizer, tau = 100
ss_phi_regularizer, tau = -0.05



In [17]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=50, n_doc_passes=5, seed_value=100,
                            n_top_tokens=15, p_mass_threshold=0.25)
tmp_model = fit_one_model(tmp_model, _n_iterations=20, _model_name='model3')
model3 = tmp_model; tmp_model = None
phi3, theta3, saved_top_tokens3 = save_model_pickle('model3', model3)

[2016-12-19 18:23:37.275000] creating model
[2016-12-19 18:23:38.217000] adding scores
[2016-12-19 18:23:38.217000] fitting
[2016-12-19 18:29:45.533000] outputting
name = model3, n_topics = 50, n_doc_passes = 5, seed_value = 100, n_iterations = 20, n_top_tokens = 15, p_threshold = 0.25



In [18]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=20, n_doc_passes=5, seed_value=100,
                            n_top_tokens=15, p_mass_threshold=0.25)
tmp_model = fit_one_model(tmp_model, _n_iterations=20, _model_name='model4')
model4 = tmp_model; tmp_model = None
phi4, theta4, saved_top_tokens4 = save_model_pickle('model4', model4)

[2016-12-19 18:40:18.360000] creating model
[2016-12-19 18:40:19.373000] adding scores
[2016-12-19 18:40:19.373000] fitting
[2016-12-19 18:45:53.723000] outputting
name = model4, n_topics = 20, n_doc_passes = 5, seed_value = 100, n_iterations = 20, n_top_tokens = 15, p_threshold = 0.25



In [16]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=500, n_doc_passes=5, seed_value=100,
                            n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['@default_class']))
tmp_model.regularizers.add(artm.SmoothSparseThetaRegularizer(name='ss_theta_regularizer'))
tmp_model.regularizers.add(artm.SmoothSparsePhiRegularizer(name='ss_phi_regularizer', class_ids=['@default_class']))
tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 1000
tmp_model.regularizers['ss_theta_regularizer'].tau = -0.1
tmp_model.regularizers['ss_phi_regularizer'].tau = -0.05
tmp_model = fit_one_model(tmp_model, _n_iterations=20, _model_name='model5')
model5 = tmp_model; tmp_model = None
phi5, theta5, saved_top_tokens5 = save_model_pickle('model5', model5)

[2016-12-19 17:40:31.787000] creating model
[2016-12-19 17:40:34.055000] adding scores
[2016-12-19 17:40:34.227000] fitting
[2016-12-19 18:18:51.675000] outputting
name = model5, n_topics = 500, n_doc_passes = 5, seed_value = 100, n_iterations = 20, n_top_tokens = 15, p_threshold = 0.25
ss_theta_regularizer, tau = -0.1
decorrelator_phi_regularizer, tau = 1000
ss_phi_regularizer, tau = -0.05



In [20]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=20, n_doc_passes=5, seed_value=100,
                            n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['@default_class']))
tmp_model.regularizers.add(artm.SmoothSparseThetaRegularizer(name='ss_theta_regularizer'))
tmp_model.regularizers.add(artm.SmoothSparsePhiRegularizer(name='ss_phi_regularizer', class_ids=['@default_class']))
tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 1000
tmp_model.regularizers['ss_theta_regularizer'].tau = -0.1
tmp_model.regularizers['ss_phi_regularizer'].tau = -0.05
tmp_model = fit_one_model(tmp_model, _n_iterations=20, _model_name='model6')
model6 = tmp_model; tmp_model = None
phi6, theta6, saved_top_tokens6 = save_model_pickle('model6', model6)

[2016-12-19 20:07:59.923000] creating model
[2016-12-19 20:08:01.357000] adding scores
[2016-12-19 20:08:01.408000] fitting
[2016-12-19 20:12:17.817000] outputting
name = model6, n_topics = 20, n_doc_passes = 5, seed_value = 100, n_iterations = 20, n_top_tokens = 15, p_threshold = 0.25
ss_theta_regularizer, tau = -0.1
decorrelator_phi_regularizer, tau = 1000
ss_phi_regularizer, tau = -0.05



In [21]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=20, n_doc_passes=5, seed_value=100,
                            n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['@default_class']))
tmp_model.regularizers.add(artm.SmoothSparseThetaRegularizer(name='ss_theta_regularizer'))
tmp_model.regularizers.add(artm.SmoothSparsePhiRegularizer(name='ss_phi_regularizer', class_ids=['@default_class']))
tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 1000
tmp_model.regularizers['ss_theta_regularizer'].tau = -0.5
tmp_model.regularizers['ss_phi_regularizer'].tau = -0.1
tmp_model = fit_one_model(tmp_model, _n_iterations=20, _model_name='model7')
model7 = tmp_model; tmp_model = None
phi7, theta7, saved_top_tokens7 = save_model_pickle('model7', model7)

[2016-12-19 20:14:52.651000] creating model
[2016-12-19 20:14:53.671000] adding scores
[2016-12-19 20:14:53.677000] fitting
[2016-12-19 20:19:23.098000] outputting
name = model7, n_topics = 20, n_doc_passes = 5, seed_value = 100, n_iterations = 20, n_top_tokens = 15, p_threshold = 0.25
ss_theta_regularizer, tau = -0.5
decorrelator_phi_regularizer, tau = 1000
ss_phi_regularizer, tau = -0.1



In [22]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=20, n_doc_passes=5, seed_value=100,
                            n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['@default_class']))
tmp_model.regularizers.add(artm.SmoothSparseThetaRegularizer(name='ss_theta_regularizer'))
tmp_model.regularizers.add(artm.SmoothSparsePhiRegularizer(name='ss_phi_regularizer', class_ids=['@default_class']))
tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 1000
tmp_model.regularizers['ss_theta_regularizer'].tau = -1
tmp_model.regularizers['ss_phi_regularizer'].tau = -0.2
tmp_model = fit_one_model(tmp_model, _n_iterations=20, _model_name='model8')
model8 = tmp_model; tmp_model = None
phi8, theta8, saved_top_tokens8 = save_model_pickle('model8', model8)

[2016-12-19 20:38:18.228000] creating model
[2016-12-19 20:38:19.142000] adding scores
[2016-12-19 20:38:19.157000] fitting
[2016-12-19 20:41:54.015000] outputting
name = model8, n_topics = 20, n_doc_passes = 5, seed_value = 100, n_iterations = 20, n_top_tokens = 15, p_threshold = 0.25
ss_theta_regularizer, tau = -1
decorrelator_phi_regularizer, tau = 1000
ss_phi_regularizer, tau = -0.2



In [7]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=20, n_doc_passes=5, seed_value=100,
                            n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['@default_class']))
tmp_model.regularizers.add(artm.SmoothSparseThetaRegularizer(name='ss_theta_regularizer'))
tmp_model.regularizers.add(artm.SmoothSparsePhiRegularizer(name='ss_phi_regularizer', class_ids=['@default_class']))
tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 1000
tmp_model.regularizers['ss_theta_regularizer'].tau = -10
tmp_model.regularizers['ss_phi_regularizer'].tau = -0.2
tmp_model = fit_one_model(tmp_model, _n_iterations=20, _model_name='model9')
model9 = tmp_model; tmp_model = None
phi9, theta9, saved_top_tokens9 = save_model_pickle('model9', model9)

[2016-12-20 01:13:18.300000] creating model
[2016-12-20 01:13:19.285000] adding scores
[2016-12-20 01:13:19.304000] fitting
[2016-12-20 01:17:43.834000] outputting
name = model9, n_topics = 20, n_doc_passes = 5, seed_value = 100, n_iterations = 20, n_top_tokens = 15, p_threshold = 0.25
ss_theta_regularizer, tau = -10
decorrelator_phi_regularizer, tau = 1000
ss_phi_regularizer, tau = -0.2



In [6]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=20, n_doc_passes=5, seed_value=100,
                         n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['@default_class']))
tmp_model.regularizers.add(artm.SmoothSparseThetaRegularizer(name='ss_theta_regularizer'))
tmp_model.regularizers.add(artm.SmoothSparsePhiRegularizer(name='ss_phi_regularizer', class_ids=['@default_class']))
tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 1000
tmp_model.regularizers['ss_theta_regularizer'].tau = -100
tmp_model.regularizers['ss_phi_regularizer'].tau = -0.2
tmp_model = fit_one_model(tmp_model, _n_iterations=20, _model_name='model10')
model10 = tmp_model; tmp_model = None
phi10, theta10, saved_top_tokens10 = save_model_pickle('model10', model10)

[2016-12-20 00:59:59.526000] creating model
[2016-12-20 01:00:00.615000] adding scores
[2016-12-20 01:00:00.700000] fitting
[2016-12-20 01:04:15.469000] outputting
name = model10, n_topics = 20, n_doc_passes = 5, seed_value = 100, n_iterations = 20, n_top_tokens = 15, p_threshold = 0.25
ss_theta_regularizer, tau = -100
decorrelator_phi_regularizer, tau = 1000
ss_phi_regularizer, tau = -0.2



In [8]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=20, n_doc_passes=5, seed_value=100,
                            n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['@default_class']))
tmp_model.regularizers.add(artm.SmoothSparseThetaRegularizer(name='ss_theta_regularizer'))
tmp_model.regularizers.add(artm.SmoothSparsePhiRegularizer(name='ss_phi_regularizer', class_ids=['@default_class']))
tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 1000
tmp_model.regularizers['ss_theta_regularizer'].tau = -10
tmp_model.regularizers['ss_phi_regularizer'].tau = -0.5
tmp_model = fit_one_model(tmp_model, _n_iterations=20, _model_name='model11')
model11 = tmp_model; tmp_model = None
phi11, theta11, saved_top_tokens11 = save_model_pickle('model11', model11)

[2016-12-20 01:20:39.961000] creating model
[2016-12-20 01:20:40.962000] adding scores
[2016-12-20 01:20:40.970000] fitting
[2016-12-20 01:25:30.249000] outputting
name = model11, n_topics = 20, n_doc_passes = 5, seed_value = 100, n_iterations = 20, n_top_tokens = 15, p_threshold = 0.25
ss_theta_regularizer, tau = -10
decorrelator_phi_regularizer, tau = 1000
ss_phi_regularizer, tau = -0.5



In [9]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=20, n_doc_passes=5, seed_value=100,
                            n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['@default_class']))
tmp_model.regularizers.add(artm.SmoothSparseThetaRegularizer(name='ss_theta_regularizer'))
tmp_model.regularizers.add(artm.SmoothSparsePhiRegularizer(name='ss_phi_regularizer', class_ids=['@default_class']))
tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 10000
tmp_model.regularizers['ss_theta_regularizer'].tau = -10
tmp_model.regularizers['ss_phi_regularizer'].tau = -1
tmp_model = fit_one_model(tmp_model, _n_iterations=20, _model_name='model12')
model12 = tmp_model; tmp_model = None
phi12, theta12, saved_top_tokens12 = save_model_pickle('model12', model12)

[2016-12-20 01:27:42.571000] creating model
[2016-12-20 01:27:43.568000] adding scores
[2016-12-20 01:27:43.583000] fitting
[2016-12-20 01:32:31.449000] outputting
name = model12, n_topics = 20, n_doc_passes = 5, seed_value = 100, n_iterations = 20, n_top_tokens = 15, p_threshold = 0.25
ss_theta_regularizer, tau = -10
decorrelator_phi_regularizer, tau = 10000
ss_phi_regularizer, tau = -1



In [10]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=20, n_doc_passes=5, seed_value=100,
                            n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['@default_class']))
tmp_model.regularizers.add(artm.SmoothSparseThetaRegularizer(name='ss_theta_regularizer'))
tmp_model.regularizers.add(artm.SmoothSparsePhiRegularizer(name='ss_phi_regularizer', class_ids=['@default_class']))
tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 10000
tmp_model.regularizers['ss_theta_regularizer'].tau = -15
tmp_model.regularizers['ss_phi_regularizer'].tau = -1
tmp_model = fit_one_model(tmp_model, _n_iterations=20, _model_name='model13')
model13 = tmp_model; tmp_model = None
phi13, theta13, saved_top_tokens13 = save_model_pickle('model13', model13)

[2016-12-20 01:33:57.864000] creating model
[2016-12-20 01:33:58.872000] adding scores
[2016-12-20 01:33:58.872000] fitting
[2016-12-20 01:37:54.329000] outputting
name = model13, n_topics = 20, n_doc_passes = 5, seed_value = 100, n_iterations = 20, n_top_tokens = 15, p_threshold = 0.25
ss_theta_regularizer, tau = -15
decorrelator_phi_regularizer, tau = 10000
ss_phi_regularizer, tau = -1



In [11]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=20, n_doc_passes=5, seed_value=100,
                            n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['@default_class']))
tmp_model.regularizers.add(artm.SmoothSparseThetaRegularizer(name='ss_theta_regularizer'))
tmp_model.regularizers.add(artm.SmoothSparsePhiRegularizer(name='ss_phi_regularizer', class_ids=['@default_class']))
tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 1000000
tmp_model.regularizers['ss_theta_regularizer'].tau = -10
tmp_model.regularizers['ss_phi_regularizer'].tau = -0.5
tmp_model = fit_one_model(tmp_model, _n_iterations=20, _model_name='model14')
model14 = tmp_model; tmp_model = None
phi14, theta14, saved_top_tokens14 = save_model_pickle('model14', model14)

[2016-12-20 01:42:31.101000] creating model
[2016-12-20 01:42:32.108000] adding scores
[2016-12-20 01:42:32.177000] fitting
[2016-12-20 01:45:44.032000] outputting
name = model14, n_topics = 20, n_doc_passes = 5, seed_value = 100, n_iterations = 20, n_top_tokens = 15, p_threshold = 0.25
ss_theta_regularizer, tau = -10
decorrelator_phi_regularizer, tau = 1000000
ss_phi_regularizer, tau = -0.5



In [12]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=20, n_doc_passes=5, seed_value=100,
                            n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['@default_class']))
tmp_model.regularizers.add(artm.SmoothSparseThetaRegularizer(name='ss_theta_regularizer'))
tmp_model.regularizers.add(artm.SmoothSparsePhiRegularizer(name='ss_phi_regularizer', class_ids=['@default_class']))
tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 10000
tmp_model.regularizers['ss_theta_regularizer'].tau = -10
tmp_model.regularizers['ss_phi_regularizer'].tau = -10
tmp_model = fit_one_model(tmp_model, _n_iterations=20, _model_name='model15')
model15 = tmp_model; tmp_model = None
phi15, theta15, saved_top_tokens15 = save_model_pickle('model15', model15)

[2016-12-20 01:53:49.251000] creating model
[2016-12-20 01:53:50.046000] adding scores
[2016-12-20 01:53:50.054000] fitting
[2016-12-20 01:56:35.309000] outputting
name = model15, n_topics = 20, n_doc_passes = 5, seed_value = 100, n_iterations = 20, n_top_tokens = 15, p_threshold = 0.25
ss_theta_regularizer, tau = -10
decorrelator_phi_regularizer, tau = 10000
ss_phi_regularizer, tau = -10



In [13]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=20, n_doc_passes=5, seed_value=100,
                            n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['@default_class']))
tmp_model.regularizers.add(artm.SmoothSparseThetaRegularizer(name='ss_theta_regularizer'))
tmp_model.regularizers.add(artm.SmoothSparsePhiRegularizer(name='ss_phi_regularizer', class_ids=['@default_class']))
tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 10000
tmp_model.regularizers['ss_theta_regularizer'].tau = -10
tmp_model.regularizers['ss_phi_regularizer'].tau = -20
tmp_model = fit_one_model(tmp_model, _n_iterations=20, _model_name='model16')
model16 = tmp_model; tmp_model = None
phi16, theta16, saved_top_tokens16 = save_model_pickle('model16', model16)

[2016-12-20 02:00:24.154000] creating model
[2016-12-20 02:00:24.960000] adding scores
[2016-12-20 02:00:24.976000] fitting
[2016-12-20 02:03:14.014000] outputting
name = model16, n_topics = 20, n_doc_passes = 5, seed_value = 100, n_iterations = 20, n_top_tokens = 15, p_threshold = 0.25
ss_theta_regularizer, tau = -10
decorrelator_phi_regularizer, tau = 10000
ss_phi_regularizer, tau = -20



In [14]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=20, n_doc_passes=5, seed_value=100,
                            n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['@default_class']))
tmp_model.regularizers.add(artm.SmoothSparseThetaRegularizer(name='ss_theta_regularizer'))
tmp_model.regularizers.add(artm.SmoothSparsePhiRegularizer
                           (name='ss_phi_regularizer', class_ids=['@default_class']))
tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 10000
tmp_model.regularizers['ss_theta_regularizer'].tau = -10
tmp_model.regularizers['ss_phi_regularizer'].tau = -30
tmp_model = fit_one_model(tmp_model, _n_iterations=20, _model_name='model17')
model17 = tmp_model; tmp_model = None
phi17, theta17, saved_top_tokens17 = save_model_pickle('model16', model17)

[2016-12-20 02:05:06.811000] creating model
[2016-12-20 02:05:07.624000] adding scores
[2016-12-20 02:05:07.624000] fitting
[2016-12-20 02:07:51.525000] outputting
name = model17, n_topics = 20, n_doc_passes = 5, seed_value = 100, n_iterations = 20, n_top_tokens = 15, p_threshold = 0.25
ss_theta_regularizer, tau = -10
decorrelator_phi_regularizer, tau = 10000
ss_phi_regularizer, tau = -30



In [None]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=20, n_doc_passes=5, seed_value=100,
                            n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['@default_class']))
tmp_model.regularizers.add(artm.SmoothSparseThetaRegularizer(name='ss_theta_regularizer'))
tmp_model.regularizers.add(artm.SmoothSparsePhiRegularizer
                           (name='ss_phi_regularizer', class_ids=['@default_class']))
tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 10000
tmp_model.regularizers['ss_theta_regularizer'].tau = -10
tmp_model.regularizers['ss_phi_regularizer'].tau = -40
tmp_model = fit_one_model(tmp_model, _n_iterations=20, _model_name='model18')
model18 = tmp_model; tmp_model = None
phi18, theta18, saved_top_tokens18 = save_model_pickle('model18', model18)

[2016-12-20 02:08:45.975000] creating model
[2016-12-20 02:08:46.974000] adding scores
[2016-12-20 02:08:46.974000] fitting


In [6]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=20, n_doc_passes=5, seed_value=100,
                            n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['@default_class']))
tmp_model.regularizers.add(artm.SmoothSparseThetaRegularizer(name='ss_theta_regularizer'))
tmp_model.regularizers.add(artm.SmoothSparsePhiRegularizer
                           (name='ss_phi_regularizer', class_ids=['@default_class']))
tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 1000
tmp_model.regularizers['ss_theta_regularizer'].tau = -10
tmp_model.regularizers['ss_phi_regularizer'].tau = -40
tmp_model = fit_one_model(tmp_model, _n_iterations=20, _model_name='model19')
model19 = tmp_model; tmp_model = None
phi19, theta19, saved_top_tokens19 = save_model_pickle('model19', model19)

[2016-12-20 14:20:54.982000] creating model
[2016-12-20 14:20:55.797000] adding scores
[2016-12-20 14:20:56.628000] fitting
[2016-12-20 14:23:52.278000] outputting
name = model19, n_topics = 20, n_doc_passes = 5, seed_value = 100, n_iterations = 20, n_top_tokens = 15, p_threshold = 0.25
ss_theta_regularizer, tau = -10
decorrelator_phi_regularizer, tau = 1000
ss_phi_regularizer, tau = -40



In [6]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=20, n_doc_passes=5, seed_value=200,
                            n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['@default_class']))
tmp_model.regularizers.add(artm.SmoothSparseThetaRegularizer(name='ss_theta_regularizer'))
tmp_model.regularizers.add(artm.SmoothSparsePhiRegularizer
                           (name='ss_phi_regularizer', class_ids=['@default_class']))
tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 1000
tmp_model.regularizers['ss_theta_regularizer'].tau = -10
tmp_model.regularizers['ss_phi_regularizer'].tau = -40
tmp_model = fit_one_model(tmp_model, _n_iterations=20, _model_name='model20')
model20 = tmp_model; tmp_model = None
phi20, theta20, saved_top_tokens20 = save_model_pickle('model20', model20)

[2016-12-20 17:59:26.472000] creating model
[2016-12-20 17:59:27.511000] adding scores
[2016-12-20 17:59:27.526000] fitting
[2016-12-20 18:02:19.159000] outputting
name = model20, n_topics = 20, n_doc_passes = 5, seed_value = 200, n_iterations = 20, n_top_tokens = 15, p_threshold = 0.25
ss_theta_regularizer, tau = -10
decorrelator_phi_regularizer, tau = 1000
ss_phi_regularizer, tau = -40



In [7]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=20, n_doc_passes=5, seed_value=300,
                            n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['@default_class']))
tmp_model.regularizers.add(artm.SmoothSparseThetaRegularizer(name='ss_theta_regularizer'))
tmp_model.regularizers.add(artm.SmoothSparsePhiRegularizer
                           (name='ss_phi_regularizer', class_ids=['@default_class']))
tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 1000
tmp_model.regularizers['ss_theta_regularizer'].tau = -10
tmp_model.regularizers['ss_phi_regularizer'].tau = -40
tmp_model = fit_one_model(tmp_model, _n_iterations=20, _model_name='model21')
model21 = tmp_model; tmp_model = None
phi21, theta21, saved_top_tokens21 = save_model_pickle('model21', model21)

[2016-12-20 20:51:02.460000] creating model
[2016-12-20 20:51:03.704000] adding scores
[2016-12-20 20:51:03.705000] fitting
[2016-12-20 20:54:04.858000] outputting
name = model21, n_topics = 20, n_doc_passes = 5, seed_value = 300, n_iterations = 20, n_top_tokens = 15, p_threshold = 0.25
ss_theta_regularizer, tau = -10
decorrelator_phi_regularizer, tau = 1000
ss_phi_regularizer, tau = -40



In [8]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=20, n_doc_passes=5, seed_value=400,
                            n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['@default_class']))
tmp_model.regularizers.add(artm.SmoothSparseThetaRegularizer(name='ss_theta_regularizer'))
tmp_model.regularizers.add(artm.SmoothSparsePhiRegularizer
                           (name='ss_phi_regularizer', class_ids=['@default_class']))
tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 1000
tmp_model.regularizers['ss_theta_regularizer'].tau = -10
tmp_model.regularizers['ss_phi_regularizer'].tau = -40
tmp_model = fit_one_model(tmp_model, _n_iterations=20, _model_name='model22')
model22 = tmp_model; tmp_model = None
phi22, theta22, saved_top_tokens22 = save_model_pickle('model22', model22)

[2016-12-20 20:54:08.102000] creating model
[2016-12-20 20:54:08.987000] adding scores
[2016-12-20 20:54:09.002000] fitting
[2016-12-20 20:57:36.795000] outputting
name = model22, n_topics = 20, n_doc_passes = 5, seed_value = 400, n_iterations = 20, n_top_tokens = 15, p_threshold = 0.25
ss_theta_regularizer, tau = -10
decorrelator_phi_regularizer, tau = 1000
ss_phi_regularizer, tau = -40



In [9]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=20, n_doc_passes=5, seed_value=200,
                            n_top_tokens=15, p_mass_threshold=0.25)
tmp_model = fit_one_model(tmp_model, _n_iterations=20, _model_name='model30')
model30 = tmp_model; tmp_model = None
phi30, theta30, saved_top_tokens30 = save_model_pickle('model30', model30)

[2016-12-20 20:57:40.388000] creating model
[2016-12-20 20:57:41.299000] adding scores
[2016-12-20 20:57:41.303000] fitting
[2016-12-20 21:01:39.365000] outputting
name = model30, n_topics = 20, n_doc_passes = 5, seed_value = 200, n_iterations = 20, n_top_tokens = 15, p_threshold = 0.25



In [10]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=20, n_doc_passes=5, seed_value=300,
                            n_top_tokens=15, p_mass_threshold=0.25)
tmp_model = fit_one_model(tmp_model, _n_iterations=20, _model_name='model31')
model31 = tmp_model; tmp_model = None
phi31, theta31, saved_top_tokens31 = save_model_pickle('model31', model31)

[2016-12-20 21:28:44.982000] creating model
[2016-12-20 21:28:46.284000] adding scores
[2016-12-20 21:28:46.295000] fitting
[2016-12-20 21:31:51.918000] outputting
name = model31, n_topics = 20, n_doc_passes = 5, seed_value = 300, n_iterations = 20, n_top_tokens = 15, p_threshold = 0.25



In [11]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=20, n_doc_passes=5, seed_value=400,
                            n_top_tokens=15, p_mass_threshold=0.25)
tmp_model = fit_one_model(tmp_model, _n_iterations=20, _model_name='model32')
model32 = tmp_model; tmp_model = None
phi32, theta32, saved_top_tokens32 = save_model_pickle('model32', model32)

[2016-12-20 21:36:11.143000] creating model
[2016-12-20 21:36:11.970000] adding scores
[2016-12-20 21:36:11.974000] fitting
[2016-12-20 21:39:39.768000] outputting
name = model32, n_topics = 20, n_doc_passes = 5, seed_value = 400, n_iterations = 20, n_top_tokens = 15, p_threshold = 0.25



In [12]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=100, n_doc_passes=5, seed_value=400,
                            n_top_tokens=15, p_mass_threshold=0.25)
tmp_model = fit_one_model(tmp_model, _n_iterations=20, _model_name='model42')
save_model_pickle('model42', tmp_model)

[2016-12-21 00:41:59.190000] creating model
[2016-12-21 00:42:15.012000] adding scores
[2016-12-21 00:42:15.027000] fitting
[2016-12-21 00:55:37.348000] outputting
name = model42, n_topics = 100, n_doc_passes = 5, seed_value = 400, n_iterations = 20, n_top_tokens = 15, p_threshold = 0.25



(                                    topic_0       topic_1       topic_2  \
 витрина                        8.342185e-12  0.000000e+00  7.274721e-13   
 корнелльский_университет       4.766214e-08  1.278453e-05  2.326350e-05   
 определённый_место             1.219799e-07  3.103936e-06  2.834464e-06   
 блог                           1.453202e-05  3.709920e-07  9.008932e-09   
 житие_святой                   8.252379e-06  1.496534e-05  1.543561e-05   
 органический_синтез            2.422835e-08  1.325375e-08  1.517086e-08   
 переключение                   3.240941e-05  9.225660e-05  9.158411e-07   
 изменение_условие              0.000000e+00  1.437539e-09  1.115844e-09   
 самоидентификация              2.529630e-06  9.904209e-10  7.361529e-07   
 довольно_большой_количество    1.452967e-11  3.377774e-06  9.002301e-06   
 стратегический_взаимодействие  4.042059e-06  8.410272e-09  1.488758e-09   
 многий_страна                  9.468021e-06  7.573032e-06  6.042399e-05   
 дата       

In [13]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=100, n_doc_passes=5, seed_value=400,
                            n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['@default_class']))
tmp_model.regularizers.add(artm.SmoothSparseThetaRegularizer(name='ss_theta_regularizer'))
tmp_model.regularizers.add(artm.SmoothSparsePhiRegularizer
                           (name='ss_phi_regularizer', class_ids=['@default_class']))
tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 1000
tmp_model.regularizers['ss_theta_regularizer'].tau = -1
tmp_model.regularizers['ss_phi_regularizer'].tau = -1
tmp_model = fit_one_model(tmp_model, _n_iterations=20, _model_name='model43')
save_model_pickle('model43', tmp_model)

[2016-12-21 00:56:07.053000] creating model
[2016-12-21 00:56:08.866000] adding scores
[2016-12-21 00:56:08.967000] fitting
[2016-12-21 01:07:22.854000] outputting
name = model43, n_topics = 100, n_doc_passes = 5, seed_value = 400, n_iterations = 20, n_top_tokens = 15, p_threshold = 0.25
ss_theta_regularizer, tau = -1
decorrelator_phi_regularizer, tau = 1000
ss_phi_regularizer, tau = -1



(                       topic_0   topic_1   topic_2   topic_3   topic_4  \
 определённый_место    0.000000  0.000000  0.000000  0.000000  0.000000   
 переключение          0.000326  0.000000  0.000000  0.000000  0.000000   
 самоидентификация     0.000000  0.000000  0.000000  0.000000  0.000000   
 многий_страна         0.000000  0.000000  0.000000  0.000000  0.000000   
 дата                  0.000000  0.000000  0.000000  0.000000  0.000000   
 закон_физика          0.000046  0.000000  0.000000  0.000000  0.000000   
 период                0.000498  0.000000  0.000000  0.000000  0.013598   
 настроение            0.000000  0.000000  0.000000  0.000000  0.000000   
 америка               0.000544  0.000000  0.000000  0.000000  0.000000   
 вопрос                0.014768  0.000000  0.000000  0.000000  0.003338   
 почка                 0.000000  0.000000  0.000000  0.000000  0.000000   
 химиотерапия          0.000000  0.000000  0.000000  0.000000  0.000000   
 блеск                 0.

In [5]:
def calculate_distances(dist_fun, _phi, _phi_other):
    print '[{}] take_distances between {} columns and {} columns'.format(datetime.now(), len(_phi.columns), len(_phi_other.columns))
    distances = pd.DataFrame(0, index = _phi.columns, columns=_phi_other.columns)
    for idx, col in enumerate(_phi.columns):
        print '[{}] column num {} of {}'.format(datetime.now(), idx, len(_phi.columns))
        for idx_other, col_other in enumerate(_phi_other.columns):
            distance = dist_fun(_phi[col], _phi_other[col_other])
            distances.iloc[idx, idx_other] = distance
    return distances
def get_optimization_result_one_matrix(dist_fn, jac_dist_fn, phi, distances):
    opt_results = {}
    for col_idx, col_name in enumerate(phi.columns):
        print '[{}] get_optimization_result for column {}'.format(datetime.now(), col_idx)
        column = phi[col_name]
        # delete col from phi
        phi_cut = phi.drop(col_name, axis=1)
        opt_results[col_name] = solve_optimization_problem(dist_fn, jac_dist_fn, column, col_name, phi_cut, distances)
    return opt_results
def get_optimization_result(dist_fn, jac_dist_fn, phi, phi_other, distances):
    opt_results = {}
    for col_idx, col_name in enumerate(phi.columns):
        print '[{}] get_optimization_result for column {}'.format(datetime.now(), col_idx)        
        column = phi[col_name]
        opt_results[col_name] = solve_optimization_problem(dist_fn, jac_dist_fn, column, column_name, phi_other, distances)
    return opt_results
def solve_optimization_problem(dist_fn, jac_dist_fn, column, column_name, phi, distances, verbose=False):
    max_iter = 50
    phi_columns = phi.columns
    # cut distances by phi columns 
    cut_distances = distances.loc[phi_columns]
    # get n closest topics
    closest_column_names = cut_distances[column_name].sort_values().head(N_CLOSEST_TOPICS).index.values
    phi_closest = phi[closest_column_names]
    
    # opt solver
    n_columns = phi_closest.shape[1] 
    bnds = [(0, 1)] * n_columns
    constraints = cons = ({'type': 'eq', 'fun': lambda x:  np.sum(x) - 1, 'jac': lambda x: [1] * n_columns})
    opt_fun = lambda x: dist_fn(column, phi_closest.dot(x))
    jac_fun = lambda x: jac_dist_fn(column, phi_closest, x)
    
    is_optimized = False
    it = 0
    while (not is_optimized) and it != 4:
        it += 1
        init_x = np.random.uniform(0, 1, (1, n_columns))
        init_x /= np.sum(init_x)
        if jac_dist_fn is not None:
            res = minimize(opt_fun, jac=jac_fun, x0=init_x, method='SLSQP', bounds=bnds, constraints=cons, options={'maxiter': max_iter, 'disp': verbose})
        else:
            res = minimize(opt_fun, x0=init_x, method='SLSQP', bounds=bnds, constraints=cons, options={'maxiter': max_iter, 'disp': verbose})
        is_optimized = res.success
    if not is_optimized:
        print 'Not optimized' 
    res['column_names'] = phi_closest.columns
    res['optimized_column'] = column_name
    return res

In [8]:
models_file.close()