In [1]:
import sys
import numpy as np
import artm
print artm.version()

from os import path, mkdir
from datetime import datetime
%matplotlib inline
sys.path.insert(0, '..\\modules\\helpers')
from plot_helper import PlotMaker
from config_helper import ConfigPaths
from print_helper import PrintHelper
from sklearn.decomposition import PCA
from scipy.spatial import ConvexHull

0.8.1


In [2]:
config = ConfigPaths('config.cfg')
plot_maker = PlotMaker()
printer = PrintHelper()

In [3]:
print config.models_file_name

Q:\\topic_modeling\\csi_science_collections.git\experiments\UCI_filtered_ngramm_trimmed_without_names\03_12_comp\models.txt


In [4]:
models_file = open(config.models_file_name, 'a')

In [5]:
def create_model(current_dictionary, n_topics, n_doc_passes, seed_value, n_top_tokens, p_mass_threshold):    
    print '[{}] creating model'.format(datetime.now())
    model = artm.ARTM(num_topics=n_topics, dictionary=current_dictionary, cache_theta=True, seed=seed_value, 
                  class_ids={'ngramm': 1.0, 'author_id': 0.0, 'author': 0.0, 
                             'post_tag': 0.0, 'projects': 0.0, 'category': 0.0,
                             'following_users': 0.0})
    model.num_document_passes = n_doc_passes
    add_scores_to_model(model, n_top_tokens=n_top_tokens, p_mass_threshold=p_mass_threshold)
    return model


def add_scores_to_model(artm_model, n_top_tokens, p_mass_threshold):
    print '[{}] adding scores'.format(datetime.now())
    artm_model.scores.add(artm.PerplexityScore(name='perplexity_score',
                                      use_unigram_document_model=False,
                                      dictionary=dictionary))
    artm_model.scores.add(artm.SparsityPhiScore(name='sparsity_phi_score', class_id='ngramm'))
    artm_model.scores.add(artm.SparsityThetaScore(name='sparsity_theta_score'))
    artm_model.scores.add(artm.TopicKernelScore(name='topic_kernel_score', class_id='ngramm', 
                                                probability_mass_threshold=p_mass_threshold))
    artm_model.scores.add(artm.TopTokensScore(name='top_tokens_score', class_id='ngramm', num_tokens=n_top_tokens))
def fit_one_model(model, _n_iterations, _model_name=''): 
    print '[{}] fitting'.format(datetime.now())
    model.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=_n_iterations)
    print '[{}] outputting'.format(datetime.now())
    printer.print_artm_model(model, _model_name, _n_iterations, output_file=models_file)
    model_pics_file_name =  path.join(config.experiment_path, _model_name)
    plot_maker.make_tm_plots(model, model_pics_file_name)
    model_output_file_name = path.join(config.experiment_path, _model_name + '.txt')
    printer.print_scores(model, _model_name, _n_iterations, model_output_file_name)
    printer.print_top_tokens(model, model_output_file_name)
    return model

In [6]:
def save_model(_model, _model_name): 
    print '[{}] saving model'.format(datetime.now())
    model_output_file_name = path.join(config.models_archive_path, _model_name)
    _model.save(filename=model_output_file_name+'_saved_p_wt', model_name=_model_name+'p_wt')
    _model.save(filename=model_output_file_name+'_saved_n_wt', model_name=_model_name+'n_wt')

In [None]:
batch_vectorizer = artm.BatchVectorizer(data_path=config.dataset_path,
                                        data_format='bow_uci',
                                        collection_name=config.collection_name,
                                        target_folder=config.output_batches_path)
dictionary = artm.Dictionary()
dictionary.gather(data_path=config.output_batches_path,
                  vocab_file_path=config.vocabulary_path)
dictionary.save(dictionary_path=config.dictionary_path)
dictionary.save_text(dictionary_path=config.dictionary_path + '.txt')
dictionary.load_text(dictionary_path=config.dictionary_path + '.txt')

In [7]:
batch_vectorizer = artm.BatchVectorizer(data_path=config.output_batches_path,
                                        data_format='batches')
dictionary = artm.Dictionary()
dictionary.load(dictionary_path=config.dictionary_path + '.dict')

In [None]:
# dictionary.filter(min_tf=5, max_tf=2000, min_df_rate=0.01, max_df_rate=0.9)

In [8]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=100, n_doc_passes=5, seed_value=100,
                         n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers.add(artm.SmoothSparseThetaRegularizer(name='ss_theta_regularizer'))
tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 100
tmp_model.regularizers['ss_theta_regularizer'].tau = -0.01
tmp_model = fit_one_model(tmp_model, _n_iterations=15, _model_name='model_decor_sparse_t_reg_21')
model_decor_sparse_t_reg_21 = tmp_model; tmp_model = None

[2016-12-03 23:04:49.610000] creating model
[2016-12-03 23:04:50.866000] adding scores
[2016-12-03 23:04:50.874000] fitting
[2016-12-03 23:05:13.166000] outputting
name = model_decor_sparse_t_reg_21, n_topics = 100, n_doc_passes = 5, seed_value = 100, n_iterations = 15, n_top_tokens = 15, p_threshold = 0.25
ss_theta_regularizer, tau = -0.01
decorrelator_phi_regularizer, tau = 100



In [9]:
model = model_decor_sparse_t_reg_21

In [10]:
phi = model.get_phi()
phi_t = phi.transpose()
print(phi)

                           topic_0       topic_1       topic_2       topic_3  \
книги                 0.000000e+00  0.000000e+00  0.000000e+00  0.000000e+00   
fuchs                 0.000000e+00  0.000000e+00  0.000000e+00  0.000000e+00   
preobrazhensky        0.000000e+00  0.000000e+00  0.000000e+00  0.000000e+00   
tabachnikov           0.000000e+00  0.000000e+00  0.000000e+00  0.000000e+00   
автограф              6.728774e-14  5.483567e-15  0.000000e+00  1.812173e-09   
автор                 4.452065e-04  9.768548e-04  5.983442e-04  1.087942e-02   
кейс                  1.459912e-10  8.389613e-08  0.000000e+00  1.225691e-12   
математика            1.099461e-08  1.533901e-06  1.458226e-13  8.079139e-04   
образование           1.169069e-04  1.658698e-04  1.550982e-03  5.102773e-04   
красота               4.338063e-11  1.088937e-13  1.871415e-05  5.865199e-05   
книга                 1.224739e-03  3.290065e-04  4.612368e-14  1.705278e-03   
университетский_курс  0.000000e+00  0.00

In [11]:
from sklearn.random_projection import johnson_lindenstrauss_min_dim
johnson_lindenstrauss_min_dim(n_samples=100, eps=0.1)

3947

In [None]:
points = phi_t
print '[{}] PCA fitting started '.format(datetime.now())
model = PCA(n_components=20).fit(points)
print '[{}] PCA fitting finished '.format(datetime.now())
proj_vertices = model.transform(points)
print '[{}] PCA transforming finished '.format(datetime.now())
# hull_kinda = ConvexHull(proj_vertices)
# print '[{}] ConvexHull finished '.format(datetime.now())

In [None]:
proj_vertices.shape

In [None]:
hull = ConvexHull(proj_vertices)
# print hull.vertices

In [None]:
hull = ConvexHull(phi)
simplices = hull.simplices
vertices = hull.vertices

In [None]:
hull[simplices, :]

In [None]:
models_file.close()