In [1]:
import sys
import numpy as np
import artm
print artm.version()

from os import path, mkdir
from datetime import datetime
%matplotlib inline
sys.path.insert(0, '..\\modules\\helpers')
sys.path.insert(0, '..\\modules\\completeness_solution')
from plot_helper import PlotMaker
from config_helper import ConfigPaths
from print_helper import PrintHelper
from artm_experiments import Experiment, Pool, OptimizationTopicsFilter,  GreedyTopicsFilter, GreedyTopicsRanker, ConvexHullTopicsFilter

0.8.1


In [2]:
config = ConfigPaths('config.cfg')
plot_maker = PlotMaker()
printer = PrintHelper()

In [3]:
print config.models_file_name

Q:\\topic_modeling\\csi_science_collections.git\experiments\UCI_filtered_ngramm_trimmed_without_names\03_12_comp\models.txt


In [4]:
models_file = open(config.models_file_name, 'a')

In [5]:
def create_model(current_dictionary, n_topics, n_doc_passes, seed_value, n_top_tokens, p_mass_threshold):    
    print '[{}] creating model'.format(datetime.now())
    model = artm.ARTM(num_topics=n_topics, dictionary=current_dictionary, cache_theta=True, seed=seed_value, 
                  class_ids={'ngramm': 1.0, 'author_id': 0.0, 'author': 0.0, 
                             'post_tag': 0.0, 'projects': 0.0, 'category': 0.0,
                             'following_users': 0.0})
    model.num_document_passes = n_doc_passes
    add_scores_to_model(model, n_top_tokens=n_top_tokens, p_mass_threshold=p_mass_threshold)
    return model


def add_scores_to_model(artm_model, n_top_tokens, p_mass_threshold):
    print '[{}] adding scores'.format(datetime.now())
    artm_model.scores.add(artm.PerplexityScore(name='perplexity_score',
                                      use_unigram_document_model=False,
                                      dictionary=dictionary))
    artm_model.scores.add(artm.SparsityPhiScore(name='sparsity_phi_score', class_id='ngramm'))
    artm_model.scores.add(artm.SparsityThetaScore(name='sparsity_theta_score'))
    artm_model.scores.add(artm.TopicKernelScore(name='topic_kernel_score', class_id='ngramm', 
                                                probability_mass_threshold=p_mass_threshold))
    artm_model.scores.add(artm.TopTokensScore(name='top_tokens_score', class_id='ngramm', num_tokens=n_top_tokens))
def fit_one_model(model, _n_iterations, _model_name=''): 
    print '[{}] fitting'.format(datetime.now())
    model.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=_n_iterations)
    print '[{}] outputting'.format(datetime.now())
    printer.print_artm_model(model, _model_name, _n_iterations, output_file=models_file)
    model_pics_file_name =  path.join(config.experiment_path, _model_name)
    plot_maker.make_tm_plots(model, model_pics_file_name)
    model_output_file_name = path.join(config.experiment_path, _model_name + '.txt')
    printer.print_scores(model, _model_name, _n_iterations, model_output_file_name)
    printer.print_top_tokens(model, model_output_file_name)
    return model

In [None]:
batch_vectorizer = artm.BatchVectorizer(data_path=config.dataset_path,
                                        data_format='bow_uci',
                                        collection_name=config.collection_name,
                                        target_folder=config.output_batches_path)
dictionary = artm.Dictionary()
dictionary.gather(data_path=config.output_batches_path,
                  vocab_file_path=config.vocabulary_path)
dictionary.save(dictionary_path=config.dictionary_path)
dictionary.save_text(dictionary_path=config.dictionary_path + '.txt')
dictionary.load_text(dictionary_path=config.dictionary_path + '.txt')

In [6]:
batch_vectorizer = artm.BatchVectorizer(data_path=config.output_batches_path,
                                        data_format='batches')
dictionary = artm.Dictionary()
dictionary.load(dictionary_path=config.dictionary_path + '.dict')

In [None]:
# dictionary.filter(min_tf=5, max_tf=2000, min_df_rate=0.01, max_df_rate=0.9)

In [7]:
def display_points(phi):
    points = ConvexHullTopicsFilter.project_points(phi.as_matrix().T, dim=2)
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.plot(points[:, 0], points[:, 1], marker='o', ls='')
    for i in xrange(phi.shape[1]):
        ax.annotate(phi.columns[i], xy=points[i], textcoords='offset points')
        
    plt.show()
    
def save_hist(vals, bins, filename):
    bins = (bins[:-1] + bins[1:]) / 2.0
    np.savetxt(filename, np.vstack((bins, vals)).T, delimiter=',')

In [8]:
def process_one_model(name, seed):
    tmp_model = create_model(current_dictionary=dictionary, n_topics=2000, n_doc_passes=5, seed_value=seed,
                         n_top_tokens=15, p_mass_threshold=0.25)
    tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['ngramm']))
    tmp_model.regularizers.add(artm.SmoothSparseThetaRegularizer(name='sparse_theta_regularizer'))
    tmp_model.regularizers['sparse_theta_regularizer'].tau = -0.01
    tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 100
    tmp_model = fit_one_model(tmp_model, _n_iterations=15, _model_name=name)
    return tmp_model

In [9]:
# exp = Experiment(Pool(topics_filter=OptimizationTopicsFilter(eps=10 ** (-2.5), verbose=False), 
#                       save_topics=True))
# exp_models = []
for i in xrange(2):    
    model_artm = process_one_model(name='model_{}'.format(i), seed=np.random.randint(0, 1984, 1)[0])
    exp_models.append(model_artm)
#     display_points(model_artm.get_phi())
    print '[{}] collecting topics'.format(datetime.now())
    exp.collect_topics(model_artm.get_phi(), model_artm.get_theta())
    vals, bins = exp.topics_pool.topics_filter.plot_hist()
    save_hist(vals, bins, "data_iter_{}.csv".format(i))
    print exp.topics_pool.get_basic_topics_count()

[2016-12-03 19:06:04.175000] creating model
[2016-12-03 19:06:09.656000] adding scores
[2016-12-03 19:06:09.737000] fitting
[2016-12-03 19:17:54.012000] outputting
name = model_0, n_topics = 2000, n_doc_passes = 5, seed_value = 548, n_iterations = 15, n_top_tokens = 15, p_threshold = 0.25
sparse_theta_regularizer, tau = -0.01
decorrelator_phi_regularizer, tau = 100



NameError: global name 'ConvexHullTopicsFilter' is not defined

In [None]:
exp_models.append(model_artm)
#     display_points(model_artm.get_phi())
exp.collect_topics(model_artm.get_phi(), model_artm.get_theta())
vals, bins = exp.topics_pool.topics_filter.plot_hist()
save_hist(vals, bins, "data_iter_{}.csv".format(i))
print exp.topics_pool.get_basic_topics_count()

In [None]:
print exp.topics_pool.hea

In [None]:
phi = model.get_phi()
th = model.get_theta()

In [None]:
phi.columns

In [None]:
exp.show_all_topics(sort_by_closest_topic=True)

In [None]:
models_file.close()