In [None]:
import sys
import numpy as np
import artm
print artm.version()

from os import path, mkdir
from datetime import datetime
%matplotlib inline
sys.path.insert(0, '..\\modules\\helpers')
from plot_helper import PlotMaker
from config_helper import ConfigPaths
from print_helper import PrintHelper

In [None]:
config = ConfigPaths('config.cfg')
plot_maker = PlotMaker()
printer = PrintHelper()

In [None]:
print config.models_file_name

In [None]:
models_file = open(config.models_file_name, 'a')

In [None]:
def create_model(current_dictionary, n_topics, n_doc_passes, seed_value, n_top_tokens, p_mass_threshold):    
    print '[{}] creating model'.format(datetime.now())
    model = artm.ARTM(num_topics=n_topics, dictionary=current_dictionary, cache_theta=True, seed=seed_value, 
                  class_ids={'ngramm': 1.0, 'author_id': 0.0, 'author': 0.0, 
                             'post_tag': 0.0, 'projects': 0.0, 'category': 0.0,
                             'following_users': 0.0})
    model.num_document_passes = n_doc_passes
    add_scores_to_model(model, n_top_tokens=n_top_tokens, p_mass_threshold=p_mass_threshold)
    return model


def add_scores_to_model(artm_model, n_top_tokens, p_mass_threshold):
    print '[{}] adding scores'.format(datetime.now())
    artm_model.scores.add(artm.PerplexityScore(name='perplexity_score',
                                      use_unigram_document_model=False,
                                      dictionary=dictionary))
    artm_model.scores.add(artm.SparsityPhiScore(name='sparsity_phi_score', class_id='ngramm'))
    artm_model.scores.add(artm.SparsityThetaScore(name='sparsity_theta_score'))
    artm_model.scores.add(artm.TopicKernelScore(name='topic_kernel_score', class_id='ngramm', 
                                                probability_mass_threshold=p_mass_threshold))
    artm_model.scores.add(artm.TopTokensScore(name='top_tokens_score', class_id='ngramm', num_tokens=n_top_tokens))

In [None]:
def process_one_model(dictionary, _n_topics, _n_doc_passes, _seed_value, _n_top_tokens, _p_mass_threshold, _n_iterations,
                     _model_name=''):
    print '[{}] processing model'.format(datetime.now())
    model = create_model(current_dictionary=dictionary, n_topics=_n_topics, n_doc_passes=_n_doc_passes, seed_value=_seed_value,
                         n_top_tokens=_n_top_tokens, p_mass_threshold=_p_mass_threshold)
    model = fit_one_model(model, _n_iterations, _model_name)
    return model
    
def fit_one_model(model, _n_iterations, _model_name=''): 
    print '[{}] fitting'.format(datetime.now())
    model.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=_n_iterations)
    print '[{}] outputting'.format(datetime.now())
    printer.print_artm_model(model, _model_name, _n_iterations, output_file=models_file)
    model_pics_file_name =  path.join(config.experiment_path, _model_name)
    plot_maker.make_tm_plots(model, model_pics_file_name)
    model_output_file_name = path.join(config.experiment_path, _model_name + '.txt')
    printer.print_scores(model, _model_name, _n_iterations, model_output_file_name)
    printer.print_top_tokens(model, model_output_file_name)
    return model

In [None]:
def save_model(_model, _model_name): 
    print '[{}] saving model'.format(datetime.now())
    model_output_file_name = path.join(config.models_archive_path, _model_name)
    _model.save(filename=model_output_file_name+'_saved_p_wt', model_name=_model_name+'p_wt')
    _model.save(filename=model_output_file_name+'_saved_n_wt', model_name=_model_name+'n_wt')

In [None]:
batch_vectorizer = artm.BatchVectorizer(data_path=config.dataset_path,
                                        data_format='bow_uci',
                                        collection_name=config.collection_name,
                                        target_folder=config.output_batches_path)
dictionary = artm.Dictionary()
dictionary.gather(data_path=config.output_batches_path,
                  vocab_file_path=config.vocabulary_path)
dictionary.save(dictionary_path=config.dictionary_path)
dictionary.save_text(dictionary_path=config.dictionary_path + '.txt')
dictionary.load_text(dictionary_path=config.dictionary_path + '.txt')

In [None]:
batch_vectorizer = artm.BatchVectorizer(data_path=config.output_batches_path,
                                        data_format='batches')
dictionary = artm.Dictionary()
dictionary.load(dictionary_path=config.dictionary_path + '.dict')

In [None]:
# dictionary.filter(min_tf=5, max_tf=2000, min_df_rate=0.01, max_df_rate=0.9)

In [None]:
# простая модель без регуляризаторов 
# темы вроде нормаьльные, возможно в некоторых темах странные слова-выбросы (18)

In [None]:
model_no_reg2000_5 = create_model(current_dictionary=dictionary, n_topics=2000, n_doc_passes=5, seed_value=100,
                            n_top_tokens=15, p_mass_threshold=0.25)
model_no_reg2000_5 = fit_one_model(model_no_reg2000_5, _n_iterations=25, _model_name='model_no_reg2000_5')

In [None]:
model_no_reg2000_5_ds = create_model(current_dictionary=dictionary, n_topics=2000, n_doc_passes=5, seed_value=200,
                            n_top_tokens=15, p_mass_threshold=0.25)
model_no_reg2000_5_ds = fit_one_model(model_no_reg2000_5_ds, _n_iterations=25, _model_name='model_no_reg2000_5_ds')

In [None]:
# простая модель c sparse регуляризаторами 

In [None]:
model_sparse_reg_1 = create_model(current_dictionary=dictionary, n_topics=2000, n_doc_passes=5, seed_value=100,
                            n_top_tokens=15, p_mass_threshold=0.25)
model_sparse_reg_1 = fit_one_model(model_sparse_reg_1, _n_iterations=25, _model_name='model_sparse_reg_1')
model_sparse_reg_1.regularizers.add(artm.SmoothSparsePhiRegularizer(name='sparse_phi_regularizer', class_ids=['ngramm']))
model_sparse_reg_1.regularizers.add(artm.SmoothSparseThetaRegularizer(name='sparse_theta_regularizer', class_ids=['ngramm']))
model_sparse_reg_1.regularizers['sparse_phi_regularizer'].tau = -0.5
model_sparse_reg_1.regularizers['sparse_theta_regularizer'].tau = -0.5
model_sparse_reg_1 = fit_one_model(model_sparse_reg_1, _n_iterations=20, _model_name='model_sparse_reg_1')

In [None]:
# простая модель со всеми регуляризаторами

In [None]:
model_3_reg = create_model(current_dictionary=dictionary, n_topics=3000, n_doc_passes=5, seed_value=100,
                            n_top_tokens=15, p_mass_threshold=0.25)
model_3_reg.regularizers.add(artm.SmoothSparsePhiRegularizer(name='sparse_phi_regularizer'))
model_3_reg.regularizers.add(artm.SmoothSparseThetaRegularizer(name='sparse_theta_regularizer'))
model_3_reg.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer'))
model_3_reg.regularizers['sparse_phi_regularizer'].tau = -0.5
model_3_reg.regularizers['sparse_theta_regularizer'].tau = -0.5
model_3_reg.regularizers['decorrelator_phi_regularizer'].tau = 1e+1
model_3_reg = fit_one_model(model_3_reg, _n_iterations=80, _model_name='model_3_reg')
# save_model(model_3_reg, 'model_sparse_reg')

In [None]:
# простая модель, другой seed

In [None]:
model_no_reg2 = create_model(current_dictionary=dictionary, n_topics=500, n_doc_passes=15, seed_value=1984,
                            n_top_tokens=15, p_mass_threshold=0.25)
model_no_reg2 = fit_one_model(model_no_reg2, _n_iterations=30, _model_name='model_no_reg2')
# save_model(model_no_reg2, 'model_no_reg2')

In [None]:
# модель с sparse регуляризаторами, другие коэффициенты, больше итераций по коллекции, меньше по документам

In [None]:
model_sparse_reg = create_model(current_dictionary=dictionary, n_topics=500, n_doc_passes=5, seed_value=100,
                            n_top_tokens=15, p_mass_threshold=0.25)
model_sparse_reg.regularizers.add(artm.SmoothSparsePhiRegularizer(name='sparse_phi_regularizer'))
model_sparse_reg.regularizers.add(artm.SmoothSparseThetaRegularizer(name='sparse_theta_regularizer'))
model_sparse_reg.regularizers['sparse_phi_regularizer'].tau = -0.5
model_sparse_reg.regularizers['sparse_theta_regularizer'].tau = -0.5
model_sparse_reg = fit_one_model(model_sparse_reg, _n_iterations=100, _model_name='model_sparse_reg')

In [None]:
# модель с sparse регуляризаторами, 100 тем

In [None]:
model_sparse_reg = create_model(current_dictionary=dictionary, n_topics=100, n_doc_passes=5, seed_value=100,
                            n_top_tokens=15, p_mass_threshold=0.25)
model_sparse_reg.regularizers.add(artm.SmoothSparsePhiRegularizer(name='sparse_phi_regularizer'))
model_sparse_reg.regularizers.add(artm.SmoothSparseThetaRegularizer(name='sparse_theta_regularizer'))
model_sparse_reg.regularizers['sparse_phi_regularizer'].tau = -0.5
model_sparse_reg.regularizers['sparse_theta_regularizer'].tau = -0.5
model_sparse_reg = fit_one_model(model_sparse_reg, _n_iterations=80, _model_name='model_sparse_reg')

In [None]:
# decorrelator

In [None]:
model_4_reg = create_model(current_dictionary=dictionary, n_topics=1000, n_doc_passes=10, seed_value=100,
                            n_top_tokens=15, p_mass_threshold=0.25)
model_4_reg.regularizers.add(artm.SmoothSparsePhiRegularizer(name='sparse_phi_regularizer',class_ids=['ngramm']))
model_4_reg.regularizers.add(artm.SmoothSparseThetaRegularizer(name='sparse_theta_regularizer'))
model_4_reg.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer',class_ids=['ngramm']))
model_4_reg.regularizers['sparse_phi_regularizer'].tau = -0.5
model_4_reg.regularizers['sparse_theta_regularizer'].tau = -0.5
model_4_reg.regularizers['decorrelator_phi_regularizer'].tau = 1e+4
model_4_reg = fit_one_model(model_4_reg, _n_iterations=150, _model_name='model_4_reg')
# save_model(model_3_reg, 'model_sparse_reg')

In [None]:
model_5_reg = create_model(current_dictionary=dictionary, n_topics=1000, n_doc_passes=10, seed_value=100,
                           n_top_tokens=15, p_mass_threshold=0.25)
model_5_reg.regularizers.add(artm.SmoothSparsePhiRegularizer(name='sparse_phi_regularizer',class_ids=['ngramm']))
model_5_reg.regularizers.add(artm.SmoothSparseThetaRegularizer(name='sparse_theta_regularizer'))
model_5_reg.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer',class_ids=['ngramm']))
model_5_reg.regularizers['sparse_phi_regularizer'].tau = -0.5
model_5_reg.regularizers['sparse_theta_regularizer'].tau = -0.5
model_5_reg.regularizers['decorrelator_phi_regularizer'].tau = 1e+3
model_5_reg = fit_one_model(model_5_reg, _n_iterations=100, _model_name='model_5_reg')

In [None]:
model_6_reg = create_model(current_dictionary=dictionary, n_topics=1000, n_doc_passes=5, seed_value=100,
                           n_top_tokens=15, p_mass_threshold=0.25)
model_6_reg.regularizers.add(artm.SmoothSparsePhiRegularizer(name='sparse_phi_regularizer',class_ids=['ngramm']))
model_6_reg.regularizers.add(artm.SmoothSparseThetaRegularizer(name='sparse_theta_regularizer'))
model_6_reg.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer',class_ids=['ngramm']))
model_6_reg.regularizers['sparse_phi_regularizer'].tau = -0.5
model_6_reg.regularizers['sparse_theta_regularizer'].tau = -0.5
model_6_reg.regularizers['decorrelator_phi_regularizer'].tau = 1e+2
model_6_reg = fit_one_model(model_6_reg, _n_iterations=100, _model_name='model_6_reg')

In [None]:
model_7_reg = create_model(current_dictionary=dictionary, n_topics=1000, n_doc_passes=5, seed_value=100,
                           n_top_tokens=15, p_mass_threshold=0.25)
model_7_reg.regularizers.add(artm.SmoothSparsePhiRegularizer(name='sparse_phi_regularizer',class_ids=['ngramm']))
model_7_reg.regularizers.add(artm.SmoothSparseThetaRegularizer(name='sparse_theta_regularizer'))
model_7_reg.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer',class_ids=['ngramm']))
model_7_reg.regularizers['sparse_phi_regularizer'].tau = -0.5
model_7_reg.regularizers['sparse_theta_regularizer'].tau = -0.5
model_7_reg.regularizers['decorrelator_phi_regularizer'].tau = 1e+1
model_7_reg = fit_one_model(model_7_reg, _n_iterations=100, _model_name='model_7_reg')

In [None]:
model_8_reg = create_model(current_dictionary=dictionary, n_topics=1000, n_doc_passes=5, seed_value=100,
                           n_top_tokens=15, p_mass_threshold=0.25)
model_8_reg.regularizers.add(artm.SmoothSparsePhiRegularizer(name='sparse_phi_regularizer',class_ids=['ngramm']))
model_8_reg.regularizers.add(artm.SmoothSparseThetaRegularizer(name='sparse_theta_regularizer'))
model_8_reg.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer',class_ids=['ngramm']))
model_8_reg.regularizers['sparse_phi_regularizer'].tau = -0.5
model_8_reg.regularizers['sparse_theta_regularizer'].tau = -0.5
model_8_reg.regularizers['decorrelator_phi_regularizer'].tau = 1e-1
model_8_reg = fit_one_model(model_8_reg, _n_iterations=100, _model_name='model_8_reg')

In [None]:
model_9_reg = create_model(current_dictionary=dictionary, n_topics=1000, n_doc_passes=5, seed_value=100,
                           n_top_tokens=15, p_mass_threshold=0.25)
model_9_reg.regularizers.add(artm.SmoothSparsePhiRegularizer(name='sparse_phi_regularizer',class_ids=['ngramm']))
model_9_reg.regularizers.add(artm.SmoothSparseThetaRegularizer(name='sparse_theta_regularizer'))
model_9_reg.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer',class_ids=['ngramm']))
model_9_reg.regularizers['sparse_phi_regularizer'].tau = -0.5
model_9_reg.regularizers['sparse_theta_regularizer'].tau = -0.5
model_9_reg.regularizers['decorrelator_phi_regularizer'].tau = 1e-3
model_9_reg = fit_one_model(model_9_reg, _n_iterations=100, _model_name='9_reg')

In [None]:
models_file.close()