In [1]:
import sys
import codecs
import numpy as np
import pandas as pd
import artm
print artm.version()

from os import path, mkdir
from datetime import datetime
%matplotlib inline
sys.path.insert(0, '..\\modules\\helpers')
from plot_helper import PlotMaker
from config_helper import ConfigPaths
from print_helper import PrintHelper
from sklearn.decomposition import PCA
from scipy.spatial import ConvexHull
from scipy.optimize import minimize

0.8.1


In [2]:
config = ConfigPaths('config.cfg')
plot_maker = PlotMaker()
printer = PrintHelper()

In [4]:
print config.models_file_name

Q:\\topic_modeling\\csi_science_collections.git\experiments\UCI_filtered_ngramm_trimmed_without_names\03_12_comp\models.txt


In [5]:
models_file = open(config.models_file_name, 'a')

In [6]:
def create_model(current_dictionary, n_topics, n_doc_passes, seed_value, n_top_tokens, p_mass_threshold):    
    print '[{}] creating model'.format(datetime.now())
    model = artm.ARTM(num_topics=n_topics, dictionary=current_dictionary, cache_theta=True, seed=seed_value, 
                  class_ids={'ngramm': 1.0, 'author_id': 0.0, 'author': 0.0, 
                             'post_tag': 0.0, 'projects': 0.0, 'category': 0.0,
                             'following_users': 0.0})
    model.num_document_passes = n_doc_passes
    add_scores_to_model(model, n_top_tokens=n_top_tokens, p_mass_threshold=p_mass_threshold)
    return model


def add_scores_to_model(artm_model, n_top_tokens, p_mass_threshold):
    print '[{}] adding scores'.format(datetime.now())
    artm_model.scores.add(artm.PerplexityScore(name='perplexity_score',
                                      use_unigram_document_model=False,
                                      dictionary=dictionary))
    artm_model.scores.add(artm.SparsityPhiScore(name='sparsity_phi_score', class_id='ngramm'))
    artm_model.scores.add(artm.SparsityThetaScore(name='sparsity_theta_score'))
    artm_model.scores.add(artm.TopicKernelScore(name='topic_kernel_score', class_id='ngramm', 
                                                probability_mass_threshold=p_mass_threshold))
    artm_model.scores.add(artm.TopTokensScore(name='top_tokens_score', class_id='ngramm', num_tokens=n_top_tokens))
def fit_one_model(model, _n_iterations, _model_name=''): 
    print '[{}] fitting'.format(datetime.now())
    model.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=_n_iterations)
    print '[{}] outputting'.format(datetime.now())
    printer.print_artm_model(model, _model_name, _n_iterations, output_file=models_file)
    model_pics_file_name =  path.join(config.experiment_path, _model_name)
    plot_maker.make_tm_plots(model, model_pics_file_name)
    model_output_file_name = path.join(config.experiment_path, _model_name + '.txt')
    printer.print_scores(model, _model_name, _n_iterations, model_output_file_name)
    printer.print_top_tokens(model, model_output_file_name)
    return model

In [6]:
def save_model(_model, _model_name): 
    print '[{}] saving model'.format(datetime.now())
    model_output_file_name = path.join(config.models_archive_path, _model_name)
    _model.save(filename=model_output_file_name+'_saved_p_wt', model_name=_model_name+'p_wt')
    _model.save(filename=model_output_file_name+'_saved_n_wt', model_name=_model_name+'n_wt')

In [None]:
batch_vectorizer = artm.BatchVectorizer(data_path=config.dataset_path,
                                        data_format='bow_uci',
                                        collection_name=config.collection_name,
                                        target_folder=config.output_batches_path)
dictionary = artm.Dictionary()
dictionary.gather(data_path=config.output_batches_path,
                  vocab_file_path=config.vocabulary_path)
dictionary.save(dictionary_path=config.dictionary_path)
dictionary.save_text(dictionary_path=config.dictionary_path + '.txt')
dictionary.load_text(dictionary_path=config.dictionary_path + '.txt')

In [7]:
batch_vectorizer = artm.BatchVectorizer(data_path=config.output_batches_path,
                                        data_format='batches')
dictionary = artm.Dictionary()
dictionary.load(dictionary_path=config.dictionary_path + '.dict')

In [42]:
len(dictionary)

TypeError: object of type 'Dictionary' has no len()

In [None]:
# dictionary.filter(min_tf=5, max_tf=2000, min_df_rate=0.01, max_df_rate=0.9)

In [8]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=100, n_doc_passes=5, seed_value=100,
                         n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers.add(artm.SmoothSparseThetaRegularizer(name='ss_theta_regularizer'))
tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 100
tmp_model.regularizers['ss_theta_regularizer'].tau = -0.01
tmp_model = fit_one_model(tmp_model, _n_iterations=15, _model_name='model1')
model1 = tmp_model; tmp_model = None

[2016-12-04 12:42:26.456000] creating model
[2016-12-04 12:42:28.436000] adding scores
[2016-12-04 12:42:28.453000] fitting
[2016-12-04 12:43:10.906000] outputting
name = model1, n_topics = 100, n_doc_passes = 5, seed_value = 100, n_iterations = 15, n_top_tokens = 15, p_threshold = 0.25
ss_theta_regularizer, tau = -0.01
decorrelator_phi_regularizer, tau = 100



In [9]:
phi1 = model1.get_phi()
theta1 = model1.get_theta()

In [48]:
def create_sample_dataset_matrix(_phi, _theta):
    _phi = _phi[(_phi.T != 0).any()]
    return _phi.dot(_theta)

In [49]:
nw = create_sample_dataset_matrix(phi1, theta1)

In [80]:
values = nw.iloc[:, nw.columns[448]]
q = values.iloc[np.where(values != 0)]

In [87]:
np.sum(nw, 0)

3001    0.999996
3002    1.000004
3003    0.999996
3004    1.000002
3005    0.999997
3006    0.999997
3007    1.000005
3008    0.999986
3009    1.000001
3010    1.000003
3011    1.000001
3012    0.999998
3013    0.999997
3014    0.999999
3015    1.000002
3016    1.000003
3017    0.999988
3018    0.999997
3019    1.000001
3020    0.999994
3021    1.000003
3022    0.999997
3023    0.999995
3024    0.999997
3025    0.999993
3026    1.000003
3027    1.000000
3028    0.999998
3029    0.999998
3030    0.999998
          ...   
971     1.000001
972     0.999992
973     1.000002
974     0.999992
975     0.999997
976     0.999990
977     0.999998
978     0.999999
979     0.999997
980     1.000000
981     0.999985
982     0.999997
983     0.999982
984     0.999999
985     1.000003
986     0.999995
987     0.999988
988     0.999992
989     0.999997
990     0.999994
991     1.000000
992     0.999991
993     0.999999
994     1.000001
995     0.999985
996     1.000000
997     0.999994
998     0.9999

In [71]:
print len(np.unique(nw.index)), nw.shape

17492 (17492, 3446)


In [74]:
def convert_to_vw(_nw, out_file):
    with codecs.open(out_file, 'w', 'utf-8') as fout:
        n = 0
        for col in _nw.columns:
            n += 1
            print '[{}] processing column no {} of {}'.format(datetime.now(), n, len(_nw.columns))
            fout.write(u'doc_{} |@default_class '.format(col))
            values = _nw.iloc[:, col]
            for idx in values.index.values:
                val = values[idx]
                if val != 0:
                    fout.write(u'{}:{} '.format(idx, val))
            fout.write(u'\n')

In [75]:
# output_batches_path = path.join(config.home_dir, '..\\data\postnauka\\bigARTM_files', 'sample_model1')
output_vw_path = path.join(config.home_dir, '..\\data\postnauka\\UCI_collections', 'sample_model1')
if not path.exists(output_vw_path):
    mkdir(output_vw_path)
convert_to_vw(nw, path.join(output_vw_path, 'model1.vw'))

[2016-12-04 13:09:15.136000] processing column no 1 of 3446
[2016-12-04 13:09:16.023000] processing column no 2 of 3446
[2016-12-04 13:09:16.881000] processing column no 3 of 3446
[2016-12-04 13:09:17.686000] processing column no 4 of 3446
[2016-12-04 13:09:18.516000] processing column no 5 of 3446
[2016-12-04 13:09:19.334000] processing column no 6 of 3446
[2016-12-04 13:09:20.104000] processing column no 7 of 3446
[2016-12-04 13:09:20.949000] processing column no 8 of 3446
[2016-12-04 13:09:21.767000] processing column no 9 of 3446
[2016-12-04 13:09:22.678000] processing column no 10 of 3446
[2016-12-04 13:09:23.679000] processing column no 11 of 3446
[2016-12-04 13:09:24.649000] processing column no 12 of 3446
[2016-12-04 13:09:25.638000] processing column no 13 of 3446
[2016-12-04 13:09:26.498000] processing column no 14 of 3446
[2016-12-04 13:09:27.287000] processing column no 15 of 3446
[2016-12-04 13:09:28.179000] processing column no 16 of 3446
[2016-12-04 13:09:29.121000] proc

IndexError: single positional indexer is out-of-bounds

In [25]:
models_file.close()