In [1]:
import sys
import codecs
import numpy as np
import pandas as pd
import artm
print artm.version()

from os import path, mkdir
from datetime import datetime
%matplotlib inline
sys.path.insert(0, '..\\modules\\helpers')
from plot_helper import PlotMaker
from config_helper import ConfigPaths
from print_helper import PrintHelper
from sklearn.decomposition import PCA
from scipy.spatial import ConvexHull
from scipy.optimize import minimize

0.8.1


In [2]:
config = ConfigPaths('config.cfg')
plot_maker = PlotMaker()
printer = PrintHelper()

In [3]:
print config.models_file_name

Q:\\topic_modeling\\csi_science_collections.git\experiments\UCI_filtered_ngramm_trimmed_without_names\04_12_num_topics\models.txt


In [4]:
models_file = open(config.models_file_name, 'a')

In [5]:
def create_model(current_dictionary, n_topics, n_doc_passes, seed_value, n_top_tokens, p_mass_threshold):    
    print '[{}] creating model'.format(datetime.now())
    model = artm.ARTM(num_topics=n_topics, dictionary=current_dictionary, cache_theta=True, seed=seed_value, 
                  class_ids={'ngramm': 1.0, 'author_id': 0.0, 'author': 0.0, 
                             'post_tag': 0.0, 'projects': 0.0, 'category': 0.0,
                             'following_users': 0.0})
    model.num_document_passes = n_doc_passes
    add_scores_to_model(model, n_top_tokens=n_top_tokens, p_mass_threshold=p_mass_threshold)
    return model


def add_scores_to_model(artm_model, n_top_tokens, p_mass_threshold):
    print '[{}] adding scores'.format(datetime.now())
    artm_model.scores.add(artm.PerplexityScore(name='perplexity_score',
                                      use_unigram_document_model=False,
                                      dictionary=dictionary))
    artm_model.scores.add(artm.SparsityPhiScore(name='sparsity_phi_score', class_id='ngramm'))
    artm_model.scores.add(artm.SparsityThetaScore(name='sparsity_theta_score'))
    artm_model.scores.add(artm.TopicKernelScore(name='topic_kernel_score', class_id='ngramm', 
                                                probability_mass_threshold=p_mass_threshold))
    artm_model.scores.add(artm.TopTokensScore(name='top_tokens_score', class_id='ngramm', num_tokens=n_top_tokens))
def fit_one_model(model, _n_iterations, _model_name=''): 
    print '[{}] fitting'.format(datetime.now())
    model.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=_n_iterations)
    print '[{}] outputting'.format(datetime.now())
    printer.print_artm_model(model, _model_name, _n_iterations, output_file=models_file)
    model_pics_file_name =  path.join(config.experiment_path, _model_name)
    plot_maker.make_tm_plots(model, model_pics_file_name)
    model_output_file_name = path.join(config.experiment_path, _model_name + '.txt')
    printer.print_scores(model, _model_name, _n_iterations, model_output_file_name)
    printer.print_top_tokens(model, model_output_file_name)
    return model

In [None]:
# batch_vectorizer = artm.BatchVectorizer(data_path=config.dataset_path,
#                                         data_format='bow_uci',
#                                         collection_name=config.collection_name,
#                                         target_folder=config.output_batches_path)
# dictionary = artm.Dictionary()
# dictionary.gather(data_path=config.output_batches_path,
#                   vocab_file_path=config.vocabulary_path)
# dictionary.save(dictionary_path=config.dictionary_path)
# dictionary.save_text(dictionary_path=config.dictionary_path + '.txt')
# dictionary.load_text(dictionary_path=config.dictionary_path + '.txt')

In [6]:
batch_vectorizer = artm.BatchVectorizer(data_path=config.output_batches_path,
                                        data_format='batches')
dictionary = artm.Dictionary()
dictionary.load(dictionary_path=config.dictionary_path + '.dict')

In [None]:
# dictionary.filter(min_tf=5, max_tf=2000, min_df_rate=0.01, max_df_rate=0.9)

In [7]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=100, n_doc_passes=5, seed_value=100,
                         n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers.add(artm.SmoothSparseThetaRegularizer(name='ss_theta_regularizer'))
tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 100
tmp_model.regularizers['ss_theta_regularizer'].tau = -0.01
tmp_model = fit_one_model(tmp_model, _n_iterations=15, _model_name='model1')
model1 = tmp_model; tmp_model = None

[2016-12-08 12:23:27.495000] creating model
[2016-12-08 12:23:28.714000] adding scores
[2016-12-08 12:23:28.723000] fitting
[2016-12-08 12:23:51.137000] outputting
name = model1, n_topics = 100, n_doc_passes = 5, seed_value = 100, n_iterations = 15, n_top_tokens = 15, p_threshold = 0.25
ss_theta_regularizer, tau = -0.01
decorrelator_phi_regularizer, tau = 100



In [8]:
phi1 = model1.get_phi()
theta1 = model1.get_theta()

In [9]:
def create_sample_dataset_matrix(_phi, _theta):
    _phi = _phi[(_phi.T != 0).any()]
    return _phi.dot(_theta)

In [10]:
nw = create_sample_dataset_matrix(phi1, theta1)

In [11]:
np.sum(nw, 0)

2001    0.999997
2002    0.999991
2003    0.999996
2004    0.999998
2005    1.000001
2006    1.000001
2007    0.999995
2008    0.999990
2009    0.999997
2010    0.999996
2011    0.999998
2012    1.000008
2013    0.999995
2014    1.000002
2015    0.999998
2016    1.000004
2017    0.999988
2018    1.000001
2019    0.999998
2020    1.000005
2021    0.999993
2022    1.000002
2023    0.999994
2024    0.999999
2025    0.999996
2026    1.000006
2027    0.999997
2028    0.999992
2029    0.999996
2030    1.000001
          ...   
1971    1.000000
1972    0.999993
1973    1.000003
1974    0.999996
1975    0.999997
1976    0.999999
1977    1.000001
1978    0.999994
1979    1.000000
1980    0.999994
1981    1.000006
1982    0.999990
1983    0.999999
1984    1.000002
1985    0.999998
1986    0.999996
1987    0.999991
1988    1.000000
1989    0.999999
1990    1.000003
1991    0.999999
1992    0.999995
1993    0.999995
1994    1.000002
1995    1.000001
1996    0.999987
1997    0.999997
1998    1.0000

In [40]:
def convert_to_vw(_nw, out_file):
    with codecs.open(out_file, 'w', 'utf-8') as fout:
        for idx, col in enumerate(_nw.columns):
            print '[{}] processing column no {} of {}'.format(datetime.now(), idx, len(_nw.columns))
            fout.write(u'doc_{} |@default_class '.format(col))
            values = _nw[col]
            for idx in values.index.values:
                val = values[idx]
                if val != 0:
                    fout.write(u'{}:{} '.format(idx, val))
            fout.write(u'\n')

In [13]:
# output_batches_path = path.join(config.home_dir, '..\\data\postnauka\\bigARTM_files', 'sample_model1')
output_vw_path = path.join(config.home_dir, '..\\data\postnauka\\UCI_collections', 'sample_model1')
if not path.exists(output_vw_path):
    mkdir(output_vw_path)
convert_to_vw(nw, path.join(output_vw_path, 'model1.vw'))

[2016-12-08 12:24:22.202000] processing column no 1 of 3446
[2016-12-08 12:24:22.622000] processing column no 2 of 3446
[2016-12-08 12:24:22.890000] processing column no 3 of 3446
[2016-12-08 12:24:23.152000] processing column no 4 of 3446
[2016-12-08 12:24:23.422000] processing column no 5 of 3446
[2016-12-08 12:24:23.689000] processing column no 6 of 3446
[2016-12-08 12:24:23.984000] processing column no 7 of 3446
[2016-12-08 12:24:24.248000] processing column no 8 of 3446
[2016-12-08 12:24:24.512000] processing column no 9 of 3446
[2016-12-08 12:24:24.772000] processing column no 10 of 3446
[2016-12-08 12:24:25.039000] processing column no 11 of 3446
[2016-12-08 12:24:25.301000] processing column no 12 of 3446
[2016-12-08 12:24:25.573000] processing column no 13 of 3446
[2016-12-08 12:24:25.832000] processing column no 14 of 3446
[2016-12-08 12:24:26.099000] processing column no 15 of 3446
[2016-12-08 12:24:26.366000] processing column no 16 of 3446
[2016-12-08 12:24:26.633000] proc

IndexError: single positional indexer is out-of-bounds

In [25]:
models_file.close()