In [1]:
import sys
import numpy as np
import pandas as pd
import artm
print artm.version()

from os import path, mkdir
from datetime import datetime
%matplotlib inline
sys.path.insert(0, '..\\modules\\helpers')
from plot_helper import PlotMaker
from config_helper import ConfigPaths
from print_helper import PrintHelper
from sklearn.decomposition import PCA
from scipy.spatial import ConvexHull

0.8.1


In [2]:
config = ConfigPaths('config.cfg')
plot_maker = PlotMaker()
printer = PrintHelper()

In [3]:
print config.models_file_name

Q:\\topic_modeling\\csi_science_collections.git\experiments\UCI_filtered_ngramm_trimmed_without_names\03_12_comp\models.txt


In [4]:
models_file = open(config.models_file_name, 'a')

In [5]:
def create_model(current_dictionary, n_topics, n_doc_passes, seed_value, n_top_tokens, p_mass_threshold):    
    print '[{}] creating model'.format(datetime.now())
    model = artm.ARTM(num_topics=n_topics, dictionary=current_dictionary, cache_theta=True, seed=seed_value, 
                  class_ids={'ngramm': 1.0, 'author_id': 0.0, 'author': 0.0, 
                             'post_tag': 0.0, 'projects': 0.0, 'category': 0.0,
                             'following_users': 0.0})
    model.num_document_passes = n_doc_passes
    add_scores_to_model(model, n_top_tokens=n_top_tokens, p_mass_threshold=p_mass_threshold)
    return model


def add_scores_to_model(artm_model, n_top_tokens, p_mass_threshold):
    print '[{}] adding scores'.format(datetime.now())
    artm_model.scores.add(artm.PerplexityScore(name='perplexity_score',
                                      use_unigram_document_model=False,
                                      dictionary=dictionary))
    artm_model.scores.add(artm.SparsityPhiScore(name='sparsity_phi_score', class_id='ngramm'))
    artm_model.scores.add(artm.SparsityThetaScore(name='sparsity_theta_score'))
    artm_model.scores.add(artm.TopicKernelScore(name='topic_kernel_score', class_id='ngramm', 
                                                probability_mass_threshold=p_mass_threshold))
    artm_model.scores.add(artm.TopTokensScore(name='top_tokens_score', class_id='ngramm', num_tokens=n_top_tokens))
def fit_one_model(model, _n_iterations, _model_name=''): 
    print '[{}] fitting'.format(datetime.now())
    model.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=_n_iterations)
    print '[{}] outputting'.format(datetime.now())
    printer.print_artm_model(model, _model_name, _n_iterations, output_file=models_file)
    model_pics_file_name =  path.join(config.experiment_path, _model_name)
    plot_maker.make_tm_plots(model, model_pics_file_name)
    model_output_file_name = path.join(config.experiment_path, _model_name + '.txt')
    printer.print_scores(model, _model_name, _n_iterations, model_output_file_name)
    printer.print_top_tokens(model, model_output_file_name)
    return model

In [6]:
def save_model(_model, _model_name): 
    print '[{}] saving model'.format(datetime.now())
    model_output_file_name = path.join(config.models_archive_path, _model_name)
    _model.save(filename=model_output_file_name+'_saved_p_wt', model_name=_model_name+'p_wt')
    _model.save(filename=model_output_file_name+'_saved_n_wt', model_name=_model_name+'n_wt')

In [None]:
batch_vectorizer = artm.BatchVectorizer(data_path=config.dataset_path,
                                        data_format='bow_uci',
                                        collection_name=config.collection_name,
                                        target_folder=config.output_batches_path)
dictionary = artm.Dictionary()
dictionary.gather(data_path=config.output_batches_path,
                  vocab_file_path=config.vocabulary_path)
dictionary.save(dictionary_path=config.dictionary_path)
dictionary.save_text(dictionary_path=config.dictionary_path + '.txt')
dictionary.load_text(dictionary_path=config.dictionary_path + '.txt')

In [7]:
batch_vectorizer = artm.BatchVectorizer(data_path=config.output_batches_path,
                                        data_format='batches')
dictionary = artm.Dictionary()
dictionary.load(dictionary_path=config.dictionary_path + '.dict')

In [None]:
# dictionary.filter(min_tf=5, max_tf=2000, min_df_rate=0.01, max_df_rate=0.9)

In [8]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=100, n_doc_passes=5, seed_value=100,
                         n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers.add(artm.SmoothSparseThetaRegularizer(name='ss_theta_regularizer'))
tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 100
tmp_model.regularizers['ss_theta_regularizer'].tau = -0.01
tmp_model = fit_one_model(tmp_model, _n_iterations=15, _model_name='model_decor_sparse_t_reg_1')
model1 = tmp_model; tmp_model = None

[2016-12-03 23:20:02.954000] creating model
[2016-12-03 23:20:04.709000] adding scores
[2016-12-03 23:20:04.723000] fitting
[2016-12-03 23:20:33.626000] outputting
name = model_decor_sparse_t_reg_1, n_topics = 100, n_doc_passes = 5, seed_value = 100, n_iterations = 15, n_top_tokens = 15, p_threshold = 0.25
ss_theta_regularizer, tau = -0.01
decorrelator_phi_regularizer, tau = 100



In [9]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=50, n_doc_passes=5, seed_value=100,
                         n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers.add(artm.SmoothSparseThetaRegularizer(name='ss_theta_regularizer'))
tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 100
tmp_model.regularizers['ss_theta_regularizer'].tau = -0.01
tmp_model = fit_one_model(tmp_model, _n_iterations=15, _model_name='model_decor_sparse_t_reg_2')
model2 = tmp_model; tmp_model = None

[2016-12-03 23:20:49.807000] creating model
[2016-12-03 23:20:51.193000] adding scores
[2016-12-03 23:20:51.201000] fitting
[2016-12-03 23:21:08.759000] outputting
name = model_decor_sparse_t_reg_2, n_topics = 50, n_doc_passes = 5, seed_value = 100, n_iterations = 15, n_top_tokens = 15, p_threshold = 0.25
ss_theta_regularizer, tau = -0.01
decorrelator_phi_regularizer, tau = 100



In [10]:
phi1 = model1.get_phi()
phi1_t = phi1.transpose()
phi2 = model2.get_phi()
phi2_t = phi2.transpose()

In [11]:
def get_distance(topic, other_topic):
    # take elements that are both nonzero
    nonzero_indices = np.intersect1d(np.where(topic != 0), np.where(other_topic != 0))
    topic_cut = topic[nonzero_indices]
    other_topic_cut = other_topic[nonzero_indices]
    if len(nonzero_indices) == 0:
        dist = float('inf')
    else:
        dist = np.sum(0.5 * (np.log(topic_cut) - np.log(other_topic_cut)) * (topic_cut - other_topic_cut))
    return dist

In [12]:
def take_distances(_phi, _phi_other):
    print '[{}] take_distances between {} columns and {} columns'.format(datetime.now(), len(_phi.columns), len(_phi_other.columns))
    distances = pd.DataFrame(0, index = _phi.columns, columns=_phi_other.columns)
    for col_idx in range(len(_phi.columns)):
        print '[{}] column num {} of {}'.format(datetime.now(), col_idx, len(_phi.columns))
        for other_col_idx in range(len(_phi_other.columns)):
            distance = get_distance(_phi.iloc[:, col_idx], _phi_other.iloc[:, other_col_idx])
            distances.iloc[col_idx, other_col_idx] = distance
    return distances
def distances_to_str_by_rows(distances, _n_topics):
    str = ''
    for n_row in range(len(distances.index)):
        values = distances.iloc[n_row, :].sort_values().head(_n_topics)
        value = ', '.join(['{} : {}'.format(values.index[ind], values[ind]) for ind in range(len(values))])
        str += '{} | {}\n'.format(distances.index[n_row], value)
    return str

In [13]:
distances = take_distances(phi2, phi1)

[2016-12-03 23:21:22.900000] take_distances between 50 columns and 100 columns
[2016-12-03 23:21:22.901000] column num 0 of 50
[2016-12-03 23:21:23.654000] column num 1 of 50
[2016-12-03 23:21:24.299000] column num 2 of 50
[2016-12-03 23:21:24.926000] column num 3 of 50
[2016-12-03 23:21:25.604000] column num 4 of 50
[2016-12-03 23:21:26.273000] column num 5 of 50
[2016-12-03 23:21:26.875000] column num 6 of 50
[2016-12-03 23:21:27.554000] column num 7 of 50
[2016-12-03 23:21:28.218000] column num 8 of 50
[2016-12-03 23:21:28.842000] column num 9 of 50
[2016-12-03 23:21:29.516000] column num 10 of 50
[2016-12-03 23:21:30.214000] column num 11 of 50
[2016-12-03 23:21:30.885000] column num 12 of 50
[2016-12-03 23:21:31.530000] column num 13 of 50
[2016-12-03 23:21:32.179000] column num 14 of 50
[2016-12-03 23:21:32.839000] column num 15 of 50
[2016-12-03 23:21:33.477000] column num 16 of 50
[2016-12-03 23:21:34.142000] column num 17 of 50
[2016-12-03 23:21:34.707000] column num 18 of 50


In [14]:
print distances_to_str_by_rows(distances, 3)

topic_0 | topic_0 : 2.05394411087, topic_57 : 3.52333402634, topic_18 : 4.42354679108
topic_1 | topic_1 : 0.957847476006, topic_75 : 1.60966968536, topic_15 : 4.0636100769
topic_2 | topic_2 : 0.831335067749, topic_64 : 2.59058761597, topic_89 : 3.10039997101
topic_3 | topic_98 : 2.35483384132, topic_73 : 2.44401478767, topic_92 : 3.33433294296
topic_4 | topic_68 : 2.93214654922, topic_55 : 3.63667440414, topic_78 : 4.349609375
topic_5 | topic_5 : 2.64490389824, topic_81 : 3.2126262188, topic_61 : 3.36071968079
topic_6 | topic_6 : 2.45645022392, topic_87 : 3.45866441727, topic_68 : 3.65049815178
topic_7 | topic_7 : 1.73223555088, topic_8 : 4.45220947266, topic_22 : 4.65224123001
topic_8 | topic_8 : 3.6936712265, topic_57 : 3.7868950367, topic_90 : 4.60551548004
topic_9 | topic_9 : 3.01174807549, topic_64 : 4.23945713043, topic_86 : 4.43224334717
topic_10 | topic_10 : 1.49747776985, topic_69 : 4.73697042465, topic_73 : 4.77277517319
topic_11 | topic_11 : 0.871329724789, topic_57 : 3.3406

In [None]:
models_file.close()