In [183]:
import sys
import numpy as np
import pandas as pd
import artm
import pickle
print artm.version()

from os import path, mkdir
from datetime import datetime
%matplotlib inline
sys.path.insert(0, '..\\modules\\helpers')
from plot_helper import PlotMaker
from config_helper import ConfigPaths
from print_helper import PrintHelper
from sklearn.metrics.pairwise import cosine_distances,  cosine_similarity, euclidean_distances
from sklearn.metrics import jaccard_similarity_score
from numpy.linalg import norm as euclidean_norm

0.8.1


In [2]:
config = ConfigPaths('config.cfg')
plot_maker = PlotMaker()
printer = PrintHelper()

In [3]:
print config.models_file_name

Q:\\topic_modeling\\csi_science_collections.git\experiments\UCI_filtered_ngramm_trimmed_without_names\np_10_12_500_dists\models.txt


In [4]:
models_file = open(config.models_file_name, 'a')

In [182]:
def create_model(current_dictionary, n_topics, n_doc_passes, seed_value, n_top_tokens, p_mass_threshold):    
    print '[{}] creating model'.format(datetime.now())
    model = artm.ARTM(num_topics=n_topics, dictionary=current_dictionary, cache_theta=True, seed=seed_value, 
                  class_ids={'ngramm': 1.0, 'author_id': 0.0, 'author': 0.0, 
                             'post_tag': 0.0, 'projects': 0.0, 'category': 0.0,
                             'following_users': 0.0})
    model.num_document_passes = n_doc_passes
    add_scores_to_model(model, n_top_tokens=n_top_tokens, p_mass_threshold=p_mass_threshold)
    return model


def add_scores_to_model(artm_model, n_top_tokens, p_mass_threshold):
    print '[{}] adding scores'.format(datetime.now())
    artm_model.scores.add(artm.PerplexityScore(name='perplexity_score',
                                      use_unigram_document_model=False,
                                      dictionary=dictionary))
    artm_model.scores.add(artm.SparsityPhiScore(name='sparsity_phi_score', class_id='ngramm'))
    artm_model.scores.add(artm.SparsityThetaScore(name='sparsity_theta_score'))
    artm_model.scores.add(artm.TopicKernelScore(name='topic_kernel_score', class_id='ngramm', 
                                                probability_mass_threshold=p_mass_threshold))
    artm_model.scores.add(artm.TopTokensScore(name='top_tokens_score', class_id='ngramm', num_tokens=n_top_tokens))
def fit_one_model(model, _n_iterations, _model_name=''): 
    print '[{}] fitting'.format(datetime.now())
    model.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=_n_iterations)
    print '[{}] outputting'.format(datetime.now())
    printer.print_artm_model(model, _model_name, _n_iterations, output_file=models_file)
    model_pics_file_name =  path.join(config.experiment_path, _model_name)
    plot_maker.make_tm_plots(model, model_pics_file_name)
    model_output_file_name = path.join(config.experiment_path, _model_name + '.txt')
    printer.print_scores(model, _model_name, _n_iterations, model_output_file_name)
    printer.print_top_tokens(model, model_output_file_name)
    return model
def save_pickle_file(dists, filename):
    pickle_filename = path.join(config.experiment_path, filename)
    pickle_file = open(pickle_filename, 'wb')
    pickle.dump(dists, pickle_file)
    pickle_file.close()

In [7]:
batch_vectorizer = artm.BatchVectorizer(data_path=config.output_batches_path,
                                        data_format='batches')
dictionary = artm.Dictionary()
dictionary.load(dictionary_path=config.dictionary_path + '.dict')

In [8]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=500, n_doc_passes=5, seed_value=100,
                            n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers.add(artm.SmoothSparseThetaRegularizer(name='ss_theta_regularizer'))
tmp_model.regularizers.add(artm.SmoothSparsePhiRegularizer(name='ss_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 100
tmp_model.regularizers['ss_theta_regularizer'].tau = -0.1
tmp_model.regularizers['ss_phi_regularizer'].tau = -0.05
tmp_model = fit_one_model(tmp_model, _n_iterations=20, _model_name='model1')
model1 = tmp_model; tmp_model = None
phi1 = model1.get_phi()

[2016-12-10 19:18:10.667000] creating model
[2016-12-10 19:18:12.502000] adding scores
[2016-12-10 19:18:12.567000] fitting
[2016-12-10 19:20:29.678000] outputting
name = model1, n_topics = 500, n_doc_passes = 5, seed_value = 100, n_iterations = 20, n_top_tokens = 15, p_threshold = 0.25
ss_theta_regularizer, tau = -0.1
decorrelator_phi_regularizer, tau = 100
ss_phi_regularizer, tau = -0.05



In [16]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=10, n_doc_passes=5, seed_value=100,
                            n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers.add(artm.SmoothSparseThetaRegularizer(name='ss_theta_regularizer'))
tmp_model.regularizers.add(artm.SmoothSparsePhiRegularizer(name='ss_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 100
tmp_model.regularizers['ss_theta_regularizer'].tau = -0.1
tmp_model.regularizers['ss_phi_regularizer'].tau = -0.05
tmp_model = fit_one_model(tmp_model, _n_iterations=20, _model_name='model2')
model2 = tmp_model; tmp_model = None
phi2 = model2.get_phi()

[2016-12-10 19:26:07.900000] creating model
[2016-12-10 19:26:08.922000] adding scores
[2016-12-10 19:26:08.922000] fitting
[2016-12-10 19:26:19.675000] outputting
name = model2, n_topics = 10, n_doc_passes = 5, seed_value = 100, n_iterations = 20, n_top_tokens = 15, p_threshold = 0.25
ss_theta_regularizer, tau = -0.1
decorrelator_phi_regularizer, tau = 100
ss_phi_regularizer, tau = -0.05



In [157]:
N_TOP_WORDS = 25
def euc_dist(p, q):
    return euclidean_norm(p - q)
def cos_dist(p, q):
    p = p.reshape(1, -1)
    q = q.reshape(1, -1)
    return cosine_distances(p, q)[0][0]
def cos_dist2(p, q):
    return 1 - p.T.dot(q) / (euclidean_norm(p) * euclidean_norm(q))
def cos_dist3(p, q):
    return 1 - np.sum(p * q) / (euclidean_norm(p) * euclidean_norm(q))
def hellinger_dist(p, q):
    return np.sqrt(np.sum((np.sqrt(p) - np.sqrt(q)) ** 2)) / np.sqrt(2) 
def kullback_leibler_dist(p, q):
    # take elements that are both nonzero
    nonzero_indices = np.intersect1d(np.where(p != 0), np.where(q != 0))
    topic_cut = q[nonzero_indices]
    other_topic_cut = p[nonzero_indices]
    if len(nonzero_indices) == 0:
        dist = float('inf')
    else:
        dist = np.sum(0.5 * (np.log(topic_cut) - np.log(other_topic_cut)) * (topic_cut - other_topic_cut))
    return dist
def jaccard_dist(p, q):
    # by top word
    p_top_index = p.sort_values().tail(N_TOP_WORDS).index.values
    q_top_index = q.sort_values().tail(N_TOP_WORDS).index.values
    return 1 - jaccard_similarity_score(p_top_index, q_top_index)

In [163]:
def take_distances(dist_fun, _phi, _phi_other):
    print '[{}] take_distances between {} columns and {} columns'.format(datetime.now(), len(_phi.columns), len(_phi_other.columns))
    distances = pd.DataFrame(0, index = _phi.columns, columns=_phi_other.columns)
    for idx, col in enumerate(_phi.columns):
        print '[{}] column num {} of {}'.format(datetime.now(), idx, len(_phi.columns))
        for idx_other, col_other in enumerate(_phi_other.columns):
            distance = dist_fun(_phi[col], _phi_other[col_other])
            distances.iloc[idx, idx_other] = distance
    return distances
def distances_to_str_by_rows(distances, _n_topics):
    str = ''
    for n_row in range(len(distances.index)):
        values = distances.iloc[n_row, :].sort_values().head(_n_topics)
        value = ', '.join(['{} : {}'.format(values.index[ind], values[ind]) for ind in range(len(values))])
        str += '{} | {}\n'.format(distances.index[n_row], value)
    return str

In [159]:
phi = phi1
distances_kl = take_distances(kullback_leibler_dist, phi, phi)
distances_cos = take_distances(cos_dist, phi, phi)
distances_hel = take_distances(hellinger_dist, phi, phi)
distances_euc = take_distances(euc_dist, phi, phi)
distances_jac = take_distances(jaccard_dist, phi, phi)

[2016-12-10 20:03:11.429000] take_distances between 500 columns and 500 columns
[2016-12-10 20:03:11.431000] column num 0 of 500
[2016-12-10 20:03:14.096000] column num 1 of 500
[2016-12-10 20:03:15.893000] column num 2 of 500
[2016-12-10 20:03:17.631000] column num 3 of 500
[2016-12-10 20:03:19.284000] column num 4 of 500
[2016-12-10 20:03:20.986000] column num 5 of 500
[2016-12-10 20:03:22.754000] column num 6 of 500
[2016-12-10 20:03:24.543000] column num 7 of 500
[2016-12-10 20:03:26.526000] column num 8 of 500
[2016-12-10 20:03:28.224000] column num 9 of 500
[2016-12-10 20:03:29.981000] column num 10 of 500
[2016-12-10 20:03:31.684000] column num 11 of 500
[2016-12-10 20:03:33.381000] column num 12 of 500
[2016-12-10 20:03:35.016000] column num 13 of 500
[2016-12-10 20:03:36.747000] column num 14 of 500
[2016-12-10 20:03:38.486000] column num 15 of 500
[2016-12-10 20:03:40.268000] column num 16 of 500
[2016-12-10 20:03:42.134000] column num 17 of 500
[2016-12-10 20:03:43.815000] c

In [181]:
distances_kl[u'topic_0'].sort_values().head(2).index.values

array([u'topic_0', u'topic_467'], dtype=object)

In [184]:
save_pickle_file(distances_cos, 'distances_cos.p')
save_pickle_file(distances_euc, 'distances_euc.p')
save_pickle_file(distances_hel, 'distances_hel.p')
save_pickle_file(distances_kl, 'distances_kl.p')
save_pickle_file(distances_jac, 'distances_jac.p')


In [164]:
print distances_to_str_by_rows(distances_cos, 3)
# print distances_to_str_by_rows(distances_euc, 3)
# print distances_to_str_by_rows(distances_hel, 3)
# print distances_to_str_by_rows(distances_kl, 3)
# print distances_to_str_by_rows(distances_jac, 3)

topic_0 | topic_0 : -2.38418579102e-07, topic_244 : 0.871605157852, topic_493 : 0.912051320076
topic_1 | topic_1 : -2.38418579102e-07, topic_46 : 0.752894937992, topic_454 : 0.804853856564
topic_2 | topic_2 : 0.0, topic_396 : 0.874725878239, topic_449 : 0.890277445316
topic_3 | topic_3 : 2.38418579102e-07, topic_219 : 0.653403162956, topic_335 : 0.766615748405
topic_4 | topic_4 : 0.0, topic_50 : 0.813055813313, topic_497 : 0.88613319397
topic_5 | topic_5 : 1.19209289551e-07, topic_395 : 0.869332909584, topic_439 : 0.905907392502
topic_6 | topic_6 : -1.19209289551e-07, topic_24 : 0.883314728737, topic_15 : 0.884427726269
topic_7 | topic_7 : 0.0, topic_289 : 0.939492821693, topic_415 : 0.941816866398
topic_8 | topic_8 : 3.57627868652e-07, topic_251 : 0.89293807745, topic_418 : 0.908919751644
topic_9 | topic_9 : 2.98023223877e-07, topic_172 : 0.902606368065, topic_209 : 0.934935927391
topic_10 | topic_10 : -1.19209289551e-07, topic_237 : 0.933019995689, topic_84 : 0.937622845173
topic_11 

In [None]:
models_file.close()