In [3]:
import sys
import numpy as np
import pandas as pd
import artm
print artm.version()

from os import path, mkdir
from datetime import datetime
%matplotlib inline
sys.path.insert(0, '..\\modules\\helpers')
from plot_helper import PlotMaker
from config_helper import ConfigPaths
from print_helper import PrintHelper
from sklearn.decomposition import PCA
from scipy.spatial import ConvexHull
from scipy.optimize import minimize
from sklearn.metrics.pairwise import cosine_distances

0.8.1


In [5]:
config = ConfigPaths('config.cfg')
plot_maker = PlotMaker()
printer = PrintHelper()

In [6]:
print config.models_file_name

Q:\\topic_modeling\\csi_science_collections.git\experiments\UCI_filtered_ngramm_trimmed_without_names\03_12_comp\models.txt


In [7]:
models_file = open(config.models_file_name, 'a')

In [8]:
def create_model(current_dictionary, n_topics, n_doc_passes, seed_value, n_top_tokens, p_mass_threshold):    
    print '[{}] creating model'.format(datetime.now())
    model = artm.ARTM(num_topics=n_topics, dictionary=current_dictionary, cache_theta=True, seed=seed_value, 
                  class_ids={'ngramm': 1.0, 'author_id': 0.0, 'author': 0.0, 
                             'post_tag': 0.0, 'projects': 0.0, 'category': 0.0,
                             'following_users': 0.0})
    model.num_document_passes = n_doc_passes
    add_scores_to_model(model, n_top_tokens=n_top_tokens, p_mass_threshold=p_mass_threshold)
    return model


def add_scores_to_model(artm_model, n_top_tokens, p_mass_threshold):
    print '[{}] adding scores'.format(datetime.now())
    artm_model.scores.add(artm.PerplexityScore(name='perplexity_score',
                                      use_unigram_document_model=False,
                                      dictionary=dictionary))
    artm_model.scores.add(artm.SparsityPhiScore(name='sparsity_phi_score', class_id='ngramm'))
    artm_model.scores.add(artm.SparsityThetaScore(name='sparsity_theta_score'))
    artm_model.scores.add(artm.TopicKernelScore(name='topic_kernel_score', class_id='ngramm', 
                                                probability_mass_threshold=p_mass_threshold))
    artm_model.scores.add(artm.TopTokensScore(name='top_tokens_score', class_id='ngramm', num_tokens=n_top_tokens))
def fit_one_model(model, _n_iterations, _model_name=''): 
    print '[{}] fitting'.format(datetime.now())
    model.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=_n_iterations)
    print '[{}] outputting'.format(datetime.now())
    printer.print_artm_model(model, _model_name, _n_iterations, output_file=models_file)
    model_pics_file_name =  path.join(config.experiment_path, _model_name)
    plot_maker.make_tm_plots(model, model_pics_file_name)
    model_output_file_name = path.join(config.experiment_path, _model_name + '.txt')
    printer.print_scores(model, _model_name, _n_iterations, model_output_file_name)
    printer.print_top_tokens(model, model_output_file_name)
    return model

In [9]:
def save_model(_model, _model_name): 
    print '[{}] saving model'.format(datetime.now())
    model_output_file_name = path.join(config.models_archive_path, _model_name)
    _model.save(filename=model_output_file_name+'_saved_p_wt', model_name=_model_name+'p_wt')
    _model.save(filename=model_output_file_name+'_saved_n_wt', model_name=_model_name+'n_wt')

In [None]:
batch_vectorizer = artm.BatchVectorizer(data_path=config.dataset_path,
                                        data_format='bow_uci',
                                        collection_name=config.collection_name,
                                        target_folder=config.output_batches_path)
dictionary = artm.Dictionary()
dictionary.gather(data_path=config.output_batches_path,
                  vocab_file_path=config.vocabulary_path)
dictionary.save(dictionary_path=config.dictionary_path)
dictionary.save_text(dictionary_path=config.dictionary_path + '.txt')
dictionary.load_text(dictionary_path=config.dictionary_path + '.txt')

In [10]:
batch_vectorizer = artm.BatchVectorizer(data_path=config.output_batches_path,
                                        data_format='batches')
dictionary = artm.Dictionary()
dictionary.load(dictionary_path=config.dictionary_path + '.dict')

In [None]:
# dictionary.filter(min_tf=5, max_tf=2000, min_df_rate=0.01, max_df_rate=0.9)

In [11]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=100, n_doc_passes=5, seed_value=100,
                         n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers.add(artm.SmoothSparseThetaRegularizer(name='ss_theta_regularizer'))
tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 100
tmp_model.regularizers['ss_theta_regularizer'].tau = -0.01
tmp_model = fit_one_model(tmp_model, _n_iterations=15, _model_name='model_decor_sparse_t_reg_1')
model1 = tmp_model; tmp_model = None

[2016-12-04 17:22:44.953000] creating model
[2016-12-04 17:22:46.366000] adding scores
[2016-12-04 17:22:46.375000] fitting
[2016-12-04 17:23:24.465000] outputting
name = model_decor_sparse_t_reg_1, n_topics = 100, n_doc_passes = 5, seed_value = 100, n_iterations = 15, n_top_tokens = 15, p_threshold = 0.25
ss_theta_regularizer, tau = -0.01
decorrelator_phi_regularizer, tau = 100



In [12]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=50, n_doc_passes=5, seed_value=100,
                         n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers.add(artm.SmoothSparseThetaRegularizer(name='ss_theta_regularizer'))
tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 100
tmp_model.regularizers['ss_theta_regularizer'].tau = -0.01
tmp_model = fit_one_model(tmp_model, _n_iterations=15, _model_name='model_decor_sparse_t_reg_2')
model2 = tmp_model; tmp_model = None

[2016-12-04 17:23:42.752000] creating model
[2016-12-04 17:23:44.127000] adding scores
[2016-12-04 17:23:44.134000] fitting
[2016-12-04 17:24:08.258000] outputting
name = model_decor_sparse_t_reg_2, n_topics = 50, n_doc_passes = 5, seed_value = 100, n_iterations = 15, n_top_tokens = 15, p_threshold = 0.25
ss_theta_regularizer, tau = -0.01
decorrelator_phi_regularizer, tau = 100



In [13]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=1000, n_doc_passes=5, seed_value=100,
                         n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers.add(artm.SmoothSparseThetaRegularizer(name='ss_theta_regularizer'))
tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 1e+6
tmp_model.regularizers['ss_theta_regularizer'].tau = -0.01
tmp_model = fit_one_model(tmp_model, _n_iterations=15, _model_name='model_decor_sparse_t_reg_3')
model3 = tmp_model; tmp_model = None

[2016-12-04 17:24:28.257000] creating model
[2016-12-04 17:24:32.023000] adding scores
[2016-12-04 17:24:32.096000] fitting
[2016-12-04 17:27:58.548000] outputting
name = model_decor_sparse_t_reg_3, n_topics = 1000, n_doc_passes = 5, seed_value = 100, n_iterations = 15, n_top_tokens = 15, p_threshold = 0.25
ss_theta_regularizer, tau = -0.01
decorrelator_phi_regularizer, tau = 1000000.0



In [14]:
phi1 = model1.get_phi()
phi1_t = phi1.transpose()
phi2 = model2.get_phi()
phi2_t = phi2.transpose()

In [15]:
phi3 = model3.get_phi()

In [16]:
def cos_dist(p, q):
    return cosine_distances(p, q)
def hellinger_dist(p, q):
    return np.sqrt(np.sum((np.sqrt(p) - np.sqrt(q)) ** 2)) / np.sqrt(2) 
def hellinger_dist_grad(A, x, b):
    y = A.dot(x)
#     print np.sqrt(y) - np.sqrt(b)
#     print  np.divide(np.sqrt(y) - np.sqrt(b), np.sqrt(y))
    nom = np.divide(np.sqrt(y) - np.sqrt(b), np.sqrt(y)).dot(A)
#     print nom
    denom = 2 * hellinger_dist(y, b) * np.sqrt(2)
#     print denom
    res = nom / denom 
#     print res
    return res

In [17]:
m = np.array([[0.1, 0.3, 0.6], [1, 0, 0], [0, 0.4, 0.6]]).transpose()

In [54]:
r = calculate_distance(hellinger_dist, 0, pd.DataFrame(m))

Optimization terminated successfully.    (Exit mode 0)
            Current function value: 0.0464352308334
            Iterations: 7
            Function evaluations: 31
            Gradient evaluations: 7


In [69]:
r = calculate_distance(hellinger_dist, hellinger_dist_grad, 0, pd.DataFrame(m))

Optimization terminated successfully.    (Exit mode 0)
            Current function value: 0.0464353319068
            Iterations: 6
            Function evaluations: 27
            Gradient evaluations: 6


In [21]:
def calculate_distances(dist, jac_dist, phi):
    distances = pd.DataFrame(0, index = range(1), columns=phi.columns)
    col_idx = 0
    for col in phi.columns:
        print '[{}] caclulating dist for column {}'.format(datetime.now(), col_idx)
        distances[0, col] = calculate_distance(dist, jac_dist, col_idx, phi).fun
        col_idx += 1
    return distances

In [22]:
def calculate_distance(dist, jac_dist, col_idx, phi):
    col = phi.iloc[:, col_idx]
    phi_cut = phi.drop(phi.columns[col_idx], axis=1)
    n_columns = phi_cut.shape[1] 
    bnds = [(0, 1)] * n_columns
    constraints = cons = ({'type': 'eq', 'fun': lambda x:  np.sum(x) - 1, 'jac': lambda x: [1] * n_columns})
    fun = lambda x: dist(col, phi_cut.dot(x))
    jac = lambda x: jac_dist(phi_cut, x, col)
    is_optimized = False
    it = 0
    while (not is_optimized) and it != 4:
        it += 1
        init_x = np.random.uniform(0, 1, (1, n_columns))
        init_x /= np.sum(init_x)
        res = minimize(fun, x0=init_x, method='SLSQP', bounds=bnds, constraints=cons, options={'maxiter': 50, 'disp': True})
#         res = minimize(fun, jac = jac, x0=init_x, method='SLSQP', bounds=bnds, constraints=cons, options={'maxiter': 1, 'disp': True})
        is_optimized = res.success
    return res

In [None]:
ds1 = calculate_distances(hellinger_dist, hellinger_dist_grad, phi1)

[2016-12-04 17:28:11.780000] caclulating dist for column 0
Optimization terminated successfully.    (Exit mode 0)
            Current function value: 0.66140634661
            Iterations: 44
            Function evaluations: 4531
            Gradient evaluations: 44
[2016-12-04 17:29:17.810000] caclulating dist for column 1
Optimization terminated successfully.    (Exit mode 0)
            Current function value: 0.637194455294
            Iterations: 40
            Function evaluations: 4158
            Gradient evaluations: 40
[2016-12-04 17:30:17.556000] caclulating dist for column 2
Optimization terminated successfully.    (Exit mode 0)
            Current function value: 0.635067347089
            Iterations: 24
            Function evaluations: 2518
            Gradient evaluations: 24
[2016-12-04 17:30:53.081000] caclulating dist for column 3
Optimization terminated successfully.    (Exit mode 0)
            Current function value: 0.660403828247
            Iterations: 34
     

In [None]:
ds3 = calculate_distances(hellinger_dist, hellinger_dist_grad, phi3)

[2016-12-04 18:50:29.889000] caclulating dist for column 0


In [31]:
res = calculate_distance(hellinger_dist, hellinger_dist_grad, 1, phi3)

Optimization terminated successfully.    (Exit mode 0)
            Current function value: 5.43369060226e-08
            Iterations: 2
            Function evaluations: 2002
            Gradient evaluations: 2


In [32]:
res = calculate_distance(hellinger_dist, hellinger_dist_grad, 0, phi3)

Positive directional derivative for linesearch    (Exit mode 8)
            Current function value: 0.0
            Iterations: 6
            Function evaluations: 2002
            Gradient evaluations: 2
Optimization terminated successfully.    (Exit mode 0)
            Current function value: 2.38834212516e-06
            Iterations: 3
            Function evaluations: 2002
            Gradient evaluations: 2


In [36]:
ds

Unnamed: 0,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,...,"(0, 90)","(0, 91)","(0, 92)","(0, 93)","(0, 94)","(0, 95)","(0, 96)","(0, 97)","(0, 98)","(0, 99)"
0,0,0,0,0,0,0,0,0,0,0,...,0.666669,0.647873,0.61969,0.60602,0.644372,0.65338,0.673503,0.658374,0.631308,0.646836


In [37]:
ds.values

array([[ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0. 

In [16]:
ds = calculate_distances(hellinger_dist, hellinger_dist_grad, phi1)

[2016-12-04 12:27:37.430000] caclulating dist for column 0
Optimization terminated successfully.    (Exit mode 0)
            Current function value: 0.661608400826
            Iterations: 21
            Function evaluations: 2198
            Gradient evaluations: 21
[2016-12-04 12:28:28.651000] caclulating dist for column 1
Optimization terminated successfully.    (Exit mode 0)
            Current function value: 0.638050801887
            Iterations: 18
            Function evaluations: 1939
            Gradient evaluations: 18
[2016-12-04 12:29:10.195000] caclulating dist for column 2
Optimization terminated successfully.    (Exit mode 0)
            Current function value: 0.633732505283
            Iterations: 41
            Function evaluations: 4253
            Gradient evaluations: 41
[2016-12-04 12:30:23.421000] caclulating dist for column 3
Optimization terminated successfully.    (Exit mode 0)
            Current function value: 0.661394847988
            Iterations: 19
    

In [None]:
ds = calculate_distances(hellinger_dist, hellinger_dist_grad, phi)

In [58]:
r = calculate_distance(hellinger_dist, 0, phi1)

Optimization terminated successfully.    (Exit mode 0)
            Current function value: 0.664200345949
            Iterations: 3
            Function evaluations: 324
            Gradient evaluations: 3


In [75]:
r = calculate_distance(hellinger_dist, hellinger_dist_grad, 0, phi1)

Optimization terminated successfully.    (Exit mode 0)
            Current function value: 0.661793461949
            Iterations: 37
            Function evaluations: 3849
            Gradient evaluations: 37


In [25]:
models_file.close()