In [18]:
import sys
import numpy as np
import pandas as pd
import artm
print artm.version()

from os import path, mkdir
from datetime import datetime
%matplotlib inline
sys.path.insert(0, '..\\modules\\helpers')
from plot_helper import PlotMaker
from config_helper import ConfigPaths
from print_helper import PrintHelper
from sklearn.decomposition import PCA
from scipy.spatial import ConvexHull
from scipy.optimize import minimize
from sklearn.metrics.pairwise import cosine_distances
from numpy.linalg import norm as euclidean_norm

0.8.1


In [3]:
config = ConfigPaths('config.cfg')
plot_maker = PlotMaker()
printer = PrintHelper()

In [4]:
print config.models_file_name

Q:\\topic_modeling\\csi_science_collections.git\experiments\UCI_filtered_ngramm_trimmed_without_names\08_12_500_tmp\models.txt


In [5]:
models_file = open(config.models_file_name, 'a')

In [6]:
def create_model(current_dictionary, n_topics, n_doc_passes, seed_value, n_top_tokens, p_mass_threshold):    
    print '[{}] creating model'.format(datetime.now())
    model = artm.ARTM(num_topics=n_topics, dictionary=current_dictionary, cache_theta=True, seed=seed_value, 
                  class_ids={'ngramm': 1.0, 'author_id': 0.0, 'author': 0.0, 
                             'post_tag': 0.0, 'projects': 0.0, 'category': 0.0,
                             'following_users': 0.0})
    model.num_document_passes = n_doc_passes
    add_scores_to_model(model, n_top_tokens=n_top_tokens, p_mass_threshold=p_mass_threshold)
    return model


def add_scores_to_model(artm_model, n_top_tokens, p_mass_threshold):
    print '[{}] adding scores'.format(datetime.now())
    artm_model.scores.add(artm.PerplexityScore(name='perplexity_score',
                                      use_unigram_document_model=False,
                                      dictionary=dictionary))
    artm_model.scores.add(artm.SparsityPhiScore(name='sparsity_phi_score', class_id='ngramm'))
    artm_model.scores.add(artm.SparsityThetaScore(name='sparsity_theta_score'))
    artm_model.scores.add(artm.TopicKernelScore(name='topic_kernel_score', class_id='ngramm', 
                                                probability_mass_threshold=p_mass_threshold))
    artm_model.scores.add(artm.TopTokensScore(name='top_tokens_score', class_id='ngramm', num_tokens=n_top_tokens))
def fit_one_model(model, _n_iterations, _model_name=''): 
    print '[{}] fitting'.format(datetime.now())
    model.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=_n_iterations)
    print '[{}] outputting'.format(datetime.now())
    printer.print_artm_model(model, _model_name, _n_iterations, output_file=models_file)
    model_pics_file_name =  path.join(config.experiment_path, _model_name)
    plot_maker.make_tm_plots(model, model_pics_file_name)
    model_output_file_name = path.join(config.experiment_path, _model_name + '.txt')
    printer.print_scores(model, _model_name, _n_iterations, model_output_file_name)
    printer.print_top_tokens(model, model_output_file_name)
    return model

In [7]:
batch_vectorizer = artm.BatchVectorizer(data_path=config.output_batches_path,
                                        data_format='batches')
dictionary = artm.Dictionary()
dictionary.load(dictionary_path=config.dictionary_path + '.dict')

In [None]:
# dictionary.filter(min_tf=5, max_tf=2000, min_df_rate=0.01, max_df_rate=0.9)

In [11]:
tmp_model = create_model(current_dictionary=dictionary, n_topics=100, n_doc_passes=5, seed_value=100,
                         n_top_tokens=15, p_mass_threshold=0.25)
tmp_model.regularizers.add(artm.DecorrelatorPhiRegularizer(name='decorrelator_phi_regularizer', class_ids=['ngramm']))
tmp_model.regularizers.add(artm.SmoothSparseThetaRegularizer(name='ss_theta_regularizer'))
tmp_model.regularizers['decorrelator_phi_regularizer'].tau = 100
tmp_model.regularizers['ss_theta_regularizer'].tau = -0.01
tmp_model = fit_one_model(tmp_model, _n_iterations=15, _model_name='model_decor_sparse_t_reg_1')
model1 = tmp_model; tmp_model = None

[2016-12-04 17:22:44.953000] creating model
[2016-12-04 17:22:46.366000] adding scores
[2016-12-04 17:22:46.375000] fitting
[2016-12-04 17:23:24.465000] outputting
name = model_decor_sparse_t_reg_1, n_topics = 100, n_doc_passes = 5, seed_value = 100, n_iterations = 15, n_top_tokens = 15, p_threshold = 0.25
ss_theta_regularizer, tau = -0.01
decorrelator_phi_regularizer, tau = 100



In [211]:
def euc_dist(p, q):
    return euclidean_norm(p - q)
def euc_dist_grad(b, A, x):
    x = x.reshape(-1, 1)
    b = b.reshape(-1, 1)
    return A.T.dot(A.dot(x) - b) / euc_dist(A.dot(x), b) 
def cos_dist(p, q):
    p = p.reshape(-1, 1)
    q = q.reshape(-1, 1)
    return cosine_distances(p, q)[0][0]
def cos_dist_grad(b, A, x):
    x = x.reshape(-1, 1)
    b = b.reshape(-1, 1)
    y = A.dot(x)
    u = b.T.dot(y) # number
    deriv_u = A.T.dot(b) * x
    v = euclidean_norm(y) * euclidean_norm(b)
    nom = deriv_u * v - A.T.dot(A).dot(x) * u[0][0] * euclidean_norm(b) / euclidean_norm(y)
    denom = v * v
    if denom != 0:
        res = nom / denom
    else:
        res = nom
    return -res
def hellinger_dist(p, q):
    return np.sqrt(np.sum((np.sqrt(p) - np.sqrt(q)) ** 2)) / np.sqrt(2) 
def hellinger_dist_grad(b, A, x):
    y = A.dot(x)
    nom = np.divide(np.sqrt(y) - np.sqrt(b), np.sqrt(y)).dot(A)
    denom = 2 * hellinger_dist(y, b) * np.sqrt(2)
    res = nom / denom 
    return res
def hellinger_dist_grad_nan(b, A, x):
    y = A.dot(x)
    tmp = np.divide(np.sqrt(y) - np.sqrt(b), np.sqrt(y))
    tmp[np.isnan(tmp)] = 0
    nom = tmp.dot(A)
    denom = 2 * hellinger_dist(y, b) * np.sqrt(2)
    res = nom / denom 
    return res
def hellinger_dist_grad_eps(b, A, x):
    y = A.dot(x)
    y[y == 0] = 1e-3
    tmp = np.divide(np.sqrt(y) - np.sqrt(b), np.sqrt(y))
    nom = tmp.dot(A)
    denom = 2 * hellinger_dist(y, b) * np.sqrt(2)
    res = nom / denom 
    return res

In [93]:
from sklearn.preprocessing import normalize

In [144]:
cos_dist(np.array([1, 0, 0]).reshape(1, -1), np.array([1, 0, 0]).reshape(1, -1))

ch 0.0 [[ 1.  0.  0.]
 [ 0.  0.  0.]
 [ 0.  0.  0.]]


0.0

In [145]:
np.array([1, 0, 0]).reshape(1, -1)


array([[1, 0, 0]])

In [20]:
m = np.array([[0.1, 0.3, 0.6], [1, 0, 0], [0, 1, 0],  [0, 0, 0], [0, 0.4, 0.6]]).transpose()
print m

[[ 0.1  1.   0.   0.   0. ]
 [ 0.3  0.   1.   0.   0.4]
 [ 0.6  0.   0.   0.   0.6]]


In [46]:
col_idx = 0
r1 = calculate_distance(hellinger_dist, None, col_idx, pd.DataFrame(m))
r2= calculate_distance(hellinger_dist, hellinger_dist_grad_eps, col_idx, pd.DataFrame(m))
r3 = calculate_distance(hellinger_dist, hellinger_dist_grad_nan, col_idx, pd.DataFrame(m))

(3, 4) (3L,)
Optimization terminated successfully.    (Exit mode 0)
            Current function value: 0.0464103055431
            Iterations: 19
            Function evaluations: 124
            Gradient evaluations: 19
(3, 4) (3L,)
Optimization terminated successfully.    (Exit mode 0)
            Current function value: 0.0464102050772
            Iterations: 8
            Function evaluations: 10
            Gradient evaluations: 8
(3, 4) (3L,)
Optimization terminated successfully.    (Exit mode 0)
            Current function value: 0.219168347645
            Iterations: 5
            Function evaluations: 15
            Gradient evaluations: 5


In [52]:
col_idx = 0
c1 = calculate_distance(cos_dist, None, col_idx, pd.DataFrame(m))
c2 = calculate_distance(cos_dist, cos_dist_grad, col_idx, pd.DataFrame(m))

(3, 4) (3L,)
Optimization terminated successfully.    (Exit mode 0)
            Current function value: [[ 0.00755361]]
            Iterations: 6
            Function evaluations: 36
            Gradient evaluations: 6
(3, 4) (3L,)




error: failed in converting 8th argument `g' of _slsqp.slsqp to C/Fortran array

In [218]:
col_idx = 4
c1 = calculate_distance(euc_dist, None, col_idx, pd.DataFrame(m))
c2 = calculate_distance(euc_dist, euc_dist_grad, col_idx, pd.DataFrame(m))

(3, 4) (3L,)
Optimization terminated successfully.    (Exit mode 0)
            Current function value: 0.112063105259
            Iterations: 4
            Function evaluations: 25
            Gradient evaluations: 4
(3, 4) (3L,)
Optimization terminated successfully.    (Exit mode 0)
            Current function value: 0.112063124581
            Iterations: 5
            Function evaluations: 6
            Gradient evaluations: 5


(3, 4) (3L,)




error: failed in converting 8th argument `g' of _slsqp.slsqp to C/Fortran array

(3, 4) (3L,)
Optimization terminated successfully.    (Exit mode 0)
            Current function value: 0.54772368343
            Iterations: 5
            Function evaluations: 9
            Gradient evaluations: 5


In [126]:
r = calculate_distance(hellinger_dist, hellinger_dist_grad_nan, 2, pd.DataFrame(m))

sqrt(y) = 
0    0.611195
1    0.474564
2    0.616331
dtype: float64
div = 
0    1.000000
1   -1.107197
2    1.000000
dtype: float64
sqrt(y) = 
0    0.263259
1    0.520931
2    0.670888
dtype: float64
div = 
0    1.000000
1   -0.919641
2    1.000000
dtype: float64
sqrt(y) = 
0    0.124213
1    0.483760
2    0.611702
dtype: float64
div = 
0    1.000000
1   -1.067139
2    1.000000
dtype: float64
sqrt(y) = 
0    0.000000
1    0.420294
2    0.514754
dtype: float64
div = 
0         NaN
1   -1.379284
2    1.000000
dtype: float64
Optimization terminated successfully.    (Exit mode 0)
            Current function value: 0.548192332527
            Iterations: 4
            Function evaluations: 15
            Gradient evaluations: 4


In [128]:
r = calculate_distance(hellinger_dist, hellinger_dist_grad, 2, pd.DataFrame(m))

0    0.546619
1   -0.628365
2    0.512808
dtype: float64
sqrt(y) = 
0    0.546619
1    0.371635
2    0.512808
dtype: float64
div = 
0    1.000000
1   -1.690811
2    1.000000
dtype: float64
0    0.195068
1   -0.553812
2    0.596409
dtype: float64
sqrt(y) = 
0    0.195068
1    0.446188
2    0.596409
dtype: float64
div = 
0    1.000000
1   -1.241206
2    1.000000
dtype: float64
0    0.161495
1   -0.578166
2    0.553206
dtype: float64
sqrt(y) = 
0    0.161495
1    0.421834
2    0.553206
dtype: float64
div = 
0    1.000000
1   -1.370601
2    1.000000
dtype: float64
0    7.450581e-09
1   -6.269552e-01
2    4.568848e-01
dtype: float64
sqrt(y) = 
0    7.450581e-09
1    3.730448e-01
2    4.568848e-01
dtype: float64
div = 
0    1.000000
1   -1.680643
2    1.000000
dtype: float64
0    0.000000
1   -0.619912
2    0.465511
dtype: float64
sqrt(y) = 
0    0.000000
1    0.380088
2    0.465511
dtype: float64
div = 
0         NaN
1   -1.630968
2    1.000000
dtype: float64
0   NaN
1   NaN
2   NaN
dtype: 

In [210]:
distances = {}
distances[phi4.columns[0]] = dists0
distances[phi4.columns[3]] = dists3
print distances

{u'topic_3':      fun: 0.54557244888173628
     jac: array([-44.68370899,   0.21733832,   0.13720262, ...,   0.45824335,
         0.2352745 ,   0.        ])
 message: 'Optimization terminated successfully.'
    nfev: 24
     nit: 3
    njev: 3
  status: 0
 success: True
       x: array([  3.69441135e-12,   7.16860823e-12,   8.93742763e-12, ...,
         2.33573199e-12,   8.51821816e-12,   2.75879599e-12]), u'topic_0':      fun: 0.72394868116776179
     jac: array([-3230.20047165, -3807.24719362, -4687.7050514 , ..., -2589.57342948,
       -3300.85809179,     0.        ])
 message: 'Optimization terminated successfully.'
    nfev: 24
     nit: 3
    njev: 3
  status: 0
 success: True
       x: array([  8.96344384e-15,   9.57152047e-15,   0.00000000e+00, ...,
         0.00000000e+00,   2.94065009e-10,   1.22850110e-15])}


In [213]:
dists0

     fun: 0.72394868116776179
     jac: array([-3230.20047165, -3807.24719362, -4687.7050514 , ..., -2589.57342948,
       -3300.85809179,     0.        ])
 message: 'Optimization terminated successfully.'
    nfev: 24
     nit: 3
    njev: 3
  status: 0
 success: True
       x: array([  8.96344384e-15,   9.57152047e-15,   0.00000000e+00, ...,
         0.00000000e+00,   2.94065009e-10,   1.22850110e-15])

In [14]:
def calculate_distances(dist, jac_dist, phi):
    distances = pd.DataFrame(0, index = range(1), columns=phi.columns)
    col_idx = 0
    for col in phi.columns:
        print '[{}] caclulating dist for column {}'.format(datetime.now(), col_idx)
        distances[0, col] = calculate_distance(dist, jac_dist, col_idx, phi).fun
        col_idx += 1
    return distances
def calculate_distance(dist, jac_dist, col_idx, phi):
    max_iter = 50
    col = phi.iloc[:, col_idx]
    phi_cut = phi.drop(phi.columns[col_idx], axis=1)
    n_columns = phi_cut.shape[1] 
    print phi_cut.shape, col.shape
    bnds = [(0, 1)] * n_columns
    constraints = cons = ({'type': 'eq', 'fun': lambda x:  np.sum(x) - 1, 'jac': lambda x: [1] * n_columns})
    fun = lambda x: dist(col, phi_cut.dot(x))
    jac = lambda x: jac_dist(col, phi_cut, x)
    is_optimized = False
    it = 0
    while (not is_optimized) and it != 4:
        it += 1
        init_x = np.random.uniform(0, 1, (1, n_columns))
        init_x /= np.sum(init_x)
        if jac_dist is not None:
            res = minimize(fun, jac=jac, x0=init_x, method='SLSQP', bounds=bnds, constraints=cons, options={'maxiter': max_iter, 'disp': True})
        else:
            res = minimize(fun, x0=init_x, method='SLSQP', bounds=bnds, constraints=cons, options={'maxiter': max_iter, 'disp': True})
        is_optimized = res.success
    res['column_names'] = phi_cut.columns
    return res

In [146]:
ds1 = calculate_distances(hellinger_dist, hellinger_dist_grad_nan, phi4)

[2016-12-08 14:35:20.606000] caclulating dist for column 0
Iteration limit exceeded    (Exit mode 9)
            Current function value: 0.711719367499
            Iterations: 26
            Function evaluations: 162
            Gradient evaluations: 26
Optimization terminated successfully.    (Exit mode 0)
            Current function value: 0.718855040779
            Iterations: 3
            Function evaluations: 24
            Gradient evaluations: 3
[2016-12-08 15:20:02.963000] caclulating dist for column 1
Iteration limit exceeded    (Exit mode 9)
            Current function value: 0.646866484112
            Iterations: 26
            Function evaluations: 55
            Gradient evaluations: 26
Iteration limit exceeded    (Exit mode 9)
            Current function value: 0.646807454396
            Iterations: 26
            Function evaluations: 60
            Gradient evaluations: 26
Optimization terminated successfully.    (Exit mode 0)
            Current function value: 0.6

KeyboardInterrupt: 

In [147]:
dists3 = calculate_distance(hellinger_dist, hellinger_dist_grad_nan, 3, phi4)

Iteration limit exceeded    (Exit mode 9)
            Current function value: 0.483701022242
            Iterations: 26
            Function evaluations: 86
            Gradient evaluations: 25
Optimization terminated successfully.    (Exit mode 0)
            Current function value: 0.545572448882
            Iterations: 3
            Function evaluations: 24
            Gradient evaluations: 3


In [190]:
dists0 = calculate_distance(hellinger_dist, hellinger_dist_grad_nan, 0, phi4)

(19321, 1499) (19321L,)
Iteration limit exceeded    (Exit mode 9)
            Current function value: 0.0
            Iterations: 26
            Function evaluations: 266
            Gradient evaluations: 26
Optimization terminated successfully.    (Exit mode 0)
            Current function value: 0.723948681168
            Iterations: 3
            Function evaluations: 24
            Gradient evaluations: 3


In [189]:
dists3.x[np.where(dists3.x > 1e-5)]

array([ 0.51216124,  0.48783487])

In [None]:
# 3 topic = 778 and 866

In [206]:
th = 1e-5
xopt = dists3.x[np.where(dists3.x > th)]
print xopt.shape, np.sum(xopt)
print np.where(dists3.x > th)[0]
print dists3.x[np.where(dists3.x > th)]

(2L,) 0.999996114758
[778 866]
[ 0.51216124  0.48783487]


In [204]:
th = 1e-3
xopt = dists0.x[np.where(dists0.x > th)]
print xopt.shape, np.sum(xopt)
print np.where(dists0.x > th)[0]
print dists0.x[np.where(dists0.x > th)]

(4L,) 0.999910846092
[  83  493  929 1326]
[ 0.32955331  0.16189175  0.13508679  0.37337899]


In [150]:
dists3.x.shape

(1499L,)

In [None]:
ds1 = calculate_distances(hellinger_dist, None, phi4)

In [None]:
ds1 = calculate_distances(hellinger_dist, hellinger_dist_grad, phi1)

[2016-12-04 17:28:11.780000] caclulating dist for column 0
Optimization terminated successfully.    (Exit mode 0)
            Current function value: 0.66140634661
            Iterations: 44
            Function evaluations: 4531
            Gradient evaluations: 44
[2016-12-04 17:29:17.810000] caclulating dist for column 1
Optimization terminated successfully.    (Exit mode 0)
            Current function value: 0.637194455294
            Iterations: 40
            Function evaluations: 4158
            Gradient evaluations: 40
[2016-12-04 17:30:17.556000] caclulating dist for column 2
Optimization terminated successfully.    (Exit mode 0)
            Current function value: 0.635067347089
            Iterations: 24
            Function evaluations: 2518
            Gradient evaluations: 24
[2016-12-04 17:30:53.081000] caclulating dist for column 3
Optimization terminated successfully.    (Exit mode 0)
            Current function value: 0.660403828247
            Iterations: 34
     

In [None]:
ds3 = calculate_distances(hellinger_dist, hellinger_dist_grad, phi3)

[2016-12-04 18:50:29.889000] caclulating dist for column 0


In [31]:
res = calculate_distance(hellinger_dist, hellinger_dist_grad, 1, phi3)

Optimization terminated successfully.    (Exit mode 0)
            Current function value: 5.43369060226e-08
            Iterations: 2
            Function evaluations: 2002
            Gradient evaluations: 2


In [32]:
res = calculate_distance(hellinger_dist, hellinger_dist_grad, 0, phi3)

Positive directional derivative for linesearch    (Exit mode 8)
            Current function value: 0.0
            Iterations: 6
            Function evaluations: 2002
            Gradient evaluations: 2
Optimization terminated successfully.    (Exit mode 0)
            Current function value: 2.38834212516e-06
            Iterations: 3
            Function evaluations: 2002
            Gradient evaluations: 2


In [36]:
ds

Unnamed: 0,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,...,"(0, 90)","(0, 91)","(0, 92)","(0, 93)","(0, 94)","(0, 95)","(0, 96)","(0, 97)","(0, 98)","(0, 99)"
0,0,0,0,0,0,0,0,0,0,0,...,0.666669,0.647873,0.61969,0.60602,0.644372,0.65338,0.673503,0.658374,0.631308,0.646836


In [37]:
ds.values

array([[ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0. 

In [16]:
ds = calculate_distances(hellinger_dist, hellinger_dist_grad, phi1)

[2016-12-04 12:27:37.430000] caclulating dist for column 0
Optimization terminated successfully.    (Exit mode 0)
            Current function value: 0.661608400826
            Iterations: 21
            Function evaluations: 2198
            Gradient evaluations: 21
[2016-12-04 12:28:28.651000] caclulating dist for column 1
Optimization terminated successfully.    (Exit mode 0)
            Current function value: 0.638050801887
            Iterations: 18
            Function evaluations: 1939
            Gradient evaluations: 18
[2016-12-04 12:29:10.195000] caclulating dist for column 2
Optimization terminated successfully.    (Exit mode 0)
            Current function value: 0.633732505283
            Iterations: 41
            Function evaluations: 4253
            Gradient evaluations: 41
[2016-12-04 12:30:23.421000] caclulating dist for column 3
Optimization terminated successfully.    (Exit mode 0)
            Current function value: 0.661394847988
            Iterations: 19
    

In [None]:
ds = calculate_distances(hellinger_dist, hellinger_dist_grad, phi)

In [58]:
r = calculate_distance(hellinger_dist, 0, phi1)

Optimization terminated successfully.    (Exit mode 0)
            Current function value: 0.664200345949
            Iterations: 3
            Function evaluations: 324
            Gradient evaluations: 3


In [75]:
r = calculate_distance(hellinger_dist, hellinger_dist_grad, 0, phi1)

Optimization terminated successfully.    (Exit mode 0)
            Current function value: 0.661793461949
            Iterations: 37
            Function evaluations: 3849
            Gradient evaluations: 37


In [25]:
models_file.close()