# Оптимальные питоновские реализация оптимизации ARTM

# + Формулы


In [36]:
import numpy as np
from numpy.core.umath_tests import inner1d
import scipy
import scipy.sparse
from sklearn.datasets import fetch_20newsgroups
import gensim
from collections import Counter
import heapq
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import time
%matplotlib inline

# Разные функции потерь

In [132]:
class LogFunction(object):
    def calc(self, x):
        return np.log(x + 1e-20)
    def calc_der(self, x):
        return 1. / (x + 1e-20)
    

class IdFunction(object):
    def calc(self, x):
        return x + 1e-20
    def calc_der(self, x):
        return np.ones_like(x)
    

class SquareFunction(object):
    def calc(self, x):
        return (x + 1e-20) ** 2
    def calc_der(self, x):
        return 2. * (x + 1e-20) ** 2
    

class CubeLogFunction(object):
    def calc(self, x):
        return np.log(x + 1e-20) ** 3
    def calc_der(self, x):
        return 3. * np.log(x + 1e-20) ** 2 / (x + 1e-20)
    

class SquareLogFunction(object):
    def calc(self, x):
        return np.log(x + 1e-20) * np.abs(np.log(x + 1e-20))
    def calc_der(self, x):
        return 2. * np.abs(np.log(x + 1e-20)) / (x + 1e-20)

    
class FiveLogFunction(object):
    def calc(self, x):
        return np.log(x + 1e-20) ** 5
    def calc_der(self, x):
        return 5. * np.log(x + 1e-20) ** 4 / (x + 1e-20)
    

class CubeRootLogFunction(object):
    def calc(self, x):
        return np.cbrt(np.log(x + 1e-20))
    def calc_der(self, x):
        return 1. / 3 / (np.cbrt(np.log(x + 1e-20)) ** 2) / (x + 1e-20)
    
    
class SquareRootLogFunction(object):
    def calc(self, x):
        return np.sqrt(- np.log(x + 1e-20))
    def calc_der(self, x):
        return 1. / 2. / np.sqrt(- np.log(x + 1e-20)) / (x + 1e-20)
    

class ExpFunction(object):
    def calc(self, x):
        return np.exp(x)
    def calc_der(self, x):
        return np.exp(x)

    
class EntropyFunction(object):
    def calc(self, x):
        return (np.log(x + 1e-20) + 50.) * (x + 1e-20)
    def calc_der(self, x):
        return np.log(x + 1e-20) + 50.

# Разные регуляризации

In [133]:
def trivial_regularization(n_tw, n_dt):
    return 0., 0.

def create_reg_decorr(tau, theta_alpha=0.):
    def fun(n_tw, n_dt):
        phi_matrix = n_tw / np.sum(n_tw, axis=1)[:, np.newaxis]
        theta_matrix = n_dt / np.sum(n_dt, axis=1)[:, np.newaxis]
        aggr_phi = np.sum(phi_matrix, axis=1)
        return - tau * np.transpose(phi_matrix * (aggr_phi[:, np.newaxis] - phi_matrix)), theta_alpha
    return fun

def create_reg_lda(phi_alpha, theta_alpha):
    def fun (n_tw, n_dt):
        return phi_alpha, theta_alpha
    return fun


# Подготовка Датасета

Нужно скачать некоторые коллекции данных и установить библиотеки (nltk, gensim)

In [3]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/tylorn/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [4]:
english_stopwords = set(stopwords.words('english'))

In [33]:
def prepare_dataset(dataset):
    # remove stopwords
    occurences = Counter()
    for i, doc in enumerate(dataset.data):
        tokens = gensim.utils.lemmatize(doc)
        for token in set(tokens):
            occurences[token] += 1
        if i % 500 == 0:
            print 'Processed: ', i, 'documents from', len(dataset.data)
    
    row, col, data = [], [], []
    token_2_num = {}
    not_empty_docs_number = 0
    doc_targets = []
    for doc, target in zip(dataset.data, dataset.target):
        tokens = gensim.utils.lemmatize(doc)
        cnt = Counter()
        for token in tokens:
            word = token.split('/')[0]
            if word not in english_stopwords and 3 <= occurences[token]:
                if token not in token_2_num:
                    token_2_num[token] = len(token_2_num)
                cnt[token_2_num[token]] += 1
        
        if len(cnt) > 0:
            for w, c in cnt.iteritems():
                row.append(not_empty_docs_number)
                col.append(w)
                data.append(c)
            not_empty_docs_number += 1
            doc_targets.append(target)
        
    num_2_token = {
        v: k
        for k, v in token_2_num.iteritems()
    }
    print 'Nonzero values:', len(data)
    return scipy.sparse.csr_matrix((data, (row, col))), token_2_num, num_2_token, doc_targets


In [34]:
dataset = fetch_20newsgroups(
    subset='all',
    categories=['sci.electronics', 'sci.med', 'sci.space', 'sci.crypt', 'rec.sport.baseball', 'rec.sport.hockey'],
    remove=('headers', 'footers', 'quotes')
)

In [35]:
%%time
origin_n_dw_matrix, token_2_num, num_2_token, doc_targets = prepare_dataset(dataset)

Processed:  0 documents from 5945
Processed:  500 documents from 5945
Processed:  1000 documents from 5945
Processed:  1500 documents from 5945
Processed:  2000 documents from 5945
Processed:  2500 documents from 5945
Processed:  3000 documents from 5945
Processed:  3500 documents from 5945
Processed:  4000 documents from 5945
Processed:  4500 documents from 5945
Processed:  5000 documents from 5945
Processed:  5500 documents from 5945
Nonzero values: 322664
CPU times: user 4min 26s, sys: 64 ms, total: 4min 26s
Wall time: 4min 26s


# Вычисление правдоподобных функций

### имеется в виду вычисление функций вида $\sum_{dw} n_{dw} f(\sum_{t} \phi_{wt} \theta_{td})$

In [17]:
def create_calculate_likelihood_like_function(n_dw_matrix, loss_function=LogFunction()):
    D, W = n_dw_matrix.shape
    docptr = []
    indptr = n_dw_matrix.indptr
    for doc_num in xrange(D):
        docptr.extend([doc_num] * (indptr[doc_num + 1] - indptr[doc_num]))
    docptr = np.array(docptr)
    wordptr = n_dw_matrix.indices
    
    def fun(phi_matrix, theta_matrix):
        s_data = loss_function.calc(inner1d(theta_matrix[docptr, :], np.transpose(phi_matrix)[wordptr, :]))
        return np.sum(n_dw_matrix.data * s_data)

    return fun

# EM алгоритм

## Общая схема:
#### Неоходимо сначала вычислить $p_{tdw} = \frac{\phi_{wt} \theta_{td}}{\sum_s \phi_{ws} \theta_{sd}}$
#### Считаем $n_{wt} = \sum_d n_{dw} p_{tdw}$ и $n_{td} = \sum_w n_{dw} p_{tdw}$
#### Вычисляем $r_{wt}, r_{td}$ как функцию от $n_{wt}, n_{td}$
#### Прибавляем, делаем положительную срезку и нормируем

## Оптимизация вычисления:
#### Обозначим за $s_{dw}$ следующее выражение $\sum_t \phi_{wt} \theta_{td}$, фактически это наше предсказание для вероятности
####  Тогда $p_{tdw} = \frac{\phi_{wt} \theta_{td}}{s_{dw}}$
#### Подставим это выражение например в $n_wt$
#### И получим, что $n_{wt} = \sum_d n_{dw} \frac{\phi_{wt} \theta_{td}}{s_{dw}} = \phi_{wt} \sum_d \theta_{td} \cdot \frac{n_{dw}}{s_{dw}}$, аналогично $n_{td} = \theta_{td} \sum_w \phi_{wt} \cdot \frac{n_{dw}}{s_{dw}}$
#### Таким образом, мы видим, что фактически нам нужно знать матрицу $\frac{n_{dw}}{s_{dw}}$, а она очень разреженная, поэтому и $s_{dw}$ нужно не для всех пар вычислять, а только там, где $n_{dw} > 0$. 
#### То есть нам нужно эффективно закодить вычисление разженной матрицы $s_{dw}$ (матрица $n_{dw}$ уже есть в разреженном виде, так как подаётся на вход алгоритма), а затем просто поэлементно поделить
#### Причём хочется, чтобы промежуточные значения $p_{tdw}$ не сохранялись (как мы увидели, они в конечном варианте не важны)
#### Обозначим эту матрицу за $A$. Тогда $n_{wt} = \phi_{wt} (A^T \Theta^T)_{wt}$, а $n_{td} = \theta_{td} (\Phi^T A^T)_{td}$.
#### Перемножить разреженную матрицу на плотную можно быстро, если правильно её хранить (по строкам, или по столбцам)
#### Если оптимизируется не правдоподобие, какая-то другая функция вида $\sum_{dw} n_{dw} f(s_{dw})$ (правдоподобие будет, если $f(x) = \ln x$ ) , то в этом случае нужно определить матрицу $A$ как $A_{dw} = n_{dw} f'(s_{dw})$

In [134]:
def em_optimization(
    n_dw_matrix, 
    phi_matrix,
    theta_matrix,
    regularization_list,
    iters_count=100,
    loss_function=LogFunction(),
    iteration_callback=None
):
    D, W = n_dw_matrix.shape
    T = phi_matrix.shape[0]
    phi_matrix = np.copy(phi_matrix)
    theta_matrix = np.copy(theta_matrix)
    docptr = []
    indptr = n_dw_matrix.indptr
    for doc_num in xrange(D):
        docptr.extend([doc_num] * (indptr[doc_num + 1] - indptr[doc_num]))
    docptr = np.array(docptr)
    wordptr = n_dw_matrix.indices
    
    start_time = time.time()
    for it in xrange(iters_count):
        phi_matrix_tr = np.transpose(phi_matrix)
        # следующая строчка это 60% времени работы алгоритма
        s_data = loss_function.calc_der(inner1d(theta_matrix[docptr, :], phi_matrix_tr[wordptr, :]))
        # следующая часть это 25% времени работы алгоритма
        A = scipy.sparse.csr_matrix(
            (
                n_dw_matrix.data * s_data, 
                n_dw_matrix.indices, 
                n_dw_matrix.indptr
            ), 
            shape=n_dw_matrix.shape
        )
        A_tr = A.tocsc().transpose()
        # Остальное это 15% времени
        n_tw = np.transpose(A_tr.dot(theta_matrix)) * phi_matrix
        n_dt = A.dot(phi_matrix_tr) * theta_matrix
        
        r_tw, r_dt = regularization_list[it](n_tw, n_dt)
        n_tw += r_tw
        n_dt += n_dt
        n_tw[n_tw < 0] = 0
        n_dt[n_dt < 0] = 0
        
        phi_matrix = n_tw / np.sum(n_tw, axis=1)[:, np.newaxis]
        theta_matrix = n_dt / np.sum(n_dt, axis=1)[:, np.newaxis]
        
        if iteration_callback is not None:
            iteration_callback(it, phi_matrix, theta_matrix)
    
    print 'Iters time', time.time() - start_time
    return phi_matrix, theta_matrix

# Naive thetaless EM

In [135]:
def naive_thetaless_em_optimization(
    n_dw_matrix, 
    phi_matrix,
    regularization_list,
    iters_count=100,
    iteration_callback=None
):
    D, W = n_dw_matrix.shape
    T = phi_matrix.shape[0]
    phi_matrix = np.copy(phi_matrix)
    docptr = []
    indptr = n_dw_matrix.indptr
    for doc_num in xrange(D):
        docptr.extend([doc_num] * (indptr[doc_num + 1] - indptr[doc_num]))
    docptr = np.array(docptr)
    wordptr = n_dw_matrix.indices
    
    start_time = time.time()
    for it in xrange(iters_count):
        phi_rev_matrix = np.transpose(phi_matrix / np.sum(phi_matrix, axis=0))
        theta_matrix = n_dw_matrix.dot(phi_rev_matrix)
        theta_matrix /= np.sum(theta_matrix, axis=1)[:, np.newaxis]
        phi_matrix_tr = np.transpose(phi_matrix)
        
        s_data = 1. / inner1d(theta_matrix[docptr, :], phi_matrix_tr[wordptr, :])
        A = scipy.sparse.csr_matrix(
            (
                n_dw_matrix.data  * s_data , 
                n_dw_matrix.indices, 
                n_dw_matrix.indptr
            ), 
            shape=n_dw_matrix.shape
        ).tocsc()
            
        n_tw = (A.T.dot(theta_matrix)).T * phi_matrix
        r_tw, _ = regularization_list[it](n_tw, theta_matrix)
        n_tw += r_tw
        n_tw[n_tw < 0] = 0
        phi_matrix = n_tw / np.sum(n_tw, axis=1)[:, np.newaxis]

        if iteration_callback is not None:
            iteration_callback(it, phi_matrix, theta_matrix)
    
    print 'Iters time', time.time() - start_time    
    return phi_matrix, theta_matrix

# Градиентный спуск

In [136]:
def gradient_optimization(
    n_dw_matrix, 
    phi_matrix,
    theta_matrix,
    regularization_gradient_list,
    iters_count=100,
    loss_function=LogFunction(),
    iteration_callback=None,
    learning_rate=1.
):
    D, W = n_dw_matrix.shape
    T = phi_matrix.shape[0]
    phi_matrix = np.copy(phi_matrix)
    theta_matrix = np.copy(theta_matrix)
    docptr = []
    indptr = n_dw_matrix.indptr
    for doc_num in xrange(D):
        docptr.extend([doc_num] * (indptr[doc_num + 1] - indptr[doc_num]))
    docptr = np.array(docptr)
    wordptr = n_dw_matrix.indices
    
    start_time = time.time()
    for it in xrange(iters_count):
        phi_matrix_tr = np.transpose(phi_matrix)
        # следующая строчка это 60% времени работы алгоритма
        s_data = loss_function.calc_der(inner1d(theta_matrix[docptr, :], phi_matrix_tr[wordptr, :]))
        # следующая часть это 25% времени работы алгоритма
        A = scipy.sparse.csr_matrix(
            (
                n_dw_matrix.data * s_data, 
                n_dw_matrix.indices, 
                n_dw_matrix.indptr
            ), 
            shape=n_dw_matrix.shape
        )
        A_tr = A.tocsc().transpose()
        # Остальное это 15% времени
        g_tw = np.transpose(A_tr.dot(theta_matrix))
        g_dt = A.dot(phi_matrix_tr)
        
        r_tw, r_dt = regularization_gradient_list[it](phi_matrix, theta_matrix)
        g_tw += r_tw
        g_dt += r_dt
        
        g_tw -= np.sum(g_tw * phi_matrix, axis=1)[:, np.newaxis]
        g_dt -= np.sum(g_dt * theta_matrix, axis=1)[:, np.newaxis]
        
        phi_matrix += g_tw * learning_rate
        theta_matrix += g_dt * learning_rate
        
        phi_matrix[phi_matrix < 0] = 0
        theta_matrix[theta_matrix < 0] = 0
        
        phi_matrix /= np.sum(phi_matrix, axis=1)[:, np.newaxis]
        theta_matrix /= np.sum(theta_matrix, axis=1)[:, np.newaxis]
        
        if iteration_callback is not None:
            iteration_callback(it, phi_matrix, theta_matrix)
    
    print 'Iters time', time.time() - start_time  
    return phi_matrix, theta_matrix

# Оценка качества классификации

In [160]:
def svm_score(theta, targets):
    svm_score1 = np.mean(cross_val_score(SVC(C=1.), theta, targets, scoring='accuracy', cv=4))
    svm_score2 = np.mean(cross_val_score(SVC(C=10.), theta, targets, scoring='accuracy', cv=4))
    svm_score3 = np.mean(cross_val_score(SVC(C=100.), theta, targets, scoring='accuracy', cv=4))

    print 'SVM score1\t\t\t', round(svm_score1, 4)
    print 'SVM score2\t\t\t', round(svm_score2, 4)
    print 'SVM score3\t\t\t', round(svm_score3, 4)

# Примеры запусков

# PLSA: EM optimization

In [205]:
D, W = origin_n_dw_matrix.shape
T = 20

np.random.seed(5242)

phi_matrix = np.random.uniform(size=(T, W)).astype(np.float64)
phi_matrix /= np.sum(phi_matrix, axis=1)[:, np.newaxis]

theta_matrix = np.random.uniform(size=(D, T)).astype(np.float64)
theta_matrix /= np.sum(theta_matrix, axis=1)[:, np.newaxis]

regularization_list = np.zeros(200, dtype=object)
regularization_list[:] = trivial_regularization

calc_log_likelihood = create_calculate_likelihood_like_function(
    loss_function=LogFunction(),
    n_dw_matrix=origin_n_dw_matrix
)

total_words_number = origin_n_dw_matrix.sum()
def callback(it, phi, theta):
    print it,  np.exp(- calc_log_likelihood(phi, theta) / total_words_number)

phi, theta = em_optimization(
    n_dw_matrix=origin_n_dw_matrix, 
    phi_matrix=phi_matrix,
    theta_matrix=theta_matrix,
    regularization_list=regularization_list,
    iters_count=200,
    loss_function=LogFunction(),
    iteration_callback=callback
)

phi, theta = em_optimization(
    n_dw_matrix=origin_n_dw_matrix, 
    phi_matrix=phi_matrix,
    theta_matrix=theta_matrix,
    regularization_list=regularization_list,
    iters_count=200,
    loss_function=LogFunction()
)

0 3963.77611396
1 3871.11825179
2 3757.26001995
3 3602.8680496
4 3398.79376697
5 3157.09414192
6 2906.89377013
7 2676.96987429
8 2482.79845124
9 2326.76695924
10 2204.08584392
11 2108.03533951
12 2032.3938713
13 1971.98983408
14 1922.91891256
15 1883.06125508
16 1850.41154779
17 1823.20951744
18 1800.25735835
19 1780.68763463
20 1763.80071039
21 1749.0271493
22 1735.8882955
23 1724.05179733
24 1713.30099236
25 1703.65812942
26 1695.08010808
27 1687.25700936
28 1679.9796253
29 1673.25823185
30 1667.0503938
31 1661.22047657
32 1655.77892377
33 1650.69903548
34 1645.86957296
35 1641.33690178
36 1637.17373058
37 1633.24788145
38 1629.46945493
39 1625.80019189
40 1622.22902023
41 1618.73863753
42 1615.31189267
43 1611.9695308
44 1608.76221369
45 1605.72734951
46 1602.85733335
47 1600.10051332
48 1597.48640011
49 1595.00446598
50 1592.54455603
51 1590.16809718
52 1588.02699654
53 1586.11117379
54 1584.32247213
55 1582.61690319
56 1580.96408378
57 1579.36191093
58 1577.78940614
59 1576.264371

In [199]:
phi_plsa_em = phi
theta_plsa_em = theta

In [206]:
svm_score(theta, doc_targets)

SVM score1			0.7831
SVM score2			0.8043
SVM score3			0.8158


# LDA: EM optimization

In [195]:
D, W = origin_n_dw_matrix.shape
T = 10

np.random.seed(42)

phi_matrix = np.random.uniform(size=(T, W)).astype(np.float64)
phi_matrix /= np.sum(phi_matrix, axis=1)[:, np.newaxis]

theta_matrix = np.random.uniform(size=(D, T)).astype(np.float64)
theta_matrix /= np.sum(theta_matrix, axis=1)[:, np.newaxis]

regularization_list = np.zeros(200, dtype=object)
regularization_list[:] = create_reg_lda(-0.1, 0.)

calc_log_likelihood = create_calculate_likelihood_like_function(
    loss_function=LogFunction(),
    n_dw_matrix=origin_n_dw_matrix
)

total_words_number = origin_n_dw_matrix.sum()
def callback(it, phi, theta):
    print it,  np.exp(- calc_log_likelihood(phi, theta) / total_words_number)

phi, theta = em_optimization(
    n_dw_matrix=origin_n_dw_matrix, 
    phi_matrix=phi_matrix,
    theta_matrix=theta_matrix,
    regularization_list=regularization_list,
    iters_count=200,
    loss_function=LogFunction(),
    iteration_callback=callback
)

phi, theta = em_optimization(
    n_dw_matrix=origin_n_dw_matrix, 
    phi_matrix=phi_matrix,
    theta_matrix=theta_matrix,
    regularization_list=regularization_list,
    iters_count=200,
    loss_function=LogFunction()
)

0 3987.37646827
1 3897.4074752
2 3798.04728921
3 3680.90916256
4 3546.86075721
5 3404.40082914
6 3264.35983653
7 3133.42614365
8 3015.05899483
9 2911.23100587
10 2822.68526189
11 2748.84376085
12 2687.63946797
13 2636.38528098
14 2592.68187264
15 2555.07686584
16 2523.26596425
17 2497.10117303
18 2475.8452058
19 2458.50015838
20 2444.08698345
21 2431.83457879
22 2421.18913973
23 2411.9411239
24 2403.92060151
25 2396.88272169
26 2390.67929499
27 2385.21190858
28 2380.39755883
29 2376.12819046
30 2372.32169245
31 2368.94321943
32 2365.92689874
33 2363.16733183
34 2360.63018304
35 2358.32615751
36 2356.24209776
37 2354.33902549
38 2352.5797764
39 2350.94714679
40 2349.42551463
41 2348.00372283
42 2346.66906665
43 2345.42598409
44 2344.2645335
45 2343.1731887
46 2342.13828756
47 2341.14797914
48 2340.19949656
49 2339.29619816
50 2338.44005861
51 2337.62109798
52 2336.83933634
53 2336.09686008
54 2335.38957787
55 2334.70918016
56 2334.04792875
57 2333.42233786
58 2332.83523389
59 2332.27917

In [196]:
svm_score(theta, doc_targets)

SVM score1			0.6425
SVM score2			0.6613
SVM score3			0.6722


# PLSA: naive thetaless EM optimization

In [203]:
D, W = origin_n_dw_matrix.shape
T = 20

np.random.seed(5242)

phi_matrix = np.random.uniform(size=(T, W)).astype(np.float64)
phi_matrix /= np.sum(phi_matrix, axis=1)[:, np.newaxis]

theta_matrix = np.random.uniform(size=(D, T)).astype(np.float64)
theta_matrix /= np.sum(theta_matrix, axis=1)[:, np.newaxis]

regularization_list = np.zeros(200, dtype=object)
regularization_list[:] = trivial_regularization

calc_log_likelihood = create_calculate_likelihood_like_function(
    loss_function=LogFunction(),
    n_dw_matrix=origin_n_dw_matrix
)

total_words_number = origin_n_dw_matrix.sum()
def callback(it, phi, theta):
    print it,  np.exp(- calc_log_likelihood(phi, theta) / total_words_number)

phi, theta = naive_thetaless_em_optimization(
    n_dw_matrix=origin_n_dw_matrix, 
    phi_matrix=phi_matrix,
    regularization_list=regularization_list,
    iters_count=200,
    iteration_callback=callback
)

phi, theta = naive_thetaless_em_optimization(
    n_dw_matrix=origin_n_dw_matrix, 
    phi_matrix=phi_matrix,
    regularization_list=regularization_list,
    iters_count=200
)

0 4028.00095306
1 4021.45072655
2 4011.18688353
3 3994.07330628
4 3964.53042104
5 3915.20945482
6 3838.08998487
7 3727.57974401
8 3588.26757141
9 3430.36332105
10 3264.50223105
11 3099.82842549
12 2940.77840888
13 2791.06900645
14 2655.84334731
15 2538.84529843
16 2440.25384977
17 2357.81397196
18 2288.9171412
19 2231.63278429
20 2184.45859242
21 2145.73542902
22 2113.76753245
23 2087.16279415
24 2064.83090253
25 2045.9194067
26 2029.77465874
27 2015.87414185
28 2003.78564552
29 1993.17287511
30 1983.76428924
31 1975.34428237
32 1967.76468881
33 1960.91566793
34 1954.69990572
35 1949.03268039
36 1943.84058175
37 1939.08426273
38 1934.73661208
39 1930.75218485
40 1927.06917997
41 1923.63482096
42 1920.41594212
43 1917.39896728
44 1914.57596872
45 1911.91453587
46 1909.34591822
47 1906.77450152
48 1904.09967824
49 1901.28386092
50 1898.55110758
51 1896.28933816
52 1894.4549964
53 1892.78561896
54 1891.21443239
55 1889.73483093
56 1888.3373967
57 1887.01232866
58 1885.74811364
59 1884.531

In [204]:
svm_score(theta, doc_targets)

SVM score1			0.8351
SVM score2			0.8441
SVM score3			0.8483


In [201]:
phi, theta = naive_thetaless_em_optimization(
    n_dw_matrix=origin_n_dw_matrix, 
    phi_matrix=phi_plsa_em,
    regularization_list=regularization_list,
    iters_count=10
)

Iters time 0.64034986496


In [202]:
svm_score(theta, doc_targets)

SVM score1			0.7326
SVM score2			0.7563
SVM score3			0.7629


# PLSA: gradient optimization

In [181]:
D, W = origin_n_dw_matrix.shape
T = 10

np.random.seed(42)

phi_matrix = np.random.uniform(size=(T, W)).astype(np.float64)
phi_matrix /= np.sum(phi_matrix, axis=1)[:, np.newaxis]

theta_matrix = np.random.uniform(size=(D, T)).astype(np.float64)
theta_matrix /= np.sum(theta_matrix, axis=1)[:, np.newaxis]

regularization_list = np.zeros(1000, dtype=object)
regularization_list[:] = trivial_regularization

calc_log_likelihood = create_calculate_likelihood_like_function(
    loss_function=LogFunction(),
    n_dw_matrix=origin_n_dw_matrix
)

total_words_number = origin_n_dw_matrix.sum()
def callback(it, phi, theta):
    print it,  np.exp(- calc_log_likelihood(phi, theta) / total_words_number)

phi, theta = gradient_optimization(
    n_dw_matrix=origin_n_dw_matrix, 
    phi_matrix=phi_plsa_em,
    theta_matrix=theta_plsa_em,
    regularization_gradient_list=regularization_list,
    iters_count=100,
    loss_function=LogFunction(),
    iteration_callback=callback,
    learning_rate=1e-10
)

#phi, theta = gradient_optimization(
#    n_dw_matrix=origin_n_dw_matrix, 
#    phi_matrix=phi_matrix,
#    theta_matrix=theta_matrix,
#    regularization_gradient_list=regularization_list,
#    iters_count=100,
#    loss_function=LogFunction(),
#    learning_rate=5e-10
#)

0 1914.9026249
1 1914.2653754
2 1913.79998342
3 1913.42439294
4 1913.10713959
5 1912.83362646
6 1912.59398608
7 1912.38321058
8 1912.19684832
9 1912.02595436
10 1911.86846318
11 1911.72306023
12 1911.58666145
13 1911.45858134
14 1911.33918252
15 1911.22778298
16 1911.12306141
17 1911.02398721
18 1910.93057578
19 1910.84245467
20 1910.76025546
21 1910.6822062
22 1910.60825447
23 1910.53786597
24 1910.47062655
25 1910.40679524
26 1910.34527284
27 1910.28619898
28 1910.22988519
29 1910.17582045
30 1910.12362137
31 1910.07338208
32 1910.02473611
33 1909.97755458
34 1909.93184904
35 1909.88858387
36 1909.84710407
37 1909.80686568
38 1909.76782855
39 1909.72994397
40 1909.69311686
41 1909.65744203
42 1909.6227316
43 1909.58901042
44 1909.5561465
45 1909.52418854
46 1909.49355963
47 1909.46386661
48 1909.43508521
49 1909.40761286
50 1909.38091782
51 1909.35512825
52 1909.33050167
53 1909.30655437
54 1909.28309085
55 1909.26019213
56 1909.23780068
57 1909.21594505
58 1909.19461983
59 1909.1738

In [182]:
svm_score(theta, doc_targets)

SVM score1			0.6994
SVM score2			0.7166
SVM score3			0.7251


## Исходники, можно пропускать

In [55]:
def perform_e_step_update(it_num, freq_matrix, docptr, phi_matrix, theta_matrix, params):
    block_size = params.get('block_size', 1)
    custom_function = params.get('custom_function', LogFunction())
    
    D, W = freq_matrix.shape
    T = phi_matrix.shape[0]
    n_wt, n_dt = np.zeros((W, T)), np.zeros((D, T))
    transposed_phi_matrix = np.transpose(phi_matrix)
    
    indices = freq_matrix.indices
    indptr = freq_matrix.indptr
    data = freq_matrix.data
    
    for block_num in xrange((D + block_size - 1) / block_size):
        block_start = block_num * block_size
        block_finish = min(D, block_start + block_size)
        ind_start, ind_finish = indptr[block_start], indptr[block_finish]
        
        datas = data[ind_start:ind_finish]
        words = indices[ind_start:ind_finish]
        docs = docptr[ind_start:ind_finish]
        
        p_dwt = transposed_phi_matrix[words] * theta_matrix[docs, :]
        p_dw = np.sum(p_dwt, axis=1)
        mult = custom_function.calc_der(p_dw)
        if np.sum(np.isnan(mult)) > 0:
            print transposed_phi_matrix[words]
            print theta_matrix[docs, :]
            print 1/0
        p_dwt *= mult[:, np.newaxis]
        p_dwt *= datas[:, np.newaxis]
        
        for doc_num in xrange(block_start, block_finish):
            doc_start, doc_finish = indptr[doc_num], indptr[doc_num + 1]
            doc_p_dwt = p_dwt[(doc_start - ind_start):(doc_finish - ind_start), :]
            n_dt[doc_num, :] += np.sum(doc_p_dwt, axis=0)
            n_wt[indices[doc_start:doc_finish], :] += doc_p_dwt
            
    return n_wt, n_dt

In [52]:
def launch_em(
    freq_matrix, 
    phi_matrix,
    theta_matrix,
    iters_count=100,
    loss_function=LogFunction()
):
    D, W = freq_matrix.shape
    T = phi_matrix.shape[0]
    phi_matrix = np.copy(phi_matrix)
    theta_matrix = np.copy(theta_matrix)
    docptr = []
    indptr = freq_matrix.indptr
    for doc_num in xrange(D):
        docptr.extend([doc_num] * (indptr[doc_num + 1] - indptr[doc_num]))
    docptr = np.array(docptr)
    wordptr = freq_matrix.indices
    
    for it in xrange(iters_count):
        s_data = loss_function.calc_der(np.sum(theta_matrix[docptr, :] * np.transpose(phi_matrix)[wordptr, :], axis=1))
        A = scipy.sparse.csr_matrix(
            (
                freq_matrix.data * s_data, 
                freq_matrix.indices, 
                freq_matrix.indptr
            ), 
            shape=freq_matrix.shape
        )
        A_tr = A.tocsc().transpose()
        n_tw = np.transpose(A_tr.dot(theta_matrix)) * phi_matrix
        n_dt = A.dot(np.transpose(phi_matrix)) * theta_matrix
        
        phi_matrix = n_tw / np.sum(n_tw, axis=1)[:, np.newaxis]
        theta_matrix = n_dt / np.sum(n_dt, axis=1)[:, np.newaxis]
        
    return phi_matrix, theta_matrix

In [66]:
def launch_em_without_theta(
    freq_matrix, 
    phi_matrix,
    iters_count=100,
    loss_function=LogFunction()
):
    D, W = freq_matrix.shape
    T = phi_matrix.shape[0]
    phi_matrix = np.copy(phi_matrix)
    docptr = []
    indptr = freq_matrix.indptr
    for doc_num in xrange(D):
        docptr.extend([doc_num] * (indptr[doc_num + 1] - indptr[doc_num]))
    docptr = np.array(docptr)
    wordptr = freq_matrix.indices
    
    for it in xrange(iters_count):
        theta_matrix = freq_matrix.dot(np.transpose(phi_matrix / np.sum(phi_matrix, axis=0)))
        theta_matrix /= np.sum(theta_matrix, axis=1)[:, np.newaxis]
        
        s_data = loss_function.calc_der(np.sum(theta_matrix[docptr, :] * np.transpose(phi_matrix)[wordptr, :], axis=1))
        A = scipy.sparse.csr_matrix(
            (
                freq_matrix.data * s_data, 
                freq_matrix.indices, 
                freq_matrix.indptr
            ), 
            shape=freq_matrix.shape
        )
        A_tr = A.tocsc().transpose()
        n_tw = np.transpose(A_tr.dot(theta_matrix)) * phi_matrix
        phi_matrix = n_tw / np.sum(n_tw, axis=1)[:, np.newaxis]
        
    return phi_matrix, theta_matrix

In [38]:
def trivial_regularization(n_wt, n_dt, phi_matrix, theta_matrix):
    return 0., 0.

def calculate_decorr(phi_matrix):
    aggr_phi = np.sum(phi_matrix, axis=1)
    return np.sum(phi_matrix * (aggr_phi[:, np.newaxis] - phi_matrix))

def create_reg_decorr_naive(tau, theta_alpha=0.):
    def fun (n_wt, n_dt, phi_matrix, theta_matrix):
        aggr_phi = np.sum(phi_matrix, axis=1)
        return - tau * np.transpose(phi_matrix * (aggr_phi[:, np.newaxis] - phi_matrix)), theta_alpha
    return fun

def create_reg_lda(phi_alpha, theta_alpha):
    def fun (n_wt, n_dt, phi_matrix, theta_matrix):
        return phi_alpha, theta_alpha
    return fun

def create_reg_decorr_unbiased(tau, theta_alpha=0.):
    def fun (n_wt, n_dt, phi_matrix, theta_matrix):
        tmp_phi =  n_wt / np.sum(n_wt, axis=0)
        aggr_phi = np.sum(tmp_phi, axis=0)
        return - tau * tmp_phi * (aggr_phi[np.newaxis, :] - tmp_phi), theta_alpha
    return fun

def calculate_likelihood(freq_matrix, docptr, phi_matrix, theta_matrix, block_size=1, custom_function=LogFunction()):
    D, W = freq_matrix.shape
    T = phi_matrix.shape[0]
    transposed_phi_matrix = np.transpose(phi_matrix)
    
    indices = freq_matrix.indices
    indptr = freq_matrix.indptr
    data = freq_matrix.data
    
    res = 0.
    for block_num in xrange((D + block_size - 1) / block_size):
        block_start = block_num * block_size
        block_finish = min(D, block_start + block_size)
        ind_start, ind_finish = indptr[block_start], indptr[block_finish]
        
        datas = data[ind_start:ind_finish]
        words = indices[ind_start:ind_finish]
        docs = docptr[ind_start:ind_finish]
        
        p_dwt = transposed_phi_matrix[words] * theta_matrix[docs, :]
        p_dw = custom_function.calc(np.sum(p_dwt, axis=1))
        res += np.sum(p_dw * datas)
    
    return res


In [39]:
def prepare_dataset(dataset):
    # remove stopwords
    occurences = Counter()
    for i, doc in enumerate(dataset.data):
        tokens = gensim.utils.lemmatize(doc)
        for token in set(tokens):
            occurences[token] += 1
        if i % 500 == 0:
            print 'Processed: ', i, 'documents from', len(dataset.data)
    
    row, col, data = [], [], []
    token_2_num = {}
    for i, doc in enumerate(dataset.data):
        tokens = gensim.utils.lemmatize(doc)
        cnt = Counter()
        for token in tokens:
            word = token.split('/')[0]
            if word not in english_stopwords and 10 <= occurences[token] < len(dataset.data) / 2:
                if token not in token_2_num:
                    token_2_num[token] = len(token_2_num)
                cnt[token_2_num[token]] += 1
        for w, c in cnt.iteritems():
            row.append(i)
            col.append(w)
            data.append(c)
        
    num_2_token = {
        v: k
        for k, v in token_2_num.iteritems()
    }
    print 'Nonzero values:', len(data)
    return scipy.sparse.csr_matrix((data, (row, col))), token_2_num, num_2_token


In [10]:
def prepare_dataset_with_cooccurences(dataset):
    # remove stopwords
    occurences = Counter()
    cooccurences = Counter()
    for i, doc in enumerate(dataset.data):
        tokens = gensim.utils.lemmatize(doc)
        for token in set(tokens):
            occurences[token] += 1
        if i % 500 == 0:
            print 'Preprocessed: ', i, 'documents from', len(dataset.data)
    
    row, col, data = [], [], []
    token_2_num = {}
    for i, doc in enumerate(dataset.data):
        tokens = gensim.utils.lemmatize(doc)
        cnt = Counter()
        words_set = set()
        for token in tokens:
            word = token.split('/')[0]
            if word not in english_stopwords and 10 <= occurences[token] < len(dataset.data) / 2:
                if token not in token_2_num:
                    token_2_num[token] = len(token_2_num)
                words_set.add(token_2_num[token])
                cnt[token_2_num[token]] += 1
        for w, c in cnt.iteritems():
            row.append(i)
            col.append(w)
            data.append(c)
            
        for w1 in words_set:
            for w2 in words_set:
                cooccurences[(w1, w2)] += 1
                
        if i % 500 == 0:
            print 'Processed: ', i, 'documents from', len(dataset.data)
        
    num_2_token = {
        v: k
        for k, v in token_2_num.iteritems()
    }
    print 'Nonzero values:', len(data)
    return scipy.sparse.csr_matrix((data, (row, col))), token_2_num, num_2_token, cooccurences


### Используем коллекцию 20newsgroups

Возьмём три научные темы, очистим данные, лемматизируем слова, удаляем редкие и частые слова (реализация этого в клетке выше).

In [54]:
dataset = fetch_20newsgroups(
    subset='all',
    categories=['sci.electronics', 'sci.med', 'sci.space', 'sci.crypt', 'rec.sport.baseball', 'rec.sport.hockey'],
    remove=('headers', 'footers', 'quotes')
)

In [55]:
%%time
origin_freq_matrix, token_2_num, num_2_token = prepare_dataset(dataset)

Processed:  0 documents from 5945
Processed:  500 documents from 5945
Processed:  1000 documents from 5945
Processed:  1500 documents from 5945
Processed:  2000 documents from 5945
Processed:  2500 documents from 5945
Processed:  3000 documents from 5945
Processed:  3500 documents from 5945
Processed:  4000 documents from 5945
Processed:  4500 documents from 5945
Processed:  5000 documents from 5945
Processed:  5500 documents from 5945
Nonzero values: 280223
CPU times: user 0 ns, sys: 4min 20s, total: 4min 20s
Wall time: 4min 20s


In [25]:
def evaluate(freq_matrix, phi, theta, dataset):
    T, W = phi.shape
    D, T = theta.shape
    
    docptr = []
    indptr = freq_matrix.indptr
    for doc_num in xrange(D):
        docptr.extend([doc_num] * (indptr[doc_num + 1] - indptr[doc_num]))
    docptr = np.array(docptr)
    
    correlation_on_topic = calculate_decorr(phi) / (T * (T - 1))
    log_likelihood = calculate_likelihood(origin_freq_matrix, docptr, phi, theta, 50)
    phi_non_zeros = np.sum(phi > 1e-20)
    phi_size = W * T
    theta_non_zeros = np.sum(theta > 1e-20)
    theta_size = D * T

    theta_copy = theta.copy()
    theta_copy[np.isnan(theta_copy)] = 1. / T
    svm_score1 = np.mean(cross_val_score(SVC(C=1.), theta_copy, dataset.target, 'accuracy', cv=4))
    svm_score2 = np.mean(cross_val_score(SVC(C=10.), theta_copy, dataset.target, 'accuracy', cv=4))
    svm_score3 = np.mean(cross_val_score(SVC(C=100.), theta_copy, dataset.target, 'accuracy', cv=4))

    print 'L\t\t\t\t', round(log_likelihood)
    print 'Average topic correlation\t', round(correlation_on_topic, 4)
    print 'Phi nonzeros\t\t\t', phi_non_zeros, '\tfrac\t', 1. * phi_non_zeros / phi_size
    print 'Theta nonzeros\t\t\t', theta_non_zeros, '\tfrac\t', 1. * theta_non_zeros / theta_size
    print 'SVM score1\t\t\t', round(svm_score1, 4)
    print 'SVM score2\t\t\t', round(svm_score2, 4)
    print 'SVM score3\t\t\t', round(svm_score3, 4)

In [56]:
# design experiment

D, W = origin_freq_matrix.shape
T = 15

np.random.seed(42)

phi_matrix = np.random.uniform(size=(T, W)).astype(np.float64)
phi_matrix /= np.sum(phi_matrix, axis=1)[:, np.newaxis]

theta_matrix = np.random.uniform(size=(D, T)).astype(np.float64)
theta_matrix /= np.sum(theta_matrix, axis=1)[:, np.newaxis]

In [57]:
%%time
phi, theta = launch_em(
    freq_matrix=origin_freq_matrix, 
    phi_matrix=phi_matrix,
    theta_matrix=theta_matrix,
    iters_count=150,
    loss_function=CubeRootLogFunction()
)

CPU times: user 0 ns, sys: 20.7 s, total: 20.7 s
Wall time: 20.7 s


In [58]:
evaluate(origin_freq_matrix, phi, theta, dataset)

L				-2962316.0
Average topic correlation	0.0709
Phi nonzeros			32211 	frac	0.390933915893
Theta nonzeros			34362 	frac	0.385332211943
SVM score1			0.761
SVM score2			0.7859
SVM score3			0.7941


In [67]:
# design experiment

D, W = origin_freq_matrix.shape
T = 15

np.random.seed(42)

phi_matrix = np.random.uniform(size=(T, W)).astype(np.float64)
phi_matrix /= np.sum(phi_matrix, axis=1)[:, np.newaxis]

In [68]:
%%time
phi, theta = launch_em_without_theta(
    freq_matrix=origin_freq_matrix, 
    phi_matrix=phi_matrix,
    iters_count=150,
    loss_function=CubeRootLogFunction()
)

CPU times: user 0 ns, sys: 21.3 s, total: 21.3 s
Wall time: 21.3 s


In [None]:
evaluate(origin_freq_matrix, phi, theta, dataset)

In [None]:
# design experiment

D, W = origin_freq_matrix.shape
T = 10

np.random.seed(42)

phi_matrix = np.random.uniform(size=(T, W)).astype(np.float64)
phi_matrix /= np.sum(phi_matrix, axis=1)[:, np.newaxis]

theta_matrix = np.random.uniform(size=(D, T)).astype(np.float64)
theta_matrix /= np.sum(theta_matrix, axis=1)[:, np.newaxis]

no_selection_params = {
    'method': perform_e_step_update,
    'custom_function': SquareRootLogFunction()
}

regularizations_list = np.zeros(50, dtype=object)
params_list = np.zeros(50, dtype=object)

regularizations_list[:] = trivial_regularization
params_list[:] = no_selection_params

phi, theta = launch_em(
    freq_matrix=origin_freq_matrix, 
    phi_matrix=phi_matrix,
    theta_matrix=theta_matrix,
    regularizations_list=regularizations_list,
    params_list=params_list,
    iters_count=50
)

In [111]:
evaluate(origin_freq_matrix, phi, theta, dataset)

L				-3019839.0
Average topic correlation	0.1106
Phi nonzeros			35537 	frac	0.646950664482
Theta nonzeros			45567 	frac	0.766476030278
SVM score			0.6878


In [164]:
# design experiment

D, W = origin_freq_matrix.shape
T = 15

np.random.seed(42)

phi_matrix = np.random.uniform(size=(T, W)).astype(np.float64)
phi_matrix /= np.sum(phi_matrix, axis=1)[:, np.newaxis]

theta_matrix = np.random.uniform(size=(D, T)).astype(np.float64)
theta_matrix /= np.sum(theta_matrix, axis=1)[:, np.newaxis]

no_selection_params = {
    'method': perform_e_step_update,
    'custom_function': CubeRootLogFunction()
}

regularizations_list = np.zeros(150, dtype=object)
params_list = np.zeros(150, dtype=object)

regularizations_list[:] = trivial_regularization
params_list[:] = no_selection_params

phi, theta = launch_em(
    freq_matrix=origin_freq_matrix, 
    phi_matrix=phi_matrix,
    theta_matrix=theta_matrix,
    regularizations_list=regularizations_list,
    params_list=params_list,
    iters_count=150
)

0 -863085.068147
1 -832362.717672
2 -831478.005335
3 -830691.791854
4 -829588.301521
5 -828047.217125
6 -826016.106592
7 -823575.703833
8 -820969.30652
9 -818475.724691
10 -816265.56712
11 -814383.099255
12 -812805.544631
13 -811487.746935
14 -810385.10334
15 -809462.676561
16 -808687.204787
17 -808024.901697
18 -807449.949004
19 -806949.90113
20 -806520.535589
21 -806152.403912
22 -805832.114154
23 -805550.179228
24 -805298.859172
25 -805073.682443
26 -804871.549985
27 -804690.385142
28 -804528.33159
29 -804382.348882
30 -804249.261742
31 -804128.613392
32 -804020.322699
33 -803921.766717
34 -803830.865457
35 -803746.789699
36 -803668.142597
37 -803593.465659
38 -803522.008612
39 -803453.470427
40 -803387.113308
41 -803323.648824
42 -803264.872507
43 -803211.098921
44 -803160.776436
45 -803112.875969
46 -803066.976077
47 -803022.849376
48 -802981.006167
49 -802941.085532
50 -802902.715018
51 -802865.667499
52 -802828.969759
53 -802793.442198
54 -802759.315464
55 -802726.657141
56 -802

In [165]:
evaluate(origin_freq_matrix, phi, theta, dataset)

L				-2962316.0
Average topic correlation	0.0709
Phi nonzeros			32211 	frac	0.390933915893
Theta nonzeros			34362 	frac	0.385332211943
SVM score1			0.761
SVM score2			0.7859
SVM score3			0.7941


In [155]:
# design experiment

D, W = origin_freq_matrix.shape
T = 15

np.random.seed(42)

phi_matrix = np.random.uniform(size=(T, W)).astype(np.float64)
phi_matrix /= np.sum(phi_matrix, axis=1)[:, np.newaxis]

theta_matrix = np.random.uniform(size=(D, T)).astype(np.float64)
theta_matrix /= np.sum(theta_matrix, axis=1)[:, np.newaxis]

no_selection_params = {
    'method': perform_e_step_update,
    'custom_function': CubeLogFunction()
}

regularizations_list = np.zeros(150, dtype=object)
params_list = np.zeros(150, dtype=object)

regularizations_list[:] = trivial_regularization
params_list[:] = no_selection_params

phi, theta = launch_em(
    freq_matrix=origin_freq_matrix, 
    phi_matrix=phi_matrix,
    theta_matrix=theta_matrix,
    regularizations_list=regularizations_list,
    params_list=params_list,
    iters_count=150
)

0 -269885456.135
1 -217302815.51
2 -212604707.897
3 -210501827.554
4 -207953778.515
5 -204628912.188
6 -200604178.059
7 -196189503.251
8 -191797434.935
9 -187764816.355
10 -184269031.938
11 -181318260.914
12 -178838070.364
13 -176752051.653
14 -174991676.133
15 -173500893.808
16 -172237356.172
17 -171162345.974
18 -170240463.064
19 -169443668.245
20 -168754278.982
21 -168158613.295
22 -167641713.049
23 -167186015.236
24 -166776482.396
25 -166406229.485
26 -166072706.436
27 -165771228.865
28 -165496338.027
29 -165244613.282
30 -165014059.839
31 -164802321.489
32 -164606983.876
33 -164426611.542
34 -164259344.714
35 -164103035.303
36 -163957596.508
37 -163820802.943
38 -163693020.229
39 -163575223.247
40 -163465623.104
41 -163363359.972
42 -163267227.537
43 -163175572.223
44 -163087260.739
45 -163002138.875
46 -162920413.459
47 -162842157.46
48 -162767849.706
49 -162698057.84
50 -162632810.047
51 -162571929.167
52 -162515357.057
53 -162462592.862
54 -162412629.296
55 -162364224.673
56 -1

In [160]:
evaluate(origin_freq_matrix, phi, theta, dataset)

L				-2980121.0
Average topic correlation	0.0713
Phi nonzeros			33797 	frac	0.410182656715
Theta nonzeros			35599 	frac	0.399203812728
SVM score1			0.7832
SVM score2			0.7931
SVM score3			0.7965


In [116]:
# design experiment

D, W = origin_freq_matrix.shape
T = 10

np.random.seed(42)

phi_matrix = np.random.uniform(size=(T, W)).astype(np.float64)
phi_matrix /= np.sum(phi_matrix, axis=1)[:, np.newaxis]

theta_matrix = np.random.uniform(size=(D, T)).astype(np.float64)
theta_matrix /= np.sum(theta_matrix, axis=1)[:, np.newaxis]

no_selection_params = {
    'method': perform_e_step_update,
    'custom_function': SquareLogFunction()
}

regularizations_list = np.zeros(50, dtype=object)
params_list = np.zeros(50, dtype=object)

regularizations_list[:] = trivial_regularization
params_list[:] = no_selection_params

phi, theta = launch_em(
    freq_matrix=origin_freq_matrix, 
    phi_matrix=phi_matrix,
    theta_matrix=theta_matrix,
    regularizations_list=regularizations_list,
    params_list=params_list,
    iters_count=50
)

0 -31350339.8416
1 -26406477.8397
2 -26216373.8224
3 -26084325.3124
4 -25908417.6427
5 -25672515.7077
6 -25377032.3351
7 -25037651.555
8 -24681974.9241
9 -24338840.9888
10 -24031064.4806
11 -23771439.1377
12 -23560573.7802
13 -23391642.7372
14 -23255108.4616
15 -23142952.646
16 -23049513.0136
17 -22970648.3967
18 -22903235.8597
19 -22844883.8049
20 -22793759.9773
21 -22748331.9082
22 -22707935.1705
23 -22672826.9465
24 -22642397.1861
25 -22615621.4664
26 -22591755.8917
27 -22570332.5027
28 -22551071.5056
29 -22533710.8167
30 -22517941.7935
31 -22503489.988
32 -22490307.3704
33 -22478404.5234
34 -22467652.6415
35 -22458006.9024
36 -22449374.4405
37 -22441504.5875
38 -22434228.2823
39 -22427362.5796
40 -22420759.4922
41 -22414437.7122
42 -22408582.6362
43 -22403191.8587
44 -22398166.0645
45 -22393446.1312
46 -22389035.6336
47 -22384909.4459
48 -22381011.9291
49 -22377318.206


In [117]:
evaluate(origin_freq_matrix, phi, theta, dataset)

L				-3024289.0
Average topic correlation	0.1109
Phi nonzeros			35953 	frac	0.654523939559
Theta nonzeros			47986 	frac	0.80716568545
SVM score			0.6846


In [118]:
# design experiment

D, W = origin_freq_matrix.shape
T = 10

np.random.seed(42)

phi_matrix = np.random.uniform(size=(T, W)).astype(np.float64)
phi_matrix /= np.sum(phi_matrix, axis=1)[:, np.newaxis]

theta_matrix = np.random.uniform(size=(D, T)).astype(np.float64)
theta_matrix /= np.sum(theta_matrix, axis=1)[:, np.newaxis]

no_selection_params = {
    'method': perform_e_step_update,
    'custom_function': FiveLogFunction()
}

regularizations_list = np.zeros(50, dtype=object)
params_list = np.zeros(50, dtype=object)

regularizations_list[:] = trivial_regularization
params_list[:] = no_selection_params

phi, theta = launch_em(
    freq_matrix=origin_freq_matrix, 
    phi_matrix=phi_matrix,
    theta_matrix=theta_matrix,
    regularizations_list=regularizations_list,
    params_list=params_list,
    iters_count=50
)

0 -20248904982.0
1 -15840456607.4
2 -14776791460.4
3 -14363274526.2
4 -14061542642.3


KeyboardInterrupt: 

In [None]:
evaluate(origin_freq_matrix, phi, theta, dataset)

In [142]:
# design experiment

D, W = origin_freq_matrix.shape
T = 7

np.random.seed(42)

phi_matrix = np.random.uniform(size=(T, W)).astype(np.float64)
phi_matrix /= np.sum(phi_matrix, axis=1)[:, np.newaxis]

theta_matrix = np.random.uniform(size=(D, T)).astype(np.float64)
theta_matrix /= np.sum(theta_matrix, axis=1)[:, np.newaxis]

no_selection_params = {
    'method': perform_e_step_update,
    'custom_function': IdFunction()
}

regularizations_list = np.zeros(150, dtype=object)
params_list = np.zeros(150, dtype=object)

regularizations_list[:] = trivial_regularization
params_list[:] = no_selection_params

phi, theta = launch_em(
    freq_matrix=origin_freq_matrix, 
    phi_matrix=phi_matrix,
    theta_matrix=theta_matrix,
    regularizations_list=regularizations_list,
    params_list=params_list,
    iters_count=90
)

0 77.5218486778
1 388.466195528
2 1135.96895613
3 1825.39339986
4 2344.26171731
5 2820.35447112
6 3344.50704904
7 3940.77016547
8 4534.19617448
9 5006.29777811
10 5323.01186288
11 5533.93193365
12 5694.63182658
13 5846.82821447
14 6035.38264581
15 6274.55071238
16 6512.64695513
17 6694.65555046
18 6818.43172169
19 6914.03207768
20 6983.9616659
21 7034.56757577
22 7076.3443992
23 7116.54952586
24 7151.07960532
25 7182.64870305
26 7216.93997287
27 7252.73666453
28 7290.84303738
29 7332.86365081
30 7377.66997032
31 7421.44373165
32 7461.44416384
33 7496.55329575
34 7528.39422609
35 7555.03797171
36 7577.24871447
37 7594.86011949
38 7607.97048449
39 7618.80002273
40 7626.02493456
41 7632.33344766
42 7637.80905185
43 7642.28681696
44 7645.85770669
45 7648.9324097
46 7651.86230822
47 7654.80221005
48 7657.82485437
49 7661.02449018
50 7664.49404073
51 7668.21700968
52 7671.99294822
53 7675.52302472
54 7678.59172
55 7681.13886902
56 7683.19820677
57 7684.8345484
58 7686.12803861
59 7687.166472

In [144]:
evaluate(origin_freq_matrix, phi, theta, dataset)

L				-18974717.0
Average topic correlation	0.0
Phi nonzeros			7 	frac	0.000182049881668
Theta nonzeros			8379 	frac	0.201345668629
SVM score			0.2267
