# Оптимальные питоновские реализация оптимизации ARTM

# + Формулы


In [36]:
import numpy as np
from numpy.core.umath_tests import inner1d
import scipy
import scipy.sparse
from sklearn.datasets import fetch_20newsgroups
import gensim
from collections import Counter
import heapq
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import time
%matplotlib inline

# Разные функции потерь

In [132]:
class LogFunction(object):
    def calc(self, x):
        return np.log(x + 1e-20)
    def calc_der(self, x):
        return 1. / (x + 1e-20)
    

class IdFunction(object):
    def calc(self, x):
        return x + 1e-20
    def calc_der(self, x):
        return np.ones_like(x)
    

class SquareFunction(object):
    def calc(self, x):
        return (x + 1e-20) ** 2
    def calc_der(self, x):
        return 2. * (x + 1e-20) ** 2
    

class CubeLogFunction(object):
    def calc(self, x):
        return np.log(x + 1e-20) ** 3
    def calc_der(self, x):
        return 3. * np.log(x + 1e-20) ** 2 / (x + 1e-20)
    

class SquareLogFunction(object):
    def calc(self, x):
        return np.log(x + 1e-20) * np.abs(np.log(x + 1e-20))
    def calc_der(self, x):
        return 2. * np.abs(np.log(x + 1e-20)) / (x + 1e-20)

    
class FiveLogFunction(object):
    def calc(self, x):
        return np.log(x + 1e-20) ** 5
    def calc_der(self, x):
        return 5. * np.log(x + 1e-20) ** 4 / (x + 1e-20)
    

class CubeRootLogFunction(object):
    def calc(self, x):
        return np.cbrt(np.log(x + 1e-20))
    def calc_der(self, x):
        return 1. / 3 / (np.cbrt(np.log(x + 1e-20)) ** 2) / (x + 1e-20)
    
    
class SquareRootLogFunction(object):
    def calc(self, x):
        return np.sqrt(- np.log(x + 1e-20))
    def calc_der(self, x):
        return 1. / 2. / np.sqrt(- np.log(x + 1e-20)) / (x + 1e-20)
    

class ExpFunction(object):
    def calc(self, x):
        return np.exp(x)
    def calc_der(self, x):
        return np.exp(x)

    
class EntropyFunction(object):
    def calc(self, x):
        return (np.log(x + 1e-20) + 50.) * (x + 1e-20)
    def calc_der(self, x):
        return np.log(x + 1e-20) + 50.

# Разные регуляризации

In [133]:
def trivial_regularization(n_tw, n_dt):
    return 0., 0.

def create_reg_decorr(tau, theta_alpha=0.):
    def fun(n_tw, n_dt):
        phi_matrix = n_tw / np.sum(n_tw, axis=1)[:, np.newaxis]
        theta_matrix = n_dt / np.sum(n_dt, axis=1)[:, np.newaxis]
        aggr_phi = np.sum(phi_matrix, axis=1)
        return - tau * np.transpose(phi_matrix * (aggr_phi[:, np.newaxis] - phi_matrix)), theta_alpha
    return fun

def create_reg_lda(phi_alpha, theta_alpha):
    def fun (n_tw, n_dt):
        return phi_alpha, theta_alpha
    return fun


# Подготовка Датасета

Нужно скачать некоторые коллекции данных и установить библиотеки (nltk, gensim)

In [3]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/tylorn/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [4]:
english_stopwords = set(stopwords.words('english'))

In [33]:
def prepare_dataset(dataset):
    # remove stopwords
    occurences = Counter()
    for i, doc in enumerate(dataset.data):
        tokens = gensim.utils.lemmatize(doc)
        for token in set(tokens):
            occurences[token] += 1
        if i % 500 == 0:
            print 'Processed: ', i, 'documents from', len(dataset.data)
    
    row, col, data = [], [], []
    token_2_num = {}
    not_empty_docs_number = 0
    doc_targets = []
    for doc, target in zip(dataset.data, dataset.target):
        tokens = gensim.utils.lemmatize(doc)
        cnt = Counter()
        for token in tokens:
            word = token.split('/')[0]
            if word not in english_stopwords and 3 <= occurences[token]:
                if token not in token_2_num:
                    token_2_num[token] = len(token_2_num)
                cnt[token_2_num[token]] += 1
        
        if len(cnt) > 0:
            for w, c in cnt.iteritems():
                row.append(not_empty_docs_number)
                col.append(w)
                data.append(c)
            not_empty_docs_number += 1
            doc_targets.append(target)
        
    num_2_token = {
        v: k
        for k, v in token_2_num.iteritems()
    }
    print 'Nonzero values:', len(data)
    return scipy.sparse.csr_matrix((data, (row, col))), token_2_num, num_2_token, doc_targets


In [34]:
dataset = fetch_20newsgroups(
    subset='all',
    categories=['sci.electronics', 'sci.med', 'sci.space', 'sci.crypt', 'rec.sport.baseball', 'rec.sport.hockey'],
    remove=('headers', 'footers', 'quotes')
)

In [35]:
%%time
origin_n_dw_matrix, token_2_num, num_2_token, doc_targets = prepare_dataset(dataset)

Processed:  0 documents from 5945
Processed:  500 documents from 5945
Processed:  1000 documents from 5945
Processed:  1500 documents from 5945
Processed:  2000 documents from 5945
Processed:  2500 documents from 5945
Processed:  3000 documents from 5945
Processed:  3500 documents from 5945
Processed:  4000 documents from 5945
Processed:  4500 documents from 5945
Processed:  5000 documents from 5945
Processed:  5500 documents from 5945
Nonzero values: 322664
CPU times: user 4min 26s, sys: 64 ms, total: 4min 26s
Wall time: 4min 26s


In [209]:
big_dataset = fetch_20newsgroups(
    subset='all',
    remove=('headers', 'footers', 'quotes')
)

In [210]:
%%time
big_origin_n_dw_matrix, _, _, big_doc_targets = prepare_dataset(big_dataset)

Processed:  0 documents from 18846
Processed:  500 documents from 18846
Processed:  1000 documents from 18846
Processed:  1500 documents from 18846
Processed:  2000 documents from 18846
Processed:  2500 documents from 18846
Processed:  3000 documents from 18846
Processed:  3500 documents from 18846
Processed:  4000 documents from 18846
Processed:  4500 documents from 18846
Processed:  5000 documents from 18846
Processed:  5500 documents from 18846
Processed:  6000 documents from 18846
Processed:  6500 documents from 18846
Processed:  7000 documents from 18846
Processed:  7500 documents from 18846
Processed:  8000 documents from 18846
Processed:  8500 documents from 18846
Processed:  9000 documents from 18846
Processed:  9500 documents from 18846
Processed:  10000 documents from 18846
Processed:  10500 documents from 18846
Processed:  11000 documents from 18846
Processed:  11500 documents from 18846
Processed:  12000 documents from 18846
Processed:  12500 documents from 18846
Processed:

# Вычисление правдоподобных функций

### имеется в виду вычисление функций вида $\sum_{dw} n_{dw} f(\sum_{t} \phi_{wt} \theta_{td})$

In [17]:
def create_calculate_likelihood_like_function(n_dw_matrix, loss_function=LogFunction()):
    D, W = n_dw_matrix.shape
    docptr = []
    indptr = n_dw_matrix.indptr
    for doc_num in xrange(D):
        docptr.extend([doc_num] * (indptr[doc_num + 1] - indptr[doc_num]))
    docptr = np.array(docptr)
    wordptr = n_dw_matrix.indices
    
    def fun(phi_matrix, theta_matrix):
        s_data = loss_function.calc(inner1d(theta_matrix[docptr, :], np.transpose(phi_matrix)[wordptr, :]))
        return np.sum(n_dw_matrix.data * s_data)

    return fun

# EM алгоритм

## Общая схема:
#### Неоходимо сначала вычислить $p_{tdw} = \frac{\phi_{wt} \theta_{td}}{\sum_s \phi_{ws} \theta_{sd}}$
#### Считаем $n_{wt} = \sum_d n_{dw} p_{tdw}$ и $n_{td} = \sum_w n_{dw} p_{tdw}$
#### Вычисляем $r_{wt}, r_{td}$ как функцию от $n_{wt}, n_{td}$
#### Прибавляем, делаем положительную срезку и нормируем

## Оптимизация вычисления:
#### Обозначим за $s_{dw}$ следующее выражение $\sum_t \phi_{wt} \theta_{td}$, фактически это наше предсказание для вероятности
####  Тогда $p_{tdw} = \frac{\phi_{wt} \theta_{td}}{s_{dw}}$
#### Подставим это выражение например в $n_wt$
#### И получим, что $n_{wt} = \sum_d n_{dw} \frac{\phi_{wt} \theta_{td}}{s_{dw}} = \phi_{wt} \sum_d \theta_{td} \cdot \frac{n_{dw}}{s_{dw}}$, аналогично $n_{td} = \theta_{td} \sum_w \phi_{wt} \cdot \frac{n_{dw}}{s_{dw}}$
#### Таким образом, мы видим, что фактически нам нужно знать матрицу $\frac{n_{dw}}{s_{dw}}$, а она очень разреженная, поэтому и $s_{dw}$ нужно не для всех пар вычислять, а только там, где $n_{dw} > 0$. 
#### То есть нам нужно эффективно закодить вычисление разженной матрицы $s_{dw}$ (матрица $n_{dw}$ уже есть в разреженном виде, так как подаётся на вход алгоритма), а затем просто поэлементно поделить
#### Причём хочется, чтобы промежуточные значения $p_{tdw}$ не сохранялись (как мы увидели, они в конечном варианте не важны)
#### Обозначим эту матрицу за $A$. Тогда $n_{wt} = \phi_{wt} (\Theta A)_{tw}$, а $n_{td} = \theta_{td} (A \Phi^T)_{dt}$.
#### Перемножить разреженную матрицу на плотную можно быстро, если правильно её хранить (по строкам, или по столбцам)
#### Если оптимизируется не правдоподобие, какая-то другая функция вида $\sum_{dw} n_{dw} f(s_{dw})$ (правдоподобие будет, если $f(x) = \ln x$ ) , то в этом случае нужно определить матрицу $A$ как $A_{dw} = n_{dw} f'(s_{dw})$

In [134]:
def em_optimization(
    n_dw_matrix, 
    phi_matrix,
    theta_matrix,
    regularization_list,
    iters_count=100,
    loss_function=LogFunction(),
    iteration_callback=None
):
    D, W = n_dw_matrix.shape
    T = phi_matrix.shape[0]
    phi_matrix = np.copy(phi_matrix)
    theta_matrix = np.copy(theta_matrix)
    docptr = []
    indptr = n_dw_matrix.indptr
    for doc_num in xrange(D):
        docptr.extend([doc_num] * (indptr[doc_num + 1] - indptr[doc_num]))
    docptr = np.array(docptr)
    wordptr = n_dw_matrix.indices
    
    start_time = time.time()
    for it in xrange(iters_count):
        phi_matrix_tr = np.transpose(phi_matrix)
        # следующая строчка это 60% времени работы алгоритма
        s_data = loss_function.calc_der(inner1d(theta_matrix[docptr, :], phi_matrix_tr[wordptr, :]))
        # следующая часть это 25% времени работы алгоритма
        A = scipy.sparse.csr_matrix(
            (
                n_dw_matrix.data * s_data, 
                n_dw_matrix.indices, 
                n_dw_matrix.indptr
            ), 
            shape=n_dw_matrix.shape
        )
        A_tr = A.tocsc().transpose()
        # Остальное это 15% времени
        n_tw = np.transpose(A_tr.dot(theta_matrix)) * phi_matrix
        n_dt = A.dot(phi_matrix_tr) * theta_matrix
        
        r_tw, r_dt = regularization_list[it](n_tw, n_dt)
        n_tw += r_tw
        n_dt += n_dt
        n_tw[n_tw < 0] = 0
        n_dt[n_dt < 0] = 0
        
        phi_matrix = n_tw / np.sum(n_tw, axis=1)[:, np.newaxis]
        theta_matrix = n_dt / np.sum(n_dt, axis=1)[:, np.newaxis]
        
        if iteration_callback is not None:
            iteration_callback(it, phi_matrix, theta_matrix)
    
    print 'Iters time', time.time() - start_time
    return phi_matrix, theta_matrix

# Naive thetaless EM


### Основная идея: давайте вообще не хранить $\Theta$, а вместо этого вычислять её на лету одной итерацией ЕМ алгоритма, которую можно легко выписать.

##### Пусть тематический профиль документа инициализирован равномерно, то для этого документа $p_{tdw} = \frac{\phi_{wt}}{\sum_s \phi_s} \equiv \overline{\phi}_{wt} \equiv (\overline{\Phi})_{wt} \equiv p(t~|~w)$ . Эту матрицу легко рассчитать.
##### На первой итерации  будет подсчитано $n_{td} = \sum_{d} n_{dw} p_{tdw} = \sum_{d} n_{dw} (\overline{\Phi})_{wt} = (N\overline{\Phi})_{dt}$
##### И, соответственно, $\theta_{td} = \frac{n_{td}}{\sum_t n_{td}} =  \frac{n_{td}}{n_d}$
##### Введём матрицу $B_{dw} \equiv \frac{n_{dw}}{n_d}$, тогда $\Theta = B \Phi^T$ 

In [135]:
def naive_thetaless_em_optimization(
    n_dw_matrix, 
    phi_matrix,
    regularization_list,
    iters_count=100,
    iteration_callback=None
):
    D, W = n_dw_matrix.shape
    T = phi_matrix.shape[0]
    phi_matrix = np.copy(phi_matrix)
    docptr = []
    indptr = n_dw_matrix.indptr
    for doc_num in xrange(D):
        docptr.extend([doc_num] * (indptr[doc_num + 1] - indptr[doc_num]))
    docptr = np.array(docptr)
    wordptr = n_dw_matrix.indices
    
    start_time = time.time()
    for it in xrange(iters_count):
        phi_rev_matrix = np.transpose(phi_matrix / np.sum(phi_matrix, axis=0))
        theta_matrix = n_dw_matrix.dot(phi_rev_matrix)
        theta_matrix /= np.sum(theta_matrix, axis=1)[:, np.newaxis]
        phi_matrix_tr = np.transpose(phi_matrix)
        
        s_data = 1. / inner1d(theta_matrix[docptr, :], phi_matrix_tr[wordptr, :])
        A = scipy.sparse.csr_matrix(
            (
                n_dw_matrix.data  * s_data , 
                n_dw_matrix.indices, 
                n_dw_matrix.indptr
            ), 
            shape=n_dw_matrix.shape
        ).tocsc()
            
        n_tw = (A.T.dot(theta_matrix)).T * phi_matrix
        r_tw, _ = regularization_list[it](n_tw, theta_matrix)
        n_tw += r_tw
        n_tw[n_tw < 0] = 0
        phi_matrix = n_tw / np.sum(n_tw, axis=1)[:, np.newaxis]

        if iteration_callback is not None:
            iteration_callback(it, phi_matrix, theta_matrix)
    
    print 'Iters time', time.time() - start_time    
    return phi_matrix, theta_matrix

# ARTM thetaless EM optimization

In [297]:
def artm_thetaless_em_optimization(
    n_dw_matrix, 
    phi_matrix,
    iters_count=100,
    iteration_callback=None
):
    D, W = n_dw_matrix.shape
    T = phi_matrix.shape[0]
    phi_matrix = np.copy(phi_matrix)
    docptr = []
    docsizes = []
    indptr = n_dw_matrix.indptr
    for doc_num in xrange(D):
        size = indptr[doc_num + 1] - indptr[doc_num]
        docptr.extend([doc_num] * size)
        docsizes.extend([size] * size)
    docptr = np.array(docptr)
    wordptr = n_dw_matrix.indices
    docsizes = np.array(docsizes)
    
    B = scipy.sparse.csr_matrix(
        (
            1. * n_dw_matrix.data  / docsizes, 
            n_dw_matrix.indices, 
            n_dw_matrix.indptr
        ), 
        shape=n_dw_matrix.shape
    ).tocsc()
    
    start_time = time.time()
    for it in xrange(iters_count):
        word_norm = np.sum(phi_matrix, axis=0)
        phi_rev_matrix = np.transpose(phi_matrix / word_norm)
        
        theta_matrix = n_dw_matrix.dot(phi_rev_matrix)
        theta_matrix /= np.sum(theta_matrix, axis=1)[:, np.newaxis]
        phi_matrix_tr = np.transpose(phi_matrix)
        
        s_data = 1. / inner1d(theta_matrix[docptr, :], phi_matrix_tr[wordptr, :])
        A = scipy.sparse.csr_matrix(
            (
                n_dw_matrix.data  * s_data , 
                n_dw_matrix.indices, 
                n_dw_matrix.indptr
            ), 
            shape=n_dw_matrix.shape
        ).tocsc()
            
        n_tw = A.T.dot(theta_matrix).T * phi_matrix
        g_dt = A.dot(phi_matrix_tr)
        tmp = g_dt.T * B
        r_tw = (tmp / word_norm - np.einsum('ij,ji->i', phi_matrix_tr, tmp) / (word_norm ** 2)) * phi_matrix
        
        n_tw += r_tw
        n_tw[n_tw < 0] = 0
        phi_matrix = n_tw / np.sum(n_tw, axis=1)[:, np.newaxis]

        if iteration_callback is not None:
            iteration_callback(it, phi_matrix, theta_matrix)
    
    print 'Iters time', time.time() - start_time    
    return phi_matrix, theta_matrix

# Gradient Descent

In [313]:
def gradient_optimization(
    n_dw_matrix, 
    phi_matrix,
    theta_matrix,
    regularization_gradient_list,
    iters_count=100,
    loss_function=LogFunction(),
    iteration_callback=None,
    learning_rate=1.
):
    D, W = n_dw_matrix.shape
    T = phi_matrix.shape[0]
    phi_matrix = np.copy(phi_matrix)
    theta_matrix = np.copy(theta_matrix)
    docptr = []
    indptr = n_dw_matrix.indptr
    for doc_num in xrange(D):
        docptr.extend([doc_num] * (indptr[doc_num + 1] - indptr[doc_num]))
    docptr = np.array(docptr)
    wordptr = n_dw_matrix.indices
    
    start_time = time.time()
    for it in xrange(iters_count):
        phi_matrix_tr = np.transpose(phi_matrix)
        # следующая строчка это 60% времени работы алгоритма
        s_data = loss_function.calc_der(inner1d(theta_matrix[docptr, :], phi_matrix_tr[wordptr, :]))
        # следующая часть это 25% времени работы алгоритма
        A = scipy.sparse.csr_matrix(
            (
                n_dw_matrix.data * s_data, 
                n_dw_matrix.indices, 
                n_dw_matrix.indptr
            ), 
            shape=n_dw_matrix.shape
        ).tocsc()
        # Остальное это 15% времени
        g_tw = theta_matrix.T * A
        g_dt = A.dot(phi_matrix_tr)
        
        r_tw, r_dt = regularization_gradient_list[it](phi_matrix, theta_matrix)
        g_tw += r_tw
        g_dt += r_dt
        
        g_tw -= np.sum(g_tw * phi_matrix, axis=1)[:, np.newaxis]
        g_dt -= np.sum(g_dt * theta_matrix, axis=1)[:, np.newaxis]
        
        phi_matrix += g_tw * learning_rate
        theta_matrix += g_dt * learning_rate
        
        phi_matrix[phi_matrix < 0] = 0
        theta_matrix[theta_matrix < 0] = 0
        
        phi_matrix /= np.sum(phi_matrix, axis=1)[:, np.newaxis]
        theta_matrix /= np.sum(theta_matrix, axis=1)[:, np.newaxis]
        
        if iteration_callback is not None:
            iteration_callback(it, phi_matrix, theta_matrix)
    
    print 'Iters time', time.time() - start_time  
    return phi_matrix, theta_matrix

# Оценка качества классификации

In [233]:
def svm_score(theta, targets):
    C_2d_range = [1e0, 1e1, 1e2, 1e3, 1e4]
    gamma_2d_range = [1e-3, 1e-2, 1e-1, 1, 1e1]
    for C in C_2d_range:
        for gamma in gamma_2d_range:
            print 'SVM(C={}, gamma={}) score: {}'.format(
                C,
                gamma,
                np.mean(cross_val_score(SVC(C=C, gamma=gamma), theta, targets, scoring='accuracy', cv=4))
            )

# Примеры запусков

# PLSA: EM optimization

In [319]:
D, W = origin_n_dw_matrix.shape
T = 10

np.random.seed(42)

phi_matrix = np.random.uniform(size=(T, W)).astype(np.float64)
phi_matrix /= np.sum(phi_matrix, axis=1)[:, np.newaxis]

theta_matrix = np.random.uniform(size=(D, T)).astype(np.float64)
theta_matrix /= np.sum(theta_matrix, axis=1)[:, np.newaxis]

regularization_list = np.zeros(200, dtype=object)
regularization_list[:] = trivial_regularization

calc_log_likelihood = create_calculate_likelihood_like_function(
    loss_function=LogFunction(),
    n_dw_matrix=origin_n_dw_matrix
)

total_words_number = origin_n_dw_matrix.sum()
def callback(it, phi, theta):
    print it,  np.exp(- calc_log_likelihood(phi, theta) / total_words_number)

phi, theta = em_optimization(
    n_dw_matrix=origin_n_dw_matrix, 
    phi_matrix=phi_matrix,
    theta_matrix=theta_matrix,
    regularization_list=regularization_list,
    iters_count=200,
    loss_function=LogFunction(),
    iteration_callback=callback
)

phi, theta = em_optimization(
    n_dw_matrix=origin_n_dw_matrix, 
    phi_matrix=phi_matrix,
    theta_matrix=theta_matrix,
    regularization_list=regularization_list,
    iters_count=200,
    loss_function=LogFunction()
)

0 3986.0704926
1 3906.19009155
2 3816.50258883
3 3706.06393687
4 3572.34675498
5 3422.22011816
6 3268.08384344
7 3119.83183247
8 2983.70125846
9 2863.18846394
10 2759.69951822
11 2672.42541819
12 2599.05123244
13 2536.97312372
14 2483.83221649
15 2437.75378852
16 2397.31633049
17 2361.2891356
18 2328.82670318
19 2299.34993209
20 2272.58561051
21 2248.73220363
22 2227.65996306
23 2208.70765597
24 2191.35635596
25 2174.89348508
26 2159.11967244
27 2144.42764304
28 2131.28250351
29 2119.31240838
30 2108.36589959
31 2098.46248252
32 2089.30733262
33 2081.12016961
34 2073.44101797
35 2066.20339851
36 2059.38450385
37 2053.0078317
38 2046.96650212
39 2041.17463077
40 2035.4610121
41 2029.96956402
42 2024.71547536
43 2019.67900783
44 2014.82612812
45 2010.11252984
46 2005.54425925
47 2001.10582942
48 1996.92925567
49 1993.06135312
50 1989.37842384
51 1985.82831163
52 1982.42779562
53 1979.20828384
54 1976.22776342
55 1973.49514204
56 1970.91173228
57 1968.36860342
58 1965.85370816
59 1963.405

In [320]:
phi_plsa_em = phi
theta_plsa_em = theta

In [321]:
1. * np.sum(phi < 1e-20) / np.sum(phi >= 0)

0.672531979977753

In [322]:
svm_score(theta, doc_targets)

SVM(C=1.0, gamma=0.001) score: 0.520351804446
SVM(C=1.0, gamma=0.01) score: 0.681636535081
SVM(C=1.0, gamma=0.1) score: 0.722342929618
SVM(C=1.0, gamma=1) score: 0.744088708797
SVM(C=1.0, gamma=10.0) score: 0.745307372089
SVM(C=10.0, gamma=0.001) score: 0.681113886998
SVM(C=10.0, gamma=0.01) score: 0.722341961247
SVM(C=10.0, gamma=0.1) score: 0.742002348356
SVM(C=10.0, gamma=1) score: 0.750695724108
SVM(C=10.0, gamma=10.0) score: 0.732265501282
SVM(C=100.0, gamma=0.001) score: 0.722167745219
SVM(C=100.0, gamma=0.01) score: 0.738346111969
SVM(C=100.0, gamma=0.1) score: 0.748613120209
SVM(C=100.0, gamma=1) score: 0.749136975602
SVM(C=100.0, gamma=10.0) score: 0.711388382576
SVM(C=1000.0, gamma=0.001) score: 0.738520932913
SVM(C=1000.0, gamma=0.01) score: 0.745133034069
SVM(C=1000.0, gamma=0.1) score: 0.750003457869
SVM(C=1000.0, gamma=1) score: 0.746527606146
SVM(C=1000.0, gamma=10.0) score: 0.693119555581
SVM(C=10000.0, gamma=0.001) score: 0.742871855207
SVM(C=10000.0, gamma=0.01) score

# LDA: EM optimization

In [292]:
D, W = origin_n_dw_matrix.shape
T = 10

np.random.seed(42)

phi_matrix = np.random.uniform(size=(T, W)).astype(np.float64)
phi_matrix /= np.sum(phi_matrix, axis=1)[:, np.newaxis]

theta_matrix = np.random.uniform(size=(D, T)).astype(np.float64)
theta_matrix /= np.sum(theta_matrix, axis=1)[:, np.newaxis]

regularization_list = np.zeros(200, dtype=object)
regularization_list[:] = create_reg_lda(-0.1, 0.)

calc_log_likelihood = create_calculate_likelihood_like_function(
    loss_function=LogFunction(),
    n_dw_matrix=origin_n_dw_matrix
)

total_words_number = origin_n_dw_matrix.sum()
def callback(it, phi, theta):
    print it,  np.exp(- calc_log_likelihood(phi, theta) / total_words_number)

phi, theta = em_optimization(
    n_dw_matrix=origin_n_dw_matrix, 
    phi_matrix=phi_matrix,
    theta_matrix=theta_matrix,
    regularization_list=regularization_list,
    iters_count=200,
    loss_function=LogFunction(),
    iteration_callback=callback
)

phi, theta = em_optimization(
    n_dw_matrix=origin_n_dw_matrix, 
    phi_matrix=phi_matrix,
    theta_matrix=theta_matrix,
    regularization_list=regularization_list,
    iters_count=200,
    loss_function=LogFunction()
)

0 3987.37646827
1 3897.4074752
2 3798.04728921
3 3680.90916256
4 3546.86075721
5 3404.40082914
6 3264.35983653
7 3133.42614365
8 3015.05899483
9 2911.23100587
10 2822.68526189
11 2748.84376085
12 2687.63946797
13 2636.38528098
14 2592.68187264
15 2555.07686584
16 2523.26596425
17 2497.10117303
18 2475.8452058
19 2458.50015838
20 2444.08698345
21 2431.83457879
22 2421.18913973
23 2411.9411239
24 2403.92060151
25 2396.88272169
26 2390.67929499
27 2385.21190858
28 2380.39755883
29 2376.12819046
30 2372.32169245
31 2368.94321943
32 2365.92689874
33 2363.16733183
34 2360.63018304
35 2358.32615751
36 2356.24209776
37 2354.33902549
38 2352.5797764
39 2350.94714679
40 2349.42551463
41 2348.00372283
42 2346.66906665
43 2345.42598409
44 2344.2645335
45 2343.1731887
46 2342.13828756
47 2341.14797914
48 2340.19949656
49 2339.29619816
50 2338.44005861
51 2337.62109798
52 2336.83933634
53 2336.09686008
54 2335.38957787
55 2334.70918016
56 2334.04792875
57 2333.42233786
58 2332.83523389
59 2332.27917

In [293]:
1. * np.sum(phi < 1e-20) / np.sum(phi >= 0)

0.8196398776418242

In [267]:
svm_score(theta, doc_targets)

SVM(C=1.0, gamma=0.001) score: 0.169276275902
SVM(C=1.0, gamma=0.01) score: 0.600039134497
SVM(C=1.0, gamma=0.1) score: 0.642485498488
SVM(C=1.0, gamma=1) score: 0.670317711202
SVM(C=1.0, gamma=10.0) score: 0.675196603837
SVM(C=10.0, gamma=0.001) score: 0.599690702441
SVM(C=10.0, gamma=0.01) score: 0.640049379212
SVM(C=10.0, gamma=0.1) score: 0.661270332102
SVM(C=10.0, gamma=1) score: 0.680413644943
SVM(C=10.0, gamma=10.0) score: 0.650666639307
SVM(C=100.0, gamma=0.001) score: 0.640223958695
SVM(C=100.0, gamma=0.01) score: 0.660573589984
SVM(C=100.0, gamma=0.1) score: 0.672237234086
SVM(C=100.0, gamma=1) score: 0.677459353464
SVM(C=100.0, gamma=10.0) score: 0.612393217468
SVM(C=1000.0, gamma=0.001) score: 0.659530108567
SVM(C=1000.0, gamma=0.01) score: 0.662487427154
SVM(C=1000.0, gamma=0.1) score: 0.67293397368
SVM(C=1000.0, gamma=1) score: 0.670677995083
SVM(C=1000.0, gamma=10.0) score: 0.589072486595
SVM(C=10000.0, gamma=0.001) score: 0.661096479529
SVM(C=10000.0, gamma=0.01) score:

# PLSA: naive thetaless EM optimization

In [290]:
D, W = origin_n_dw_matrix.shape
T = 10

np.random.seed(42)

phi_matrix = np.random.uniform(size=(T, W)).astype(np.float64)
phi_matrix /= np.sum(phi_matrix, axis=1)[:, np.newaxis]

theta_matrix = np.random.uniform(size=(D, T)).astype(np.float64)
theta_matrix /= np.sum(theta_matrix, axis=1)[:, np.newaxis]

regularization_list = np.zeros(200, dtype=object)
regularization_list[:] = trivial_regularization

calc_log_likelihood = create_calculate_likelihood_like_function(
    loss_function=LogFunction(),
    n_dw_matrix=origin_n_dw_matrix
)

total_words_number = origin_n_dw_matrix.sum()
def callback(it, phi, theta):
    print it,  np.exp(- calc_log_likelihood(phi, theta) / total_words_number)

phi, theta = naive_thetaless_em_optimization(
    n_dw_matrix=origin_n_dw_matrix, 
    phi_matrix=phi_matrix,
    regularization_list=regularization_list,
    iters_count=200,
    iteration_callback=callback
)

phi, theta = naive_thetaless_em_optimization(
    n_dw_matrix=origin_n_dw_matrix, 
    phi_matrix=phi_matrix,
    regularization_list=regularization_list,
    iters_count=200
)

0 4028.35177178
1 4021.70284106
2 4010.85417715
3 3992.47680023
4 3962.57281777
5 3918.66130822
6 3858.60944736
7 3779.02320026
8 3678.00890305
9 3557.60111366
10 3424.09415717
11 3286.93928091
12 3155.88858099
13 3037.63331739
14 2934.65975247
15 2846.60272155
16 2771.81820433
17 2708.09139679
18 2653.23728614
19 2605.52734057
20 2563.82805547
21 2527.47902652
22 2496.01808851
23 2468.9673896
24 2445.77497288
25 2425.86697009
26 2408.70801891
27 2393.82768905
28 2380.85744338
29 2369.4991576
30 2359.4620215
31 2350.46863365
32 2342.29522488
33 2334.77867514
34 2327.84692884
35 2321.57292316
36 2316.02284294
37 2311.15125784
38 2306.84215015
39 2302.97900869
40 2299.47429011
41 2296.27278533
42 2293.34037007
43 2290.65809191
44 2288.1943657
45 2285.92025384
46 2283.81069361
47 2281.83589404
48 2279.96375713
49 2278.17297431
50 2276.45806591
51 2274.82519468
52 2273.27885688
53 2271.79915634
54 2270.35432729
55 2268.92474786
56 2267.50940967
57 2266.12473908
58 2264.8033004
59 2263.5754

In [291]:
1. * np.sum(phi < 1e-20) / np.sum(phi >= 0)

0.7207244160177976

In [269]:
svm_score(theta, doc_targets)

SVM(C=1.0, gamma=0.001) score: 0.169276275902
SVM(C=1.0, gamma=0.01) score: 0.805159600936
SVM(C=1.0, gamma=0.1) score: 0.822387866773
SVM(C=1.0, gamma=1) score: 0.832828722538
SVM(C=1.0, gamma=10.0) score: 0.838566713923
SVM(C=10.0, gamma=0.001) score: 0.805159600936
SVM(C=10.0, gamma=0.01) score: 0.822039434717
SVM(C=10.0, gamma=0.1) score: 0.833349797334
SVM(C=10.0, gamma=1) score: 0.839791073024
SVM(C=10.0, gamma=10.0) score: 0.831952559079
SVM(C=100.0, gamma=0.001) score: 0.822213650745
SVM(C=100.0, gamma=0.01) score: 0.833521352232
SVM(C=100.0, gamma=0.1) score: 0.83944058728
SVM(C=100.0, gamma=1) score: 0.838917814679
SVM(C=100.0, gamma=10.0) score: 0.810548679865
SVM(C=1000.0, gamma=0.001) score: 0.833173525093
SVM(C=1000.0, gamma=0.01) score: 0.837179166956
SVM(C=1000.0, gamma=0.1) score: 0.839788289902
SVM(C=1000.0, gamma=1) score: 0.829168859174
SVM(C=1000.0, gamma=10.0) score: 0.780973834044
SVM(C=10000.0, gamma=0.001) score: 0.835612069084
SVM(C=10000.0, gamma=0.01) score:

# PLSA: ARTM thetaless optimization

In [298]:
D, W = origin_n_dw_matrix.shape
T = 10

np.random.seed(42)

phi_matrix = np.random.uniform(size=(T, W)).astype(np.float64)
phi_matrix /= np.sum(phi_matrix, axis=1)[:, np.newaxis]

theta_matrix = np.random.uniform(size=(D, T)).astype(np.float64)
theta_matrix /= np.sum(theta_matrix, axis=1)[:, np.newaxis]

regularization_list = np.zeros(200, dtype=object)
regularization_list[:] = trivial_regularization

calc_log_likelihood = create_calculate_likelihood_like_function(
    loss_function=LogFunction(),
    n_dw_matrix=origin_n_dw_matrix
)

total_words_number = origin_n_dw_matrix.sum()
def callback(it, phi, theta):
    print it,  np.exp(- calc_log_likelihood(phi, theta) / total_words_number)

phi, theta = artm_thetaless_em_optimization(
    n_dw_matrix=origin_n_dw_matrix, 
    phi_matrix=phi_matrix,
    iters_count=200,
    iteration_callback=callback
)

phi, theta = artm_thetaless_em_optimization(
    n_dw_matrix=origin_n_dw_matrix, 
    phi_matrix=phi_matrix,
    iters_count=200
)

0 4024.0085702
1 3991.50885265
2 3911.89488681
3 3756.1893153
4 3493.3368556
5 3171.316171
6 2915.60988676
7 2746.45369508
8 2635.80508596
9 2560.07395308
10 2505.06796354
11 2462.82826338
12 2429.00455576
13 2400.8921662
14 2377.21732813
15 2358.5833659
16 2343.64807915
17 2330.81632703
18 2319.79565166
19 2310.80408502
20 2303.31341941
21 2297.08052617
22 2291.69432531
23 2286.87792991
24 2282.7277651
25 2278.97319887
26 2275.56004815
27 2272.38892231
28 2269.57526931
29 2267.0551283
30 2264.7551971
31 2262.62370833
32 2260.66444089
33 2258.79769608
34 2257.03570419
35 2255.33731105
36 2253.74794326
37 2252.13412785
38 2250.58292352
39 2249.03407418
40 2247.53274389
41 2245.99435917
42 2244.35455335
43 2242.70489286
44 2241.0986306
45 2239.54583799
46 2238.1159281
47 2236.85119384
48 2235.71748512
49 2234.62317682
50 2233.62029424
51 2232.56358515
52 2231.63564977
53 2230.71625656
54 2230.02567909
55 2229.29650379
56 2228.6848943
57 2227.98746718
58 2227.44599568
59 2226.81180812
60 

In [299]:
1. * np.sum(phi < 1e-20) / np.sum(phi >= 0)

0.8232758620689655

In [300]:
svm_score(theta, doc_targets)

SVM(C=1.0, gamma=0.001) score: 0.169276275902
SVM(C=1.0, gamma=0.01) score: 0.763220998524
SVM(C=1.0, gamma=0.1) score: 0.794023342451
SVM(C=1.0, gamma=1) score: 0.809857267576
SVM(C=1.0, gamma=10.0) score: 0.813160715497
SVM(C=10.0, gamma=0.001) score: 0.762873171385
SVM(C=10.0, gamma=0.01) score: 0.795067911708
SVM(C=10.0, gamma=0.1) score: 0.810033783802
SVM(C=10.0, gamma=1) score: 0.812464819757
SVM(C=10.0, gamma=10.0) score: 0.803590210438
SVM(C=100.0, gamma=0.001) score: 0.79489369568
SVM(C=100.0, gamma=0.01) score: 0.809685956663
SVM(C=100.0, gamma=0.1) score: 0.81281639839
SVM(C=100.0, gamma=1) score: 0.812288788979
SVM(C=100.0, gamma=10.0) score: 0.786367884395
SVM(C=1000.0, gamma=0.001) score: 0.809511135718
SVM(C=1000.0, gamma=0.01) score: 0.811597493636
SVM(C=1000.0, gamma=0.1) score: 0.810376649614
SVM(C=1000.0, gamma=1) score: 0.805855870227
SVM(C=1000.0, gamma=10.0) score: 0.762350515729
SVM(C=10000.0, gamma=0.001) score: 0.811075691931
SVM(C=10000.0, gamma=0.01) score: 

# PLSA: gradient optimization

In [315]:
D, W = origin_n_dw_matrix.shape
T = 10

np.random.seed(42)

phi_matrix = np.random.uniform(size=(T, W)).astype(np.float64)
phi_matrix /= np.sum(phi_matrix, axis=1)[:, np.newaxis]

theta_matrix = np.random.uniform(size=(D, T)).astype(np.float64)
theta_matrix /= np.sum(theta_matrix, axis=1)[:, np.newaxis]

regularization_list = np.zeros(200, dtype=object)
regularization_list[:] = trivial_regularization

calc_log_likelihood = create_calculate_likelihood_like_function(
    loss_function=LogFunction(),
    n_dw_matrix=origin_n_dw_matrix
)

total_words_number = origin_n_dw_matrix.sum()
def callback(it, phi, theta):
    print it,  np.exp(- calc_log_likelihood(phi, theta) / total_words_number)

phi, theta = gradient_optimization(
    n_dw_matrix=origin_n_dw_matrix, 
    phi_matrix=phi_matrix,
    theta_matrix=theta_matrix,
    regularization_gradient_list=regularization_list,
    iters_count=200,
    loss_function=LogFunction(),
    iteration_callback=callback,
    learning_rate=1e-10
)

0 10077.0201076
1 9000.64995398
2 8289.15908634
3 7760.50577064
4 7342.67075424
5 6999.46812428
6 6709.96890275
7 6461.04857097
8 6243.80641146
9 6052.10960786
Iters time 1.23072099686
0 4744.99428753
1 4306.63558239
2 4151.32249368
3 4079.85504326
4 4062.07593218
5 1.17069327399e+14
6 11465.8188711
7 29660.5365078
8 7.83410614953e+15
9 16559.559841
10 17028976319.6
11 5199236221.76
12 70325.7605461
13 18274.1864573
14 5.11136483788e+15
15 27374.0089902
16 2306294156.96
17 15074493884.5
18 677606.721354
19 16185.5916663
20 4.0062959194e+15
21 29529.6108698
22 1129932083.67
23 46022288470.4
24 166171.442295
25 21157.1392253
26 1.27099406952e+16
27 17725.0676726
28 36847389347.8
29 1347481389.94
30 248214.08597
31 14198.86946
32 5.98509703326e+15
33 17770.1129828
34 2577992927.69
35 20001470946.4
36 82782.5048572
37 15688.0297908
38 4.74796193647e+15
39 40389.7578963
40 1588210356.15
41 58005856046.3
42 42491.5431418
43 56419.2937038
44 2.29298387942e+15
45 105230.073947
46 1535393518.5


KeyboardInterrupt: 

In [308]:
1. * np.sum(phi < 1e-20) / np.sum(phi >= 0)

0.7000208565072302

In [309]:
svm_score(theta, doc_targets)

SVM(C=1.0, gamma=0.001) score: 0.169276275902
SVM(C=1.0, gamma=0.01) score: 0.169276275902
SVM(C=1.0, gamma=0.1) score: 0.169276275902
SVM(C=1.0, gamma=1) score: 0.174668379414
SVM(C=1.0, gamma=10.0) score: 0.181637749963
SVM(C=10.0, gamma=0.001) score: 0.169276275902
SVM(C=10.0, gamma=0.01) score: 0.169276275902
SVM(C=10.0, gamma=0.1) score: 0.174321035199
SVM(C=10.0, gamma=1) score: 0.174499607637
SVM(C=10.0, gamma=10.0) score: 0.172241110096
SVM(C=100.0, gamma=0.001) score: 0.169276275902
SVM(C=100.0, gamma=0.01) score: 0.173624412549
SVM(C=100.0, gamma=0.1) score: 0.171713119563
SVM(C=100.0, gamma=1) score: 0.182677118955
SVM(C=100.0, gamma=10.0) score: 0.170674734086


KeyboardInterrupt: 

In [None]:
D, W = big_origin_n_dw_matrix.shape
T = 30

np.random.seed(5242)

phi_matrix = np.random.uniform(size=(T, W)).astype(np.float64)
phi_matrix /= np.sum(phi_matrix, axis=1)[:, np.newaxis]

theta_matrix = np.random.uniform(size=(D, T)).astype(np.float64)
theta_matrix /= np.sum(theta_matrix, axis=1)[:, np.newaxis]

regularization_list = np.zeros(200, dtype=object)
regularization_list[:] = trivial_regularization

calc_log_likelihood = create_calculate_likelihood_like_function(
    loss_function=LogFunction(),
    n_dw_matrix=big_origin_n_dw_matrix
)

total_words_number = big_origin_n_dw_matrix.sum()
def callback(it, phi, theta):
    print it,  np.exp(- calc_log_likelihood(phi, theta) / total_words_number)

phi, theta = em_optimization(
    n_dw_matrix=big_origin_n_dw_matrix, 
    phi_matrix=phi_matrix,
    theta_matrix=theta_matrix,
    regularization_list=regularization_list,
    iters_count=200,
    loss_function=LogFunction(),
)

In [None]:
1. * np.sum(phi < 1e-20) / np.sum(phi >= 0)

In [None]:
svm_score(theta, big_doc_targets)

In [None]:
1

In [None]:
D, W = big_origin_n_dw_matrix.shape
T = 30

np.random.seed(42)

phi_matrix = np.random.uniform(size=(T, W)).astype(np.float64)
phi_matrix /= np.sum(phi_matrix, axis=1)[:, np.newaxis]

theta_matrix = np.random.uniform(size=(D, T)).astype(np.float64)
theta_matrix /= np.sum(theta_matrix, axis=1)[:, np.newaxis]

regularization_list = np.zeros(200, dtype=object)
regularization_list[:] = trivial_regularization

calc_log_likelihood = create_calculate_likelihood_like_function(
    loss_function=LogFunction(),
    n_dw_matrix=big_origin_n_dw_matrix
)

total_words_number = big_origin_n_dw_matrix.sum()
def callback(it, phi, theta):
    print it,  np.exp(- calc_log_likelihood(phi, theta) / total_words_number)

phi, theta = naive_thetaless_em_optimization(
    n_dw_matrix=big_origin_n_dw_matrix, 
    phi_matrix=phi_matrix,
    regularization_list=regularization_list,
    iters_count=200,
)

In [None]:
1. * np.sum(phi < 1e-20) / np.sum(phi >= 0)

In [None]:
svm_score(theta, big_doc_targets)

In [323]:
D, W = big_origin_n_dw_matrix.shape
T = 30

np.random.seed(42)

phi_matrix = np.random.uniform(size=(T, W)).astype(np.float64)
phi_matrix /= np.sum(phi_matrix, axis=1)[:, np.newaxis]

theta_matrix = np.random.uniform(size=(D, T)).astype(np.float64)
theta_matrix /= np.sum(theta_matrix, axis=1)[:, np.newaxis]

regularization_list = np.zeros(200, dtype=object)
regularization_list[:] = trivial_regularization

calc_log_likelihood = create_calculate_likelihood_like_function(
    loss_function=LogFunction(),
    n_dw_matrix=big_origin_n_dw_matrix
)

total_words_number = big_origin_n_dw_matrix.sum()
def callback(it, phi, theta):
    print it,  np.exp(- calc_log_likelihood(phi, theta) / total_words_number)

phi, theta = artm_thetaless_em_optimization(
    n_dw_matrix=big_origin_n_dw_matrix, 
    phi_matrix=phi_matrix,
    iters_count=200,
)

Iters time 191.425027847


In [None]:
1. * np.sum(phi < 1e-20) / np.sum(phi >= 0)

0.9317146165104747

In [None]:
svm_score(theta, big_doc_targets)

SVM(C=1.0, gamma=0.001) score: 0.0534981417557
SVM(C=1.0, gamma=0.01) score: 0.353752725393
SVM(C=1.0, gamma=0.1) score: 0.528856637738
SVM(C=1.0, gamma=1) score: 0.552428969197
SVM(C=1.0, gamma=10.0) score: 0.568894035616
