In [1]:
import numpy as np

In [3]:
import scipy
import scipy.sparse

In [108]:
FOLDER_TEMPLATE = "E:/Championat_com/Lemmatized_Spoiled_Filtered_Numerized/";

In [109]:
from collections import Counter

def create_frequency_matrix(path):
    row, col, data = [], [], []
    with open(path, 'r') as f:
        for i, line in enumerate(f.xreadlines()):
            tokens = map(int, line.split(' '))
            assert tokens[0] == len(tokens) - 1
            words_cnt = Counter(tokens[1:])
            for w, c in words_cnt.iteritems():
                row.append(i)
                col.append(w)
                data.append(c)
    return scipy.sparse.csr_matrix((data, (row, col)))

In [110]:
origin_freq_matrix = create_frequency_matrix(FOLDER_TEMPLATE + 'numerized.dat')

In [571]:
D, W = origin_freq_matrix.shape
T = 10

In [573]:
np.random.seed(42)

phi_matrix = np.random.uniform(size=(T, W)).astype(np.float64)
phi_matrix /= np.sum(phi_matrix, axis=1)[:, np.newaxis]

theta_matrix = np.random.uniform(size=(D, T)).astype(np.float64)
theta_matrix /= np.sum(theta_matrix, axis=1)[:, np.newaxis]

In [686]:
from itertools import izip

def perform_e_step_update(freq_matrix, docptr, phi_matrix, theta_matrix, block_size=1, e_selection_level=0.01):
    D, W = freq_matrix.shape
    T = phi_matrix.shape[0]
    n_wt, n_dt = np.zeros((W, T)), np.zeros((D, T))
    transposed_phi_matrix = np.transpose(phi_matrix)
    
    indices = freq_matrix.indices
    indptr = freq_matrix.indptr
    data = freq_matrix.data
    
    for block_num in xrange((D + block_size - 1) / block_size):
        block_start = block_num * block_size
        block_finish = min(D, block_start + block_size)
        ind_start, ind_finish = indptr[block_start], indptr[block_finish]
        
        datas = data[ind_start:ind_finish]
        words = indices[ind_start:ind_finish]
        docs = docptr[ind_start:ind_finish]
        
        p_dwt = transposed_phi_matrix[words] * theta_matrix[docs, :]
        maximums = np.max(p_dwt, axis=1)
        p_dwt[p_dwt < e_selection_level * maximums[:, np.newaxis]] = 0.
        p_dwt /= (np.sum(p_dwt, axis=1)[:, np.newaxis] + 1e-10)
        p_dwt *= datas[:, np.newaxis]
        
        for doc_num in xrange(block_start, block_finish):
            doc_start, doc_finish = indptr[doc_num], indptr[doc_num + 1]
            doc_p_dwt = p_dwt[(doc_start - ind_start):(doc_finish - ind_start), :]
            n_dt[doc_num, :] += np.sum(doc_p_dwt, axis=0)
            n_wt[indices[doc_start:doc_finish], :] += doc_p_dwt
    return n_wt, n_dt

def trivial_regularization(n_wt, n_dt, phi_matrix, theta_matrix):
    return 0., 0.


def calculate_decorr(phi_matrix):
    aggr_phi = np.sum(phi_matrix, axis=1)
    return np.sum(phi_matrix * (aggr_phi[:, np.newaxis] - phi_matrix))

def create_reg_decorr_naive(tau):
    def fun (n_wt, n_dt, phi_matrix, theta_matrix):
        aggr_phi = np.sum(phi_matrix, axis=1)
        return - tau * np.transpose(phi_matrix * (aggr_phi[:, np.newaxis] - phi_matrix)), -0.1
    return fun

def create_reg_decorr_unbiased(tau):
    def fun (n_wt, n_dt, phi_matrix, theta_matrix):
        tmp_phi =  n_wt / np.sum(n_wt, axis=0)
        aggr_phi = np.sum(tmp_phi, axis=0)
        return - tau * tmp_phi * (aggr_phi[np.newaxis, :] - tmp_phi), -0.1
    return fun

def calculate_likelihood(freq_matrix, docptr, phi_matrix, theta_matrix, block_size=1):
    D, W = freq_matrix.shape
    T = phi_matrix.shape[0]
    transposed_phi_matrix = np.transpose(phi_matrix)
    
    indices = freq_matrix.indices
    indptr = freq_matrix.indptr
    data = freq_matrix.data
    
    res = 0.
    for block_num in xrange((D + block_size - 1) / block_size):
        block_start = block_num * block_size
        block_finish = min(D, block_start + block_size)
        ind_start, ind_finish = indptr[block_start], indptr[block_finish]
        
        datas = data[ind_start:ind_finish]
        words = indices[ind_start:ind_finish]
        docs = docptr[ind_start:ind_finish]
        
        p_dwt = transposed_phi_matrix[words] * theta_matrix[docs, :]
        res += np.sum(np.log(np.sum(p_dwt, axis=1) + 1e-10) * datas)
    
    return res


def create_decorr_logger(tau):
    def fun(freq_matrix, docptr, phi_matrix, theta_matrix):
        likelihood = calculate_likelihood(freq_matrix, docptr, phi_matrix, theta_matrix, 50)
        decorr = calculate_decorr(phi_matrix)
        print 'L', likelihood
        print 'decorr', decorr
        print 'L + tau R', likelihood - tau * decorr
        print 'Zeros', np.sum(phi_matrix <= 1e-20)
    return fun
        

def launch_em(freq_matrix, phi_matrix, theta_matrix, logger, regularization=trivial_regularization, iters_count=100, e_selection_level=0.0001):
    phi_matrix = np.array(phi_matrix)
    theta_matrix = np.array(theta_matrix)
    docptr = []
    indptr = freq_matrix.indptr
    for doc_num in xrange(D):
        docptr.extend([doc_num] * (indptr[doc_num + 1] - indptr[doc_num]))
    docptr = np.array(docptr)
    
    
    for it in xrange(iters_count):
        print it
        n_wt, n_dt = perform_e_step_update(freq_matrix, docptr, phi_matrix, theta_matrix, 50, e_selection_level=e_selection_level)
        r_wt, r_dt = regularization(n_wt, n_dt, phi_matrix, theta_matrix)
        n_wt = np.maximum(n_wt + r_wt, 0)
        n_dt = np.maximum(n_dt + r_dt, 0)
        n_wt /= np.sum(n_wt, axis=0)
        n_dt /= np.sum(n_dt, axis=1)[:, np.newaxis]
        phi_matrix = np.transpose(n_wt)
        theta_matrix = n_dt
        logger(freq_matrix, docptr, phi_matrix, theta_matrix)
        
    return phi_matrix, theta_matrix


In [698]:
%%time
_ = launch_em(
    origin_freq_matrix,
    phi_matrix, theta_matrix,
    create_decorr_logger(1e5),
    create_reg_decorr_unbiased(1e5),
    50,
    e_selection_level=0.5
)

0
L -42532902.8965
decorr 9.98291994257
L + tau R -43531194.8907
Zeros 43867
1
L -42811428.1155
decorr 9.97958949002
L + tau R -43809387.0645
Zeros 74319
2
L -42929201.834
decorr 9.97849571803
L + tau R -43927051.4058
Zeros 90198
3
L -42951047.1659
decorr 9.97826220945
L + tau R -43948873.3869
Zeros 98857
4
L -42945877.2078
decorr 9.97822483339
L + tau R -43943699.6912
Zeros 103805
5
L -42932852.2481
decorr 9.97820330449
L + tau R -43930672.5786
Zeros 106750
6
L -42917261.2069
decorr 9.97816588762
L + tau R -43915077.7956
Zeros 108669
7
L -42900735.3102
decorr 9.97812686409
L + tau R -43898547.9966
Zeros 109949
8
L -42884528.5585
decorr 9.97809804751
L + tau R -43882338.3633
Zeros 110868
9
L -42868913.6814
decorr 9.97807569655
L + tau R -43866721.251
Zeros 111562
10
L -42854135.7527
decorr 9.97806385384
L + tau R -43851942.1381
Zeros 112080
11
L -42840821.992
decorr 9.9780576592
L + tau R -43838627.7579
Zeros 112487
12
L -42829272.0524
decorr 9.97804167507
L + tau R -43827076.2199
Zero

In [695]:
%%time
new_phi, new_theta = launch_em(
    origin_freq_matrix,
    phi_matrix, theta_matrix,
    create_decorr_logger(1e5),
    create_reg_decorr_unbiased(1e5),
    50,
    e_selection_level=0.
)

0
L -42319446.8746
decorr 9.98810687418
L + tau R -43318257.5621
Zeros 0
1
L -42301186.169
decorr 9.98821443901
L + tau R -43300007.6129
Zeros 0
2
L -42283310.9197
decorr 9.98828966701
L + tau R -43282139.8864
Zeros 0
3
L -42263512.1913
decorr 9.98833579793
L + tau R -43262345.7711
Zeros 0
4
L -42239068.6731
decorr 9.98835143128
L + tau R -43237903.8162
Zeros 0
5
L -42206051.8284
decorr 9.9883290578
L + tau R -43204884.7342
Zeros 2
6
L -42158277.5061
decorr 9.98825109464
L + tau R -43157102.6156
Zeros 4
7
L -42086105.7353
decorr 9.98808223309
L + tau R -43084913.9586
Zeros 6
8
L -41976286.4217
decorr 9.98775884382
L + tau R -42975062.306
Zeros 10
9
L -41815847.9308
decorr 9.98718728166
L + tau R -42814566.659
Zeros 10
10
L -41601783.907
decorr 9.98628463234
L + tau R -42600412.3703
Zeros 17
11
L -41350052.4847
decorr 9.98507507356
L + tau R -42348559.992
Zeros 27
12
L -41091100.9597
decorr 9.98373382117
L + tau R -42089474.3419
Zeros 49
13
L -40853500.4684
decorr 9.98248473047
L + tau 

In [702]:
%%time
new_phi2, new_theta2 = launch_em(
    origin_freq_matrix,
    new_phi, new_theta,
    create_decorr_logger(1e5),
    create_reg_decorr_unbiased(1e5),
    10,
    e_selection_level=0.99
)

0
L -39905252.5689
decorr 9.97659495099
L + tau R -40902912.064
Zeros 120729
1
L -39922785.853
decorr 9.97694640201
L + tau R -40920480.4932
Zeros 121261
2
L -39938269.2097
decorr 9.9770283254
L + tau R -40935972.0422
Zeros 121622
3
L -39952455.7592
decorr 9.97698955004
L + tau R -40950154.7142
Zeros 121888
4
L -39964825.5156
decorr 9.97690844591
L + tau R -40962516.3601
Zeros 122121
5
L -39975641.4504
decorr 9.97682740988
L + tau R -40973324.1914
Zeros 122289
6
L -39985210.8665
decorr 9.97674167138
L + tau R -40982885.0336
Zeros 122434
7
L -39993640.4164
decorr 9.97665756512
L + tau R -40991306.1729
Zeros 122539
8
L -40001040.6908
decorr 9.97657563845
L + tau R -40998698.2547
Zeros 122640
9
L -40007254.785
decorr 9.97651193298
L + tau R -41004905.9783
Zeros 122712
Wall time: 18 s


In [703]:
%%time
new_phi3, new_theta3 = launch_em(
    origin_freq_matrix,
    new_phi2, new_theta2,
    create_decorr_logger(1e5),
    create_reg_decorr_unbiased(0),
    10,
    e_selection_level=0.01
)

0
L -39877649.0347
decorr 9.97779748017
L + tau R -40875428.7827
Zeros 122712
1
L -39835963.6742
decorr 9.97824209252
L + tau R -40833787.8835
Zeros 122714
2
L -39816123.7925
decorr 9.97848223471
L + tau R -40813972.0159
Zeros 122716
3
L -39804879.7327
decorr 9.97864002743
L + tau R -40802743.7354
Zeros 122717
4
L -39797778.9475
decorr 9.97875373843
L + tau R -40795654.3213
Zeros 122722
5
L -39792951.4623
decorr 9.97883983538
L + tau R -40790835.4458
Zeros 122731
6
L -39789497.7243
decorr 9.97890687345
L + tau R -40787388.4116
Zeros 122739
7
L -39786937.9894
decorr 9.97896008242
L + tau R -40784833.9976
Zeros 122752
8
L -39784990.3619
decorr 9.97900298772
L + tau R -40782890.6606
Zeros 122758
9
L -39783476.8991
decorr 9.97903810876
L + tau R -40781380.71
Zeros 122770
Wall time: 18.5 s


In [712]:
m = np.transpose(new_phi3)
m /= np.sum(m, axis=1)[:, np.newaxis]
print np.sum(m > 1e-1)
print np.sum(m >= 0)

43489
188310


In [672]:
%%time
launch_em(origin_freq_matrix, phi_matrix, theta_matrix, create_decorr_logger(1e5), create_reg_decorr_unbiased(1e5), 50)

0
L -42532902.8965
decorr 9.98291994257
L + tau R -43531194.8907
1
L -42811428.1155
decorr 9.97958949002
L + tau R -43809387.0645
2
L -42929201.834
decorr 9.97849571803
L + tau R -43927051.4058
3
L -42951047.1659
decorr 9.97826220945
L + tau R -43948873.3869
4
L -42945877.2078
decorr 9.97822483339
L + tau R -43943699.6912
5
L -42932852.2481
decorr 9.97820330449
L + tau R -43930672.5786
6
L -42917261.2069
decorr 9.97816588762
L + tau R -43915077.7956
7
L -42900735.3102
decorr 9.97812686409
L + tau R -43898547.9966
8
L -42884528.5585
decorr 9.97809804751
L + tau R -43882338.3633
9
L -42868913.6814
decorr 9.97807569655
L + tau R -43866721.251
10
L -42854135.7527
decorr 9.97806385384
L + tau R -43851942.1381
11
L -42840821.992
decorr 9.9780576592
L + tau R -43838627.7579
12
L -42829272.0524
decorr 9.97804167507
L + tau R -43827076.2199
13
L -42819254.794
decorr 9.97801139673
L + tau R -43817055.9337
14
L -42810423.1762
decorr 9.97797166149
L + tau R -43808220.3424
15
L -42802740.9306
decor

In [663]:
%%time
launch_em(origin_freq_matrix, phi_matrix, theta_matrix, create_decorr_logger(1e5), create_reg_decorr_naive(1e5), 50)

0
L -44703155.621
decorr 9.98477366634
L + tau R -45701632.9877
1
L -44392873.5262
decorr 9.9877519217
L + tau R -45391648.7184
2
L -44371442.8485
decorr 9.98719441523
L + tau R -45370162.2901
3
L -44352187.8109
decorr 9.98741005976
L + tau R -45350928.8169
4
L -44327840.773
decorr 9.98737881288
L + tau R -45326578.6543
5
L -44293214.5751
decorr 9.98733562654
L + tau R -45291948.1377
6
L -44239956.5305
decorr 9.98718424113
L + tau R -45238674.9546
7
L -44154766.0403
decorr 9.98686773175
L + tau R -45153452.8135
8
L -44020845.5723
decorr 9.98625785406
L + tau R -45019471.3577
9
L -43827203.0448
decorr 9.9852095901
L + tau R -44825724.0038
10
L -43584495.9021
decorr 9.98370117895
L + tau R -44582866.02
11
L -43327048.129
decorr 9.98196394365
L + tau R -44325244.5233
12
L -43090945.0468
decorr 9.98033560952
L + tau R -44088978.6078
13
L -42895831.9808
decorr 9.97905289336
L + tau R -43893737.2701
14
L -42744546.6119
decorr 9.97818881533
L + tau R -43742365.4934
15
L -42630341.8494
decorr 

In [665]:
%%time
launch_em(origin_freq_matrix, phi_matrix, theta_matrix, create_decorr_logger(1e5), create_reg_decorr_unbiased(1e5), 50)

0
L -42319446.8746
decorr 9.98810687418
L + tau R -43318257.5621
1
L -42301186.169
decorr 9.98821443901
L + tau R -43300007.6129
2
L -42283310.9197
decorr 9.98828966701
L + tau R -43282139.8864
3
L -42263512.1913
decorr 9.98833579793
L + tau R -43262345.7711
4
L -42239068.6731
decorr 9.98835143128
L + tau R -43237903.8162
5
L -42206051.8284
decorr 9.9883290578
L + tau R -43204884.7342
6
L -42158277.5061
decorr 9.98825109464
L + tau R -43157102.6156
7
L -42086105.7353
decorr 9.98808223309
L + tau R -43084913.9586
8
L -41976286.4217
decorr 9.98775884382
L + tau R -42975062.306
9
L -41815847.9308
decorr 9.98718728166
L + tau R -42814566.659
10
L -41601783.907
decorr 9.98628463234
L + tau R -42600412.3703
11
L -41350052.4847
decorr 9.98507507356
L + tau R -42348559.992
12
L -41091100.9597
decorr 9.98373382117
L + tau R -42089474.3419
13
L -40853500.4684
decorr 9.98248473047
L + tau R -41851748.9414
14
L -40653236.2392
decorr 9.9814851235
L + tau R -41651384.7516
15
L -40493767.0027
decorr 