In [1]:
import numpy as np
import pandas as pd
from collections import Counter
from scipy.special import digamma
from scipy.special import gamma
from scipy.special import gammaln
import time

import pickle

In [2]:
class LDA:
    
    def __init__(self, fn, K):
        data = pd.read_csv(fn,sep='\t',index_col = 0)
        np.random.seed(2)
        data = data.iloc[np.random.choice(range(len(data)),5000)]
        words_in_each_gene = data.groupby(data.index).sum()

        words_in_txt = [np.repeat(np.array(words_in_each_gene.columns), t.astype('int'),axis=0) for t in np.array(np.ceil(words_in_each_gene))]
        allwords = [a for b in words_in_txt for a in b]
        vocabulary = np.unique(allwords)
        vocabulary_in_int = dict(zip(vocabulary, range(len(vocabulary))))
        self.vocabulary_in_int = vocabulary_in_int

        self.K = K
        self.M = len(words_in_each_gene)
        self.V = len(vocabulary)
        print 'M = %d, V = %d, total number of words = %d' % (self.M, self.V, len(allwords))

        W = [[vocabulary_in_int[t] for t in wd] for wd in words_in_txt]
        self.Nd = [len(wd) for wd in W]
        self.alphii = 1/float(self.K)
        self.etaj = 1/float(self.V)
        
        ### generate W matrix: D x [Nd, V]
        self.Wn = [np.zeros([self.Nd[t],self.V]) for t in range(self.M)]
        for d in range(self.M):
            for token in xrange(self.Nd[d]):
                self.Wn[d][token, W[d][token]] = 1 
          
        
        #### initialization -- set the strucutres, values don't matter
        self.phi = [np.ones([t, self.K]) for t in self.Nd]
        self.gammma = np.random.sample([self.M, self.K])
        self.lambbda = np.random.sample([self.K, self.V])

        
    def update_stochastic(self, max_iter, local_max_iter):
        
        self.T = []
        self.ELBO = []
        episron = 1e-3
        iterations = 1
        old_ELBO = 0

        while iterations < max_iter: 
    
            START = time.time()
            
            ##### update local parameters
            
            ## shuffle the samples
            shutffled_idx = range(self.M)
            np.random.shuffle(shutffled_idx)
            for rd_d in shutffled_idx:
                print rd_d
                nd_d = self.Nd[rd_d]
            
                # randomly initialize gamma for this gene
                # & give phi_local a strucutre
                gammma_local = np.random.sample(self.K)
                phi_local = np.zeros([rd_d, self.K])
            
                # optimize the local phi and gamma for this document
                local_iterations = 0
                while local_iterations < local_max_iter:
                
                    old_phi_local = self.phi[rd_d].copy()
                    old_gammma_local = self.gammma[rd_d].copy()
                
                    # update local phi
                    A = np.transpose(np.reshape(np.repeat(digamma(gammma_local), nd_d), [self.K, nd_d]))
                    B = np.dot(self.Wn[rd_d], digamma(np.transpose(self.lambbda)))
                    C = np.transpose(digamma(np.reshape(np.repeat(np.sum(np.transpose(self.lambbda), axis=0), nd_d), [self.K, nd_d]))) 
                
                    phi_local = A+B-C
                    phi_local = np.exp([x - max(x) for x in phi_local])
                    phi_local = np.array([x+1e-10 / sum(x+1e-10) for x in phi_local])
                              
                    # update local gamma 
                    gammma_local = np.sum(phi_local, axis=0) + self.alphii
                    if np.sum(np.linalg.norm(old_phi_local - phi_local) + np.linalg.norm(old_gammma_local - gammma_local)) < episron:
                        print 'Gene converged after %d iterations' % local_iterations
                        break
                    else:
                        self.phi[rd_d] = phi_local
                        self.gammma[rd_d] = gammma_local
                        local_iterations += 1
                                            
                assert ~np.isnan(np.sum(self.phi[rd_d]))
                assert np.sum(self.phi[rd_d] == 0) == 0
                assert ~np.isnan(np.sum(self.gammma[rd_d]))
                                
                    
            ##### update global parameters
            
            kappa = 0.55
            rhot = np.power(iterations + 1, kappa)
            
            ## update intermediate global parameter
            lambbda_med = np.zeros([self.K, self.V])
            for i in xrange(self.K):
                for j in xrange(self.V):
                    lambbda_med[i,j] = self.M * np.dot(self.Wn[rd_d][:,j], phi_local[:,i]) + self.etaj

            self.lambbda = (1-rhot) * self.lambbda  + rhot * lambbda_med
            assert ~np.isnan(np.sum(self.lambbda))


            ##### compute ELBO
            gammma_term = np.sum(np.array([gammaln(np.sum(t)) for t in self.gammma]) - np.sum([gammaln(t) for t in self.gammma],axis=1))
            lambda_term = np.sum(np.array([gammaln(np.sum(t)) for t in self.lambbda]) - np.sum([gammaln(t) for t in self.lambbda],axis=1))
            phi_term = np.sum([np.sum(t * np.log(t)) for t in self.phi])
            ELBO = -gammma_term - lambda_term - phi_term
            self.ELBO.append(ELBO)
            self.T.append(time.time() - START)
    
            print time.time() - START
            print gammma_term, lambda_term, phi_term, ELBO
    
            if np.abs(ELBO - old_ELBO) < episron:
                print 'Converged after %d iterations\n' % iterations
                break
            else:
                iterations += 1
                old_ELBO = ELBO.copy()
                
                
            

In [3]:
genes = LDA("Features.csv",4)
genes.update_stochastic(20,1000)

M = 190, V = 124, total number of words = 502401
165
Gene converged after 24 iterations
74
Gene converged after 44 iterations
22
Gene converged after 53 iterations
82
Gene converged after 32 iterations
54
Gene converged after 22 iterations
112
Gene converged after 44 iterations
139
Gene converged after 55 iterations
35
Gene converged after 27 iterations
129
Gene converged after 72 iterations
71
Gene converged after 35 iterations
119
Gene converged after 65 iterations
157
Gene converged after 24 iterations
130
Gene converged after 44 iterations
93
Gene converged after 46 iterations
49
Gene converged after 34 iterations
169
Gene converged after 29 iterations
60
Gene converged after 50 iterations
92
Gene converged after 49 iterations
164
Gene converged after 37 iterations
76
Gene converged after 35 iterations
156
Gene converged after 38 iterations
127
Gene converged after 38 iterations
88
Gene converged after 54 iterations
75
Gene converged after 59 iterations
30
Gene converged after 42 i

Gene converged after 22 iterations
96
Gene converged after 25 iterations
26
Gene converged after 21 iterations
45
Gene converged after 35 iterations
127
Gene converged after 34 iterations
50
Gene converged after 23 iterations
35
Gene converged after 29 iterations
182
Gene converged after 26 iterations
123
Gene converged after 26 iterations
34
Gene converged after 51 iterations
47
Gene converged after 33 iterations
72
Gene converged after 43 iterations
135
Gene converged after 33 iterations
145
Gene converged after 29 iterations
54
Gene converged after 28 iterations
76
Gene converged after 26 iterations
133
Gene converged after 36 iterations
139
Gene converged after 31 iterations
154
Gene converged after 29 iterations
6
Gene converged after 30 iterations
169
Gene converged after 23 iterations
95
Gene converged after 31 iterations
162
Gene converged after 76 iterations
82
Gene converged after 31 iterations
33
Gene converged after 40 iterations
159
Gene converged after 29 iterations
0
Gen

Gene converged after 10 iterations
57
Gene converged after 7 iterations
96
Gene converged after 6 iterations
175
Gene converged after 9 iterations
108
Gene converged after 9 iterations
68
Gene converged after 9 iterations
173
Gene converged after 8 iterations
41
Gene converged after 8 iterations
31
Gene converged after 7 iterations
154
Gene converged after 9 iterations
155
Gene converged after 6 iterations
59
Gene converged after 7 iterations
93
Gene converged after 7 iterations
15
Gene converged after 10 iterations
77
Gene converged after 8 iterations
75
Gene converged after 9 iterations
105
Gene converged after 13 iterations
78
Gene converged after 8 iterations
5
Gene converged after 7 iterations
10
Gene converged after 8 iterations
16
Gene converged after 6 iterations
181
Gene converged after 10 iterations
104
Gene converged after 7 iterations
149
Gene converged after 7 iterations
148
Gene converged after 10 iterations
118
Gene converged after 9 iterations
26
Gene converged after 6 

Gene converged after 18 iterations
10
Gene converged after 20 iterations
27
Gene converged after 18 iterations
159
Gene converged after 21 iterations
78
Gene converged after 17 iterations
160
Gene converged after 22 iterations
125
Gene converged after 18 iterations
33
Gene converged after 19 iterations
173
Gene converged after 27 iterations
15
Gene converged after 17 iterations
19
Gene converged after 18 iterations
178
Gene converged after 18 iterations
156
Gene converged after 20 iterations
151
Gene converged after 22 iterations
50
Gene converged after 18 iterations
88
Gene converged after 22 iterations
76
Gene converged after 17 iterations
116
Gene converged after 21 iterations
35
Gene converged after 14 iterations
189
Gene converged after 24 iterations
77
Gene converged after 20 iterations
122
Gene converged after 17 iterations
90
Gene converged after 19 iterations
13
Gene converged after 18 iterations
37
Gene converged after 23 iterations
187
Gene converged after 28 iterations
162


Gene converged after 16 iterations
39
Gene converged after 31 iterations
167
Gene converged after 16 iterations
132
Gene converged after 19 iterations
57
Gene converged after 12 iterations
12
Gene converged after 15 iterations
87
Gene converged after 14 iterations
1
Gene converged after 18 iterations
59
Gene converged after 10 iterations
97
Gene converged after 17 iterations
7
Gene converged after 10 iterations
174
Gene converged after 13 iterations
184
Gene converged after 16 iterations
26
Gene converged after 15 iterations
24
Gene converged after 11 iterations
14
Gene converged after 14 iterations
44
Gene converged after 13 iterations
41
Gene converged after 13 iterations
166
Gene converged after 12 iterations
5
Gene converged after 11 iterations
13
Gene converged after 21 iterations
81
Gene converged after 14 iterations
181
Gene converged after 18 iterations
106
Gene converged after 13 iterations
88
Gene converged after 13 iterations
144
Gene converged after 14 iterations
56
Gene co

Gene converged after 13 iterations
39
Gene converged after 16 iterations
138
Gene converged after 14 iterations
106
Gene converged after 19 iterations
56
Gene converged after 16 iterations
89
Gene converged after 15 iterations
86
Gene converged after 13 iterations
19
Gene converged after 20 iterations
173
Gene converged after 30 iterations
160
Gene converged after 14 iterations
94
Gene converged after 15 iterations
148
Gene converged after 19 iterations
128
Gene converged after 17 iterations
143
Gene converged after 13 iterations
184
Gene converged after 15 iterations
124
Gene converged after 14 iterations
85
Gene converged after 18 iterations
7
Gene converged after 13 iterations
157
Gene converged after 20 iterations
121
Gene converged after 20 iterations
17
Gene converged after 12 iterations
83
Gene converged after 17 iterations
2
Gene converged after 15 iterations
104
Gene converged after 16 iterations
167
Gene converged after 20 iterations
78
Gene converged after 14 iterations
76
G

Gene converged after 20 iterations
40
Gene converged after 18 iterations
61
Gene converged after 24 iterations
20
Gene converged after 27 iterations
27
Gene converged after 22 iterations
62
Gene converged after 24 iterations
0
Gene converged after 14 iterations
142
Gene converged after 22 iterations
15
Gene converged after 27 iterations
44
Gene converged after 19 iterations
35
Gene converged after 20 iterations
93
Gene converged after 29 iterations
123
Gene converged after 25 iterations
157
Gene converged after 21 iterations
51
Gene converged after 23 iterations
46
Gene converged after 32 iterations
76
Gene converged after 20 iterations
70
Gene converged after 21 iterations
184
Gene converged after 18 iterations
14
Gene converged after 18 iterations
113
Gene converged after 19 iterations
108
Gene converged after 36 iterations
148
Gene converged after 24 iterations
109
Gene converged after 20 iterations
37
Gene converged after 25 iterations
180
Gene converged after 17 iterations
23
Gene

Gene converged after 5 iterations
13
Gene converged after 7 iterations
69
Gene converged after 6 iterations
89
Gene converged after 6 iterations
74
Gene converged after 6 iterations
183
Gene converged after 7 iterations
31
Gene converged after 6 iterations
20
Gene converged after 6 iterations
135
Gene converged after 14 iterations
134
Gene converged after 6 iterations
93
Gene converged after 6 iterations
155
Gene converged after 7 iterations
44
Gene converged after 5 iterations
149
Gene converged after 6 iterations
154
Gene converged after 6 iterations
159
Gene converged after 8 iterations
171
Gene converged after 6 iterations
119
Gene converged after 8 iterations
19
Gene converged after 6 iterations
146
Gene converged after 9 iterations
142
Gene converged after 5 iterations
161
Gene converged after 7 iterations
164
Gene converged after 6 iterations
137
Gene converged after 8 iterations
0
Gene converged after 9 iterations
62
Gene converged after 6 iterations
67
Gene converged after 7 i

Gene converged after 27 iterations
50
Gene converged after 18 iterations
131
Gene converged after 17 iterations
108
Gene converged after 25 iterations
65
Gene converged after 25 iterations
14
Gene converged after 14 iterations
144.652472973
797652.717397 -45594489.4331 -161486.878064 44958323.5937
132
Gene converged after 21 iterations
174
Gene converged after 12 iterations
74
Gene converged after 16 iterations
18
Gene converged after 15 iterations
17
Gene converged after 20 iterations
102
Gene converged after 21 iterations
6
Gene converged after 17 iterations
125
Gene converged after 15 iterations
37
Gene converged after 16 iterations
187
Gene converged after 12 iterations
64
Gene converged after 18 iterations
15
Gene converged after 10 iterations
23
Gene converged after 17 iterations
87
Gene converged after 14 iterations
96
Gene converged after 16 iterations
101
Gene converged after 16 iterations
57
Gene converged after 19 iterations
2
Gene converged after 14 iterations
155
Gene conv

Gene converged after 24 iterations
75
Gene converged after 36 iterations
89
Gene converged after 20 iterations
24
Gene converged after 32 iterations
62
Gene converged after 20 iterations
183
Gene converged after 36 iterations
10
Gene converged after 26 iterations
132
Gene converged after 20 iterations
61
Gene converged after 26 iterations
64
Gene converged after 32 iterations
189
Gene converged after 29 iterations
88
Gene converged after 34 iterations
134
Gene converged after 22 iterations
78
Gene converged after 31 iterations
35
Gene converged after 19 iterations
14
Gene converged after 16 iterations
182
Gene converged after 26 iterations
165
Gene converged after 23 iterations
44
Gene converged after 22 iterations
188
Gene converged after 31 iterations
4
Gene converged after 32 iterations
168
Gene converged after 17 iterations
59
Gene converged after 40 iterations
40
Gene converged after 22 iterations
109
Gene converged after 33 iterations
9
Gene converged after 23 iterations
153
Gene

Gene converged after 12 iterations
9
Gene converged after 12 iterations
25
Gene converged after 13 iterations
76
Gene converged after 13 iterations
118
Gene converged after 15 iterations
4
Gene converged after 10 iterations
144
Gene converged after 13 iterations
100
Gene converged after 13 iterations
185
Gene converged after 11 iterations
28
Gene converged after 14 iterations
85
Gene converged after 13 iterations
164
Gene converged after 15 iterations
44
Gene converged after 12 iterations
51
Gene converged after 14 iterations
23
Gene converged after 14 iterations
2
Gene converged after 10 iterations
50
Gene converged after 12 iterations
52
Gene converged after 12 iterations
189
Gene converged after 14 iterations
72
Gene converged after 13 iterations
57
Gene converged after 14 iterations
27
Gene converged after 11 iterations
139
Gene converged after 11 iterations
48
Gene converged after 11 iterations
187
Gene converged after 11 iterations
121
Gene converged after 14 iterations
123
Gene 

Gene converged after 29 iterations
70
Gene converged after 20 iterations
102
Gene converged after 23 iterations
174
Gene converged after 21 iterations
38
Gene converged after 24 iterations
59
Gene converged after 29 iterations
71
Gene converged after 21 iterations
65
Gene converged after 25 iterations
42
Gene converged after 23 iterations
117
Gene converged after 21 iterations
178
Gene converged after 29 iterations
0
Gene converged after 20 iterations
28
Gene converged after 21 iterations
79
Gene converged after 23 iterations
25
Gene converged after 16 iterations
148
Gene converged after 22 iterations
17
Gene converged after 16 iterations
22
Gene converged after 14 iterations
81
Gene converged after 12 iterations
153
Gene converged after 25 iterations
131
Gene converged after 22 iterations
110
Gene converged after 34 iterations
155
Gene converged after 21 iterations
56
Gene converged after 25 iterations
179
Gene converged after 24 iterations
170
Gene converged after 21 iterations
152
G

Gene converged after 13 iterations
136
Gene converged after 11 iterations
163
Gene converged after 8 iterations
184
Gene converged after 12 iterations
112
Gene converged after 15 iterations
129
Gene converged after 12 iterations
40
Gene converged after 12 iterations
57
Gene converged after 14 iterations
24
Gene converged after 12 iterations
71
Gene converged after 12 iterations
10
Gene converged after 17 iterations
138
Gene converged after 15 iterations
45
Gene converged after 18 iterations
127
Gene converged after 12 iterations
92
Gene converged after 13 iterations
66
Gene converged after 11 iterations
77
Gene converged after 12 iterations
158
Gene converged after 17 iterations
52
Gene converged after 16 iterations
172
Gene converged after 12 iterations
98
Gene converged after 12 iterations
177
Gene converged after 11 iterations
115
Gene converged after 13 iterations
96
Gene converged after 11 iterations
47
Gene converged after 13 iterations
64
Gene converged after 14 iterations
91
Ge

Gene converged after 20 iterations
121
Gene converged after 18 iterations
82
Gene converged after 17 iterations
172
Gene converged after 25 iterations
109
Gene converged after 39 iterations
60
Gene converged after 21 iterations
149
Gene converged after 16 iterations
20
Gene converged after 20 iterations
4
Gene converged after 22 iterations
99
Gene converged after 16 iterations
39
Gene converged after 22 iterations
15
Gene converged after 22 iterations
156
Gene converged after 12 iterations
140
Gene converged after 15 iterations
178
Gene converged after 25 iterations
107
Gene converged after 21 iterations
184
Gene converged after 30 iterations
46
Gene converged after 25 iterations
27
Gene converged after 20 iterations
74
Gene converged after 19 iterations
83
Gene converged after 24 iterations
11
Gene converged after 17 iterations
61
Gene converged after 19 iterations
110
Gene converged after 23 iterations
181
Gene converged after 23 iterations
77
Gene converged after 17 iterations
5
Gen

Gene converged after 11 iterations
39
Gene converged after 12 iterations
143
Gene converged after 13 iterations
129
Gene converged after 11 iterations
156
Gene converged after 14 iterations
9
Gene converged after 12 iterations
91
Gene converged after 15 iterations
105
Gene converged after 11 iterations
118
Gene converged after 14 iterations
178
Gene converged after 11 iterations
7
Gene converged after 12 iterations
187
Gene converged after 12 iterations
59
Gene converged after 13 iterations
172
Gene converged after 13 iterations
38
Gene converged after 16 iterations
60
Gene converged after 12 iterations
179
Gene converged after 13 iterations
43
Gene converged after 11 iterations
56
Gene converged after 13 iterations
104
Gene converged after 10 iterations
8
Gene converged after 10 iterations
61
Gene converged after 11 iterations
90
Gene converged after 14 iterations
122
Gene converged after 11 iterations
54
Gene converged after 14 iterations
25
Gene converged after 11 iterations
169
Gen

Gene converged after 25 iterations
61
Gene converged after 16 iterations
63
Gene converged after 21 iterations
24
Gene converged after 16 iterations
28
Gene converged after 16 iterations
80
Gene converged after 17 iterations
90
Gene converged after 28 iterations
85
Gene converged after 22 iterations
52
Gene converged after 15 iterations
53
Gene converged after 18 iterations
42
Gene converged after 24 iterations
3
Gene converged after 21 iterations
100
Gene converged after 17 iterations
26
Gene converged after 18 iterations
15
Gene converged after 22 iterations
14
Gene converged after 12 iterations
118
Gene converged after 18 iterations
29
Gene converged after 15 iterations
104
Gene converged after 19 iterations
168
Gene converged after 13 iterations
18
Gene converged after 19 iterations
182
Gene converged after 15 iterations
68
Gene converged after 20 iterations
94
Gene converged after 22 iterations
146
Gene converged after 18 iterations
87
Gene converged after 15 iterations
45
Gene co

Gene converged after 12 iterations
156
Gene converged after 11 iterations
8
Gene converged after 18 iterations
39
Gene converged after 15 iterations
1
Gene converged after 11 iterations
71
Gene converged after 12 iterations
132
Gene converged after 10 iterations
159
Gene converged after 16 iterations
97
Gene converged after 12 iterations
91
Gene converged after 5 iterations
61
Gene converged after 16 iterations
145
Gene converged after 15 iterations
136
Gene converged after 13 iterations
187
Gene converged after 14 iterations
92
Gene converged after 14 iterations
138
Gene converged after 10 iterations
186
Gene converged after 14 iterations
181
Gene converged after 14 iterations
64
Gene converged after 15 iterations
189
Gene converged after 17 iterations
74
Gene converged after 10 iterations
88.9058029652
618957.636857 2.95925020423e+12 -106363.390176 -2.95925071682e+12
94
Gene converged after 23 iterations
188
Gene converged after 59 iterations
3
Gene converged after 54 iterations
137


In [None]:
output = "VI/VI_stochatic_K4"

np.savez('%s-ELBO' %output, genes.ELBO)
np.savez('%s-t' % output, genes.T)
np.savez('%s-gamma' % output, genes.gammma)
np.savez('%s-lambda' % output, genes.lambbda)
np.savez('%s-phi' % output, genes.phi)
np.savez('%s-data-used' % output, genes.Wn)
np.savez('%s-vocabulary-to-int' % output, genes.vocabulary_in_int)



with open("%s.pickle" % output, 'wb') as handle:
    pickle.dump(genes, handle)

