In [1]:
###### Compute empirical covariance matrix for each cluster

In [None]:
import numpy as np
import pandas as pd
from collections import Counter
from scipy.special import digamma
from scipy.stats import gamma
from scipy.special import gammaln
from scipy.special import multigammaln
from sklearn.mixture import GaussianMixture
import time

#from matplotlib import pyplot as plt
import dill

## for simulation
from numpy.random import dirichlet
from numpy.random import multinomial
from numpy.random import multivariate_normal

import sys

from sklearn.metrics.cluster import adjusted_rand_score

In [15]:
class LDA_GMM:

    def __init__(self, fn, K):

        #### store the data
        data = pd.read_csv(fn,sep='\t',index_col = 0)
        temp_df = data.T.copy()
        temp_df.columns = range(len(data))
        temp_df = temp_df.drop_duplicates().T
        temp_df.index = data.index
        print data.shape, temp_df.shape
        data = temp_df.copy()

        words_in_each_gene = data.groupby(data.index)
        self.genes = np.sort(words_in_each_gene.indices.keys())
        self.test_genes = np.random.choice(self.genes, int(len(self.genes) * 0.2), replace=False)

        self.K = K
        self.M = len(self.genes)
        self.vocabulary = np.array(data.columns)
        self.V = len(self.vocabulary)
        print 'M = %d, V = %d, K = %d' % (self.M, self.V, self.K)

        # data for each gene
        self.Nd = []
        self.Wn = []
        self.test_data = pd.DataFrame()
        for g in self.genes:
            temp = np.array(words_in_each_gene.get_group(g))
            if g in self.test_genes:
                # test
                test_idx = np.random.choice(range(len(temp)), int(len(temp)*0.5), replace=False)
                test_words = words_in_each_gene.get_group(g).iloc[test_idx]
                self.test_data = self.test_data.append(test_words)
                # train
                train_idx = np.array([a for a in range(len(temp)) if a not in test_idx])
                train_words = words_in_each_gene.get_group(g).iloc[train_idx]
                self.Wn.append(np.array(train_words))
                self.Nd.append(len(train_words))
            else:
                self.Wn.append(np.array(temp))
                self.Nd.append(len(temp))


        #### super-parameters
        self.alphai = 1 / float(self.K)
        self.m0 = np.mean(np.array([np.mean(self.Wn[d], axis=0) for d in xrange(self.M)]), axis=0)
        self.beta0 = 1.0
        self.W0 = np.identity(self.V) * 1.0
        self.nu0 = self.V + 5.0
 
        
        
    def initialize(self):        
        
        self.W0_inv = np.linalg.inv(self.W0)
        
        #### randomly initialize phi 
        self.phi = []
        for d in xrange(self.M):
            self.phi.append(vector_sum_to_1(self.Nd[d], self.K))
            self.phi[d] = np.array(map(lambda x: (x+1e-50) / np.sum(x+1e-50), self.phi[d]))
            
        ## Define some statistics for convinence
        # Nk
        self.Nk = np.sum([np.sum(self.phi[d], axis=0) for d in xrange(self.M)], axis=0)
        # weighted mean value in each module
        self.wk_bar = np.array([sum(l) for l in zip(*[np.dot(np.transpose(self.phi[t]), self.Wn[t]) for t in xrange(self.M)])])
        self.wk_bar = np.array([self.wk_bar[k,:] / self.Nk[k] for k in xrange(self.K)])
        # weighted covariance
        self.Sk = []
        for k in xrange(self.K):
            centered_Wnd = np.concatenate(self.Wn) - self.wk_bar[k,:][np.newaxis]
            self.Sk.append(np.dot(np.concatenate(self.phi)[:,k][np.newaxis] * np.transpose(centered_Wnd), centered_Wnd) / self.Nk[k])

            
        #### Initialization the other parameters
        # Lambda
        self.betak = self.beta0 + self.Nk
        self.nuk   = self.nu0 + self.Nk
        self.mk    = [(self.beta0 * self.m0 + self.Nk[k] * self.wk_bar[k,:]) / (self.beta0 + self.Nk[k]) for k in xrange(self.K)]
        
        self.Wk    = []
        for k in xrange(self.K):
            self.Wk.append(np.linalg.inv(self.Sk[k]))
            if np.linalg.cond(self.Wk[k]) > 1/sys.float_info.epsilon:
                    print 'Wk_%d is not psd at initialization ' % (k)
                    return
            else:
                    continue
        assert sum([np.linalg.slogdet(self.Wk[k])[0] < 1 for k in xrange(self.K)]) == 0
        
        self.Eq_lnDetLambda = np.zeros(self.K)
        for k in xrange(self.K):
            self.Eq_lnDetLambda[k] = np.sum([digamma((self.nuk[k] + 1 - j)/2) for j in xrange(self.V)]) + \
                                     self.V * np.log(2) + np.linalg.slogdet(self.Wk[k])[1]        
        
        # gammma
        self.gammma = [np.sum(self.phi[d], axis=0) + self.alphai for d in xrange(self.M)]
        self.gammma = np.array(self.gammma)
                
        #### storage
        self.ELBO = []
        self.T = []
        self.updated = []        
        self.compute_ELBO('Initial')
        self.old_ELBO = self.ELBO[-1].copy()
        
        


    def update(self, max_iter):
        
        episron = 1e-3
        iterations = 1

        while iterations < max_iter: 
            
            START = time.time()
            
            ### update phi
            
            S = time.time()
            for d in xrange(self.M):
                
                ## [Nd, K], every row is the same for each matrix.
                Eq_mu_Lambda_mean = []
                for k in xrange(self.K):
                    wk_minus_mk = self.Wn[d] - self.mk[k][np.newaxis]
                    Eq_mu_Lambda_mean.append(np.diag(self.V / self.betak[k] + \
                                              self.nuk[k] * np.dot(np.dot(wk_minus_mk, self.Wk[k]), np.transpose(wk_minus_mk))))
                Eq_mu_Lambda_mean = np.transpose(Eq_mu_Lambda_mean)
                self.Eq_mu_Lambda_mean = Eq_mu_Lambda_mean
                
                self.phi[d] = np.exp(digamma(self.gammma[d,:])[np.newaxis] + self.Eq_lnDetLambda[np.newaxis]/2 - Eq_mu_Lambda_mean/2)
                self.phi[d] = np.array(map(lambda x: (x+1e-50) / np.sum(x+1e-50), self.phi[d]))
                 
            ### Define some statistics for convinence
            # Nk
            self.Nk = np.sum([np.sum(self.phi[d], axis=0) for d in xrange(self.M)], axis=0)
            # weighted mean value in each module
            self.wk_bar = np.array([sum(l) for l in zip(*[np.dot(np.transpose(self.phi[t]), self.Wn[t]) for t in xrange(self.M)])])
            self.wk_bar = np.array([self.wk_bar[k,:] / self.Nk[k] for k in xrange(self.K)])
            # weighted covariance
            self.Sk = []
            for k in xrange(self.K):
                centered_Wnd = np.concatenate(self.Wn) - self.wk_bar[k,:][np.newaxis]
                self.Sk.append(np.dot(np.concatenate(self.phi)[:,k][np.newaxis] * np.transpose(centered_Wnd), centered_Wnd) / self.Nk[k])
            #print np.mean([np.linalg.det(x) for x in self.Sk])
                
            print 'Update phis:', time.time() - S
            ### compute ELBO
            self.compute_ELBO('phi')
                

            
            ### update gammma
            self.gammma = [np.sum(self.phi[d], axis=0) + self.alphai for d in xrange(self.M)]
            self.gammma = np.array(self.gammma)
            self.compute_ELBO('gamma')
            
            S = time.time()
            ### update lambda
            self.betak = self.beta0 + self.Nk
            self.mk = [(self.beta0 * self.m0 + self.Nk[k] * self.wk_bar[k,:]) / (self.beta0 + self.Nk[k]) for k in xrange(self.K)]
            self.nuk = self.nu0 + self.Nk           
            for k in xrange(self.K):
                self.Wk[k] = self.Wk.append(np.linalg.inv(self.Sk[k]))
                if np.linalg.cond(self.Wk[k]) > 1/sys.float_info.epsilon:
                    print 'Wk_%d is not psd at iteration %d ' % (k,iterations)
                    return
                else:
                    continue
            for k in xrange(self.K):
                self.Eq_lnDetLambda[k] = np.sum([digamma((self.nuk[k] + 1 - j)/2) for j in xrange(self.V)]) + \
                                         self.V * np.log(2) + np.linalg.slogdet(self.Wk[k])[1]
            print 'update Lambdas:', time.time()-S
                    
            self.compute_ELBO('Lambdas')
            
            assert sum([np.linalg.slogdet(self.Wk[k])[0] < 1 for k in xrange(self.K)]) == 0
            assert ~np.isnan(sum([np.sum(self.phi[t]) for t in xrange(self.M)]))
            assert ~np.isnan(np.sum(self.gammma))
            assert ~np.isnan(np.sum(self.Wk))
        
            #print Counter([a for b in [np.argmax(self.phi[d], axis=1) for d in xrange(self.M)] for a in b])
    
            if np.abs(self.ELBO[-1] - self.old_ELBO) < episron:
                print 'Converged after %d iterations\n' % iterations
                break
            else:
                iterations += 1
                self.old_ELBO = self.ELBO[-1].copy()
                self.T.append(time.time() - START)                

                
    def compute_ELBO(self, qi_updated):
                        
        gammma_term = np.sum(gammaln(self.gammma))
        phi_term = -np.sum([np.sum(t * np.log(t)) for t in self.phi])

        mu_lambda_term = []
        for k in xrange(self.K):
            temp1 = (self.Nk[k] + self.nu0 - self.V - 2) * self.Eq_lnDetLambda[k]
            temp2 = self.V * (self.Nk[k] + self.beta0) / self.betak[k]
            temp4 = self.nuk[k] * (self.Nk[k]  *  np.dot(np.dot(self.wk_bar[k] - self.mk[k], self.Wk[k])[np.newaxis], \
                                                         (self.wk_bar[k] - self.mk[k])[np.newaxis].T) + \
                                        self.beta0 * np.dot(np.dot(self.mk[k] - self.m0, self.Wk[k])[np.newaxis], \
                                                        (self.mk[k] - self.m0)[np.newaxis].T))
            temp4 = temp4[0,0]
            temp5 = self.nuk[k] * (self.Nk[k] * np.matrix.trace(np.dot(self.Sk[k], self.Wk[k])) + \
                                    np.matrix.trace(np.dot(np.linalg.inv(self.W0), self.Wk[k])))
            temp6 = self.V * self.Nk[k] * np.log(2*np.math.pi) + self.V * np.log(self.betak[k]/(2*np.math.pi)) 
            H_q_Lambdak = (self.V+1)/2 * np.linalg.slogdet(self.Wk[k])[1] + \
                            multigammaln(self.nuk[k]/2, self.V) - \
                            (self.nuk[k]-self.V-1)/2 * np.sum([digamma((self.nuk[k]-j+1)/2) for j in xrange(self.V)]) + \
                            self.nuk[k] * self.V / 2
            mu_lambda_term.append(temp1 - temp2 - temp4 - temp5 - temp6 + 2*H_q_Lambdak)
            
        ELBO = gammma_term + phi_term + np.sum(mu_lambda_term) / 2
        self.ELBO.append(ELBO)
        self.updated.append(qi_updated)

        #print qi_updated, ELBO
        
        
        
def vector_sum_to_1(rowN, colN):
    ## row sum up to 1
    np.random.seed(10)
    vc = np.random.sample([rowN, colN])
    vc = map(lambda x: x / np.sum(x), vc)
    return np.array(vc)

In [16]:
with open('VI/real_data_empirical_Wk/Features_normalized_Least150.csv_K_20_initialized.p', 'rb') as fn:
    ZZZ = dill.load(fn)
    
    
ZZZ.initialize()

In [18]:
ZZZ.update(10)

Determinant of Sk in each module: [1.0667435710194016e-06, 1.0667435710194016e-06, 1.0667435710194016e-06, 1.0667435710194016e-06, 1.0667435710194016e-06, 1.0667435710194016e-06, 1.0667435710194016e-06, 1.0667435710194016e-06, 1.0667435710194016e-06, 1.0667435710194016e-06, 1.0667435710194016e-06, 1.0667435710194016e-06, 1.0667435710194016e-06, 1.0667435710194016e-06, 1.0667435710194016e-06, 1.0667435710194016e-06, 1.0667435710194016e-06, 1.0667435710194016e-06, 1.0667435710194016e-06, 1.0667435710194016e-06]
Converged after 1 iterations


In [6]:
ZZZ = LDA_GMM('simulated_data/M_500_K_4_V_150_W0_scale_0.1', 4)
ZZZ.initialize()
iterations = 2
ZZZ.update(iterations)

IOError: File simulated_data/M_500_K_4_V_150_W0_scale_0.1 does not exist

In [19]:
fn = "Simulated_Lambbda_median_2_W0_median_2_data_std"

Z = LDA_GMM('%s.npz' % fn, 4)
Z.initialize()


iterations = 10
Z.update(iterations)

plt.figure()
plt.plot(range(len(Z.ELBO)), Z.ELBO)
plt.savefig('VI/%s_ELBO.png' % fn)

M = 20, V = 124, K = 4
2.95470486568e+45
1245.61064699
-971455.48716
-971530.105155
-971539.987434
-971526.346528
-971541.9162
-971647.406103
-971644.151413
-971702.564773
-971696.832893
-971698.606704
-971681.243912
-971763.172661
-971868.848403
-971936.650516
-971930.816248
-971967.730835
-971968.3694
-971985.492096
-971945.574825
-971934.714481
-971930.583459
-971907.215745
-971907.071232
-971845.326482
-969485.121714
-969380.553014
-969207.281642
-968795.91498
-968553.028671
-968459.166151
-968078.90763
-967996.337876
-967573.388345
-967005.545309
-966716.899704
-966664.998378
-966567.480872
-966496.891466
-966236.15521
-966095.640124
-965870.393245
-965662.707278
-965271.181298
-964929.736828
-964728.8424
-964254.836817
-964251.827767
-963717.581272
-948732.194457
-948145.232963
-946744.362743
-943985.205374
-942384.558881
-941631.193285
-938900.603711
-937669.687211
-934843.809162
-931425.406746
-929453.494653
-928885.780713
-928267.74949
-927068.650084
-925522.573768
-924846.434

In [22]:
Z = ZZZ

allsamples = np.array(np.concatenate(Z.Wn))
GMM_model = GaussianMixture(n_components=Z.K)
GMM_model.fit(allsamples)
GMM_pred = GMM_model.predict_proba(allsamples)

GMM_assignment = [np.argmax(x) for x in GMM_pred]
resulting_assignment = [np.argmax(x) for x in np.concatenate(Z.phi)]
true_assignment = [np.argmax(x) for x in np.concatenate(Z.word_assginment_z)]
print adjusted_rand_score(resulting_assignment, true_assignment)
print adjusted_rand_score(GMM_assignment, true_assignment)

-5.60339802678e-13
1.0


In [8]:
gene_assignment = [np.argmax(x) for x in Z.gammma]
true_gene_assignment = [np.argmax(x) for x in Z.theta]
print adjusted_rand_score(gene_assignment, true_gene_assignment)

0.685993664784


In [434]:
phiiii = []
gammmaaaa = [] 
betakkkk = []
mkkkkk = []
Wkkkkk = []
nukkkkk = []
a = Z.M + 8
b = Z.M
increase_ELBO = [j-i for i, j in zip(Z.ELBO[:-1], Z.ELBO[1:])]
for ttt in xrange(8-1):
    phiiii.append(increase_ELBO[ttt*a:ttt*a+b])
    gammmaaaa.append(increase_ELBO[ttt*a+b])
    betakkkk.append(increase_ELBO[ttt*a+b+1])
    mkkkkk.append(increase_ELBO[ttt*a+b+2])
    Wkkkkk.append(increase_ELBO[ttt*a+b+3:ttt*a+b+7])
    nukkkkk.append(increase_ELBO[ttt*a+b+7])
    

IndexError: list index out of range