In [1]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from pprint import pprint

In [2]:
SEED = 42
np.random.seed(SEED)
np.set_printoptions(precision=20)
np.set_printoptions(suppress=True)

n = 2 # total topic
epoch = 50 # total matrix W & H updated

Set example document, then convert it to bag of words with `sklearn`.

In [3]:
doc = [
    'french fries',
    'french fries with cheese',
    'a computer with keyboard'
]

vectorizer = CountVectorizer()
V = vectorizer.fit_transform(doc).toarray()
print(V)
pprint(vectorizer.vocabulary_)

[[0 0 1 1 0 0]
 [1 0 1 1 0 1]
 [0 1 0 0 1 1]]
{'cheese': 0, 'computer': 1, 'french': 2, 'fries': 3, 'keyboard': 4, 'with': 5}


Perform random weight initialization on matrix W & H

In [4]:
W = np.random.rand(len(doc), n) # topic * doc
H = np.random.rand(n, len(vectorizer.vocabulary_)) # word * topic

print('Total topic:', n)
print('W (topic weight for each doc) initial weight:')
print(W)
print('H (word weight for each topic) initial weight:')
print(H)

Total topic: 2
W (topic weight for each doc) initial weight:
[[0.3745401188473625  0.9507143064099162 ]
 [0.7319939418114051  0.5986584841970366 ]
 [0.15601864044243652 0.15599452033620265]]
H (word weight for each topic) initial weight:
[[0.05808361216819946  0.8661761457749352   0.6011150117432088
  0.7080725777960455   0.020584494295802447 0.9699098521619943  ]
 [0.8324426408004217   0.21233911067827616  0.18182496720710062
  0.18340450985343382  0.3042422429595377   0.5247564316322378  ]]


Function to update weight of matrix W & H.

In [5]:
def update(V, W, H):
    new_W = np.zeros((len(doc), n)) # topic * doc
    new_H = np.zeros((n, len(vectorizer.vocabulary_))) # word * topic

    # Update H
    a = np.dot(W.T, V)
    b = np.dot(np.dot(W.T, W), H)
    for i in range(new_H.shape[0]):
        for j in range(new_H.shape[1]):
            new_H[i,j] = H[i,j] * (a[i,j] / b[i,j])

    # Update W
    a = np.dot(V, new_H.T)
    b = np.dot(np.dot(W, new_H), new_H.T)
    for i in range(new_W.shape[0]):
        for j in range(new_W.shape[1]):
            new_W[i,j] = W[i,j] * (a[i,j] / b[i,j])

    return new_W, new_H

Update weight based on determined epoch.

In [6]:
for _ in range(epoch):
    W, H = update(V, W, H)

Show updated weight of matrix W & H

In [7]:
print('W (topic weight for each doc) updated weight:')
print(W)
print('H (word weight for each topic) updated weight:')
print(H)

W (topic weight for each doc) updated weight:
[[0.5742947697302693     0.                    ]
 [0.8693104744002939     0.2112327640392642    ]
 [0.00000093963888003401 1.4423222617451956    ]]
H (word weight for each topic) updated weight:
[[0.7956362188354963     0.                     1.329860674845827
  1.3298608423320668     0.                     0.6791071944518956    ]
 [0.03067520984730096    0.6787656219155906     0.000122722449313596
  0.00012186816057678491 0.6787656219155906     0.7194910106575703    ]]


Show weight of words which represent each topic.

In [8]:
# 8. Show word which represent a topic
for idx in range(n):
    print('Word that represent topic', idx)
    topic_word = vectorizer.vocabulary_
    for i, key in enumerate(topic_word):
        topic_word[key] = H[idx, i]
    topic_word = sorted(topic_word.items(), key=lambda x:x[1], reverse=True)
    pprint(topic_word)

Word that represent topic 0
[('cheese', 1.3298608423320668),
 ('with', 1.329860674845827),
 ('french', 0.7956362188354963),
 ('keyboard', 0.6791071944518956),
 ('fries', 3.755199861885117e-161),
 ('computer', 5.322000537272556e-162)]
Word that represent topic 1
[('keyboard', 0.7194910106575703),
 ('fries', 0.6787656219155906),
 ('computer', 0.6787656219155906),
 ('french', 0.03067520984730096),
 ('with', 0.000122722449313596),
 ('cheese', 0.00012186816057678491)]


Show topic score of each document. As expected, document #0 and #1 belong to same topic.

In [9]:
for idx, doc_vector in enumerate(V):
    print(f'Document #{idx} - {doc[idx]}')
    for i, word_weight in enumerate(H):
        score = np.sum(doc_vector * word_weight)
        print(f'Topic {i}: {score}')

Document #0 - french fries
Topic 0: 2.6597215171778936
Topic 1: 0.0002445906098903809
Document #1 - french fries with cheese
Topic 0: 4.134464930465286
Topic 1: 0.7504108111147616
Document #2 - a computer with keyboard
Topic 0: 0.6791071944518956
Topic 1: 2.0770222544887513
