In [1]:
import numpy as np
from collections import Counter
from operator import itemgetter
import nltk
from nltk.corpus import stopwords
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
# functions to extract positive and negative part
def pos(vec):
  return np.array(list(map(lambda x: max(x, 0), vec)))

def neg(vec):
  return np.array(list(map(lambda x: max(-x, 0), vec)))

def normalize(vec):
  return vec / np.linalg.norm(vec)

def nndsvd(A, k):
  # here we will assume k < rank(A)
  # Get shape of A
  m, n = A.shape

  # conduct SVD
  U, S, V = np.linalg.svd(A, full_matrices = False)

  # store resulting matrix
  W, H = np.zeros((m, k)), np.zeros((k, n))

  # update rows and columns of W and H
  for i in range(k):
    if i == 0:
      W[:, 0] = np.sqrt(S[0]) * np.abs(U[:, 0])
      H[0, :] = np.sqrt(S[0]) * np.abs(V[0, :])
    else:
      x = U[:, i]
      y = V[i, :]
      xp, xn = pos(x), neg(x)
      yp, yn = pos(y), neg(y)
      if np.linalg.norm(xp) * np.linalg.norm(yp) > np.linalg.norm(xn) * np.linalg.norm(yn):
        u = normalize(xp)
        v = normalize(yp)
        sigma = np.linalg.norm(xp) * np.linalg.norm(yp)
      else:
        u = normalize(xn)
        v = normalize(yn)
        sigma = np.linalg.norm(xn) * np.linalg.norm(yn)
      W[:, i] = np.sqrt(S[i] * sigma) * u
      H[i, :] = np.sqrt(S[i] * sigma) * v
  return W, H

In [3]:
test = np.array([[1, 2, 3, 4, 5], [4, 5, 6, 7, 8], [5, 6, 7, 8, 9], [12, 43, 21, 43, 62]])
testW, testH = nndsvd(test, 2)
print(np.linalg.norm(test - np.dot(testW, testH), "fro"))

3.3148991013588183


In [4]:
# function for nmf
def NMF(A, k, max_iter = 10000, epsilon = 0.0001):
  # using Lee and Seung method
  W, H = nndsvd(A, k)
  for i in range(max_iter):
    temp1 = np.dot(W.T, A) / (np.dot(np.dot(W.T, W), H) + 1e-10)  # Add a small constant to avoid division by zero
    H_new = H * temp1
    temp2 = np.dot(A, H.T) / (np.dot(np.dot(W, H), H.T) + 1e-10)
    W_new = W * temp2
    W, H = W_new, H_new
    if np.linalg.norm(A - np.dot(W, H), "fro") < epsilon:
      break
  error = np.linalg.norm(A - np.dot(W, H), 'fro')
  print(f"final error is {error}")
  print(f"final mean error is {error / (A.shape[0] * A.shape[1])}")
  return W, H

In [5]:
testW, testH = NMF(test, 2)

final error is 1.6632952104592984
final mean error is 0.08316476052296493


In [6]:
print(testW)
print(testH)

[[0.68004432 0.58727249]
 [1.17823535 1.75597028]
 [1.34429903 2.14553622]
 [9.42707949 0.        ]]
[[1.27700524 4.57009214 2.2404377  4.58621862 6.62028475]
 [1.44410677 0.         1.92560433 0.92407755 0.        ]]


In [7]:
# our test data
testNMF = ['The quick brown fox jumps over the lazy dog',
           'In a galaxy far far away there is an epic adventure unfolding',
           'Scientific progress is fueled by curiosity and a relentless pursuit of knowledge',
           'The beauty of nature lies in its diversity from towering mountains to serene oceans',
           'Jazz music with its improvisational flair captures the essence of spontaneity and creativity',
           'As technology advances society grapples with ethical dilemmas and questions of privacy',
           'A healthy lifestyle encompasses regular exercise balanced nutrition and sufficient rest',
           "Shakespeare's timeless plays continue to resonate with audiences exploring the complexities of human nature",
           'The pursuit of happiness is a fundamental aspect of the human experience',
           'Climate change poses significant challenges requiring global cooperation for sustainable solutions',
           'Artistic expression knows no bounds from classical paintings to avant-garde installations',
           'Democracy thrives when citizens actively engage in civic discourse and exercise their right to vote',
           'The internet has revolutionized communication connecting people across continents in an instant',
           'Quantum physics delves into the mysterious and mind-bending nature of the subatomic world',
           'Cultural diversity enriches our global tapestry fostering understanding and appreciation'
           ]
testNMF = list(map(lambda x: x.lower(), testNMF))

In [8]:
# get list of unique words without stopwords
stop_words = set(stopwords.words('english'))
def get_unique_words(doc_lst):
  unique_words = []
  for doc in doc_lst:
    temp = [d for d in doc.split(" ") if d not in stop_words]
    unique_words.extend(temp)
  unique_words = list(set(unique_words))
  return unique_words

# create term document matrix (using frequency instead of count)
def ftdm(doc_lst, unique_words):
  # make the resulting matrix
  res = np.zeros((len(unique_words), len(doc_lst)))

  # now we fill each column
  for i, doc in enumerate(doc_lst):
    counter = dict(Counter(doc.split(" ")))
    for word in counter:
      if word in unique_words:
        res[unique_words.index(word), i] = counter[word]

  # get frequency and unique words list to use later
  return res / np.sum(res, axis = 0)

In [9]:
# now we examine on test data

# first get all unique words
testNMF_unique_words = get_unique_words(testNMF)

# then get frequency matrix
testNMF_ftdm = ftdm(testNMF, testNMF_unique_words)

# now we do NMF, test with 3 topics return distribution and prevalence
testNMF_td, testNMF_tp = NMF(testNMF_ftdm, 4, max_iter = 1000000)

final error is 1.1226031191976538
final mean error is 0.0006623027251903563


In [10]:
# now we look at our results, first we look at proportion of topics
print(testNMF_tp)

# we can also look at what toppics does each documents in
print("")
print("What topic does each documents in:")
print(np.argmax(testNMF_tp, axis = 0))

[[ 0.          0.          0.33046155  0.          0.          0.
   0.          0.08788119  0.55995095  0.          0.          0.
   0.          0.          0.        ]
 [ 0.          0.6548092   0.          0.          0.          0.
  -0.         -0.         -0.         -0.         -0.         -0.
  -0.         -0.         -0.        ]
 [ 0.6391206   0.          0.          0.          0.          0.
  -0.         -0.         -0.         -0.         -0.         -0.
  -0.         -0.         -0.        ]
 [ 0.          0.          0.          0.42481064  0.          0.
   0.          0.15836164  0.          0.06762553 -0.          0.
   0.          0.32157052  0.27989066]]

What topic does each documents in:
[2 1 0 3 0 0 0 3 0 3 0 0 0 3 3]


In [11]:
# we can look at documents with similar topics
for i in range(4):
  print(f"documents in topic {i+1}")
  for inx, topic in enumerate(np.argmax(testNMF_tp, axis = 0)):
    if topic == i:
      print(testNMF[inx])
  print("")

documents in topic 1
scientific progress is fueled by curiosity and a relentless pursuit of knowledge
jazz music with its improvisational flair captures the essence of spontaneity and creativity
as technology advances society grapples with ethical dilemmas and questions of privacy
a healthy lifestyle encompasses regular exercise balanced nutrition and sufficient rest
the pursuit of happiness is a fundamental aspect of the human experience
artistic expression knows no bounds from classical paintings to avant-garde installations
democracy thrives when citizens actively engage in civic discourse and exercise their right to vote
the internet has revolutionized communication connecting people across continents in an instant

documents in topic 2
in a galaxy far far away there is an epic adventure unfolding

documents in topic 3
the quick brown fox jumps over the lazy dog

documents in topic 4
the beauty of nature lies in its diversity from towering mountains to serene oceans
shakespeare's t

In [12]:
# we can look at top 5 words of each topics
for i in range(4):
  print(f"top words in topic {i+1} with probability")
  temp_lst = list(testNMF_td[:, i])
  indices = list(sorted(enumerate(temp_lst), key = itemgetter(1)))[-5:][::-1]
  top_words = [(testNMF_unique_words[ind[0]]) for ind in indices]
  print(top_words)

top words in topic 1 with probability
['pursuit', 'human', 'aspect', 'experience', 'fundamental']
top words in topic 2 with probability
['far', 'galaxy', 'epic', 'unfolding', 'adventure']
top words in topic 3 with probability
['fox', 'lazy', 'quick', 'jumps', 'brown']
top words in topic 4 with probability
['nature', 'diversity', 'beauty', 'serene', 'oceans']
