In [2]:
import numpy as np
from numpy.linalg import svd

In [4]:
documents = [
    "This is the first document",
    "This document is the second document",
    "And this is the third one",
    "Is this the first document?"
]
documents

['This is the first document',
 'This document is the second document',
 'And this is the third one',
 'Is this the first document?']

In [24]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

'''
Count the occurences of the word in documents
Create the term-document matrix (each column represents a word, each row represents a document)
'''
term_document_matrix = vectorizer.fit_transform(documents)
vectorizer.get_feature_names_out()

array(['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third',
       'this'], dtype=object)

In [19]:
# Convert the term-document matrix to numpy array
term_document_array = term_document_matrix.toarray()
term_document_array

array([[0, 1, 1, 1, 0, 0, 1, 0, 1],
       [0, 2, 0, 1, 0, 1, 1, 0, 1],
       [1, 0, 0, 1, 1, 0, 1, 1, 1],
       [0, 1, 1, 1, 0, 0, 1, 0, 1]], dtype=int64)

In [20]:
U, S, V = svd(term_document_array)

In [50]:
# Determine keywords based on the principal components (columns of matrix V)
keyword_threshold = 0.75
keywords = []

for i, word in enumerate(vectorizer.get_feature_names_out()):
    keyword_vector = V[i, :]
    print(np.abs(keyword_vector) > keyword_threshold)
    if np.any(np.abs(keyword_vector) > keyword_threshold):
        keywords.append(word)

[False False False False False False False False False]
[False False False False False False False False False]
[False False False False False False False False False]
[False False False False False False False False False]
[False False False False  True False False False False]
[False False False  True False False False False False]
[False False False False False False  True False False]
[False False False False False False False  True False]
[False False False False False False False False  True]


In [42]:
print("Keywords:")
print(keywords)

Keywords:
['one', 'second', 'the', 'third', 'this']
