In [1]:
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

In [2]:
# Sample documents
documents = [
    "Latent semantic analysis is a technique in natural language processing.",
    "It is used to discover hidden patterns in a set of documents.",
    "LSA can be applied to various text analysis tasks, such as information retrieval and document classification.",
]

In [3]:
nltk.download('punkt')
# Tokenize the documents
tokenized_documents = [nltk.word_tokenize(doc) for doc in documents]

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Yuva\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)
tfidf_matrix

<3x32 sparse matrix of type '<class 'numpy.float64'>'
	with 36 stored elements in Compressed Sparse Row format>

In [5]:
# Perform Truncated Singular Value Decomposition (SVD) for LSA
num_topics = 2  # You can adjust the number of topics as needed
lsa = TruncatedSVD(n_components=num_topics)
lsa_matrix = lsa.fit_transform(tfidf_matrix)
lsa_matrix

array([[ 0.70391338, -0.25193762],
       [ 0.69552698, -0.30305683],
       [ 0.43025184,  0.90209134]])

In [6]:
# Print the topics
print("Topics:")
terms = tfidf_vectorizer.get_feature_names_out()
for i, topic in enumerate(lsa.components_):
    top_terms = [terms[idx] for idx in topic.argsort()[-5:][::-1]]
    print(f"Topic {i + 1}: {' '.join(top_terms)}")

Topics:
Topic 1: in is analysis to semantic
Topic 2: applied various lsa and text


In [7]:
# Print the document-topic matrix
print("Document-Topic Matrix:")
for i, doc in enumerate(documents):
    print(f"Document {i + 1}: {lsa_matrix[i]}")

Document-Topic Matrix:
Document 1: [ 0.70391338 -0.25193762]
Document 2: [ 0.69552698 -0.30305683]
Document 3: [0.43025184 0.90209134]


In [8]:
# Calculate the similarity between documents
from sklearn.metrics.pairwise import cosine_similarity

print("Document Similarity:")
for i in range(len(documents)):
    for j in range(i + 1, len(documents)):
        similarity = cosine_similarity(lsa_matrix[i].reshape(1, -1), lsa_matrix[j].reshape(1, -1))
        print(f"Similarity between Document {i + 1} and Document {j + 1}: {similarity[0][0]}")

Document Similarity:
Similarity between Document 1 and Document 2: 0.9977420219483246
Similarity between Document 1 and Document 3: 0.10116014908584053
Similarity between Document 2 and Document 3: 0.03411334846122621
