In [12]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.datasets import fetch_20newsgroups

# Step 1: Data Collection
newsgroups_data = fetch_20newsgroups(subset='all')
documents = newsgroups_data.data


# Step 2: Text Preprocessing
# Tokenization, lowercase conversion, stop word removal, and stemming
stemmer = SnowballStemmer("english")
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    tokens = text.split()
    tokens = [token.lower() for token in tokens]
    tokens = [stemmer.stem(token) for token in tokens if token not in stop_words]
    return ' '.join(tokens)

preprocessed_documents = [preprocess_text(doc) for doc in documents]

# Step 3: Create Term-Document Matrix
vectorizer = TfidfVectorizer(max_df=0.5, min_df=2, stop_words='english')
tfidf_matrix = vectorizer.fit_transform(preprocessed_documents)

# Step 4: SVD Decomposition
num_topics = 100  # Number of topics to reduce the matrix to
svd = TruncatedSVD(n_components=num_topics)
lsa_matrix = svd.fit_transform(tfidf_matrix)

# Step 5: Topic Exploration
# Singular vectors and their corresponding terms
singular_vectors = svd.components_

# Print the top terms for each topic
for topic_idx, topic in enumerate(singular_vectors):
    top_terms_idx = topic.argsort()[::-1][:10]  # Get the indices of top terms
    top_terms = [vectorizer.get_feature_names_out()[idx] for idx in top_terms_idx]
    top_terms_str = [str(term) for term in top_terms]  # Convert numpy arrays to strings
    print(f"Topic {topic_idx + 1}: {', '.join(top_terms_str)}")

# Additional analysis or visualization can be performed based on the topics extracted.


Number of documents: 18846
Topic 1: com, use, like, peopl, know, articl, think, god, univers, say
Topic 2: god, christian, jesus, peopl, believ, say, religion, faith, christ, sin
Topic 3: god, window, christian, jesus, file, scsi, dos, drive, card, use
Topic 4: key, clipper, chip, encrypt, com, govern, escrow, netcom, secur, law
Topic 5: armenian, israel, isra, turkish, arab, muslim, jew, state, armenia, serdar
Topic 6: key, game, chip, encrypt, clipper, team, escrow, god, use, secur
Topic 7: scsi, drive, ide, chip, hard, disk, ohio, card, bus, key
Topic 8: nasa, ohio, space, gov, cleveland, state, magnus, cwru, digex, acs
Topic 9: ohio, cleveland, israel, state, cwru, magnus, freenet, acs, isra, window
Topic 10: armenian, turkish, god, com, serdar, key, argic, armenia, ohio, muslim
Topic 11: sgi, uk, moral, livesey, scsi, israel, keith, caltech, wpd, solntze
Topic 12: uk, ac, car, cramer, optilink, homosexu, gay, clayton, bike, pitt
Topic 13: keith, sgi, livesey, moral, caltech, armen

In [25]:
# Step 6: Load Query from a Text Document
with open("query.txt", "r") as query_file:
    query_text = query_file.read()

# Preprocess the query in the same way as the dataset
preprocessed_query = preprocess_text(query_text)

# Step 7: Project the Query into LSI Space
query_vector = vectorizer.transform([preprocessed_query])  # Transform the query into TF-IDF space
query_lsi = svd.transform(query_vector)  # Project the query into the LSI space

# Step 8: Compute Cosine Similarity

from sklearn.metrics.pairwise import cosine_similarity

# Compute the cosine similarity between the query and LSI-transformed documents
similarities = cosine_similarity(query_lsi, lsa_matrix)

# Find the most relevant documents
top_n = 3  # You can change this to the number of top relevant documents you want
top_document_indices = similarities.argsort()[0][::-1][:top_n]

# Print the most relevant documents
print("Top Relevant Documents:")
for i, doc_idx in enumerate(top_document_indices):
    print(f"{i + 1}. Document {doc_idx + 1}: {documents[doc_idx]}")

Top Relevant Documents:
1. Document 12949: From: u95_dgold@vaxc.stevens-tech.edu
Subject: EMI filter, What's in it?
Lines: 8
Organization: Stevens Institute Of Technology

Could someone tell me what's in a Cornell-Dubilier EMI Filter
FIL 3363-001?

It is rated at 13A 115/250VAC 50/60HZ.  Is it just MOV's and ferrite?

Dave  /  n2mxx
Stevens Institute of Technology
Hoboken, New Jersey

2. Document 13717: From: grahamt@phantom.gatech.edu (Graham E. Thomas)
Subject: Re: BLAST to the past!
Organization: Georgia Institute of Technology
Lines: 17
NNTP-Posting-Host: oit.gatech.edu

amh2@ns1.cc.lehigh.edu (ALOIS M. HIMSL) writes:
>be worthwhile?  Or how about something like the old MGB with new technology?
>Just think about it - the old style with upgraded safety features and perhaps a
>natural gas operated engine for less than 10K. I think it would go over well.
>What is your opinion??????
>Al H

Well, the MGB is currently in production for the English market, built
by Rover. It now has a V8,

In [33]:
from sklearn.datasets import fetch_20newsgroups

# Ground truth labels are stored in newsgroups_data.target
ground_truth_labels = newsgroups_data.target

# If you want to see the unique categories (topics) in the dataset, you can use:
unique_categories = list(newsgroups_data.target_names)
print("Unique Categories (Topics):", unique_categories)

Unique Categories (Topics): ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [34]:
from sklearn.cluster import KMeans

num_clusters = 20  # You can adjust the number of clusters
kmeans = KMeans(n_clusters=num_clusters, random_state=0)
cluster_labels = kmeans.fit_predict(lsa_matrix)

  super()._check_params_vs_input(X, default_n_init=10)


In [35]:
from sklearn.metrics import accuracy_score, confusion_matrix

# Compute the confusion matrix between ground truth labels and cluster labels
confusion = confusion_matrix(ground_truth_labels, cluster_labels)
print(confusion)

# Calculate purity
purity = np.sum(np.max(confusion, axis=0)) / np.sum(confusion)
print(f"purity: {purity}")

[[282   0   0   2   1 158   0   0   0   0   0 132   0 169  38   0  16   1
    0   0]
 [628   1   0   1   0   3   0   2   0   2 330   0   3   1   0   0   2   0
    0   0]
 [273   9   0   0   0   3   0   2   0  13 678   0   2   0   0   0   5   0
    0   0]
 [385   2   0   0   1   0   3   3   0   5 575   0   2   0   0   0   6   0
    0   0]
 [559   1   0   0   0   1   0  13   0   1 368   0   2   0   0   0  18   0
    0   0]
 [532   2   0   0   6   0   0   2   0   1 422   0   2   0   0   0  21   0
    0   0]
 [707  13   0   5   0   3  17  33   0   7 167   0   8   0   0   0  15   0
    0   0]
 [893   2   0   3   0  37   0  30   0   1   5   0   0   0   0   0  19   0
    0   0]
 [954   0   0   0   0  14   0   6   0   1   1   0   9   0   0   0  11   0
    0   0]
 [484  15   1   4   0   4 469   7   0   1   0   0   1   0   0   0   8   0
    0   0]
 [182  25   0   0   0   4 702   6   0   0   1   0  28   0   0   0  51   0
    0   0]
 [321   0   0  16 503  69   0   8   0  34  30   0   0   0   0   0

In [36]:
from sklearn.metrics import normalized_mutual_info_score

nmi = normalized_mutual_info_score(ground_truth_labels, cluster_labels)
print(f"NMI score: {nmi}")

NMI score: 0.3581136714869967


In [37]:
from sklearn.metrics import silhouette_score

silhouette_avg = silhouette_score(lsa_matrix, cluster_labels)
print(f"Silhouette Score: {silhouette_avg}")

Silhouette Score: 0.0878960135399179
