In [13]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Step 1: Load the dataset
categories = ['rec.sport.baseball', 'sci.space', 'talk.politics.misc']  # Select specific categories
dataset = fetch_20newsgroups(subset='all', categories=categories, remove=('headers', 'footers', 'quotes'))

# Display sample data
print(f"Number of documents: {len(dataset.data)}")
print("\nSample document:")
print(dataset.data[0][:500]) 

Number of documents: 2756

Sample document:


:So we try to ensure that the process of deciding whether to introduce
:third parties isn't random.  As Steve said above, there are examples
:where third parties *are* less ignorant or corrupt than the two
:primary parties; should this knowledge not be able to help?

Of course it helps,  but only if the decision to involve third parties
is the primary partis' to make.  A corrupt and ignorant third party
isn't going to say,  "we're corrupt and ignorant,  we'll stay out of this".
Pointing out th


In [15]:
vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)  # Limit to top 1000 features
tfidf_matrix = vectorizer.fit_transform(dataset.data)

In [21]:
print(f"\nTF-IDF Matrix Shape: {tfidf_matrix.shape}")


TF-IDF Matrix Shape: (2756, 1000)


In [25]:
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [27]:
print(f"Cosine Similarity Matrix Shape: {cosine_sim.shape}")

Cosine Similarity Matrix Shape: (2756, 2756)


In [29]:
def recommend_similar_documents(doc_index, cosine_sim=cosine_sim, top_n=5):
    # Get pairwise similarity scores for the given document
    sim_scores = list(enumerate(cosine_sim[doc_index]))
    
    # Sort documents based on similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get top N most similar documents (excluding the document itself)
    sim_scores = sim_scores[1:top_n+1]
    doc_indices = [i[0] for i in sim_scores]
    
    # Return the titles (or content) of the recommended documents
    return [dataset.data[i] for i in doc_indices]

In [39]:
doc_index = 0  # Index of the document to find similar documents for
recommended_docs = recommend_similar_documents(doc_index, cosine_sim, top_n=5)

In [37]:
print("\nRecommended Documents:")
for i, doc in enumerate(recommended_docs, 1):
    print(f"\nDocument {i}:")
    print(doc[:500])


Recommended Documents:

Document 1:

cubs suck cubs suck cubs suck cubs suck cubs suck cubs suck cubs suck
cubs suck cubs suck cubs suck cubs suck cubs suck cubs suck cubs scuk
cubs suck cubs suck cubs suck cubs cuck cubs suck cubs suck cubs suck
cubs suck cubs suck cubs suck cubs suck cubs suck cubs suck cubs suck
cubs suck cubs suck cubs suck cubs suck cubs suck cubs suck cubs suck

oh yeah, he aqlso added that harry is a drunken idiot who shoulda
stayed in st louis where his heart is, but also added that fair weathered
fans all

Document 2:
I've noticed that is has become fashionable lately in rsb to predict
the Marlines to finish ahead of the Cubs....how?

First Base:

Grace vs Destrade...Could Destrade be the second coming of Cecil
Fielder? I doubt it. If Destrade performs to the height of expectations,
then even, otherwise, edge to Cubs

Second Base:

Sandberg vs Barberie...No contest. Sandberg will be back May 1. Edge
to Cubs...a big edge.

Shortstop:

Vizcaino vs Weiss...Vizca