In [53]:
import feedparser
from sklearn.feature_extraction.text import TfidfVectorizer
import string
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [54]:
# Initializing TF-IDF vectorizer object for vectorizing data
vectorizer = TfidfVectorizer()

In [55]:
# Choose a Sample Title
sample_title = 'War'

In [56]:
# Collecting feeds for various News Outlets
nyt_feed = feedparser.parse('https://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml')
bbc_feed = feedparser.parse('http://feeds.bbci.co.uk/news/world/rss.xml')

In [57]:
# Create list of cleaned titles
cleaned_titles = []

# Cleaning NYT Feed Entries
for entry in nyt_feed.entries:
    # Remove punctuation from the title using str.translate() and string.punctuation
    title = entry.title.translate(str.maketrans('', '', string.punctuation))
    
    # Convert the title to lowercase using str.lower()
    title = title.lower()

    # Append cleaned title to list
    cleaned_titles.append(title)

In [58]:
# Fit the vectorizer to the cleaned titles
vectorizer.fit(cleaned_titles)

# Transform the cleaned titles into a tf-idf matrix
tfidf_matrix = vectorizer.transform(cleaned_titles)

# Clean and transform the sample title into a tf-idf vector
sample_vector = vectorizer.transform([sample_title.translate(str.maketrans('', '', string.punctuation)).lower()])

similarity = cosine_similarity(sample_vector, tfidf_matrix)

similar_indices = similarity.argsort()[0][::-1]

for i in range(5):
    print(cleaned_titles[similar_indices[i]])

few wordle players use consistent starting words but when they do it’s adieu
pajaro flood is the latest sign of river levee risks during storms
the political fingerpointing behind bank collapses and train derailments
us says russian fighter jet hit american drone over black sea
true liberation eludes kherson as russian shelling intensifies in ukraine


In [59]:
print(tfidf_matrix.shape)

(23, 185)
