### News Search Engine

In [3]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [4]:
#loading the dataset
df = pd.read_json(r"C:\Users\bbuser\Downloads\News_Category_Dataset_v3.json", lines=True)
# Select relevant columns as in the question 
df = df[['category', 'headline']]
# Filter categories
categories = ["POLITICS", "TRAVEL", "SPORTS", "HOME & LIVING"]
df = df[df['category'].isin(categories)]

# Balance dataset (1000 per category)
df_balanced = df.groupby('category').apply(lambda x: x.sample(1000, random_state=42)).reset_index(drop=True)

print("Dataset shape:", df_balanced.shape) 


Dataset shape: (4000, 2)


  df_balanced = df.groupby('category').apply(lambda x: x.sample(1000, random_state=42)).reset_index(drop=True)


In [10]:
#TF-IDF Vectorization
vectorizer = TfidfVectorizer(stop_words="english")
tfidf_matrix = vectorizer.fit_transform(df_balanced['headline'])
vectorizer.get_feature_names_out()
print(f"Vocabulary size: {len(vectorizer.vocabulary_)}")

Vocabulary size: 8302


In [8]:
# Search Function

def search_articles(query, top_k=10):
    # Transform query into TF-IDF vector
    query_vec = vectorizer.transform([query])
    
    # Compute cosine similarity
    similarity = cosine_similarity(query_vec, tfidf_matrix).flatten()
    
    # Get top K indices
    top_indices = similarity.argsort()[::-1][:top_k]
    
    # Prepare results
    results = []
    for idx in top_indices:
        results.append({
            "headline": df_balanced.iloc[idx]['headline'],
            "category": df_balanced.iloc[idx]['category'],
            "similarity": float(similarity[idx])
        })
    return results
# Example usage
query = "election results"  
results = search_articles(query, top_k=5)
for res in results:
    print(f"Headline: {res['headline']}, Category: {res['category']}, Similarity: {res['similarity']:.4f}")


Headline: U.S. Open Results: Novak Djokovic Defeats Julien Benneteau In Third Round, Category: SPORTS, Similarity: 0.3163
Headline: We’re Still, Somehow, A Year Away From The Presidential Election, Category: POLITICS, Similarity: 0.3026
Headline: Obama Has Some Issues With How The Media Are Covering The Election, Category: POLITICS, Similarity: 0.2376
Headline: 8 Problems You May Encounter Going To Vote In The Election, Category: HOME & LIVING, Similarity: 0.2321
Headline: Barack Obama Sanctions Russia Over Election Meddling, Category: POLITICS, Similarity: 0.2153


In [11]:
#Example Search
query = "president election campaign"
results = search_articles(query, top_k=10)

# Display nicely
print(f"\n Search results for: '{query}'\n")
for rank, res in enumerate(results, 1):
    print(f"{rank}. {res['headline']}  [{res['category']}]  (score: {res['similarity']:.3f})")


 Search results for: 'president election campaign'

1. We’re Still, Somehow, A Year Away From The Presidential Election  [POLITICS]  (score: 0.306)
2. Protecting America From Its President  [POLITICS]  (score: 0.270)
3. Lying To The Press Is Nothing New For The President  [POLITICS]  (score: 0.247)
4. Obama Has Some Issues With How The Media Are Covering The Election  [POLITICS]  (score: 0.240)
5. Hillary Clinton Is On Her Way To A $1 Billion Campaign. Donald Trump Is Not.  [POLITICS]  (score: 0.239)
6. President Obama Hawaii: What To Do On Oahu (PHOTOS)  [TRAVEL]  (score: 0.238)
7. 8 Problems You May Encounter Going To Vote In The Election  [HOME & LIVING]  (score: 0.235)
8. This Is What It's Like To Spend A Week On A Presidential Campaign Bus  [POLITICS]  (score: 0.230)
9. Bernie Sanders’ Campaign Is In Big Trouble With The DNC  [POLITICS]  (score: 0.226)
10. Obama To Visit A Mosque For The First Time As President  [POLITICS]  (score: 0.225)
