# Import the Library

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


# 1. Data Preprocessing

1. Load dataset and filter categories.

In [10]:
df = pd.read_json("News_Category_Dataset_v3.json", lines=True)
df


Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22
...,...,...,...,...,...,...
209522,https://www.huffingtonpost.com/entry/rim-ceo-t...,RIM CEO Thorsten Heins' 'Significant' Plans Fo...,TECH,Verizon Wireless and AT&T are already promotin...,"Reuters, Reuters",2012-01-28
209523,https://www.huffingtonpost.com/entry/maria-sha...,Maria Sharapova Stunned By Victoria Azarenka I...,SPORTS,"Afterward, Azarenka, more effusive with the pr...",,2012-01-28
209524,https://www.huffingtonpost.com/entry/super-bow...,"Giants Over Patriots, Jets Over Colts Among M...",SPORTS,"Leading up to Super Bowl XLVI, the most talked...",,2012-01-28
209525,https://www.huffingtonpost.com/entry/aldon-smi...,Aldon Smith Arrested: 49ers Linebacker Busted ...,SPORTS,CORRECTION: An earlier version of this story i...,,2012-01-28


2. Balance dataset (1000 per category).
3. Keep only headline and category.

In [4]:
# Keep only headline and category columns
df = df[['headline', 'category']]

# Filter required categories
categories = ["POLITICS", "TRAVEL", "SPORTS", "HOME & LIVING"]
df = df[df['category'].isin(categories)]

# Sample 1000 articles per category
df = df.groupby('category').head(1000)

# Final dataset
df = df.reset_index(drop=True)
print(df.shape)
df.head(10)

(4000, 2)


Unnamed: 0,headline,category
0,"Maury Wills, Base-Stealing Shortstop For Dodge...",SPORTS
1,Biden Says U.S. Forces Would Defend Taiwan If ...,POLITICS
2,‘Beautiful And Sad At The Same Time’: Ukrainia...,POLITICS
3,"Las Vegas Aces Win First WNBA Title, Chelsea G...",SPORTS
4,Biden Says Queen's Death Left 'Giant Hole' For...,POLITICS
5,Bill To Help Afghans Who Escaped Taliban Faces...,POLITICS
6,Mark Meadows Complies With Justice Dept. Subpo...,POLITICS
7,Democrats Nominate Seth Magaziner In Key Rhode...,POLITICS
8,Joe Biden Urges National Unity In Speech On Re...,POLITICS
9,Sen. Tim Scott Downplays Electability Concerns...,POLITICS


# 2. Search Index Creation

In [5]:
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['headline'])


# 3. Search Functionality

In [6]:
def search_articles(query, top_k=10):
    # Transform query into TF-IDF space
    query_vec = vectorizer.transform([query])
    
    # Compute cosine similarity
    similarities = cosine_similarity(query_vec, X).flatten()
    
    # Get top k indices
    top_indices = similarities.argsort()[-top_k:][::-1]
    
    # Prepare results
    results = df.iloc[top_indices].copy()
    results['similarity'] = similarities[top_indices]
    return results[['headline', 'category', 'similarity']]


In [7]:
query = "travel tips for Europe"
results = search_articles(query)
print(results)

                                               headline  category  similarity
3106                   Three Meditation Tips for Travel    TRAVEL    0.486844
2456                           How Do I Travel So Much?    TRAVEL    0.474351
2097  Are People Scared To Travel To Europe And Brit...    TRAVEL    0.457047
2471           8 Travel Tips Made Easy For The Holidays    TRAVEL    0.442132
3122               My Five Tips for Sweet Summer Travel    TRAVEL    0.431171
2455  Tips On How To Plan The Perfect Budget Travel ...    TRAVEL    0.370459
316   Biden To Travel To Europe For NATO Summit On R...  POLITICS    0.358396
1783  The Best (Cheap) Places To Go In Europe This W...    TRAVEL    0.348087
2482                   The 5 Best Ski Resorts In Europe    TRAVEL    0.341066
2558            Backpacking Through Europe in the 1990s    TRAVEL    0.332314


In [8]:
for i, row in results.iterrows():
    print(f"{row['similarity']:.3f} | {row['category']} | {row['headline']}")

0.487 | TRAVEL | Three Meditation Tips for Travel
0.474 | TRAVEL | How Do I Travel So Much?
0.457 | TRAVEL | Are People Scared To Travel To Europe And Britain?
0.442 | TRAVEL | 8 Travel Tips Made Easy For The Holidays
0.431 | TRAVEL | My Five Tips for Sweet Summer Travel
0.370 | TRAVEL | Tips On How To Plan The Perfect Budget Travel Vacation
0.358 | POLITICS | Biden To Travel To Europe For NATO Summit On Russia’s War On Ukraine
0.348 | TRAVEL | The Best (Cheap) Places To Go In Europe This Winter
0.341 | TRAVEL | The 5 Best Ski Resorts In Europe
0.332 | TRAVEL | Backpacking Through Europe in the 1990s


# 4. Output Format

In [9]:
while True:
    query = input("Enter search query (or 'exit' to stop): ")
    if query.lower() == 'exit':
        break
    print(search_articles(query))

Enter search query (or 'exit' to stop):  how do 


                                             headline  category  similarity
16  US, Trump Team Propose Names For Arbiter In Ma...  POLITICS         0.0
17  Politician's DNA Connected To Las Vegas Journa...  POLITICS         0.0
18  Michigan Supreme Court Revives Abortion Rights...  POLITICS         0.0
19  Portland Residents With Disabilities Sue Over ...  POLITICS         0.0
20  Baseball Players Union Joins AFL-CIO In Show O...    SPORTS         0.0
21  The Unemployment Insurance System Is Not Ready...  POLITICS         0.0
22  Kody Clemens Strikes Out MVP Shohei Ohtani, Tr...    SPORTS         0.0
23  Michigan Secretary of State Worried About ‘Vio...  POLITICS         0.0
24  Uvalde Fourth Graders Waited An Hour With Woun...  POLITICS         0.0
25  Trump-Endorsed Wisconsin Gubernatorial Candida...  POLITICS         0.0


Enter search query (or 'exit' to stop):  exit
