
# TF‑IDF News Headline Search (Kaggle News Category Dataset)

This notebook builds a simple search engine over **4,000** news headlines from the Kaggle News Category Dataset, using **TF‑IDF** and **cosine similarity**.


In [1]:
import os
import json
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from pathlib import Path

In [3]:
DATA_PATH = Path("News_Category_Dataset_v3.json")  
CATEGORIES = ["POLITICS", "TRAVEL", "SPORTS", "HOME & LIVING"]
PER_CATEGORY = 1000
TOP_K = 10
RANDOM_STATE = 42  

In [4]:
df = pd.read_json(DATA_PATH, lines=True)
print("Loaded rows:", len(df))
print("Columns:", df.columns.tolist())
if not {'headline', 'category'}.issubset(df.columns):
    raise ValueError("Expected columns 'headline' and 'category' not found in dataset.")


Loaded rows: 209527
Columns: ['link', 'headline', 'category', 'short_description', 'authors', 'date']


In [5]:
df_filtered = df[df['category'].isin(CATEGORIES)].copy()

# Show counts before balancing
print("Counts before balancing:")
print(df_filtered['category'].value_counts())

Counts before balancing:
category
POLITICS         35602
TRAVEL            9900
SPORTS            5077
HOME & LIVING     4320
Name: count, dtype: int64


In [6]:
balanced_parts = []
shortages = {}
for cat in CATEGORIES:
    part = df_filtered[df_filtered['category'] == cat].head(PER_CATEGORY).copy()
    if len(part) < PER_CATEGORY:
        shortages[cat] = PER_CATEGORY - len(part)
    balanced_parts.append(part)

balanced_df = pd.concat(balanced_parts, ignore_index=True)

In [7]:
balanced_df = balanced_df[['headline', 'category']].reset_index(drop=True)

print("\nFinal per-category counts:")
print(balanced_df['category'].value_counts())


Final per-category counts:
category
POLITICS         1000
TRAVEL           1000
SPORTS           1000
HOME & LIVING    1000
Name: count, dtype: int64


In [8]:
if shortages:
    print("\nWARNING: Some categories had fewer than", PER_CATEGORY, "rows available:")
    for k, v in shortages.items():
        print(f" - {k}: shortage of {v}")
        
print("\nTotal rows kept:", len(balanced_df))
balanced_df.head(3)



Total rows kept: 4000


Unnamed: 0,headline,category
0,Biden Says U.S. Forces Would Defend Taiwan If ...,POLITICS
1,‘Beautiful And Sad At The Same Time’: Ukrainia...,POLITICS
2,Biden Says Queen's Death Left 'Giant Hole' For...,POLITICS


In [9]:
counts = balanced_df['category'].value_counts()
display(counts.to_frame('count'))

balanced_csv_path = Path("balanced_headlines_4cats.csv")
balanced_df.to_csv(balanced_csv_path, index=False)
print(f"Balanced dataset saved to: {balanced_csv_path.resolve()}")


Unnamed: 0_level_0,count
category,Unnamed: 1_level_1
POLITICS,1000
TRAVEL,1000
SPORTS,1000
HOME & LIVING,1000


Balanced dataset saved to: C:\Users\DuaaHilal\Downloads\balanced_headlines_4cats.csv


In [10]:
# Vectorize headlines with TF-IDF
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(balanced_df['headline'])
print("TF-IDF matrix shape:", tfidf_matrix.shape)

TF-IDF matrix shape: (4000, 8075)


In [11]:
# Search utility
def search_articles(query: str, top_n: int = TOP_K):
    if not query or not isinstance(query, str):
        raise ValueError("Please provide a non-empty text query.")
    query_vec = vectorizer.transform([query])
    sims = cosine_similarity(query_vec, tfidf_matrix).flatten()
    top_idx = np.argsort(sims)[-top_n:][::-1]
    results = pd.DataFrame({
        'headline': balanced_df.iloc[top_idx]['headline'].values,
        'category': balanced_df.iloc[top_idx]['category'].values,
        'similarity': sims[top_idx]
    })
    results = results.sort_values(by='similarity', ascending=False).reset_index(drop=True)
    return results


In [15]:
example_queries = [
    "cat",
    "dog",
    "home decor ideas",
    "best summer beaches travel tips"
]

for q in example_queries:
    print(f"\nQuery: {q}")
    display(search_articles(q, top_n=10))


Query: cat


Unnamed: 0,headline,category,similarity
0,Olympic Snowboarder Roars Back From Injury Wit...,SPORTS,0.41443
1,Cat Nap: 14 Beds For Your Favorite Feline,HOME & LIVING,0.40442
2,Rogue Cat Rescued After Hiding Out In New York...,TRAVEL,0.394688
3,United Bans Many Popular Dog And Cat Breeds Fr...,TRAVEL,0.33261
4,FBI Probing Russian-Speaking Fake Heiress Who ...,POLITICS,0.0
5,Kash Patel Says His Life's In Danger Because H...,POLITICS,0.0
6,Lindsey Graham Warns Of 'Riots In Streets' If ...,POLITICS,0.0
7,House Members Are Using A Controversial Tool T...,POLITICS,0.0
8,Secret Service Official And Ex-Trump Aide Tony...,POLITICS,0.0
9,Stocks Dive For Truth Social SPAC Amid Merger ...,POLITICS,0.0



Query: dog


Unnamed: 0,headline,category,similarity
0,These Are The Most Popular Dog Breeds In Every...,HOME & LIVING,0.503162
1,How To Help Your Dog Be A Good Neighbor,HOME & LIVING,0.494602
2,This Adorable Dog Has A Very Important Hospita...,TRAVEL,0.388231
3,Instead Of Returning The Dog You Adopted Durin...,HOME & LIVING,0.370583
4,The Best Comeback Story At Westminster Dog Sho...,SPORTS,0.333824
5,Just Take Our Money And Give Us This Star Wars...,HOME & LIVING,0.333554
6,Chloe Kim Poses With Her Adorable Dog On The C...,SPORTS,0.327954
7,Dog Gone Crazy? 5 Tips for Managing Fido Durin...,HOME & LIVING,0.316581
8,Dog Dies On United Flight After Passenger Forc...,TRAVEL,0.306186
9,United Airlines Mistakenly Flies Family's Dog ...,TRAVEL,0.304873



Query: home decor ideas


Unnamed: 0,headline,category,similarity
0,Genius Decor Ideas For Small Spaces,HOME & LIVING,0.519753
1,5 Spring Home Decor Updates Under $100,HOME & LIVING,0.414501
2,Hot Spring Trends in Home Decor,HOME & LIVING,0.412124
3,Fix Up Your Home In A Weekend With These 6 DIY...,HOME & LIVING,0.391869
4,What Your Christmas Decor Says About You (PHOTOS),HOME & LIVING,0.386833
5,Designer Home Decor: Save & Splurge,HOME & LIVING,0.386068
6,11 Decor Ideas That Will Change Your Rental Ap...,HOME & LIVING,0.370299
7,Memorial Day Furniture And Home Decor Sales 2019,HOME & LIVING,0.333805
8,Stylish Home Decor That Hardly Costs a Penny,HOME & LIVING,0.328773
9,Easy Decorating Ideas Under $100 to Get Your H...,HOME & LIVING,0.322825



Query: best summer beaches travel tips


Unnamed: 0,headline,category,similarity
0,My Five Tips for Sweet Summer Travel,TRAVEL,0.581589
1,The Best Beaches In Africa,TRAVEL,0.478693
2,Here's Yet Another Way To Get Paid To Travel T...,TRAVEL,0.397838
3,Three Meditation Tips for Travel,TRAVEL,0.378216
4,How Do I Travel So Much?,TRAVEL,0.368511
5,The World's Best (?) Beaches: Beach-Bumming on...,TRAVEL,0.358222
6,8 Travel Tips Made Easy For The Holidays,TRAVEL,0.343481
7,Secret Maine Beaches,TRAVEL,0.33516
8,15 Popular Travel Destinations You Should Avoi...,TRAVEL,0.320354
9,Get These Last-Minute Summer Travel Deals Befo...,TRAVEL,0.317037


In [14]:
query = " Online Travel"
search_articles(query, top_n=10)


Unnamed: 0,headline,category,similarity
0,How Do I Travel So Much?,TRAVEL,0.553774
1,How to Buy Vintage Furniture Online,HOME & LIVING,0.456116
2,How Online Travel Agencies Manipulate Your Sea...,TRAVEL,0.455339
3,"The Weekend's Best Online Sales For Furniture,...",HOME & LIVING,0.353091
4,The Best Stores To Buy Industrial Furniture An...,HOME & LIVING,0.324095
5,Travel Is The New 'Me Time',TRAVEL,0.318172
6,Where To Buy Art Deco-Inspired Furniture And D...,HOME & LIVING,0.298542
7,"The Best Online Sales This Weekend, From Resto...",HOME & LIVING,0.294285
8,Christmas Just Came Early! Shop This Week's Be...,HOME & LIVING,0.287194
9,8 Reasons To Travel This Year,TRAVEL,0.281692
