# News Search engine

#### imports

In [11]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import matplotlib as plt 
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer 
from collections import Counter
from wordcloud import WordCloud
import re
import nltk
from nltk.corpus import stopwords
import random
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report

### data preprocessing

- Load dataset and filter categories.

In [3]:
df = pd.read_json(r"data/News_Category_Dataset_v3.json", lines=True)
df

Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22
...,...,...,...,...,...,...
209522,https://www.huffingtonpost.com/entry/rim-ceo-t...,RIM CEO Thorsten Heins' 'Significant' Plans Fo...,TECH,Verizon Wireless and AT&T are already promotin...,"Reuters, Reuters",2012-01-28
209523,https://www.huffingtonpost.com/entry/maria-sha...,Maria Sharapova Stunned By Victoria Azarenka I...,SPORTS,"Afterward, Azarenka, more effusive with the pr...",,2012-01-28
209524,https://www.huffingtonpost.com/entry/super-bow...,"Giants Over Patriots, Jets Over Colts Among M...",SPORTS,"Leading up to Super Bowl XLVI, the most talked...",,2012-01-28
209525,https://www.huffingtonpost.com/entry/aldon-smi...,Aldon Smith Arrested: 49ers Linebacker Busted ...,SPORTS,CORRECTION: An earlier version of this story i...,,2012-01-28


- Balance dataset (1000 per category).

In [4]:
categories = ["POLITICS", "TRAVEL", "SPORTS", "HOME & LIVING"]
df_category = df[df['category'].isin(categories)]
df_category["category"].value_counts()


category
POLITICS         35602
TRAVEL            9900
SPORTS            5077
HOME & LIVING     4320
Name: count, dtype: int64

In [5]:
df_balanced = (
    df_category.groupby("category", group_keys=False)
      .apply(lambda x: x.sample(n=1000, random_state=42))
      .reset_index(drop=True)
)

  .apply(lambda x: x.sample(n=1000, random_state=42))


- Keep only headline and category.

In [6]:
df_balanced = df_balanced[["headline", "category"]].dropna()

In [8]:
df_balanced.shape

(4000, 2)

#### Vectorization

In [9]:
vectorizer = TfidfVectorizer(stop_words="english", max_features=5000)
X = vectorizer.fit_transform(df_balanced["headline"])

print("TF-IDF matrix shape:", X.shape)  # (4000, ~5000)

TF-IDF matrix shape: (4000, 5000)


#### Search Implementation

In [14]:
def search(query, top_n=10):
    # Transform query into TF-IDF space
    q_vec = vectorizer.transform([query])
    # Compute cosine similarity with all headline vectors
    sims = cosine_similarity(q_vec, X).flatten()
    # Get top N indices
    top_idx = np.argsort(sims)[::-1][:top_n]
    
    results = []
    for rank, i in enumerate(top_idx, 1):
        results.append({
            "Rank": rank,
            "Headline": df_balanced.iloc[i]["headline"],
            "Category": df_balanced.iloc[i]["category"],
            "Similarity Score": round(sims[i], 3)
        })
    return pd.DataFrame(results)

# Example usage
query = "president election"
results_df = search(query, top_n=10)
results_df


Unnamed: 0,Rank,Headline,Category,Similarity Score
0,1,This President's Tweeting Is Squandering Our Time,POLITICS,0.49
1,2,Protecting America From Its President,POLITICS,0.487
2,3,"We’re Still, Somehow, A Year Away From The Pre...",POLITICS,0.377
3,4,Obama Has Some Issues With How The Media Are C...,POLITICS,0.354
4,5,8 Problems You May Encounter Going To Vote In ...,HOME & LIVING,0.342
5,6,True Patriotism Requires Supporting Every Pres...,POLITICS,0.315
6,7,Shonda Rhimes Says 2016 Election Is Mirroring ...,POLITICS,0.309
7,8,Lying To The Press Is Nothing New For The Pres...,POLITICS,0.305
8,9,Are President Trump's Supporters Embarrassed T...,POLITICS,0.302
9,10,President Obama Hawaii: What To Do On Oahu (PH...,TRAVEL,0.294


#### User experience

In [15]:
search("president election")

Unnamed: 0,Rank,Headline,Category,Similarity Score
0,1,This President's Tweeting Is Squandering Our Time,POLITICS,0.49
1,2,Protecting America From Its President,POLITICS,0.487
2,3,"We’re Still, Somehow, A Year Away From The Pre...",POLITICS,0.377
3,4,Obama Has Some Issues With How The Media Are C...,POLITICS,0.354
4,5,8 Problems You May Encounter Going To Vote In ...,HOME & LIVING,0.342
5,6,True Patriotism Requires Supporting Every Pres...,POLITICS,0.315
6,7,Shonda Rhimes Says 2016 Election Is Mirroring ...,POLITICS,0.309
7,8,Lying To The Press Is Nothing New For The Pres...,POLITICS,0.305
8,9,Are President Trump's Supporters Embarrassed T...,POLITICS,0.302
9,10,President Obama Hawaii: What To Do On Oahu (PH...,TRAVEL,0.294
