## News Search Engine- Task

In [1]:
## Step 1: Import Required Libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [2]:
## Step 2: Load Dataset
DATA_PATH = "C:/Users/bbuser/Desktop/python_sprint 8/News_Category_Dataset_v3.json"

# Load JSON lines file
data = pd.read_json(DATA_PATH, lines=True)

# Keep only required columns
data = data[['category', 'headline']]
data.head()

Unnamed: 0,category,headline
0,U.S. NEWS,Over 4 Million Americans Roll Up Sleeves For O...
1,U.S. NEWS,"American Airlines Flyer Charged, Banned For Li..."
2,COMEDY,23 Of The Funniest Tweets About Cats And Dogs ...
3,PARENTING,The Funniest Tweets From Parents This Week (Se...
4,U.S. NEWS,Woman Who Called Cops On Black Bird-Watcher Lo...


In [3]:
 ## Step 3: Filter Categories and Balance Dataset
TARGET_CATEGORIES = ["POLITICS", "TRAVEL", "SPORTS", "HOME & LIVING"]

df = data[data['category'].isin(TARGET_CATEGORIES)].copy()

# Balance: 1000 per category
df_bal = df.groupby("category").apply(
    lambda x: x.sample(n=1000, random_state=42)
).reset_index(drop=True)

df_bal['category'].value_counts()

  df_bal = df.groupby("category").apply(


category
HOME & LIVING    1000
POLITICS         1000
SPORTS           1000
TRAVEL           1000
Name: count, dtype: int64

In [4]:
## Step 4: TF-IDF Vectorization
# Convert all 4000 headlines into TF-IDF vectors.
vectorizer = TfidfVectorizer(stop_words="english")
X = vectorizer.fit_transform(df_bal['headline'])

print("Shape of TF-IDF matrix:", X.shape)

Shape of TF-IDF matrix: (4000, 8302)


In [5]:
## Step 5: Search Function
def search_articles(query, top_k=10):
    # Transform query into vector
    query_vec = vectorizer.transform([query])
    
    # Compute cosine similarity with all headlines
    sims = cosine_similarity(query_vec, X).flatten()
    
    # Get indices of top matches
    top_idx = sims.argsort()[::-1][:top_k]
    
    results = []
    for i in top_idx:
        results.append({
            "headline": df_bal.iloc[i]['headline'],
            "category": df_bal.iloc[i]['category'],
            "similarity": round(sims[i], 3)
        })
    return pd.DataFrame(results)
# The function will:
# - Convert the query into TF-IDF.
# - Compute cosine similarity against all headlines.
# - Return the top 10 most relevant results.

In [6]:
## Step 6: Example Search
query = "president election policy"
results = search_articles(query, top_k=10)
results

Unnamed: 0,headline,category,similarity
0,NCAA’s New Sexual Violence Policy Underwhelmin...,SPORTS,0.296
1,"We’re Still, Somehow, A Year Away From The Pre...",POLITICS,0.28
2,"According To His Tweets, Trump's North Korea P...",POLITICS,0.256
3,Protecting America From Its President,POLITICS,0.248
4,John Kerry: Dick Cheney 'Completely Wrong' On ...,POLITICS,0.242
5,Carly Fiorina Backs Maternity Leave Policy Wor...,POLITICS,0.236
6,Lying To The Press Is Nothing New For The Pres...,POLITICS,0.227
7,Obama Has Some Issues With How The Media Are C...,POLITICS,0.22
8,President Obama Hawaii: What To Do On Oahu (PH...,TRAVEL,0.219
9,8 Problems You May Encounter Going To Vote In ...,HOME & LIVING,0.215


In [7]:
## Step 7: Another Example
query = "world cup football"
search_articles(query, top_k=10)

Unnamed: 0,headline,category,similarity
0,LIVE: World Cup Final,SPORTS,0.523
1,Top 5 Reasons to Watch the 2014 World Cup,SPORTS,0.474
2,Here's How To Watch The 2014 World Cup Online,SPORTS,0.454
3,Argentina Back To World Cup Semifinals,SPORTS,0.446
4,The World Cup Winners Selfie Is The Best Ever,SPORTS,0.425
5,Conservative Rage at Soccer and World Cup Is N...,SPORTS,0.394
6,WATCH: Last-Minute Goal Wins World Cup Match,SPORTS,0.363
7,WATCH: This Wonder Goal Won The 2014 World Cup,SPORTS,0.361
8,Huge Fight Broke Out In Stands At World Cup Match,SPORTS,0.338
9,"Hipster Football: Who, How and Why",SPORTS,0.334


## Task Is Done 