In [5]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Data Loading

In [6]:
df = pd.read_json('Data/News_Category_Dataset_v3.json', lines=True)

# Filter Categories

In [7]:
df = df[df['category'].isin(
    ['POLITICS', 'TRAVEL', 'SPORTS', 'HOME & LIVING']
)]

# Take 1000 Row from Each Category

In [8]:
df = df.groupby('category').apply(
    lambda x: x.sample(n=1000, random_state=42)
).reset_index(drop=True)

  df = df.groupby('category').apply(


# Keep Only (Headlines & Category)

In [9]:
df = df[['headline', 'category']]

In [10]:
df

Unnamed: 0,headline,category
0,"Busiest Shipping Day Of The Year Is Today, Ann...",HOME & LIVING
1,What To Watch On Netflix That’s New This Week ...,HOME & LIVING
2,Repurposing Idea Shows You How To Organize Hai...,HOME & LIVING
3,Company Buys $8000 Horse Lamp By Front Design ...,HOME & LIVING
4,Renovate for Rent,HOME & LIVING
...,...,...
3995,The 7 Most Mysterious Stone-Carved Faces That ...,TRAVEL
3996,Tips for a Stress-Free Family Summer Vacation,TRAVEL
3997,These Are The Busiest Flight Routes In The World,TRAVEL
3998,"This Is The Best, Most Underrated Travel Resource",TRAVEL


# Text Preprocessing

### Lowercasing:

In [11]:
df['headline'] = df['headline'].str.lower()
df['category'] = df['category'].str.lower()

### Remove punctuation: 

In [12]:
df['headline'] = df['headline'].apply(lambda x: re.sub(r'[^a-z\s]', ' ', x))
df['category'] = df['category'].apply(lambda x: re.sub(r'[^a-z\s]', ' ', x))

### Remove stopwords: 

In [13]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Split → remove stopwords → join back

df['headline'] = df['headline'].apply(
    lambda x: " ".join([word for word in x.split() if word not in stop_words])
)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\osamasaid\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# TF-IDF Vectorizer:

In [14]:
vectorizer = TfidfVectorizer(max_features=1000)

### Vectorize the headline column

In [15]:
headline_col_vectorize = vectorizer.fit_transform(df['headline'])

### Store vectors for all articles

In [16]:
tfidf_df = pd.DataFrame(headline_col_vectorize.toarray(), columns=vectorizer.get_feature_names_out())
tfidf_df

Unnamed: 0,abandoned,abortion,abroad,abuse,access,according,across,action,actually,ad,...,worthy,would,wrong,year,years,yet,york,young,youth,zen
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.396991,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
3996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
3997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
3998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0


# Search Implementation:

In [None]:
# hl = df["headline"].astype(str).tolist()
# cat = df["category"].astype(str).tolist()


# def search(query, top_n=10):
#     query_vec = vectorizer.transform([query])
#     similarities = cosine_similarity(query_vec, headline_col_vectorize).flatten()
#     top_indices = similarities.argsort()[-top_n:][::-1]

#     results_list = []
#     for i in top_indices:
#         results_list.append({
#             "headline": hl[i],
#             "category": cat[i],
#             "similarity_score": round(similarities[i], 4)
#         })
    
#     return results_list

In [None]:
headlines = df["headline"].astype(str).tolist()
categories = df["category"].astype(str).tolist()

def search(query, top_k=10):
    
    q_vec = vectorizer.transform([query])
    scores = cosine_similarity(q_vec, headline_col_vectorize).flatten()
    best_idx = scores.argsort()[-top_k:][::-1]
    results = [
        {
            "headline": headlines[i],
            "category": categories[i],
            "similarity": round(scores[i], 4)
        }
        for i in best_idx
    ]
    return results

In [24]:
search('sport')

[{'headline': 'dangerous sport world',
  'category': 'sports',
  'similarity': np.float64(0.833)},
 {'headline': 'sport society arete nfl women sport',
  'category': 'sports',
  'similarity': np.float64(0.7238)},
 {'headline': 'sport society arete fearsome foursome',
  'category': 'sports',
  'similarity': np.float64(0.5481)},
 {'headline': 'sport society arete cubs quest',
  'category': 'sports',
  'similarity': np.float64(0.5481)},
 {'headline': 'transcending game human side sport',
  'category': 'sports',
  'similarity': np.float64(0.5119)},
 {'headline': 'landon donovan proves seriously good sport world cup snub',
  'category': 'sports',
  'similarity': np.float64(0.4981)},
 {'headline': 'watch massive paddleboard made surfing team sport',
  'category': 'sports',
  'similarity': np.float64(0.4937)},
 {'headline': 'sport society arete baseball',
  'category': 'sports',
  'similarity': np.float64(0.488)},
 {'headline': 'sport society arete world series musings',
  'category': 'sports