In [35]:
import pandas as pd

df = pd.read_json('News_Category_Dataset_v3.json',lines=True)

df.head()

Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22


In [21]:
print(f"Dataset shape: {df.shape}")

Dataset shape: (209527, 6)


In [22]:
print(f"\nColumns: {list(df.columns)}")


Columns: ['link', 'headline', 'category', 'short_description', 'authors', 'date']


In [23]:
print(f"\nData types:")
print(df.dtypes)


Data types:
link                         object
headline                     object
category                     object
short_description            object
authors                      object
date                 datetime64[ns]
dtype: object


In [24]:
print(f"\nMissing values:")
print(df.isnull().sum())


Missing values:
link                 0
headline             0
category             0
short_description    0
authors              0
date                 0
dtype: int64


In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209527 entries, 0 to 209526
Data columns (total 6 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   link               209527 non-null  object        
 1   headline           209527 non-null  object        
 2   category           209527 non-null  object        
 3   short_description  209527 non-null  object        
 4   authors            209527 non-null  object        
 5   date               209527 non-null  datetime64[ns]
dtypes: datetime64[ns](1), object(5)
memory usage: 9.6+ MB


In [26]:
df.describe()

Unnamed: 0,date
count,209527
mean,2015-04-30 00:44:14.344308736
min,2012-01-28 00:00:00
25%,2013-08-10 00:00:00
50%,2015-03-16 00:00:00
75%,2016-11-01 00:00:00
max,2022-09-23 00:00:00


In [27]:
category_counts = df['category'].value_counts()
print("Top 10 categories:")
print(category_counts.head(10))

Top 10 categories:
category
POLITICS          35602
WELLNESS          17945
ENTERTAINMENT     17362
TRAVEL             9900
STYLE & BEAUTY     9814
PARENTING          8791
HEALTHY LIVING     6694
QUEER VOICES       6347
FOOD & DRINK       6340
BUSINESS           5992
Name: count, dtype: int64


In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [29]:
categories = ['POLITICS', 'TRAVEL', 'SPORTS', 'HOME & LIVING']
filtered_df = df[df['category'].isin(categories)]

In [30]:
data = []
for cat in categories:
    sample = filtered_df[filtered_df['category'] == cat].sample(1000, random_state=42)
    data.append(sample)
search_df = pd.concat(data)[['headline', 'category']].reset_index(drop=True)

In [31]:
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(search_df['headline'])

In [32]:
def search(query):
    query_vec = vectorizer.transform([query])
    scores = cosine_similarity(query_vec, tfidf_matrix)[0]
    top_10 = scores.argsort()[-10:][::-1]
    
    results = []
    for i, idx in enumerate(top_10):
        results.append({
            'rank': i+1,
            'headline': search_df.iloc[idx]['headline'],
            'category': search_df.iloc[idx]['category'],
            'score': round(scores[idx], 3)
        })
    return pd.DataFrame(results)

In [34]:
print("Search engine ready!")

# Test it:
results = search('vacation travel')
print(results)

Search engine ready!
   rank                                           headline category  score
0     1                                       Why I Travel   TRAVEL  0.611
1     2       The Top 7 Destinations For A Family Vacation   TRAVEL  0.445
2     3              10 Places To Have A 'Frozen' Vacation   TRAVEL  0.394
3     4                       Keep a Lid on Vacation Costs   TRAVEL  0.378
4     5         7 Smart Tips For Working While On Vacation   TRAVEL  0.349
5     6  Way Too Many Americans Didn't Take Enough Vaca...   TRAVEL  0.326
6     7      Tips for a Stress-Free Family Summer Vacation   TRAVEL  0.323
7     8  Why I'm Vowing To Use All Of My Vacation Days ...   TRAVEL  0.320
8     9  Best B&Bs For A Romantic Vacation This Winter ...   TRAVEL  0.318
9    10         Marco Island, Where Miami Goes On Vacation   TRAVEL  0.318
