## 1) Loading Libraries

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

## 2) Data Loading

In [2]:
df = pd.read_json("News Search Engine_Dataset_v3.json", lines=True)
df

Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22
...,...,...,...,...,...,...
209522,https://www.huffingtonpost.com/entry/rim-ceo-t...,RIM CEO Thorsten Heins' 'Significant' Plans Fo...,TECH,Verizon Wireless and AT&T are already promotin...,"Reuters, Reuters",2012-01-28
209523,https://www.huffingtonpost.com/entry/maria-sha...,Maria Sharapova Stunned By Victoria Azarenka I...,SPORTS,"Afterward, Azarenka, more effusive with the pr...",,2012-01-28
209524,https://www.huffingtonpost.com/entry/super-bow...,"Giants Over Patriots, Jets Over Colts Among M...",SPORTS,"Leading up to Super Bowl XLVI, the most talked...",,2012-01-28
209525,https://www.huffingtonpost.com/entry/aldon-smi...,Aldon Smith Arrested: 49ers Linebacker Busted ...,SPORTS,CORRECTION: An earlier version of this story i...,,2012-01-28


## 3) Create Balanced Dataset by Category

In [13]:
# Filter to specified categories only
allowed_categories = {"POLITICS", "TRAVEL", "SPORTS", "HOME & LIVING"}
df = df.loc[df["category"].isin(allowed_categories)]

# Create balanced dataset with 1000 samples per group
df_balanced = (df.groupby("category", group_keys=False)
               .apply(lambda group: group.sample(n=1000, random_state=42))
               .reset_index(drop=True))

# Select only required columns
df_balanced = df_balanced.loc[:, ["headline", "category"]]

  .apply(lambda group: group.sample(n=1000, random_state=42))


## 4) Finalize and Preview Dataset

In [14]:
# Final dataset
df = df.reset_index(drop=True)
print(df.shape)
df.head(10)

(54899, 6)


Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/dodgers-basebal...,"Maury Wills, Base-Stealing Shortstop For Dodge...",SPORTS,"Maury Wills, who helped the Los Angeles Dodger...","Beth Harris, AP",2022-09-20
1,https://www.huffpost.com/entry/biden-us-forces...,Biden Says U.S. Forces Would Defend Taiwan If ...,POLITICS,President issues vow as tensions with China rise.,,2022-09-19
2,https://www.huffpost.com/entry/ukraine-festiva...,‘Beautiful And Sad At The Same Time’: Ukrainia...,POLITICS,An annual celebration took on a different feel...,Jonathan Nicholson,2022-09-19
3,https://www.huffpost.com/entry/2022-wnba-final...,"Las Vegas Aces Win First WNBA Title, Chelsea G...",SPORTS,Las Vegas never had a professional sports cham...,"Pat Eaton-Robb, AP",2022-09-19
4,https://www.huffpost.com/entry/europe-britain-...,Biden Says Queen's Death Left 'Giant Hole' For...,POLITICS,"U.S. President Joe Biden, in London for the fu...","Darlene Superville, AP",2022-09-18
5,https://www.huffpost.com/entry/afghan-adjustme...,Bill To Help Afghans Who Escaped Taliban Faces...,POLITICS,Republican outrage over the shoddy U.S. withdr...,Hamed Ahmadi and Arthur Delaney,2022-09-16
6,https://www.huffpost.com/entry/capitol-riot-in...,Mark Meadows Complies With Justice Dept. Subpo...,POLITICS,The former White House chief of staff has turn...,"ERIC TUCKER, AP",2022-09-15
7,https://www.huffpost.com/entry/seth-magaziner-...,Democrats Nominate Seth Magaziner In Key Rhode...,POLITICS,The state's general treasurer is slated to fac...,Daniel Marans,2022-09-14
8,https://www.huffpost.com/entry/biden-cancer-mo...,Joe Biden Urges National Unity In Speech On Re...,POLITICS,"""Cancer does not discriminate red and blue,"" t...",Nick Visser,2022-09-13
9,https://www.huffpost.com/entry/tim-scott-senat...,Sen. Tim Scott Downplays Electability Concerns...,POLITICS,"""Who we have on the field is who we’re gonna p...",Marita Vlachou,2022-09-12


## 5) Transform Headlines into TF-IDF Features

In [15]:
# Configure text vectorization parameters
text_vectorizer = TfidfVectorizer(
    stop_words="english",
    max_features=5000,
    lowercase=True
)

# Learn vocabulary and transform text data
X = text_vectorizer.fit_transform(df_balanced["headline"].values)

## 6) Retrieve Most Relevant Articles for a Query

In [16]:
def find_relevant_articles(search_query, result_count=10):
    """Retrieve articles most relevant to the search query"""
    # Convert query to feature vector
    query_vector = vectorizer.transform([search_query])
    
    # Calculate similarity scores
    similarity_scores = cosine_similarity(query_vector, X).ravel()
    
    # Identify top matching indices
    top_matches = similarity_scores.argsort()[-result_count:][::-1]
    
    # Compile results with similarity metrics
    search_results = df.iloc[top_matches].copy()
    search_results['relevance_score'] = similarity_scores[top_matches]
    
    return search_results[['headline', 'category', 'relevance_score']]

## 7) Execute Search Query and Display Results

In [19]:
search_terms = "Europe travel"
matching_results = find_relevant_articles(search_terms)
print(matching_results)

                                               headline  category  \
3604  Ohio Voters Have A Chance To Do Something Abou...  POLITICS   
3572  U.S. Olympic Committee Ignored Sexual Abuse Co...    SPORTS   
2792  Pete Buttigieg, 37-Year-Old Mayor Of City In I...  POLITICS   
3976  What The Southwest Flight Can Teach Us About O...    TRAVEL   
3148  Trump Is Reportedly Sending New Pal Kim Jong U...  POLITICS   
3205  Publix Suspends Contributions To NRA-Backed Po...  POLITICS   
3573        Eric Schneiderman Has Always Been A Con Man  POLITICS   
3550         Trump's Iran Deal Exit Is A Win For Russia  POLITICS   
3800  White House Releases Photos Of Mike Pompeo Wit...  POLITICS   
3065  The Kremlin Hates America's 'Malignant Feminis...  POLITICS   

      relevance_score  
3604         0.566451  
3572         0.564726  
2792         0.538501  
3976         0.533820  
3148         0.487850  
3205         0.399826  
3573         0.380509  
3550         0.370808  
3800         0.342019  


## 8) Display Search Results in Readable Format

In [20]:
for i, row in results.iterrows():
    print(f"{row['similarity']:.3f} | {row['category']} | {row['headline']}")

0.566 | POLITICS | Ohio Voters Have A Chance To Do Something About Gerrymandering
0.565 | SPORTS | U.S. Olympic Committee Ignored Sexual Abuse Complaints Against Taekwondo Stars: Lawsuit
0.539 | POLITICS | Pete Buttigieg, 37-Year-Old Mayor Of City In Indiana, Joins Presidential Race
0.534 | TRAVEL | What The Southwest Flight Can Teach Us About Oxygen Masks
0.488 | POLITICS | Trump Is Reportedly Sending New Pal Kim Jong Un An Awkward Gift
0.400 | POLITICS | Publix Suspends Contributions To NRA-Backed Politician Amid Protests
0.381 | POLITICS | Eric Schneiderman Has Always Been A Con Man
0.371 | POLITICS | Trump's Iran Deal Exit Is A Win For Russia
0.342 | POLITICS | White House Releases Photos Of Mike Pompeo With Kim Jong Un To Praise Confirmation
0.332 | POLITICS | The Kremlin Hates America's 'Malignant Feminism,' Loves Brett Kavanaugh


## 9) Run Interactive Search Loop

In [21]:
search_active = True
while search_active:
    user_query = input("Enter search query (or 'exit' to stop): ")
    if user_query.lower() == 'exit':
        search_active = False
    else:
        print(find_relevant_articles(user_query))

Enter search query (or 'exit' to stop):  2


                                             headline  category  \
16  US, Trump Team Propose Names For Arbiter In Ma...  POLITICS   
17  Politician's DNA Connected To Las Vegas Journa...  POLITICS   
18  Michigan Supreme Court Revives Abortion Rights...  POLITICS   
19  Portland Residents With Disabilities Sue Over ...  POLITICS   
20  Baseball Players Union Joins AFL-CIO In Show O...    SPORTS   
21  The Unemployment Insurance System Is Not Ready...  POLITICS   
22  Kody Clemens Strikes Out MVP Shohei Ohtani, Tr...    SPORTS   
23  Michigan Secretary of State Worried About ‘Vio...  POLITICS   
24  Uvalde Fourth Graders Waited An Hour With Woun...  POLITICS   
25  Trump-Endorsed Wisconsin Gubernatorial Candida...  POLITICS   

    relevance_score  
16              0.0  
17              0.0  
18              0.0  
19              0.0  
20              0.0  
21              0.0  
22              0.0  
23              0.0  
24              0.0  
25              0.0  


Enter search query (or 'exit' to stop):  exit
