In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import save_npz, load_npz
import joblib

In [2]:
data_path  = r"C:\Users\bbuser\Desktop\News_Category_Dataset_v3.json" 
output_dir = r"C:\Users\bbuser\Desktop\news_keyword_baseline"
os.makedirs(output_dir, exist_ok=True)

In [3]:
csv_path   = os.path.join(output_dir, "news_balanced_headline_category.csv")
vec_path   = os.path.join(output_dir, "tfidf_vectorizer.pkl")
mat_path   = os.path.join(output_dir, "tfidf_matrix.npz")

In [4]:
target_categories = ["POLITICS", "TRAVEL", "SPORTS", "HOME & LIVING"]
n_per_class = 1000
random_state = 42

In [5]:
df = pd.read_json(data_path, lines=True)

df = df[df["category"].isin(target_categories)].copy()
df = df[["headline", "category"]].dropna(subset=["headline", "category"])

balanced = (
    df.groupby("category", group_keys=False)
      .sample(n=n_per_class, random_state=random_state)
      .reset_index(drop=True)
)

In [6]:
balanced.to_csv(csv_path, index=False, encoding="utf-8")

print("prepared dataset saved:", csv_path)
print(balanced["category"].value_counts())

prepared dataset saved: C:\Users\bbuser\Desktop\news_keyword_baseline\news_balanced_headline_category.csv
category
HOME & LIVING    1000
POLITICS         1000
SPORTS           1000
TRAVEL           1000
Name: count, dtype: int64


In [7]:
texts = balanced["headline"].astype(str).tolist()
vectorizer = TfidfVectorizer()
x = vectorizer.fit_transform(texts)  

In [8]:
save_npz(mat_path, x)
joblib.dump(vectorizer, vec_path)

['C:\\Users\\bbuser\\Desktop\\news_keyword_baseline\\tfidf_vectorizer.pkl']

In [9]:
print("tf-idf matrix saved to:", mat_path, "| shape:", x.shape)
print("vectorizer saved to:", vec_path)

tf-idf matrix saved to: C:\Users\bbuser\Desktop\news_keyword_baseline\tfidf_matrix.npz | shape: (4000, 8631)
vectorizer saved to: C:\Users\bbuser\Desktop\news_keyword_baseline\tfidf_vectorizer.pkl


In [10]:
vectorizer = joblib.load(vec_path)
x = load_npz(mat_path)
df_results = pd.read_csv(csv_path) 

In [11]:
def search(query: str, top_k: int = 10):
    query = (query or "").strip()
    if not query:
        print("empty query. please type something.")
        return

    q_vec = vectorizer.transform([query])
    scores = (x @ q_vec.T).toarray().ravel()
    top_idx = np.argsort(-scores)[:top_k]
    print(f"\nTop {top_k} results for: {query!r}\n" + "-"*70)
    for rank, i in enumerate(top_idx, start=1):
        cat = df_results.loc[i, "category"]
        head = df_results.loc[i, "headline"]
        sc = scores[i]
        print(f"{rank:>2}. [{cat}] {head}  (score={sc:.4f})")
    print("-"*70)

In [12]:
search("president election")
search("travel to europe")
search("football game")
search("home decoration ideas")


Top 10 results for: 'president election'
----------------------------------------------------------------------
 1. [POLITICS] President Trump's War On Children  (score=0.3419)
 2. [POLITICS] Donald Trump Is The 'Air Guitar' President  (score=0.3013)
 3. [TRAVEL] Inauguration Travel: President Obama's Favorite Hotels  (score=0.2892)
 4. [TRAVEL] Hotel Suites Good Enough For The President (PHOTOS)  (score=0.2804)
 5. [POLITICS] What If A President Is Too Impaired To Lead?  (score=0.2687)
 6. [POLITICS] President Trump Escapes D.C. (And Reality) In Florida…Again  (score=0.2593)
 7. [POLITICS] A Response: My Election Blame List  (score=0.2515)
 8. [POLITICS] It's Bloomberg-For-President O'Clock Again, Apparently  (score=0.2513)
 9. [SPORTS] PGA President Fired Over 'Insensitive' Comments  (score=0.2509)
10. [SPORTS] Here Is President Obama's 2016 March Madness Bracket  (score=0.2494)
----------------------------------------------------------------------

Top 10 results for: 'travel to eu

In [13]:
q = input("Enter your search query: ")
search(q, top_k=10)

Enter your search query:  HOME



Top 10 results for: 'HOME'
----------------------------------------------------------------------
 1. [HOME & LIVING] Heart & Home  (score=0.4813)
 2. [HOME & LIVING] Donna David, Professional Organizer, Says A Decluttered Home Is A Stress-Free Home  (score=0.3412)
 3. [HOME & LIVING] 10 Ways To Love Your Home More  (score=0.3197)
 4. [HOME & LIVING] Shelter Island, New York Home Has Eat-In Wine Cellar, Home Theater And Nightclub (PHOTOS)  (score=0.3197)
 5. [HOME & LIVING] How To Dry Clean At Home  (score=0.3049)
 6. [HOME & LIVING] 5 Steps to a More Masculine Home  (score=0.2914)
 7. [SPORTS] Why LeBron Wanted to Go Home  (score=0.2858)
 8. [HOME & LIVING] Memorial Day Sales 2012: Home Decor And Outdoor Furniture Sales At Macy's, Sears, Home Depot And More (PHOTOS)  (score=0.2737)
 9. [HOME & LIVING] Weekly Roundup of eBay Vintage Home Finds  (score=0.2694)
10. [HOME & LIVING] Weekly Roundup of eBay Vintage Home Finds  (score=0.2694)
-------------------------------------------------