#### pandas → load and preprocess the dataset.

#### numpy → arrays, handle indices and similarity scores.

#### TfidfVectorizer (sklearn) → convert headlines and queries into numeric vectors.

#### scipy.sparse → efficiently store the TF-IDF vectors.

#### linear_kernel (sklearn) → compute cosine similarity between query and headlines.

#### joblib → save/load the vectorizer and TF-IDF matrix for future searches.

In [3]:
import os
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from scipy import sparse
import joblib

## News Search Engine

## Data Preprocessing

Filters to your 4 categories.

Keeps only headline and category.

Balances dataset to 1,000 per category.

## TF-IDF Vectorization

Trains TF-IDF on headlines.

Uses unigrams and bigrams for better matching.

## Search

Converts a free-text query into TF-IDF.

Computes cosine similarity with all headlines.

Returns top 10 results with headline, category, similarity_score.

In [4]:
# --- Parameters ---
TARGET_CATEGORIES = ["POLITICS", "TRAVEL", "SPORTS", "HOME & LIVING"]
TARGET_PER_CAT = 1000

# --- 1. Load and preprocess dataset ---
def load_news_dataset(file_path):
    df = pd.read_json(file_path, lines=True)
    df = df[['category', 'headline']]
    df = df[df['category'].isin(TARGET_CATEGORIES)]
    df = df.dropna(subset=['headline']).drop_duplicates(subset=['headline'])
    
    # Balance to TARGET_PER_CAT per category
    balanced_frames = []
    for cat in TARGET_CATEGORIES:
        subset = df[df['category'] == cat].sample(frac=1, random_state=42)
        if len(subset) >= TARGET_PER_CAT:
            subset = subset.head(TARGET_PER_CAT)
        balanced_frames.append(subset)
    
    balanced = pd.concat(balanced_frames, ignore_index=True)
    return balanced.reset_index(drop=True)



# --- 2. TF-IDF Vectorization ---
def build_tfidf_index(df):
    vectorizer = TfidfVectorizer(lowercase=True, stop_words='english', ngram_range=(1,2), norm='l2')
    X = vectorizer.fit_transform(df['headline'])
    return vectorizer, X


# --- 3. Search Function ---
def search(query, vectorizer, X, df, top_k=10):
    q_vec = vectorizer.transform([query])
    sims = linear_kernel(q_vec, X).ravel()
    top_idx = np.argsort(-sims)[:top_k]
    results = df.iloc[top_idx].copy()
    results['similarity_score'] = sims[top_idx]
    return results[['headline', 'category', 'similarity_score']].reset_index(drop=True)

# --- 4. Save 
def save_artifacts(vectorizer, X, df, prefix="news"):
    joblib.dump(vectorizer, f"{prefix}_tfidf_vectorizer.joblib")
    sparse.save_npz(f"{prefix}_tfidf_matrix.npz", X)
    df.to_csv(f"{prefix}_index.csv", index=False)

def load_artifacts(prefix="news"):
    vectorizer = joblib.load(f"{prefix}_tfidf_vectorizer.joblib")
    X = sparse.load_npz(f"{prefix}_tfidf_matrix.npz")
    df = pd.read_csv(f"{prefix}_index.csv")
    return vectorizer, X, df

# --- Example Usage : ---
if __name__ == "__main__":
    # Load & preprocess dataset
    df = load_news_dataset("News_Category_Dataset_v3.json")
    
    # Build TF-IDF index
    vectorizer, X = build_tfidf_index(df)
    
    # Optional: save artifacts for later
    save_artifacts(vectorizer, X, df)
    
    # Perform a search
    query = "election results and government policy"
    results = search(query, vectorizer, X, df, top_k=10)
    print(results)


                                            headline  category  \
0  Donald Trump Still Won't Say If He'll Accept E...  POLITICS   
1  Doug Lamborn Midterm Election Results: Republi...  POLITICS   
2  Colorado Senate Election Results: Cory Gardner...  POLITICS   
3  Rick Larsen Midterm Election Results: Larsen D...  POLITICS   
4  Cathy McMorris Rodgers Midterm Election Result...  POLITICS   
5               Trump’s Foreign Policy Is A Disaster  POLITICS   
6    What the Government Shutdown Means to Travelers    TRAVEL   
7    Americans Can't Wait Until The Election Is Over  POLITICS   
8  Top U.S. Diplomat In China Quits Over Donald T...  POLITICS   
9  NCAA’s New Sexual Violence Policy Underwhelmin...    SPORTS   

   similarity_score  
0          0.336693  
1          0.280195  
2          0.276106  
3          0.259444  
4          0.256840  
5          0.177884  
6          0.154296  
7          0.140693  
8          0.133345  
9          0.130793  
