Install required packages

In [2]:
!pip install --quiet kagglehub pandas scikit-learn ipywidgets


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.6 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.4/1.6 MB[0m [31m11.7 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.6/1.6 MB[0m [31m27.2 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m20.1 MB/s[0m eta [36m0:00:00[0m
[?25h

Imports + Download dataset using kagglehub

In [3]:
import os
import glob
import kagglehub
import pandas as pd

# Download dataset (kagglehub handles the Kaggle API steps for you)
path = kagglehub.dataset_download("rmisra/news-category-dataset")
print("Downloaded dataset path:", path)

# Find the JSON file inside the downloaded path (robust search)
json_files = glob.glob(os.path.join(path, "*.json"))
if not json_files:
    json_files = glob.glob(os.path.join(path, "**", "*.json"), recursive=True)

if not json_files:
    raise FileNotFoundError(f"No .json found in {path}; list directory to check contents: {os.listdir(path)}")

json_path = json_files[0]
print("Using JSON file:", json_path)

# Load dataset
df = pd.read_json(json_path, lines=True)
print("Rows loaded:", len(df))
df.head()


Using Colab cache for faster access to the 'news-category-dataset' dataset.
Downloaded dataset path: /kaggle/input/news-category-dataset
Using JSON file: /kaggle/input/news-category-dataset/News_Category_Dataset_v3.json
Rows loaded: 209527


Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22


Preprocess: combine text fields, remove empties/dupes, add global id

In [4]:
# Combine headline and short_description into a single content column
df['headline'] = df['headline'].fillna('').astype(str)
df['short_description'] = df['short_description'].fillna('').astype(str)
df['content'] = (df['headline'] + ' ' + df['short_description']).str.strip()

# Drop empty content rows
df = df[df['content'].str.len() > 0].reset_index(drop=True)

# Drop duplicates by content
before = len(df)
df.drop_duplicates(subset=['content'], inplace=True)
df = df.reset_index(drop=True)
print(f"Dropped {before - len(df)} duplicate rows. Remaining rows: {len(df)}")

# Add a stable global id column to track original indices when making category subsets
df['global_id'] = df.index

# Quick preview
df[['global_id','category','headline','short_description']].head()


Dropped 485 duplicate rows. Remaining rows: 209037


Unnamed: 0,global_id,category,headline,short_description
0,0,U.S. NEWS,Over 4 Million Americans Roll Up Sleeves For O...,Health experts said it is too early to predict...
1,1,U.S. NEWS,"American Airlines Flyer Charged, Banned For Li...",He was subdued by passengers and crew when he ...
2,2,COMEDY,23 Of The Funniest Tweets About Cats And Dogs ...,"""Until you have a dog you don't understand wha..."
3,3,PARENTING,The Funniest Tweets From Parents This Week (Se...,"""Accidentally put grown-up toothpaste on my to..."
4,4,U.S. NEWS,Woman Who Called Cops On Black Bird-Watcher Lo...,Amy Cooper accused investment firm Franklin Te...


Show available categories (so the user can pick)

In [5]:
categories = sorted(df['category'].unique())
print("Available categories ({}):".format(len(categories)))
for i, c in enumerate(categories, 1):
    print(f"{i:2d}. {c}")

Available categories (42):
 1. ARTS
 2. ARTS & CULTURE
 3. BLACK VOICES
 4. BUSINESS
 5. COLLEGE
 6. COMEDY
 7. CRIME
 8. CULTURE & ARTS
 9. DIVORCE
10. EDUCATION
11. ENTERTAINMENT
12. ENVIRONMENT
13. FIFTY
14. FOOD & DRINK
15. GOOD NEWS
16. GREEN
17. HEALTHY LIVING
18. HOME & LIVING
19. IMPACT
20. LATINO VOICES
21. MEDIA
22. MONEY
23. PARENTING
24. PARENTS
25. POLITICS
26. QUEER VOICES
27. RELIGION
28. SCIENCE
29. SPORTS
30. STYLE
31. STYLE & BEAUTY
32. TASTE
33. TECH
34. THE WORLDPOST
35. TRAVEL
36. U.S. NEWS
37. WEDDINGS
38. WEIRD NEWS
39. WELLNESS
40. WOMEN
41. WORLD NEWS
42. WORLDPOST


Helper: build TF-IDF + NearestNeighbors for a DataFrame subset

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

def build_tfidf_nn(df_subset, max_features=3000, ngram_range=(1,2)):
    """
    Build TF-IDF vectorizer + sparse matrix + NearestNeighbors model for df_subset.
    Returns (vectorizer, tfidf_matrix, nn_model).
    """
    # Defensive: if df_subset is tiny, lower max_features
    if df_subset.shape[0] < 50:
        max_features = min(max_features, 1000)

    vectorizer = TfidfVectorizer(stop_words='english', max_features=max_features, ngram_range=ngram_range)
    tfidf_matrix = vectorizer.fit_transform(df_subset['content'])

    # n_neighbors must be <= n_samples; set a reasonable default (we'll ask for top_k and pass accordingly)
    nn_model = NearestNeighbors(metric='cosine', algorithm='brute')
    nn_model.fit(tfidf_matrix)

    return vectorizer, tfidf_matrix, nn_model


Recommendation functions (index-based & text-query-based)

In [7]:
import numpy as np
import pandas as pd

def recommend_by_index_in_subset(df_subset, tfidf_matrix, nn_model, index_in_subset, top_k=5):
    """
    Recommend top_k articles in the given subset similar to the row at index_in_subset.
    Returns a pandas DataFrame with headline, short_description, link, similarity, and global_id.
    """
    n_samples = tfidf_matrix.shape[0]
    k = min(top_k + 1, n_samples)  # +1 because the closest is the document itself
    distances, indices = nn_model.kneighbors(tfidf_matrix[index_in_subset], n_neighbors=k)
    distances = distances.flatten()
    indices = indices.flatten()

    # convert distances -> similarity
    similarities = 1 - distances

    # skip the first one (itself) and take top_k
    chosen = []
    for dist, idx, sim in zip(distances[1:], indices[1:], similarities[1:]):
        row = df_subset.iloc[idx]
        chosen.append({
            'headline': row['headline'],
            'short_description': row['short_description'],
            'link': row.get('link', ''),
            'similarity': float(sim),
            'global_id': int(row['global_id'])
        })
    return pd.DataFrame(chosen)

def recommend_by_text_in_subset(query_text, df_subset, vectorizer, tfidf_matrix, nn_model, top_k=5):
    """
    Take a user query (string), vectorize it with the subset vectorizer, and return top_k similar articles.
    """
    qv = vectorizer.transform([query_text])
    n_samples = tfidf_matrix.shape[0]
    k = min(top_k, n_samples)
    distances, indices = nn_model.kneighbors(qv, n_neighbors=k)
    distances = distances.flatten()
    indices = indices.flatten()
    similarities = 1 - distances
    chosen = []
    for dist, idx, sim in zip(distances, indices, similarities):
        row = df_subset.iloc[idx]
        chosen.append({
            'headline': row['headline'],
            'short_description': row['short_description'],
            'link': row.get('link', ''),
            'similarity': float(sim),
            'global_id': int(row['global_id'])
        })
    return pd.DataFrame(chosen)


Console interactive flow (pick category → build model → choose article or type a query)

In [8]:
import random
import math

def interactive_console():
    print("Choose a category by number (0 to exit):")
    for i, c in enumerate(categories, 1):
        print(f"{i:2d}. {c}")
    while True:
        try:
            choice = int(input("\nEnter category number (0 to exit): ").strip())
        except Exception:
            print("Invalid input. Please enter a number.")
            continue
        if choice == 0:
            print("Exiting.")
            return
        if 1 <= choice <= len(categories):
            cat = categories[choice - 1]
            # make subset
            df_sub = df[df['category'] == cat].reset_index(drop=True)
            print(f"\nCategory '{cat}' selected — {len(df_sub)} articles found.\n")
            if len(df_sub) == 0:
                continue

            # Build TF-IDF + NN for this subset
            print("Building TF-IDF & NearestNeighbors for this category (this may take a few seconds)...")
            vectorizer, tfidf_matrix, nn_model = build_tfidf_nn(df_sub, max_features=3000)
            print("Done.\n")

            # show a short sample of articles (first 20 or random 20)
            sample_n = min(20, len(df_sub))
            sample_indices = list(range(sample_n))
            print("Sample articles (index : headline):")
            for i in sample_indices:
                h = df_sub.loc[i, 'headline']
                print(f"{i:3d}. {h[:140]}")

            # now ask to either pick index or type a query
            while True:
                print("\nOptions:")
                print("  i -> pick an article index to get similar articles")
                print("  t -> type a text query to find similar articles")
                print("  c -> choose another category")
                print("  q -> quit")
                cmd = input("Enter option (i/t/c/q): ").strip().lower()
                if cmd == 'q':
                    print("Exiting.")
                    return
                if cmd == 'c':
                    break  # go back to category selection
                if cmd == 'i':
                    try:
                        idx = int(input("Enter article index from the sample above (or any index 0..{}): ".format(len(df_sub)-1)).strip())
                    except Exception:
                        print("Invalid index. Try again.")
                        continue
                    if not (0 <= idx < len(df_sub)):
                        print("Index out of range.")
                        continue
                    print("\nSelected article:")
                    print(df_sub.loc[idx, 'headline'])
                    print(df_sub.loc[idx, 'short_description'])
                    print()
                    recs = recommend_by_index_in_subset(df_sub, tfidf_matrix, nn_model, idx, top_k=5)
                    if recs.empty:
                        print("No recommendations found.")
                    else:
                        print("\nTop similar articles:")
                        for i, r in recs.iterrows():
                            print(f"- [{r['similarity']:.3f}] {r['headline']}")
                            print(f"    {r['short_description']}")
                            print(f"    Link: {r['link']}")
                            print()
                elif cmd == 't':
                    q = input("Type a short query (e.g., 'spaceX rocket launch'): ").strip()
                    if not q:
                        print("Empty query.")
                        continue
                    recs = recommend_by_text_in_subset(q, df_sub, vectorizer, tfidf_matrix, nn_model, top_k=5)
                    if recs.empty:
                        print("No recommendations found.")
                    else:
                        print("\nTop matches for your query:")
                        for i, r in recs.iterrows():
                            print(f"- [{r['similarity']:.3f}] {r['headline']}")
                            print(f"    {r['short_description']}")
                            print(f"    Link: {r['link']}")
                            print()
                else:
                    print("Unknown command. Choose i/t/c/q.")
        else:
            print(f"Please enter a number between 1 and {len(categories)} (or 0 to exit).")

# run the console interactive flow
interactive_console()


Choose a category by number (0 to exit):
 1. ARTS
 2. ARTS & CULTURE
 3. BLACK VOICES
 4. BUSINESS
 5. COLLEGE
 6. COMEDY
 7. CRIME
 8. CULTURE & ARTS
 9. DIVORCE
10. EDUCATION
11. ENTERTAINMENT
12. ENVIRONMENT
13. FIFTY
14. FOOD & DRINK
15. GOOD NEWS
16. GREEN
17. HEALTHY LIVING
18. HOME & LIVING
19. IMPACT
20. LATINO VOICES
21. MEDIA
22. MONEY
23. PARENTING
24. PARENTS
25. POLITICS
26. QUEER VOICES
27. RELIGION
28. SCIENCE
29. SPORTS
30. STYLE
31. STYLE & BEAUTY
32. TASTE
33. TECH
34. THE WORLDPOST
35. TRAVEL
36. U.S. NEWS
37. WEDDINGS
38. WEIRD NEWS
39. WELLNESS
40. WOMEN
41. WORLD NEWS
42. WORLDPOST

Enter category number (0 to exit): 40

Category 'WOMEN' selected — 3486 articles found.

Building TF-IDF & NearestNeighbors for this category (this may take a few seconds)...
Done.

Sample articles (index : headline):
  0. The Funniest Tweets From Women This Week (June 25-July 1)
  1. The 20 Funniest Tweets From Women This Week (June 11-17)
  2. The 20 Funniest Tweets From Women This W

Widget-based UI for Colab: dropdowns + button

In [9]:
# Run this cell in Colab to get a simple clickable UI
from IPython.display import display, HTML, clear_output
import ipywidgets as widgets

# Dropdown for category selection
cat_dd = widgets.Dropdown(options=categories, description='Category:', layout=widgets.Layout(width='60%'))
build_btn = widgets.Button(description='Build model for category', button_style='primary')
article_dd = widgets.Dropdown(options=[], description='Article:')
query_box = widgets.Text(placeholder='Type a short query (or leave blank)', description='Query:')
search_btn = widgets.Button(description='Find similar articles')

out = widgets.Output(layout=widgets.Layout(border='1px solid lightgray'))

# We'll store model objects here after building for a category
ui_state = {'current_category': None, 'df_sub': None, 'vectorizer': None, 'tfidf': None, 'nn': None}

def on_build_clicked(b):
    with out:
        clear_output()
        cat = cat_dd.value
        print("Building model for category:", cat)
        df_sub = df[df['category'] == cat].reset_index(drop=True)
        if df_sub.empty:
            print("No articles in this category.")
            return
        vectorizer, tfidf_matrix, nn_model = build_tfidf_nn(df_sub, max_features=2500)
        ui_state.update({
            'current_category': cat,
            'df_sub': df_sub,
            'vectorizer': vectorizer,
            'tfidf': tfidf_matrix,
            'nn': nn_model
        })
        # populate article dropdown with first 200 headlines (avoid huge dropdown)
        max_dd = min(200, len(df_sub))
        article_options = [(f"{i} - {df_sub.loc[i,'headline'][:80]}", i) for i in range(max_dd)]
        article_dd.options = article_options
        print(f"Model built for category '{cat}' ({len(df_sub)} articles). Select an article or type a query and click 'Find similar articles'.")

def on_search_clicked(b):
    with out:
        clear_output()
        if ui_state['df_sub'] is None:
            print("Build a model for a category first.")
            return
        df_sub = ui_state['df_sub']
        vec = ui_state['vectorizer']
        tfidf = ui_state['tfidf']
        nn = ui_state['nn']
        if query_box.value.strip():
            recs = recommend_by_text_in_subset(query_box.value.strip(), df_sub, vec, tfidf, nn, top_k=5)
        else:
            # use selected article index from dropdown
            if article_dd.value is None:
                print("Pick an article (or provide a query).")
                return
            idx = int(article_dd.value)
            recs = recommend_by_index_in_subset(df_sub, tfidf, nn, idx, top_k=5)
        if recs.empty:
            print("No recommendations found.")
            return
        # display results as HTML
        html = "<h3>Recommendations</h3>"
        for i, r in recs.iterrows():
            html += f"<h4>[{r['similarity']:.3f}] {r['headline']}</h4>"
            html += f"<p>{r['short_description']}<br><a href='{r['link']}' target='_blank'>{r['link']}</a></p><hr>"
        display(HTML(html))

build_btn.on_click(on_build_clicked)
search_btn.on_click(on_search_clicked)

display(widgets.VBox([cat_dd, build_btn, widgets.HBox([article_dd, query_box, search_btn]), out]))


VBox(children=(Dropdown(description='Category:', layout=Layout(width='60%'), options=('ARTS', 'ARTS & CULTURE'…