<!-- loaded the Kaggle News Category Dataset from your local path:
C:\Users\bbuser\Desktop\News_Category_Dataset_v3.json

The dataset is in JSON Lines format (.json where each line = one record), so we used:

pd.read_json(DATA_PATH, lines=True) -->

##### 1) Loading the dataset

##### I used the Kaggle News Category Dataset from my local machine (News_Category_Dataset_v3.json).

##### Since the file is in JSON-Lines format, I loaded it with the correct setting to read each line as a separate record.

In [1]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split

DATA_PATH = r"C:\Users\bbuser\Desktop\News_Category_Dataset_v3.json"
OUTPUT_DIR = r"C:\Users\bbuser\Desktop\news_keyword_baseline"
os.makedirs(OUTPUT_DIR, exist_ok=True)

TARGET_CATEGORIES = ["POLITICS", "TRAVEL", "SPORTS", "HOME & LIVING"]
SAMPLES_PER_CLASS = 1000
TEST_SIZE = 0.20
RANDOM_STATE = 42

In [2]:
df = pd.read_json(DATA_PATH, lines=True)
df.head()

Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22


##### The dataset originally contains multiple fields such as headline, authors, and link.

##### For this project, I only need the text description and the label, so I kept only the columns:

##### short_description (the input text)

##### category (the target label)

In [3]:
df = df[["short_description", "category"]].copy()

df = df.dropna(subset=["short_description"])
df["short_description"] = df["short_description"].astype(str).str.strip()
df = df[df["short_description"] != ""]

df.head()

Unnamed: 0,short_description,category
0,Health experts said it is too early to predict...,U.S. NEWS
1,He was subdued by passengers and crew when he ...,U.S. NEWS
2,"""Until you have a dog you don't understand wha...",COMEDY
3,"""Accidentally put grown-up toothpaste on my to...",PARENTING
4,Amy Cooper accused investment firm Franklin Te...,U.S. NEWS


##### I removed rows with missing or empty short descriptions.

##### I also ensured the descriptions were stored as clean strings without unnecessary spaces.

##### Filtering categories

##### The dataset covers more than 40 categories.

##### I restricted it to the four categories required for the task: POLITICS, TRAVEL, SPORTS, and HOME & LIVING.

In [4]:
df = df[df["category"].isin(TARGET_CATEGORIES)].copy()

print("Counts per category BEFORE balancing:")
print(df["category"].value_counts())

Counts per category BEFORE balancing:
category
POLITICS         32441
TRAVEL            9421
SPORTS            4414
HOME & LIVING     4317
Name: count, dtype: int64


##### Balancing the dataset

##### To keep the data balanced, I sampled 1000 articles from each category, resulting in a total of 4000 records.

##### This ensured that each class is equally represented and prevents bias toward larger categories.

In [5]:
df_balanced = (
    df.groupby("category", group_keys=False)[["short_description", "category"]]
      .apply(lambda x: x.sample(n=SAMPLES_PER_CLASS, random_state=RANDOM_STATE))
      .reset_index(drop=True)
)

print("Counts per category AFTER balancing:")
print(df_balanced["category"].value_counts())

Counts per category AFTER balancing:
category
HOME & LIVING    1000
POLITICS         1000
SPORTS           1000
TRAVEL           1000
Name: count, dtype: int64


##### Splitting into train and test sets

##### I performed an 80/20 stratified split:

##### Training set: 3200 records

##### Test set: 800 records

In [6]:
train_df, test_df = train_test_split(
    df_balanced,
    test_size=TEST_SIZE,
    stratify=df_balanced["category"],
    random_state=RANDOM_STATE
)

print("Shapes:")
print("Train:", train_df.shape)
print("Test :", test_df.shape)

print("\nCategory distribution in Train:")
print(train_df["category"].value_counts())

print("\nCategory distribution in Test:")
print(test_df["category"].value_counts())

Shapes:
Train: (3200, 2)
Test : (800, 2)

Category distribution in Train:
category
TRAVEL           800
SPORTS           800
HOME & LIVING    800
POLITICS         800
Name: count, dtype: int64

Category distribution in Test:
category
HOME & LIVING    200
POLITICS         200
TRAVEL           200
SPORTS           200
Name: count, dtype: int64


##### I saved the balanced dataset and the train/test splits into CSV files, so I can reuse them in later steps without reprocessing.

In [7]:
train_path = os.path.join(OUTPUT_DIR, "train_shortdesc_4cats.csv")
test_path  = os.path.join(OUTPUT_DIR, "test_shortdesc_4cats.csv")
balanced_path = os.path.join(OUTPUT_DIR, "balanced_shortdesc_4cats.csv")

train_df.to_csv(train_path, index=False, encoding="utf-8")
test_df.to_csv(test_path, index=False, encoding="utf-8")
df_balanced.to_csv(balanced_path, index=False, encoding="utf-8")

print("Files saved:")
print(train_path)
print(test_path)
print(balanced_path)

Files saved:
C:\Users\bbuser\Desktop\news_keyword_baseline\train_shortdesc_4cats.csv
C:\Users\bbuser\Desktop\news_keyword_baseline\test_shortdesc_4cats.csv
C:\Users\bbuser\Desktop\news_keyword_baseline\balanced_shortdesc_4cats.csv


In [9]:
from collections import Counter
import re

# -----------------------------
# 1. Tokenizer
# -----------------------------
def tokenize(text):
    """Lowercase and extract words only."""
    return re.findall(r'\b[a-z]+\b', text.lower())

# -----------------------------
# 2. Word frequency per category
# -----------------------------
category_word_freq = {}
for cat in train_df["category"].unique():
    texts = train_df[train_df["category"] == cat]["short_description"]
    words = []
    for t in texts:
        words.extend(tokenize(t))
    category_word_freq[cat] = Counter(words)

# -----------------------------
# 3. Classification function
# -----------------------------
def classify(text, word_sets):
    words = set(tokenize(text))
    overlaps = {cat: len(words & wset) for cat, wset in word_sets.items()}
    
    max_overlap = max(overlaps.values())
    if max_overlap == 0:
        return "UNKNOWN"
    
    candidates = [cat for cat, val in overlaps.items() if val == max_overlap]
    return candidates[0]   # simple tie-breaking

# -----------------------------
# 4. Evaluation
# -----------------------------
def evaluate(word_sets):
    correct, total = 0, 0
    for _, row in test_df.iterrows():
        pred = classify(row["short_description"], word_sets)
        if pred == row["category"]:
            correct += 1
        total += 1
    return correct / total

# -----------------------------
# 5. Run experiments (with frequency printing)
# -----------------------------
results = {}
for n in [10, 20, 30, 50]:
    word_sets = {cat: set([w for w, _ in counter.most_common(n)]) 
                 for cat, counter in category_word_freq.items()}
    acc = evaluate(word_sets)
    results[n] = acc
    print("="*50)
    print(f"Top {n} words → Accuracy = {acc:.3f}")
    
    # show top-n frequent words per category
    for cat, counter in category_word_freq.items():
        top_words = [w for w, _ in counter.most_common(n)]
        print(f"{cat}: {top_words}")


Top 10 words → Accuracy = 0.275
TRAVEL: ['the', 'a', 'of', 'to', 'and', 'in', 's', 'you', 'is', 'for']
SPORTS: ['the', 'a', 'to', 'of', 'and', 's', 'in', 'it', 'for', 'is']
HOME & LIVING: ['the', 'to', 'a', 'and', 'of', 'you', 's', 'in', 'your', 'for']
POLITICS: ['the', 'to', 'a', 'of', 'in', 'and', 's', 'is', 'for', 'it']
Top 20 words → Accuracy = 0.333
TRAVEL: ['the', 'a', 'of', 'to', 'and', 'in', 's', 'you', 'is', 'for', 'that', 'i', 'it', 'on', 'with', 'are', 'as', 'we', 'but', 't']
SPORTS: ['the', 'a', 'to', 'of', 'and', 's', 'in', 'it', 'for', 'is', 'was', 'he', 'on', 'that', 'with', 'at', 'be', 'his', 't', 'i']
HOME & LIVING: ['the', 'to', 'a', 'and', 'of', 'you', 's', 'in', 'your', 'for', 'it', 'is', 'we', 'home', 'that', 'this', 'on', 'have', 'with', 'be']
POLITICS: ['the', 'to', 'a', 'of', 'in', 'and', 's', 'is', 'for', 'it', 'that', 'on', 'trump', 'he', 'are', 'with', 'his', 'we', 'as', 'this']
Top 30 words → Accuracy = 0.388
TRAVEL: ['the', 'a', 'of', 'to', 'and', 'in', 's'