In [2]:
import pandas as pd
from datasets import load_dataset

# --- 1. Configuration ---

# The 5 labels we chose for our thesis
LABELS_TO_KEEP = ['POLITICS', 'ENTERTAINMENT', 'TRAVEL', 'BUSINESS', 'SPORTS']
DATASET_NAME = 'heegyu/news-category-dataset'

# --- 2. Load the Data ---

print(f"Loading dataset: '{DATASET_NAME}'...")
# Load the full training split
full_dataset = load_dataset(DATASET_NAME, split='train')

# Convert to a pandas DataFrame for easy filtering and inspection
df_full = full_dataset.to_pandas()

print(f"Original dataset size: {len(df_full)} rows")
print("Original category counts (Top 10):")
print(df_full['category'].value_counts().head(10))
print("\n" + "="*40 + "\n")

# --- 3. Filter the Data ---

print(f"Filtering for our {len(LABELS_TO_KEEP)} target labels...")

# Use the pandas .isin() method to keep only rows where the category is in our list
df_filtered = df_full[df_full['category'].isin(LABELS_TO_KEEP)].copy()

# Reset the index for a clean DataFrame
df_filtered.reset_index(drop=True, inplace=True)

# --- 4. Show Results ---

print(f"Successfully filtered!")
print(f"New dataset size: {len(df_filtered)} rows")

print("\nFinal category counts in our new dataset:")
print(df_filtered['category'].value_counts())

print("\n--- Sample of Filtered Data ---")
print(df_filtered.head())

  from .autonotebook import tqdm as notebook_tqdm


Loading dataset: 'heegyu/news-category-dataset'...
Original dataset size: 209527 rows
Original category counts (Top 10):
category
POLITICS          35602
WELLNESS          17945
ENTERTAINMENT     17362
TRAVEL             9900
STYLE & BEAUTY     9814
PARENTING          8791
HEALTHY LIVING     6694
QUEER VOICES       6347
FOOD & DRINK       6340
BUSINESS           5992
Name: count, dtype: int64


Filtering for our 5 target labels...
Successfully filtered!
New dataset size: 73933 rows

Final category counts in our new dataset:
category
POLITICS         35602
ENTERTAINMENT    17362
TRAVEL            9900
BUSINESS          5992
SPORTS            5077
Name: count, dtype: int64

--- Sample of Filtered Data ---
                                                link  \
0  https://www.huffpost.com/entry/dodgers-basebal...   
1  https://www.huffpost.com/entry/golden-globes-r...   
2  https://www.huffpost.com/entry/biden-us-forces...   
3  https://www.huffpost.com/entry/ukraine-festiva...   
4  http

In [3]:
# Convert date column to datetime if it's not already
df_filtered['date'] = pd.to_datetime(df_filtered['date'])

# Extract year from the date column
df_filtered['year'] = df_filtered['date'].dt.year

print("Year column added successfully!")
print(f"\nYear distribution:")
print(df_filtered['year'].value_counts().sort_index())

print("\n--- Sample with Year Column ---")
print(df_filtered[['headline', 'category', 'date', 'year']].head(10))

Year column added successfully!

Year distribution:
year
2012     4741
2013     5514
2014     9522
2015    13345
2016    16209
2017    14865
2018     5809
2019      733
2020     1235
2021     1236
2022      724
Name: count, dtype: int64

--- Sample with Year Column ---
                                            headline       category  \
0  Maury Wills, Base-Stealing Shortstop For Dodge...         SPORTS   
1  Golden Globes Returning To NBC In January Afte...  ENTERTAINMENT   
2  Biden Says U.S. Forces Would Defend Taiwan If ...       POLITICS   
3  ‘Beautiful And Sad At The Same Time’: Ukrainia...       POLITICS   
4  Las Vegas Aces Win First WNBA Title, Chelsea G...         SPORTS   
5  James Cameron Says He 'Clashed' With Studio Be...  ENTERTAINMENT   
6  Biden Says Queen's Death Left 'Giant Hole' For...       POLITICS   
7  Amazon Greenlights 'Blade Runner 2099' Limited...  ENTERTAINMENT   
8  Bill To Help Afghans Who Escaped Taliban Faces...       POLITICS   
9  'The Phantom Of T

In [11]:
# df_filtered.columns


df_2021_data = df_filtered[df_filtered['year'] == 2016]
df_2018_data = df_filtered[df_filtered['year'] == 2018]

# get the distribution of categories in 2022
print("\nCategory distribution for 2022:")
print(df_2021_data['category'].value_counts(normalize=True))
# get the distribution of categories in 2012
print("\nCategory distribution for 2018:")
print(df_2018_data['category'].value_counts(normalize=True))


Category distribution for 2022:
category
POLITICS         0.616324
ENTERTAINMENT    0.217533
SPORTS           0.081683
BUSINESS         0.057746
TRAVEL           0.026714
Name: proportion, dtype: float64

Category distribution for 2018:
category
POLITICS         0.585127
ENTERTAINMENT    0.320537
SPORTS           0.066104
BUSINESS         0.015837
TRAVEL           0.012395
Name: proportion, dtype: float64


In [12]:
# train BERT model on 2022 data, split 2022 data, but validate on 2018 data

print(df_2021_data['category'].value_counts())
# get the distribution of categories in 2012
print("\nCategory distribution for 2018:")
print(df_2018_data['category'].value_counts())

# 

category
POLITICS         9990
ENTERTAINMENT    3526
SPORTS           1324
BUSINESS          936
TRAVEL            433
Name: count, dtype: int64

Category distribution for 2018:
category
POLITICS         3399
ENTERTAINMENT    1862
SPORTS            384
BUSINESS           92
TRAVEL             72
Name: count, dtype: int64


In [6]:
df_filtered["year"].value_counts()

year
2016    16209
2017    14865
2015    13345
2014     9522
2018     5809
2013     5514
2012     4741
2021     1236
2020     1235
2019      733
2022      724
Name: count, dtype: int64

In [7]:
df_filtered[df_filtered['year'] == 2016]["category"].value_counts()

category
POLITICS         9990
ENTERTAINMENT    3526
SPORTS           1324
BUSINESS          936
TRAVEL            433
Name: count, dtype: int64

In [9]:
df_filtered.columns

Index(['link', 'headline', 'category', 'short_description', 'authors', 'date',
       'year'],
      dtype='object')