## Trim of the original all the news dataset

In [1]:
import polars as pl

In [3]:
# Load the dataset
df = pl.read_csv("all-the-news-2-1.csv")

# Define target sources and their desired counts
target_counts = {
    "The New York Times": 75000,
    "The Hill": 50000,
    "People": 35000,
    "CNN": 30000,
    "Vice": 25000,
    "Politico": 20000,
    "Washington Post": 20000,
    "Buzzfeed News": 20000,
    "Economist": 20000,
    "Fox News": 20000,
}

# Filter to only target sources
df = df.filter(pl.col("publication").is_in(list(target_counts.keys())))

# Add a new column for article length
df = df.with_columns([
    pl.col("article").str.len_chars().alias("article_length")
])


In [4]:
# Downsample by shortest articles for each publication
filtered_parts = []

for pub, target_size in target_counts.items():
    pub_df = (
        df.filter(pl.col("publication") == pub)
          .sort("article_length")  # shortest articles first
          .head(target_size)
    )
    filtered_parts.append(pub_df)

# Combine all filtered parts
trimmed_df = pl.concat(filtered_parts)

# Group and count by publication
final_counts = (
    trimmed_df.group_by("publication")
              .count()
              .sort("count", descending=True)
)

# Print results
print(final_counts)

shape: (10, 2)
┌────────────────────┬───────┐
│ publication        ┆ count │
│ ---                ┆ ---   │
│ str                ┆ u32   │
╞════════════════════╪═══════╡
│ The New York Times ┆ 75000 │
│ The Hill           ┆ 50000 │
│ People             ┆ 35000 │
│ CNN                ┆ 30000 │
│ Vice               ┆ 25000 │
│ Fox News           ┆ 20000 │
│ Buzzfeed News      ┆ 20000 │
│ Politico           ┆ 20000 │
│ Economist          ┆ 20000 │
│ Washington Post    ┆ 20000 │
└────────────────────┴───────┘


  .count()


In [5]:
trimmed_df.write_csv("trimmed_articles.csv")