## Trim of the original all the news dataset

In [8]:
import polars as pl

In [9]:
# Load the dataset
df = pl.read_csv("all-the-news-2-1-LARGE.csv")

In [12]:
# Define target sources and their desired counts
target_counts = {
    "The New York Times": 15000,
    "The Hill": 15000,
    "Reuters": 15000,
    "People": 15000,
    "CNN": 15000,
    "Vice": 15000,
    "Politico": 15000,
    "Buzzfeed News": 15000,
    "Economist": 15000,
    "Fox News": 15000,
}

# Filter to only target sources
df = df.filter(pl.col("publication").is_in(list(target_counts.keys())))

# remove articles with length 0 or null 
df = df.filter(pl.col("article").is_not_null())

# Compute 5th and 95th percentiles of article length
article_lengths = df.select(pl.col("article").str.len_chars().alias("length"))

lower_bound = article_lengths.select(pl.col("length").quantile(0.05)).item()
upper_bound = article_lengths.select(pl.col("length").quantile(0.95)).item()

# Filter based on these quantiles
df = df.with_columns([
    pl.col("article").str.len_chars().alias("article_length")
]).filter(
    (pl.col("article_length") > lower_bound) & (pl.col("article_length") < upper_bound)
).drop("article_length")

In [13]:
# Remove any article with chinese characters in the title 
df = df.filter(~pl.col("title").str.contains(r"[\u4e00-\u9fff]"))

In [14]:
# Downsample by longest articles for each publication
filtered_parts = []

for pub, target_size in target_counts.items():
    pub_df = (
        df.filter(pl.col("publication") == pub)
          .head(target_size)
    )
    filtered_parts.append(pub_df)

# Combine all filtered parts
trimmed_df = pl.concat(filtered_parts)

# Group and count by publication
final_counts = (
    trimmed_df.group_by("publication")
              .count()
              .sort("count", descending=True)
)

# Print results
print(final_counts)

shape: (10, 2)
┌────────────────────┬───────┐
│ publication        ┆ count │
│ ---                ┆ ---   │
│ str                ┆ u32   │
╞════════════════════╪═══════╡
│ Reuters            ┆ 15000 │
│ People             ┆ 15000 │
│ Economist          ┆ 15000 │
│ Buzzfeed News      ┆ 15000 │
│ Politico           ┆ 15000 │
│ CNN                ┆ 15000 │
│ Fox News           ┆ 15000 │
│ The New York Times ┆ 15000 │
│ Vice               ┆ 15000 │
│ The Hill           ┆ 15000 │
└────────────────────┴───────┘


  .count()


In [None]:
trimmed_df.write_csv("all-the-news-2-1-SMALL.csv")