In [1]:
%pip -q install arxiv pandas tqdm python-dateutil

[33m  DEPRECATION: Building 'sgmllib3k' using the legacy setup.py bdist_wheel mechanism, which will be removed in a future version. pip 25.3 will enforce this behaviour change. A possible replacement is to use the standardized build interface by setting the `--use-pep517` option, (possibly combined with `--no-build-isolation`), or adding a `pyproject.toml` file to the source tree of 'sgmllib3k'. Discussion can be found at https://github.com/pypa/pip/issues/6334[0m[33m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1[0m[39;49m -> [0m[32;49m26.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [10]:
import random
import json
from pathlib import Path
from datetime import datetime, timezone
from dateutil import parser
import pandas as pd
from tqdm import tqdm
import arxiv

CATEGORY = "cs.CL"
YEAR_FROM = 2020
YEAR_TO = 2025
MAX_PER_YEAR = 500
SEED = 42

OUT_DIR = Path("data")
OUT_DIR.mkdir(parents=True, exist_ok=True)

RAW_JSONL = OUT_DIR / f"arxiv_{CATEGORY.replace('.', '')}_{YEAR_FROM}_{YEAR_TO}_monthly_sampled_seed{SEED}.jsonl"
SAMPLED_CSV = OUT_DIR / f"arxiv_{CATEGORY.replace('.', '')}_{YEAR_FROM}_{YEAR_TO}_sampled_{MAX_PER_YEAR}peryear_seed{SEED}.csv"

In [11]:
def append_jsonl(path: Path, records):
    with path.open("a", encoding="utf-8") as f:
        for r in records:
            f.write(json.dumps(r, ensure_ascii=False) + "\n")

def read_jsonl_ids(path: Path, key: str):
    if not path.exists():
        return set()
    seen = set()
    with path.open("r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            obj = json.loads(line)
            v = obj.get(key)
            if v:
                seen.add(v)
    return seen

def to_record(result: arxiv.Result):
    published = result.published
    if published.tzinfo is None:
        published = published.replace(tzinfo=timezone.utc)
    year = published.year
    month = published.month
    return {
        "arxiv_id": result.get_short_id(),
        "title": (result.title or "").strip(),
        "abstract": (result.summary or "").strip(),
        "published": published.isoformat(),
        "year": int(year),
        "month": int(month),
        "authors": [a.name for a in (result.authors or [])],
        "categories": list(result.categories or []),
        "primary_category": result.primary_category,
        "pdf_url": result.pdf_url,
        "entry_id": result.entry_id,
    }

In [None]:
client = arxiv.Client(page_size=200, delay_seconds=3.0, num_retries=5)

In [12]:
import random
import calendar
import arxiv
from tqdm import tqdm

CATEGORY = "cs.CL"

random.seed(SEED)

seen = read_jsonl_ids(RAW_JSONL, "arxiv_id")
total_added = 0

def month_query(category: str, year: int, month: int) -> str:
    last_day = calendar.monthrange(year, month)[1]
    start = f"{year}{month:02d}010000"
    end = f"{year}{month:02d}{last_day:02d}2359"
    return f"cat:{category} AND submittedDate:[{start} TO {end}]"

for y in range(YEAR_FROM, YEAR_TO + 1):
    year_pool = []
    scanned = 0
    kept_primary = 0
    kept_unique = 0

    for m in range(1, 13):
        q = month_query(CATEGORY, y, m)
        search = arxiv.Search(
            query=q,
            max_results=10000,
            sort_by=arxiv.SortCriterion.SubmittedDate,
            sort_order=arxiv.SortOrder.Ascending,
        )

        for r in tqdm(client.results(search), desc=f"Fetching {y}-{m:02d}", unit="paper"):
            scanned += 1
            rec = to_record(r)
            if rec["primary_category"] != CATEGORY:
                continue
            kept_primary += 1
            if rec["arxiv_id"] in seen:
                continue
            year_pool.append(rec)
            seen.add(rec["arxiv_id"])
            kept_unique += 1

    if len(year_pool) > MAX_PER_YEAR:
        year_pool = random.sample(year_pool, MAX_PER_YEAR)

    append_jsonl(RAW_JSONL, year_pool)
    total_added += len(year_pool)

    print(f"{y}: scanned={scanned}, primary_ok={kept_primary}, unique_pool={kept_unique}, saved_after_sampling={len(year_pool)}")

print(f"Total added: {total_added}")
print(f"Raw saved to: {RAW_JSONL}")

Fetching 2020-01: 293paper [00:03, 93.49paper/s]
Fetching 2020-02: 362paper [00:06, 59.27paper/s]
Fetching 2020-03: 358paper [00:06, 58.89paper/s]
Fetching 2020-04: 889paper [00:15, 58.71paper/s]
Fetching 2020-05: 853paper [00:15, 56.37paper/s]
Fetching 2020-06: 491paper [00:09, 53.73paper/s]
Fetching 2020-07: 405paper [00:09, 44.88paper/s]
Fetching 2020-08: 384paper [00:06, 62.05paper/s]
Fetching 2020-09: 575paper [00:09, 63.18paper/s]
Fetching 2020-10: 1309paper [00:21, 61.51paper/s]
Fetching 2020-11: 596paper [00:09, 64.89paper/s]
Fetching 2020-12: 610paper [00:11, 50.87paper/s]


2020: scanned=7125, primary_ok=5582, unique_pool=5582, saved_after_sampling=500


Fetching 2021-01: 483paper [00:09, 52.85paper/s]
Fetching 2021-02: 435paper [00:09, 47.81paper/s]
Fetching 2021-03: 539paper [00:09, 58.76paper/s]
Fetching 2021-04: 945paper [00:15, 62.15paper/s]
Fetching 2021-05: 683paper [00:12, 56.40paper/s]
Fetching 2021-06: 903paper [00:15, 59.54paper/s]
Fetching 2021-07: 462paper [00:09, 51.04paper/s]
Fetching 2021-08: 519paper [00:09, 56.79paper/s]
Fetching 2021-09: 1130paper [00:18, 61.93paper/s]
Fetching 2021-10: 872paper [00:15, 57.73paper/s]
Fetching 2021-11: 489paper [00:09, 53.52paper/s]
Fetching 2021-12: 623paper [00:12, 51.69paper/s]


2021: scanned=8083, primary_ok=6578, unique_pool=6578, saved_after_sampling=500


Fetching 2022-01: 471paper [00:09, 51.83paper/s]
Fetching 2022-02: 472paper [00:09, 51.61paper/s]
Fetching 2022-03: 882paper [00:15, 58.23paper/s]
Fetching 2022-04: 887paper [00:15, 58.47paper/s]
Fetching 2022-05: 1009paper [00:19, 52.80paper/s]
Fetching 2022-06: 568paper [00:12, 46.72paper/s]
Fetching 2022-07: 442paper [00:11, 39.65paper/s]
Fetching 2022-08: 445paper [00:14, 30.17paper/s]
Fetching 2022-09: 626paper [00:16, 38.71paper/s]
Fetching 2022-10: 1427paper [00:49, 28.88paper/s]
Fetching 2022-11: 896paper [00:33, 26.64paper/s]
Fetching 2022-12: 846paper [00:28, 29.42paper/s]


2022: scanned=8971, primary_ok=7133, unique_pool=7133, saved_after_sampling=500


Fetching 2023-01: 520paper [00:19, 27.08paper/s]
Fetching 2023-02: 715paper [00:14, 47.82paper/s]
Fetching 2023-03: 747paper [00:19, 37.97paper/s]
Fetching 2023-04: 729paper [00:17, 42.49paper/s]
Fetching 2023-05: 2371paper [00:48, 48.58paper/s]
Fetching 2023-06: 1231paper [00:27, 45.00paper/s]
Fetching 2023-07: 895paper [00:28, 31.06paper/s]
Fetching 2023-08: 883paper [00:19, 44.43paper/s]
Fetching 2023-09: 1096paper [00:23, 46.98paper/s]
Fetching 2023-10: 1966paper [00:42, 46.79paper/s]
Fetching 2023-11: 1445paper [00:31, 46.20paper/s]
Fetching 2023-12: 1002paper [00:23, 42.10paper/s]


2023: scanned=13600, primary_ok=10482, unique_pool=10482, saved_after_sampling=500


Fetching 2024-01: 1204paper [00:34, 34.80paper/s]
Fetching 2024-02: 2112paper [00:44, 47.96paper/s]
Fetching 2024-03: 1698paper [00:40, 41.92paper/s]
Fetching 2024-04: 1583paper [00:35, 44.55paper/s]
Fetching 2024-05: 1618paper [00:35, 45.59paper/s]
Fetching 2024-06: 2454paper [00:52, 46.61paper/s]
Fetching 2024-07: 1620paper [00:38, 42.23paper/s]
Fetching 2024-08: 1295paper [00:35, 36.26paper/s]
Fetching 2024-09: 1502paper [00:35, 42.02paper/s]
Fetching 2024-10: 2631paper [01:10, 37.42paper/s]
Fetching 2024-11: 1307paper [00:35, 37.07paper/s]
Fetching 2024-12: 1665paper [00:37, 44.34paper/s]


2024: scanned=20689, primary_ok=14999, unique_pool=14999, saved_after_sampling=500


Fetching 2025-01: 1313paper [00:46, 28.18paper/s]
Fetching 2025-02: 2476paper [01:24, 29.32paper/s]
Fetching 2025-03: 1878paper [01:39, 18.97paper/s]
Fetching 2025-04: 1606paper [00:39, 40.18paper/s]
Fetching 2025-05: 2953paper [00:58, 50.05paper/s]
Fetching 2025-06: 2324paper [00:48, 47.51paper/s]
Fetching 2025-07: 1639paper [00:34, 47.28paper/s]
Fetching 2025-08: 1830paper [00:44, 40.91paper/s]
Fetching 2025-09: 2233paper [00:49, 45.24paper/s]
Fetching 2025-10: 2591paper [00:49, 52.68paper/s]
Fetching 2025-11: 1580paper [00:30, 51.50paper/s]
Fetching 2025-12: 1316paper [00:28, 45.44paper/s]

2025: scanned=23739, primary_ok=17085, unique_pool=17085, saved_after_sampling=500
Total added: 3000
Raw saved to: data/arxiv_csCL_2020_2025_monthly_sampled_seed42.jsonl





In [14]:
import re
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

CSV_PATH = "data/arxiv_csCL_2020_2025_sampled_500peryear_seed42.csv"

df_initial = pd.read_csv(CSV_PATH)
df =df_initial.copy()
df["title"] = df["title"].fillna("").astype(str)
df["abstract"] = df["abstract"].fillna("").astype(str)

df["text"] = (df["title"].str.strip() + " " + df["abstract"].str.strip()).str.strip()

df["text"] = (
    df["text"]
    .str.replace(r"\\[a-zA-Z]+\{.*?\}", " ", regex=True)  # LaTeX commands
    .str.replace(r"http\S+", " ", regex=True)             # URLs
    .str.replace(r"\\[a-zA-Z]+", " ", regex=True)         # other LaTeX leftovers
    .str.replace(r"\s+", " ", regex=True)
    .str.strip()
)

df = df[df["text"].str.len() >= 50].reset_index(drop=True)

vectorizer = TfidfVectorizer(
    lowercase=True,
    stop_words="english",
    min_df=5,
    ngram_range=(1, 2),
)

X = vectorizer.fit_transform(df["text"])

df.shape, X.shape, len(vectorizer.get_feature_names_out())

len(df_initial), len(df)


(3000, 3000)

In [17]:
print("df shape:", df.shape)
print("X shape:", X.shape)
print("feature count:", len(vectorizer.get_feature_names_out()))

df shape: (3000, 10)
X shape: (3000, 9854)
feature count: 9854


In [18]:
import numpy as np
from sklearn.preprocessing import normalize

Xn = normalize(X, norm="l2", axis=1)

centroid = np.asarray(Xn.mean(axis=0)).ravel()
centroid_norm = np.linalg.norm(centroid)
centroid_unit = centroid / centroid_norm if centroid_norm > 0 else centroid

sims = Xn @ centroid_unit
sims = np.asarray(sims).ravel()

coherence_mean = float(sims.mean())
coherence_median = float(np.median(sims))
coherence_std = float(sims.std())

coherence_mean, coherence_median, coherence_std

(0.1697407222924281, 0.16982073057257852, 0.037722868972918655)