In [37]:
import arxiv

client = arxiv.Client(
  page_size=1,
)

search = arxiv.Search(
    query="",
    max_results=1,
    sort_by=arxiv.SortCriterion.SubmittedDate,
    sort_order=arxiv.SortOrder.Descending,
    id_list=["2301.12345v1"]
)

result = next(client.results(search))
for attr, value in vars(result).items():
  print(attr,":",value)
    
# print("ID:", result.entry_id)
# print("標題:", result.title)
# print("摘要:", result.summary)
# print("作者:", ", ".join(str(a) for a in result.authors))
# print("主分類:", result.primary_category)
# print("其他分類:", ", ".join(result.categories))
# print("發表日期:", result.published)
# print("更新日期:", result.updated)
# print("期刊/會議資訊:", result.journal_ref)
# print("DOI:", result.doi)
# print("PDF_url:", result.pdf_url)


entry_id : http://arxiv.org/abs/2301.12345v1
updated : 2023-01-29 03:59:33+00:00
published : 2023-01-29 03:59:33+00:00
title : Chemotactic motility-induced phase separation
authors : [arxiv.Result.Author('Hongbo Zhao'), arxiv.Result.Author('Andrej Košmrlj'), arxiv.Result.Author('Sujit S. Datta')]
summary : Collectives of actively-moving particles can spontaneously separate into
dilute and dense phases -- a fascinating phenomenon known as motility-induced
phase separation (MIPS). MIPS is well-studied for randomly-moving particles
with no directional bias. However, many forms of active matter exhibit
collective chemotaxis, directed motion along a chemical gradient that the
constituent particles can generate themselves. Here, using theory and
simulations, we demonstrate that collective chemotaxis strongly competes with
MIPS -- in some cases, arresting or completely suppressing phase separation, or
in other cases, generating fundamentally new dynamic instabilities. We
establish quantitativ

In [None]:
import arxiv
import json
from pathlib import Path

client = arxiv.Client(
    page_size=10, # limit 1000
    delay_seconds=3,
    num_retries=3
)

search = arxiv.Search(
    query='cat:cs.LG',
    max_results=50,
    sort_by=arxiv.SortCriterion.SubmittedDate,
    sort_order=arxiv.SortOrder.Descending
)


BATCH_SIZE = 10 
output_dir = Path("arxiv_data")
output_dir.mkdir(exist_ok=True)


batch = []
batch_count = 1

for idx, result in enumerate(client.results(search), start=1):
    paper = {
        "entry_id": result.entry_id,
        "title": result.title,
        "authors": [a.name for a in result.authors],
        "summary": result.summary,
        "primary_category": result.primary_category,
        "categories": result.categories,
        "published": result.published.isoformat(),
        "updated": result.updated.isoformat(),
        "journal_ref": result.journal_ref,
        "doi": result.doi
    }
    batch.append(paper)

    if len(batch) >= BATCH_SIZE:
        filename = output_dir / f"arxiv_batch_{batch_count}.json"
        with open(filename, "w", encoding="utf-8") as f:
            json.dump(batch, f, ensure_ascii=False, indent=2)
        print(f"Saved ({len(batch)} papers) to {filename}")
        batch_count += 1
        batch = []

if batch:
    filename = output_dir / f"arxiv_batch_{batch_count}.json"
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(batch, f, ensure_ascii=False, indent=2)
    print(f"Saved final ({len(batch)} papers) to {filename}")


Saved batch 1 (10 papers) to arxiv_data/arxiv_batch_1.json
Saved batch 2 (10 papers) to arxiv_data/arxiv_batch_2.json
Saved batch 3 (10 papers) to arxiv_data/arxiv_batch_3.json
Saved batch 4 (10 papers) to arxiv_data/arxiv_batch_4.json
Saved batch 5 (10 papers) to arxiv_data/arxiv_batch_5.json


In [None]:
import json
from pathlib import Path
from datetime import datetime

input_file = Path("arxiv_data/arxiv_batch_1.json")
output_file = Path("arxiv_data/arxiv_batch_cleaned.json")

with open(input_file, "r", encoding="utf-8") as f:
    papers = json.load(f)

# 去重
unique_papers = {paper["id"]:paper for paper in papers}
print(unique_papers)

# 補缺值 (id、title、summary、authors、categories、subject、submitted_date、updated_date、affiliations 缺一不可)
required_fields = [
    "id", "title", "summary", "authors", 
    "categories", "published", "updated"
]
cleaned_papers = [
    paper for paper in unique_papers.values()
    if all(paper.get(field) for field in required_fields)
]
k = [paper.get(field) for field in required_fields]
print(k)


{'http://arxiv.org/abs/2510.09599v1': {'id': 'http://arxiv.org/abs/2510.09599v1', 'title': 'Prompting Test-Time Scaling Is A Strong LLM Reasoning Data Augmentation', 'authors': ['Sondos Mahmoud Bsharat', 'Zhiqiang Shen'], 'summary': "Large language models (LLMs) have demonstrated impressive reasoning\ncapabilities when provided with chain-of-thought exemplars, but curating large\nreasoning datasets remains laborious and resource-intensive. In this work, we\nintroduce Prompting Test-Time Scaling (P-TTS), a simple yet effective\ninference-time data augmentation strategy for enhancing LLM reasoning through\nfinetuning. Rather than collecting thousands or even millions of examples,\nP-TTS leverages a small pool of only 90 manually selected reasoning instances\nand systematically varies exemplar augmentation through principled instruction\nprompting intensities at test time to synthesize diverse reasoning trajectory\ncontexts. Then we finetune the various sizes of Qwen-2.5 models on P-TTS d