In [37]:
import arxiv

client = arxiv.Client(
  page_size=1,
)

search = arxiv.Search(
    query="",
    max_results=1,
    sort_by=arxiv.SortCriterion.SubmittedDate,
    sort_order=arxiv.SortOrder.Descending,
    id_list=["2301.12345v1"]
)

result = next(client.results(search))
for attr, value in vars(result).items():
  print(attr,":",value)
    
# print("ID:", result.entry_id)
# print("標題:", result.title)
# print("摘要:", result.summary)
# print("作者:", ", ".join(str(a) for a in result.authors))
# print("主分類:", result.primary_category)
# print("其他分類:", ", ".join(result.categories))
# print("發表日期:", result.published)
# print("更新日期:", result.updated)
# print("期刊/會議資訊:", result.journal_ref)
# print("DOI:", result.doi)
# print("PDF_url:", result.pdf_url)


entry_id : http://arxiv.org/abs/2301.12345v1
updated : 2023-01-29 03:59:33+00:00
published : 2023-01-29 03:59:33+00:00
title : Chemotactic motility-induced phase separation
authors : [arxiv.Result.Author('Hongbo Zhao'), arxiv.Result.Author('Andrej Košmrlj'), arxiv.Result.Author('Sujit S. Datta')]
summary : Collectives of actively-moving particles can spontaneously separate into
dilute and dense phases -- a fascinating phenomenon known as motility-induced
phase separation (MIPS). MIPS is well-studied for randomly-moving particles
with no directional bias. However, many forms of active matter exhibit
collective chemotaxis, directed motion along a chemical gradient that the
constituent particles can generate themselves. Here, using theory and
simulations, we demonstrate that collective chemotaxis strongly competes with
MIPS -- in some cases, arresting or completely suppressing phase separation, or
in other cases, generating fundamentally new dynamic instabilities. We
establish quantitativ

In [38]:
import arxiv
import json
from pathlib import Path

client = arxiv.Client(
    page_size=10, # limit 1000
    delay_seconds=3,
    num_retries=3
)

search = arxiv.Search(
    query='cat:cs.LG',
    max_results=50,
    sort_by=arxiv.SortCriterion.SubmittedDate,
    sort_order=arxiv.SortOrder.Descending
)


BATCH_SIZE = 10 
output_dir = Path("arxiv_data")
output_dir.mkdir(exist_ok=True)


batch = []
batch_count = 1

for idx, result in enumerate(client.results(search), start=1):
    paper = {
        "entry_id": result.entry_id,
        "title": result.title,
        "authors": [a.name for a in result.authors],
        "summary": result.summary,
        "primary_category": result.primary_category,
        "categories": result.categories,
        "published": result.published.isoformat(),
        "updated": result.updated.isoformat(),
        "journal_ref": result.journal_ref,
        "doi": result.doi
    }
    batch.append(paper)

    if len(batch) >= BATCH_SIZE:
        filename = output_dir / f"arxiv_batch_{batch_count}.json"
        with open(filename, "w", encoding="utf-8") as f:
            json.dump(batch, f, ensure_ascii=False, indent=2)
        print(f"Saved ({len(batch)} papers) to {filename}")
        batch_count += 1
        batch = []

if batch:
    filename = output_dir / f"arxiv_batch_{batch_count}.json"
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(batch, f, ensure_ascii=False, indent=2)
    print(f"Saved final ({len(batch)} papers) to {filename}")


Saved (10 papers) to arxiv_data/arxiv_batch_1.json
Saved (10 papers) to arxiv_data/arxiv_batch_2.json
Saved (10 papers) to arxiv_data/arxiv_batch_3.json
Saved (10 papers) to arxiv_data/arxiv_batch_4.json
Saved (10 papers) to arxiv_data/arxiv_batch_5.json


In [None]:
import json
from pathlib import Path
from datetime import datetime, timezone

input_file = Path("arxiv_data/arxiv_batch_1.json")
output_file = Path("arxiv_data/arxiv_batch_cleaned.json")

def transform_datetime2date(dt_str):
    try:
        dt = datetime.fromisoformat(dt_str.replace("Z", "+00:00"))
        return dt.strftime("%Y-%m-%d")
    except Exception:
        return None

with open(input_file, "r", encoding="utf-8") as f:
    papers = json.load(f)

# 去重
unique_papers = {paper["entry_id"]: paper for paper in papers}


required_fields = [
    "entry_id", "title", "summary", "authors", 
    "primary_category", "published", "updated"
]

cleaned_papers = []
for paper in unique_papers.values():
    # 刪除缺值資料
    if all(paper.get(field) for field in required_fields) and all(a.strip() for a in paper["authors"]):
        paper["published_date"] = transform_datetime2date(paper["published"])
        paper["updated_date"] = transform_datetime2date(paper["updated"])
        paper["etl_datetime"] = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S") # use UTC timezone
        cleaned_papers.append(paper)

for paper in cleaned_papers:
    print(paper)

# with open(output_file, "w", encoding="utf-8") as f:
#     json.dump(cleaned_papers, f, ensure_ascii=False, indent=2)

# print(f"清理完成，共 {len(cleaned_papers)} 筆，已儲存到 {output_file}")


{'entry_id': 'http://arxiv.org/abs/2510.11713v1', 'title': 'Are Large Reasoning Models Interruptible?', 'authors': ['Tsung-Han Wu', 'Mihran Miroyan', 'David M. Chan', 'Trevor Darrell', 'Narges Norouzi', 'Joseph E. Gonzalez'], 'summary': 'Large Reasoning Models (LRMs) excel at complex reasoning but are\ntraditionally evaluated in static, "frozen world" settings: model responses are\nassumed to be instantaneous, and the context of a request is presumed to be\nimmutable over the duration of the response. While generally true for\nshort-term tasks, the "frozen world" assumption breaks down in modern reasoning\ntasks such as assistive programming, where models may take hours to think\nthrough problems and code may change dramatically from the time the model\nstarts thinking to the model\'s final output. In this work, we challenge the\nfrozen world assumption and evaluate LRM robustness under two realistic dynamic\nscenarios: interruptions, which test the quality of the model\'s partial outp

In [None]:
import boto3
from botocore.exceptions import ClientError

# 建立 DynamoDB 連線
dynamodb = boto3.resource(
    'dynamodb',
    aws_access_key_id='',
    aws_secret_access_key='',
    region_name='ap-southeast-2'
)

# 指定 table 名稱
table = dynamodb.Table('download_paper_entry_id')


In [70]:
from datetime import datetime, timezone

entry_id = "http://arxiv.org/abs/2510.11683v1"
item = {
    "entry_id": entry_id,
    "status": "uploaded",  # "failed"
    "last_attempt": datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S"),
    "error_msg": "" 
}

try:
    table.put_item(
        Item=item,
        ConditionExpression='attribute_not_exists(entry_id)'
    )
    print("已新增")
except ClientError as e:
    if e.response['Error']['Code'] == 'ConditionalCheckFailedException':
        print("這篇 paper 已存在")
    else:
        raise


已新增


In [71]:
response = table.get_item(Key={'entry_id': entry_id})
item = response.get('Item')

if item:
    print("已存在", item)
else:
    print("不存在")


已存在 {'entry_id': 'http://arxiv.org/abs/2510.11683v1', 'last_attempt': '2025-10-14 15:15:24', 'error_msg': '', 'status': 'uploaded'}


In [66]:
print(entry_id)

http://arxiv.org/abs/2510.11683v1


In [None]:
from moto import mock_aws
import boto3
import json

# -----------------------------
# 用 decorator 或 context manager 模擬 S3
# -----------------------------
@mock_aws  # <-- 使用通用的 mock_aws
def test_s3_operations():
    # 建立 S3 client
    s3 = boto3.client("s3", region_name="us-east-1")

    # 1️⃣ 建立 bucket
    bucket_name = "my-test-bucket"
    s3.create_bucket(Bucket=bucket_name) 
    print(f"Bucket '{bucket_name}' created.")

    # 2️⃣ 建立 prefix / 資料夾
    domain = "cs.LG"
    prefix = f"raw/domain={domain}/"
    s3.put_object(Bucket=bucket_name, Key=(prefix + ".keep"), Body=b'')
    print(f"Prefix '{prefix}' created.")

    # 3️⃣ 上傳 JSON
    sample_data = {"title": "Example arXiv Paper", "id": "1234.5678"}
    key = prefix + "example.json"
    s3.put_object(
        Bucket=bucket_name,
        Key=key,
        Body=json.dumps(sample_data),
        ContentType="application/json"
    )
    print(f"Uploaded JSON to {key}")

    # 4️⃣ 列出 bucket 內容
    response = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix)
    print("Bucket contents:")
    for obj in response.get("Contents", []):
        print(" -", obj["Key"])

# -----------------------------
# 執行測試
# -----------------------------
if __name__ == "__main__":
    test_s3_operations()

INFO:botocore.credentials:Found credentials in environment variables.


Bucket 'my-test-bucket' created.
Prefix 'raw/domain=cs.LG/' created.
Uploaded JSON to raw/domain=cs.LG/example.json
Bucket contents:
 - raw/domain=cs.LG/.keep
 - raw/domain=cs.LG/example.json
