In [None]:
import arxiv

client = arxiv.Client(
  page_size=1,
)

search = arxiv.Search(
    query="",
    max_results=1,
    sort_by=arxiv.SortCriterion.SubmittedDate,
    sort_order=arxiv.SortOrder.Descending,
    id_list=["2301.12345v1"]
)

result = next(client.results(search))
for attr, value in vars(result).items():
  print(attr,":",value)
    
# print("ID:", result.entry_id)
# print("標題:", result.title)
# print("摘要:", result.summary)
# print("作者:", ", ".join(str(a) for a in result.authors))
# print("主分類:", result.primary_category)
# print("其他分類:", ", ".join(result.categories))
# print("發表日期:", result.published)
# print("更新日期:", result.updated)
# print("期刊/會議資訊:", result.journal_ref)
# print("DOI:", result.doi)
# print("PDF_url:", result.pdf_url)


In [None]:
import arxiv
import json
from pathlib import Path
import time
import logging
from dotenv import load_dotenv
import os
import boto3
from datetime import datetime, timezone

load_dotenv("../.env")
logging.getLogger("arxiv").setLevel(logging.WARNING)

category_list = ["cs.DS", "cs.AI", "cs.LG", "cs.CV", "cs.CL", "stat.ML", "math.ST"]
MAX_RESULTS_GOAL = 1000
BATCH_SIZE = 100

S3_BUCKET = "hackmd-paper-bucket"
TABLE_NAME = "download_paper_entry_id"

MAX_ATTEMPTS = 3
INITIAL_DELAY_SECONDS = 5

last_exception = None

client = arxiv.Client(
    page_size=MAX_RESULTS_GOAL, 
    delay_seconds=3,
    num_retries=3
)

s3 = boto3.client(
    "s3",
    region_name=os.getenv("AWS_REGION"),
    aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
    aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY")
    )

dynamodb = boto3.resource(
    "dynamodb",
    region_name=os.getenv("AWS_REGION"),
    aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
    aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY")
)

table = dynamodb.Table(TABLE_NAME)

def paper_exists(category, entry_id):
    try:
        response = table.get_item(
            Key={"category": category, "entry_id": entry_id}
        )
        return "Item" in response
    except Exception as e:
        print(f"DynamoDB query error: {e}")
        return False

def record_to_dynamo(category, entry_id, status, error_msg=""):
    item = {
        "category": category,
        "entry_id": entry_id,
        "status": status,  # "uploaded" 或 "failed"
        "last_attempt": datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S"),
        "error_msg": error_msg
    }
    try:
        table.put_item(
            Item=item,
            ConditionExpression="attribute_not_exists(entry_id)"
        )
    except Exception as e:
        print(f"⚠️ 寫入 DynamoDB 失敗 (entry_id={entry_id}): {e}")
        
def upload_batch_to_s3(s3_prefix, batch_data, batch_num):
    if not batch_data:
        return
    
    jsonl_content = "\n".join([json.dumps(paper, ensure_ascii=False) for paper in batch_data])
    
    utc_now = datetime.now(timezone.utc)
    today_str = utc_now.strftime("%Y-%m-%d")
    utc_timestamp = int(utc_now.timestamp())
    s3_key = f"{s3_prefix}{today_str}/{category.replace('.','_')}_batch_{batch_num}_{utc_timestamp}.jsonl"
    
    for attempt in range(MAX_ATTEMPTS):
        try:
            s3.put_object(
                Bucket=S3_BUCKET,
                Key=s3_key,
                Body=jsonl_content.encode('utf-8'),
                ContentType='application/jsonl'
            )
            print(f"Successfully uploaded batch {batch_num} ({len(batch_data)} papers) to s3://{S3_BUCKET}/{s3_key}")
            break
        except Exception as e:
            last_exception = e
            print(f"Attempt {attempt + 1}/{MAX_ATTEMPTS} failed for batch {batch_num}. Error: {e}")
            if attempt < MAX_ATTEMPTS - 1:
                delay = INITIAL_DELAY_SECONDS * (2 ** attempt)
                print(f"Retrying in {delay} seconds...")
                time.sleep(delay)
            else:
                print(f"All {MAX_ATTEMPTS} attempts failed for batch {batch_num}. Could not upload.")
                raise last_exception

try:
    for category in category_list:
        S3_PREFIX = f"raw/{category.replace('.','_')}/"
        
        search = arxiv.Search(
            query=f'cat:{category}',
            max_results=MAX_RESULTS_GOAL,
            sort_by=arxiv.SortCriterion.SubmittedDate,
            sort_order=arxiv.SortOrder.Descending
        )
        
        results_generator = client.results(search)
        
        batch = []
        batch_count = 0
        
        for paper_result in results_generator:
            entry_id = paper_result.entry_id
            if paper_exists(category, entry_id):
                continue
            paper_data = {
                "entry_id": entry_id,
                "title": paper_result.title,
                "authors": [a.name for a in paper_result.authors],
                "summary": paper_result.summary,
                "primary_category": paper_result.primary_category,
                "categories": paper_result.categories,
                "published": paper_result.published.isoformat(),
                "updated": paper_result.updated.isoformat(),
                "journal_ref": paper_result.journal_ref,
                "doi": paper_result.doi
            }
            batch.append(paper_data)
            
            record_to_dynamo(category, entry_id, status="pending")
            
            if len(batch) >= BATCH_SIZE:
                try:
                    upload_batch_to_s3(S3_PREFIX, batch, batch_count)
                    for paper in batch:
                        record_to_dynamo(category, paper["entry_id"], status="uploaded")
                except Exception as e:
                    for paper in batch:
                        record_to_dynamo(category, paper["entry_id"], status="failed", error_msg=str(e))
                finally:
                    batch = []
                    batch_count += 1
                
        if batch:
            upload_batch_to_s3(S3_PREFIX, batch, batch_count)

except Exception as e:
    print(f"\nAn unexpected error occurred during the process: {e}")


In [None]:
import json
from pathlib import Path
from datetime import datetime, timezone

input_file = Path("arxiv_data/arxiv_batch_1.json")
output_file = Path("arxiv_data/arxiv_batch_cleaned.json")

def transform_datetime2date(dt_str):
    try:
        dt = datetime.fromisoformat(dt_str.replace("Z", "+00:00"))
        return dt.strftime("%Y-%m-%d")
    except Exception:
        return None

with open(input_file, "r", encoding="utf-8") as f:
    papers = json.load(f)

# 去重
unique_papers = {paper["entry_id"]: paper for paper in papers}


required_fields = [
    "entry_id", "title", "summary", "authors", 
    "primary_category", "published", "updated"
]

cleaned_papers = []
for paper in unique_papers.values():
    # 刪除缺值資料
    if all(paper.get(field) for field in required_fields) and all(a.strip() for a in paper["authors"]):
        paper["published_date"] = transform_datetime2date(paper["published"])
        paper["updated_date"] = transform_datetime2date(paper["updated"])
        paper["etl_datetime"] = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S") # use UTC timezone
        cleaned_papers.append(paper)

for paper in cleaned_papers:
    print(paper)

# with open(output_file, "w", encoding="utf-8") as f:
#     json.dump(cleaned_papers, f, ensure_ascii=False, indent=2)

# print(f"清理完成，共 {len(cleaned_papers)} 筆，已儲存到 {output_file}")


In [None]:
import boto3
from botocore.exceptions import ClientError

# 建立 DynamoDB 連線
dynamodb = boto3.resource(
    'dynamodb',
    aws_access_key_id='',
    aws_secret_access_key='',
    region_name='ap-southeast-2'
)

# 指定 table 名稱
table = dynamodb.Table('download_paper_entry_id')


In [None]:
# 新增一筆資料
from datetime import datetime, timezone

entry_id = "http://arxiv.org/abs/2510.11683v1"
item = {
    "category": "cs.LG",
    "entry_id": entry_id,
    "status": "uploaded",  # "failed"
    "last_attempt": datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S"),
    "error_msg": "" 
}

try:
    table.put_item(
        Item=item,
        ConditionExpression='attribute_not_exists(entry_id)'
    )
    print("已新增")
except ClientError as e:
    if e.response['Error']['Code'] == 'ConditionalCheckFailedException':
        print("這篇 paper 已存在")
    else:
        raise


已新增


In [None]:
# key 查詢
response = table.get_item(Key={"category": "cs.LG",'entry_id': entry_id})
item = response.get('Item')

if item:
    print("已存在", item)
else:
    print("不存在")


已存在 {'category': 'cs.LG', 'last_attempt': '2025-10-16 09:34:28', 'error_msg': '', 'status': 'uploaded', 'entry_id': 'http://arxiv.org/abs/2510.11683v1'}


In [8]:
# 刪除 key
response = table.delete_item(
    Key={
        "category": "cs.LG",
        "entry_id": entry_id
    }
)

print("刪除成功:", response)


刪除成功: {'ResponseMetadata': {'RequestId': '7TTA1C0VHABSMDGRPOCFBDJNCBVV4KQNSO5AEMVJF66Q9ASUAAJG', 'HTTPStatusCode': 200, 'HTTPHeaders': {'server': 'Server', 'date': 'Thu, 16 Oct 2025 09:36:05 GMT', 'content-type': 'application/x-amz-json-1.0', 'content-length': '2', 'connection': 'keep-alive', 'x-amzn-requestid': '7TTA1C0VHABSMDGRPOCFBDJNCBVV4KQNSO5AEMVJF66Q9ASUAAJG', 'x-amz-crc32': '2745614147'}, 'RetryAttempts': 0}}


In [None]:
import boto3
from dotenv import load_dotenv
import os

def create_s3_bucket_and_prefix(bucket_name: str, domain: str):
    env_path = os.path.join(os.path.dirname(__file__), "../.env")
    if not os.path.exists(env_path):
        raise FileNotFoundError(f".env not found at {env_path}")
    
    load_dotenv(env_path)

    s3 = boto3.client(
        "s3",
        region_name=os.getenv("AWS_REGION"),
        aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
        aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY")
    )

    s3.create_bucket(Bucket=bucket_name)
    prefix = f"raw/domain={domain}/"
    s3.put_object(Bucket=bucket_name, Key=(prefix + ".keep"))

if __name__ == "__main__":
    create_s3_bucket_and_prefix("my-test-bucket", "cs.LG")


In [77]:
# 查看你有哪個 Bucket

import boto3
from dotenv import load_dotenv
import os

load_dotenv("../.env")

s3 = boto3.client(
    "s3",
    region_name=os.getenv("AWS_REGION"),
    aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
    aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY")
)

response = s3.list_buckets()
for bucket in response["Buckets"]:
    print(bucket["Name"])


hackmd-paper-bucket


In [84]:
import boto3
from dotenv import load_dotenv
import os

load_dotenv("../.env")

s3 = boto3.client(
    "s3",
    region_name=os.getenv("AWS_REGION"),
    aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
    aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY")
)

bucket_name = "hackmd-paper-bucket"

response = s3.list_objects_v2(
    Bucket=bucket_name,
    Prefix="raw/",   # 只看 raw/ 底下
    Delimiter="/"
)

if "CommonPrefixes" in response:
    print("Prefixes:")
    for prefix in response["CommonPrefixes"]:
        print(prefix["Prefix"])
else:
    print("沒有找到任何 prefix")


Prefixes:
raw/domain=cs.LG/


In [86]:
import boto3
from dotenv import load_dotenv
import os

load_dotenv("../.env")

s3 = boto3.client(
    "s3",
    region_name=os.getenv("AWS_REGION"),
    aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
    aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY")
)

bucket_name = "hackmd-paper-bucket"
prefix = "raw/domain=cs.LG/"
local_file = "/home/hank/hackmd-data-pipeline/tests/arxiv_data/arxiv_batch_2.json"
key = prefix + os.path.basename(local_file)

with open(local_file, "rb") as f:
    s3.put_object(
        Bucket=bucket_name,
        Key=key,
        Body=f,
        ContentType="application/json"
    )

print(f"已上傳 {local_file} 到 S3: {key}")


已上傳 /home/hank/hackmd-data-pipeline/tests/arxiv_data/arxiv_batch_2.json 到 S3: raw/domain=cs.LG/arxiv_batch_2.json


In [88]:
import boto3
from dotenv import load_dotenv
import os

load_dotenv("../.env")

s3 = boto3.client(
    "s3",
    region_name=os.getenv("AWS_REGION"),
    aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
    aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY")
)

bucket_name = "hackmd-paper-bucket"
prefix = "raw/domain=cs.LG/"

response = s3.list_objects_v2(
    Bucket=bucket_name,
    Prefix=prefix,
    Delimiter="/" 
)

if "Contents" in response:
    print("檔案列表：")
    files = [obj["Key"] for obj in response["Contents"] if not obj["Key"].endswith(".keep")]
    for f in files:
        print(f)
else:
    print("此 prefix 下沒有檔案")


檔案列表：
raw/domain=cs.LG/arxiv_batch_2.json
