In [1]:
# 讀取環境變數
import sys
from pathlib import Path

PROJECT_ROOT = Path().resolve().parent
sys.path.append(str(PROJECT_ROOT))

In [None]:
import arxiv

client = arxiv.Client(
  page_size=1,
)

search = arxiv.Search(
    query="",
    max_results=1,
    sort_by=arxiv.SortCriterion.SubmittedDate,
    sort_order=arxiv.SortOrder.Descending,
    id_list=["2301.12345v1"]
)

result = next(client.results(search))
for attr, value in vars(result).items():
  print(attr,":",value)
    
# print("ID:", result.entry_id)
# print("標題:", result.title)
# print("摘要:", result.summary)
# print("作者:", ", ".join(str(a) for a in result.authors))
# print("主分類:", result.primary_category)
# print("其他分類:", ", ".join(result.categories))
# print("發表日期:", result.published)
# print("更新日期:", result.updated)
# print("期刊/會議資訊:", result.journal_ref)
# print("DOI:", result.doi)
# print("PDF_url:", result.pdf_url)


In [4]:
import arxiv
import json
import time
import time
import logging
from dotenv import load_dotenv
import os
import gzip
import io
import boto3
from datetime import datetime, timezone
from src.core.db import get_pg
from src.core.pg_engine import PsqlEngine

pg = get_pg()
load_dotenv("../.env")
logging.getLogger("arxiv").setLevel(logging.WARNING)

category_list = [
    # Computer Science
    "cs.AI", "cs.LG", "cs.CV", "cs.CL", "cs.NE", "cs.RO", "cs.HC", "cs.SE", "cs.DS",
    "cs.DB", "cs.SY", "cs.OS", "cs.PF", "cs.PL", "cs.MS", "cs.CC", "cs.CG", "cs.LO",
    "cs.GT", "cs.MM", "cs.IT",

    # Mathematics
    "math.CO", "math.AG", "math.GT", "math.QA", "math.RA", "math.NT", "math.KT", "math.DG",
    "math.DS", "math.FA", "math.AP", "math.SP", "math.ST", "math.PR",

    # Statistics
    "stat.ML", "stat.TH", "stat.CO", "stat.AP", "stat.OT", "stat.ME",

    # Physics
    "physics.optics", "physics.bio-ph", "physics.gen-ph", "physics.acc-ph", "physics.chem-ph",
    "physics.class-ph", "physics.comp-ph", "physics.data-an", "physics.ed-ph", "physics.ins-det",
    "physics.med-ph", "physics.plasm-ph", "physics.space-ph",

    # Quantitative Biology
    "q-bio.BM", "q-bio.CB", "q-bio.GN", "q-bio.MN", "q-bio.NC", "q-bio.PE", "q-bio.QM", "q-bio.SC",

    # Quantitative Finance
    "q-fin.CP", "q-fin.EC", "q-fin.GN", "q-fin.MF", "q-fin.PM", "q-fin.RM", "q-fin.ST", "q-fin.TR",

    # Electrical Engineering and Systems Science
    "eess.AS", "eess.IV", "eess.SP", "eess.SY",

    # Astrophysics
    "astro-ph.CO", "astro-ph.GA", "astro-ph.HE", "astro-ph.IM", "astro-ph.SR",

    # Condensed Matter Physics
    "cond-mat.mtrl-sci", "cond-mat.str-el", "cond-mat.supr-con", "cond-mat.quant-gas",
    "cond-mat.dis-nn", "cond-mat.soft", "cond-mat.stat-mech",

    # High Energy Physics
    "hep-ex", "hep-lat", "hep-ph", "hep-th"
]

category_list = ["cs.AI"]

MAX_RESULTS_GOAL = 1000
BATCH_SIZE = 100
S3_BUCKET = "hackmd-paper-bucket"
MAX_ATTEMPTS = 3
INITIAL_DELAY_SECONDS = 5

client = arxiv.Client(
    page_size=MAX_RESULTS_GOAL, 
    delay_seconds=3,
    num_retries=3
)

s3 = boto3.client(
    "s3",
    region_name=os.getenv("AWS_REGION"),
    aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
    aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY")
)

def load_existing_ids():
    stmt = f"""
        SELECT entry_id
        FROM papers.downloaded_papers
    """
    rows = pg.execute_query(stmt)
    return set(r[0] for r in rows)

def add_to_pg_batch(entry_id, category, status, etl_status, etl_batch_id=None, error_msg=""):
    now_utc = datetime.now(timezone.utc)
    pg_batch.append((entry_id, category, status, now_utc, error_msg, etl_status, etl_batch_id))


def flush_pg_batch():
    global pg_batch
    if not pg_batch:
        return
    try:
        pg.insert_mogrify("papers.downloaded_papers", pg_batch)
    except Exception as e:
        print(f"Failed to insert batch into Postgres: {e}")
    finally:
        pg_batch = []
        
def add_raw_batches_to_pg(batch_id, category, s3_path, record_count, ):
    stmt = """
        INSERT INTO etl.raw_batches (batch_id, category, s3_path, record_count)
        VALUES (%s, %s, %s, %s)
        ON CONFLICT (batch_id) DO NOTHING;
    """
    try:
        pg.execute_cmd(stmt, (batch_id, category, s3_path, record_count))
    except Exception as e:
        print(f"Failed to insert into raw_batches: {e}")

def upload_batch_to_s3(s3_prefix, batch_data, batch_num, category):
    if not batch_data:
        return
    jsonl_content = "\n".join([json.dumps(paper, ensure_ascii=False) for paper in batch_data])
    buffer = io.BytesIO()
    with gzip.GzipFile(fileobj=buffer, mode='wb') as f:
        f.write(jsonl_content.encode('utf-8'))
    gzip_bytes = buffer.getvalue()
    
    utc_now = datetime.now(timezone.utc)
    today_str = utc_now.strftime("%Y-%m-%d")
    utc_timestamp = int(utc_now.timestamp())
    s3_key = f"{s3_prefix}{today_str}/{category.replace('.','_')}_batch_{batch_num}_{utc_timestamp}.jsonl.gz"
    
    last_exception = None
    for attempt in range(MAX_ATTEMPTS):
        try:
            s3.put_object(
                Bucket=S3_BUCKET,
                Key=s3_key,
                Body=gzip_bytes,
                ContentType='application/jsonl'
            )
            break
        except Exception as e:
            last_exception = e
            if attempt < MAX_ATTEMPTS - 1:
                time.sleep(INITIAL_DELAY_SECONDS * (2 ** attempt))
            else:
                raise last_exception
    return s3_key

existing_ids = load_existing_ids()
category_stats = {}
for category in category_list:
    start_time = time.time()
    s3_count = 0
    pg_count = 0
    try:
        S3_PREFIX = f"raw/{category.replace('.','_')}/"
        search = arxiv.Search(
            query=f'cat:{category}',
            max_results=MAX_RESULTS_GOAL,
            sort_by=arxiv.SortCriterion.SubmittedDate,
            sort_order=arxiv.SortOrder.Descending
        )
        print(f"{category}: Started")
        batch = []
        batch_ids = set()
        pg_batch = []
        batch_count = 0
        total_count = 0
        while True:
            try:
                results_generator = client.results(search, offset=total_count)
                for paper_result in results_generator:
                    if total_count == 0:
                        print(f"zero")
                    entry_id = paper_result.entry_id
                    if entry_id in existing_ids or entry_id in batch_ids:
                        print(f"Skipping duplicate: {entry_id}")
                        continue
                    existing_ids.add(entry_id)
                    batch_ids.add(entry_id)
                    paper_data = {
                        "entry_id": entry_id,
                        "title": paper_result.title,
                        "authors": [a.name for a in paper_result.authors],
                        "summary": paper_result.summary,
                        "primary_category": paper_result.primary_category,
                        "categories": paper_result.categories,
                        "published": paper_result.published.isoformat(),
                        "updated": paper_result.updated.isoformat(),
                        "journal_ref": paper_result.journal_ref,
                        "doi": paper_result.doi
                    }
                    utc_now = datetime.now(timezone.utc)
                    today_str = utc_now.strftime("%Y-%m-%d")
                    batch.append(paper_data)
                    total_count += 1
                    etl_batch_id = f"{category.replace('.','_')}_{today_str}_batch_{batch_count}"
                    
                    # entry_id, category, status, etl_status, etl_batch_id=None, error_msg=""   
                    add_to_pg_batch(entry_id, category, "pending", "pending", etl_batch_id)
                    pg_count += 1
                    if len(batch) >= BATCH_SIZE:
                        print(pg_count)
                        # 上傳至 S3
                        now_s3_key = upload_batch_to_s3(S3_PREFIX, batch, batch_count, category)
                        s3_count += len(batch)
                        for i in range(len(pg_batch)):
                            pg_batch[i] = (pg_batch[i][0], pg_batch[i][1], "uploaded", pg_batch[i][3], pg_batch[i][4])
                            print(pg_batch[i])
                        # 寫入 ETL raw_batches 表
                        add_raw_batches_to_pg(etl_batch_id, category, now_s3_key, len(batch))
                        # 批次推送到 PG
                        flush_pg_batch()
                        print(etl_batch_id)
                        batch = []
                        pg_batch = []
                        batch_count += 1
                break
            except arxiv.UnexpectedEmptyPageError:
                total_count += 1
                continue
        if batch:
            upload_batch_to_s3(S3_PREFIX, batch, batch_count, category)
            s3_count += len(batch)
            for i in range(len(pg_batch)):
                pg_batch[i] = (pg_batch[i][0], pg_batch[i][1], "uploaded", pg_batch[i][3], pg_batch[i][4], pg_batch[i][5])
            flush_pg_batch()
        elapsed = time.time() - start_time
        category_stats[category] = {"time_sec": elapsed, "s3_count": s3_count, "pg_count": pg_count}
        print(f"{category}: Finished")
    except Exception as e:
        print(f"Error during category {category}: {e}")

for cat, stats in category_stats.items():
    print(f"{cat} -> Time: {stats['time_sec']:.2f}s, S3: {stats['s3_count']}, PostgreSQL: {stats['pg_count']}")


cs.AI: Started


KeyboardInterrupt: 

In [None]:
import json
from pathlib import Path
from datetime import datetime, timezone

input_file = Path("arxiv_data/arxiv_batch_1.json")
output_file = Path("arxiv_data/arxiv_batch_cleaned.json")

def transform_datetime2date(dt_str):
    try:
        dt = datetime.fromisoformat(dt_str.replace("Z", "+00:00"))
        return dt.strftime("%Y-%m-%d")
    except Exception:
        return None

with open(input_file, "r", encoding="utf-8") as f:
    papers = json.load(f)

# 去重
unique_papers = {paper["entry_id"]: paper for paper in papers}


required_fields = [
    "entry_id", "title", "summary", "authors", 
    "primary_category", "published", "updated"
]

cleaned_papers = []
for paper in unique_papers.values():
    # 刪除缺值資料
    if all(paper.get(field) for field in required_fields) and all(a.strip() for a in paper["authors"]):
        paper["published_date"] = transform_datetime2date(paper["published"])
        paper["updated_date"] = transform_datetime2date(paper["updated"])
        paper["etl_datetime"] = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S") # use UTC timezone
        cleaned_papers.append(paper)

for paper in cleaned_papers:
    print(paper)

# with open(output_file, "w", encoding="utf-8") as f:
#     json.dump(cleaned_papers, f, ensure_ascii=False, indent=2)

# print(f"清理完成，共 {len(cleaned_papers)} 筆，已儲存到 {output_file}")


In [None]:
import boto3
from botocore.exceptions import ClientError

load_dotenv("../.env")

# 建立 DynamoDB 連線
dynamodb = boto3.resource(
    "dynamodb",
    region_name=os.getenv("AWS_REGION"),
    aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
    aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY")
)

table = dynamodb.Table('download_paper_entry_id')


In [None]:
# 新增一筆資料
from datetime import datetime, timezone

entry_id = "http://arxiv.org/abs/2510.11683v1"
item = {
    "category": "cs.LG",
    "entry_id": entry_id,
    "status": "uploaded",  # "failed"
    "last_attempt": datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S"),
    "error_msg": "" 
}

try:
    table.put_item(
        Item=item,
        ConditionExpression='attribute_not_exists(entry_id)'
    )
    print("已新增")
except ClientError as e:
    if e.response['Error']['Code'] == 'ConditionalCheckFailedException':
        print("這篇 paper 已存在")
    else:
        raise


In [None]:
# key 查詢
response = table.get_item(Key={"category": "cs.LG",'entry_id': entry_id})
item = response.get('Item')

if item:
    print("已存在", item)
else:
    print("不存在")


In [None]:
# 刪除 key
response = table.delete_item(
    Key={
        "category": "cs.LG",
        "entry_id": entry_id
    }
)

print("刪除成功:", response)


In [None]:
import boto3
from dotenv import load_dotenv
import os

def create_s3_bucket_and_prefix(bucket_name: str, domain: str):
    env_path = os.path.join(os.path.dirname(__file__), "../.env")
    if not os.path.exists(env_path):
        raise FileNotFoundError(f".env not found at {env_path}")
    
    load_dotenv(env_path)

    s3 = boto3.client(
        "s3",
        region_name=os.getenv("AWS_REGION"),
        aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
        aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY")
    )

    s3.create_bucket(Bucket=bucket_name)
    prefix = f"raw/domain={domain}/"
    s3.put_object(Bucket=bucket_name, Key=(prefix + ".keep"))

if __name__ == "__main__":
    create_s3_bucket_and_prefix("my-test-bucket", "cs.LG")


In [None]:
# 查看你有哪個 Bucket

import boto3
from dotenv import load_dotenv
import os

load_dotenv("../.env")

s3 = boto3.client(
    "s3",
    region_name=os.getenv("AWS_REGION"),
    aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
    aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY")
)

response = s3.list_buckets()
for bucket in response["Buckets"]:
    print(bucket["Name"])


In [None]:
import boto3
from dotenv import load_dotenv
import os

load_dotenv("../.env")

s3 = boto3.client(
    "s3",
    region_name=os.getenv("AWS_REGION"),
    aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
    aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY")
)

bucket_name = "hackmd-paper-bucket"

response = s3.list_objects_v2(
    Bucket=bucket_name,
    Prefix="raw/",   # 只看 raw/ 底下
    Delimiter="/"
)

if "CommonPrefixes" in response:
    print("Prefixes:")
    for prefix in response["CommonPrefixes"]:
        print(prefix["Prefix"])
else:
    print("沒有找到任何 prefix")


In [None]:
import boto3
from dotenv import load_dotenv
import os

load_dotenv("../.env")

s3 = boto3.client(
    "s3",
    region_name=os.getenv("AWS_REGION"),
    aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
    aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY")
)

bucket_name = "hackmd-paper-bucket"
prefix = "raw/domain=cs.LG/"
local_file = "/home/hank/hackmd-data-pipeline/tests/arxiv_data/arxiv_batch_2.json"
key = prefix + os.path.basename(local_file)

with open(local_file, "rb") as f:
    s3.put_object(
        Bucket=bucket_name,
        Key=key,
        Body=f,
        ContentType="application/json"
    )

print(f"已上傳 {local_file} 到 S3: {key}")


In [None]:
import boto3
from dotenv import load_dotenv
import os

load_dotenv("../.env")

s3 = boto3.client(
    "s3",
    region_name=os.getenv("AWS_REGION"),
    aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
    aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY")
)

bucket_name = "hackmd-paper-bucket"
prefix = "raw/domain=cs.LG/"

response = s3.list_objects_v2(
    Bucket=bucket_name,
    Prefix=prefix,
    Delimiter="/" 
)

if "Contents" in response:
    print("檔案列表：")
    files = [obj["Key"] for obj in response["Contents"] if not obj["Key"].endswith(".keep")]
    for f in files:
        print(f)
else:
    print("此 prefix 下沒有檔案")


In [None]:
from src.core.db import get_pg
from src.core.pg_engine import PsqlEngine
pg = get_pg()

def paper_exists(pg: PsqlEngine, category: str, entry_id: str) -> bool:
    stmt = f"""
        SELECT 1
        FROM papers.downloaded_papers
        WHERE category = '{category}' AND entry_id = '{entry_id}'
        LIMIT 1;
    """
    result = pg.execute_query(stmt)
    return bool(result)


paper_exists(pg, "cs_LG",'dsfd')