In [None]:
import aiohttp
from typing import Dict, Any
import asyncio
from tqdm.auto import tqdm
import random
import json

USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/26.0.1 Safari/605.1.15"

# from https://diglib.eg.org/communities/80fa7e37-2d2d-4ebf-af88-3979808dc118
volumes_to_id = {
    "34(2015)": "aefebe7c-3a9c-40ce-832b-34b752a270aa",
    "35(2016)": "a3d41902-05d0-4c06-bd25-ac0d0deaeba0",
    "36(2017)": "881bb0c6-cec0-48fd-a237-03a99d410e44",
    "37(2018)": "43de9d14-da8a-41ed-9b96-08a2923eeb8e",
    "38(2019)": "d37e8b96-edab-4c53-beef-a70089e3171b",
    "39(2020)": "b682cad8-1600-498b-9d54-c37192b8165d",
    "40(2021)": "420f6880-6b1f-407e-a334-a4322df51631",
    "41(2022)": "3d8889d0-b367-437b-b97f-57215b92b85d",
    "42(2023)": "bb4843ec-17fe-4dd8-b391-e57b0f34181e",
    "43(2024)": "1429bb38-796b-4d48-b8de-268d21c973a1",
    "44(2025)": "da03e00f-81f6-449d-83cb-b687c9744c8d",
}
url = "https://diglib.eg.org/server/api/discover/browses/title/items"

In [None]:
async def get_papers_in_page(session: aiohttp.ClientSession, volume_id: str, page: int):
    query = {
        "scope": volume_id,
        "sort": "dc.title,ASC",
        "page": page,
        "size": 20,
    }
    async with session.get(
        url, headers={"User-Agent": USER_AGENT}, params=query
    ) as response:
        response_type = response.headers["Content-Type"]
        if "json" in response_type:
            json_response = await response.json()
            return json_response
        else:
            raise ValueError(
                f"Unexpected response type: {response_type}, content: {await response.text()}"
            )

In [None]:
def parse_paper_data(response: Dict[str, Any]):
    paper_items = response.get("_embedded", {"items": []}).get("items", [])
    papers = []
    for paper_item in paper_items:
        paper_id = paper_item["id"]
        paper_metadata = paper_item["metadata"]
        paper_doi = paper_metadata["dc.identifier.doi"][0]["value"]
        if "dc.description.volume" in paper_metadata:
            paper_volume = paper_metadata["dc.description.volume"][0]["value"]
        else:
            paper_volume = None
        if "dc.description.number" in paper_metadata:
            paper_issue_number = paper_metadata["dc.description.number"][0]["value"]
        else:
            paper_issue_number = None
        if "dc.description.abstract" in paper_metadata:
            paper_abstract = paper_metadata["dc.description.abstract"][0]["value"]
        else:
            paper_abstract = ""
        paper_title = paper_metadata["dc.title"][0]["value"]
        papers.append(
            {
                "id": paper_id,
                "doi": paper_doi,
                "volume": paper_volume,
                "issue_number": paper_issue_number,
                "title": paper_title,
                "abstract": paper_abstract,
            }
        )
    
    return papers


async def get_papers_in_volume(
    session: aiohttp.ClientSession, volume_id: str, sleep: float = 0.5
):
    json_response = await get_papers_in_page(session, volume_id, 0)
    page_info = json_response["page"]
    total_pages = page_info["totalPages"]
    total_item_num = page_info["totalElements"]
    all_papers = parse_paper_data(json_response)

    if total_pages > 1:
        for i in tqdm(range(1, total_pages), leave=False):
            json_response = await get_papers_in_page(session, volume_id, i)
            all_papers.extend(parse_paper_data(json_response))
            sleep_time = random.uniform(sleep - 0.1, sleep + 0.1)
            await asyncio.sleep(sleep_time)

    assert len(all_papers) == total_item_num, (
        f"total_item_num: {total_item_num}, len(all_papers): {len(all_papers)}, volume_id: {volume_id}"
    )
    return all_papers


In [None]:
all_papers = []
async with aiohttp.ClientSession() as session:
    for volume_id in tqdm(volumes_to_id.values()):
        papers = await get_papers_in_volume(session, volume_id)
        all_papers.extend(papers)
        sleep_time = random.uniform(0.1, 1.1)
        await asyncio.sleep(sleep_time)

In [None]:
paper_data = {
    "eurovis_2015_2025": all_papers
}

In [None]:
with open("eurovis_2015_2025.json", "w") as f:
    json.dump(paper_data, f, indent=4, ensure_ascii=False)