In [None]:
import aiohttp
import pandas as pd
from typing import List, Dict, Any
import asyncio
from tqdm.auto import tqdm
import random
import json

TVCG_ID = 2945
USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/26.0.1 Safari/605.1.15"

In [None]:
async def get_issues(session: aiohttp.ClientSession):
    get_issue_url = (
        f"https://ieeexplore.ieee.org/rest/publication/{TVCG_ID}/regular-issues"
    )
    referer = f"https://ieeexplore.ieee.org/xpl/issues?punumber={TVCG_ID}"
    async with session.get(
        get_issue_url, headers={"User-Agent": USER_AGENT, "Referer": referer}
    ) as response:
        response_type = response.headers.get("Content-Type")
        if response_type.startswith("application/json"):
            return await response.json()
        else:
            raise Exception(f"Unexpected content type: {response_type}, content: {await response.text()}")

def parse_issue_result(result: Dict[str, Any]):
    all_issues = []

    for decade_list in result["issuelist"]:
        _decade = decade_list["decade"]
        for year_list in decade_list["years"]:
            year = year_list["year"]
            issues = year_list["issues"]
            for issue in issues:
                issue_number = issue["issue"]
                volume = issue["volume"]
                issue_id = issue["issueNumber"]
                year = issue["year"]
                month = issue["month"]
                all_issues.append(
                    {
                        "issue_num": issue_number,
                        "volume": volume,
                        "issue_id": issue_id,
                        "year": year,
                        "month": month,
                    }
                )
    return all_issues

def parse_month(month_str):
    # Map month abbreviations to month numbers
    month_map = {
        "Jan.": 1,
        "Feb.": 2,
        "Mar.": 3,
        "Apr.": 4,
        "May": 5,
        "Jun.": 6,
        "Jul.": 7,
        "Aug.": 8,
        "Sep.": 9,
        "Sept.": 9,
        "Oct.": 10,
        "Nov.": 11,
        "Dec.": 12,
        "January": 1,
        "February": 2,
        "March": 3,
        "April": 4,
        "June": 6,
        "July": 7,
        "August": 8,
        "September": 9,
        "October": 10,
        "November": 11,
        "December": 12,
    }
    month_str = month_str.strip()
    return month_map.get(month_str, None)


def to_df(all_issues: List[Dict[str, Any]]):
    df = pd.DataFrame(all_issues)

    df["issue_num"] = pd.to_numeric(df["issue_num"], errors="coerce")
    df["volume"] = pd.to_numeric(df["volume"], errors="coerce")
    df["issue_id"] = pd.to_numeric(df["issue_id"], errors="coerce")
    df["year"] = pd.to_numeric(df["year"], errors="coerce")
    df["month"] = df["month"].apply(parse_month)

    df = df.astype(
        {
            "issue_num": "Int64",
            "volume": "Int64",
            "issue_id": "Int64",
            "year": "Int64",
            "month": "Int64",
        }
    )

    non_numeric = df.map(lambda x: not pd.api.types.is_number(x)).any()
    non_numeric_columns = non_numeric[non_numeric].index.tolist()

    for col in non_numeric_columns:
        print(f"Non-numeric values found in column '{col}':")
        print(df.loc[~df[col].apply(pd.api.types.is_number), col].unique())

In [None]:
async with aiohttp.ClientSession() as session:
    result = await get_issues(session)
    all_issues = parse_issue_result(result)

In [None]:
def parse_article_data(result: Dict[str, Any]):
    papers = []
    for record in result["records"]:
        title = record["articleTitle"]
        abstract = record.get("abstract", None)
        publication_date = record["publicationDate"]
        issue = record["issue"]
        volume = record["volume"]
        papers.append(
            {
                "title": title,
                "abstract": abstract,
                "publication_date": publication_date,
                "issue": issue,
                "volume": volume,
            }
        )
    return papers


async def get_issue_data(session: aiohttp.ClientSession, issue_id: int, sleep: float = 0.2):
    all_papers = []
    result = await get_issue_page(session, issue_id, 1)
    total_pages = result["totalPages"]
    total_records = result["totalRecords"]
    all_papers = parse_article_data(result)
    if total_pages > 1:
        for page in tqdm(range(2, total_pages + 1), leave=False):
            result = await get_issue_page(session, issue_id, page)
            papers = parse_article_data(result)
            all_papers.extend(papers)
            sleep_time = random.uniform(0.001, sleep)
            await asyncio.sleep(sleep_time)
    
    assert len(all_papers) == total_records, f"Total records mismatch: {len(all_papers)} != {total_records}"
    return all_papers


async def get_issue_page(session: aiohttp.ClientSession, issue_id: int, page: int):
    assert page > 0, "Page number must be greater than 0"
    url = f"https://ieeexplore.ieee.org/rest/search/pub/{TVCG_ID}/issue/{issue_id}/toc"
    json_data = {
        "isnumber": issue_id,
        "punumber": TVCG_ID,
        "sortType": "vol-only-seq",
    }
    if page == 1:
        referer = f"https://ieeexplore.ieee.org/xpl/tocresult.jsp?isnumber={issue_id}&punumber={TVCG_ID}&sortType=vol-only-seq"
    else:
        referer = f"https://ieeexplore.ieee.org/xpl/tocresult.jsp?isnumber={issue_id}&punumber={TVCG_ID}&sortType=vol-only-seq&pageNumber={page}"
        json_data["pageNumber"] = page

    async with session.post(
        url, headers={"User-Agent": USER_AGENT, "Referer": referer}, json=json_data
    ) as response:
        response_type = response.headers.get("Content-Type")
        if response_type.startswith("application/json"):
            return await response.json()
        else:
            raise Exception(f"Unexpected content type: {response_type}, content: {await response.text()}")


In [None]:
issues_2015 = [issue for issue in all_issues if int(issue["year"]) >= 2015]

In [None]:
async with aiohttp.ClientSession() as session:
    all_papers = []
    for issue in tqdm(issues_2015):
        papers = await get_issue_data(session, issue["issue_id"])
        all_papers.extend(papers)
        sleep_time = random.uniform(0.001, 2.0)
        await asyncio.sleep(sleep_time)

In [None]:
papers_2015_2025 = {
    "papers-2015-2025": all_papers,
}

In [None]:
with open("papers_2015_2025.json", "w") as f:
    json.dump(papers_2015_2025, f, indent=4)