In [4]:
import csv
import time
from pathlib import Path

import httpx
from bs4 import BeautifulSoup

avaliable_year = [
    1982,
    1981,
    1980,
    1979,
    1978,
    1977,
    1976,
    1975,
    1974,
    1973,
    1972,
    1971,
]


def scrape_publications(year_list: list[int]) -> list[dict[str, str]]:
    """Scrape publication details including all metadata from all pages.

    Args:
        year_list (list[int]): List of years to crawl

    Returns:
        List[Dict[str, str]]: List of publication dictionaries with complete metadata

    """
    publications = []
    base_url = (
        "https://www.hkiaps.cuhk.edu.hk/src-publications/?current_page=1&filterYear="
    )

    for item in year_list:
        url = f"{base_url}{item}"
        print(f"Scraping publications for year: {item}")
        try:
            response = httpx.get(url, timeout=10, headers={"User-Agent": "Mozilla/5.0"})
            response.raise_for_status()
            soup = BeautifulSoup(response.content, "html.parser")
            containers = soup.find_all("div", class_="bg-fafafa")

            for container in containers:
                pub = {}
                headlines = container.find_all("div", class_="Headline3")
                if len(headlines) >= 2:
                    pub["id"] = headlines[0].get_text(strip=True)
                    pub["title"] = headlines[1].get_text(strip=True)
                buttons = container.find_all("a")
                for button in buttons:
                    href = button.get("href", "")
                    text = button.get_text(strip=True)
                    if "Abstract" in text:
                        pub["abstract_url"] = (
                            href
                            if href.startswith("http")
                            else f"https://www.hkiaps.cuhk.edu.hk{href}"
                        )
                    elif "Table of Contents" in text and href.endswith(".pdf"):
                        pub["toc_pdf"] = href
                    elif "PDF" in text and href.endswith(".pdf"):
                        pub["pdf_url"] = href
                info_div = container.find("div", class_="Body2")
                if info_div:
                    pub["metadata"] = info_div.get_text(separator=" ", strip=True)
                img = container.find("img", class_="publicationsImg")
                if img:
                    pub["image_url"] = img.get("src", "")
                publications.append(pub)
                print(f"  - {pub.get('id', 'N/A')}: {pub.get('title', 'N/A')[:50]}...")

            print(f"Found {len(containers)} publications on year {item}.\n")
            time.sleep(5)

        except Exception as e:
            print(f"Error on year {item}: {e}")

    return publications


# Main execution
if __name__ == "__main__":
    print("Starting scraper...\n")
    publications = scrape_publications(avaliable_year)  # Adjust total_pages as needed

Starting scraper...

Scraping publications for year: 1982
  - OP98: Social Change, Bureaucratic Rule, and Emergent Pol...
Found 1 publications on year 1982.

Scraping publications for year: 1981
  - OP97: Local Administrative Reform in Hong Kong: Promises...
  - OP96: The Impact of Interviewer Language and Ethnicity o...
  - OP95: Aggressive Behavior in Chinese Society: The Proble...
Found 3 publications on year 1981.

Scraping publications for year: 1980
  - OP94: A Pilot Study on the Victims of Motor-cycle Accide...
  - OP93: The Government, Intermediate Organizations and Gra...
  - OP92: Evaluating the Effectiveness of the Activity Appro...
  - OP91: An Analysis of the Relationship between Language A...
  - OP90: Development, Colonial Rule, and Intergroup Politic...
  - OP89: Social Accommodation of Politics: The Case of the ...
  - OP88: Planned Development and Political Adaptability in ...
Found 7 publications on year 1980.

Scraping publications for year: 1979
  - OP87: A Techniq

In [None]:
def save_to_csv(
    publications: list[dict[str, str]],
    dir_name: str,
    filename: str = "hkiaps_publications.csv",
) -> None:
    """Save publications to CSV file.

    Args:
        publications (List[Dict[str, str]]): List of publication dictionaries
        filename (str): Output CSV filename

    Returns:
        None: Writes data to CSV file

    """
    if not publications:
        print("No publications to save")
        return
    fieldnames = [
        "id",
        "title",
        "metadata",
        "pdf_url",
        "toc_pdf",
        "abstract_url",
        "image_url",
    ]
    filepath = Path(dir_name) / filename
    filepath.parent.mkdir(parents=True, exist_ok=True)
    with Path(filepath).open("w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction="ignore")
        writer.writeheader()
        writer.writerows(publications)
    print(f"\nSaved {len(publications)} publications to {filename}")


save_to_csv(
    publications, dir_name="src-publications", filename="hkiaps_src_publications.csv"
)
print("\n=== Summary ===")
print(f"Total publications: {len(publications)}")


Saved 98 publications to hkiaps_src_publications.csv

=== Summary ===
Total publications: 98
