In [None]:
from playwright.async_api import async_playwright
import asyncio

async def fetch_page_content(url):
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=False)
        page = await browser.new_page()
        
        try:
            await page.goto(url, wait_until='domcontentloaded')

            # A custom incremental auto-scroll function
            async def auto_scroll():
                await page.evaluate("""
                    () => {
                        return new Promise((resolve) => {
                            let lastHeight = 0;
                            let stableRounds = 0;       // how many times in a row we saw NO new content
                            const maxStableRounds = 5;  // stop after 5 consecutive stable rounds
                            const distance = 100;

                            const scrollInterval = setInterval(() => {
                                // Current scroll height
                                const scrollHeight = document.body.scrollHeight;
                                const currentPosition = window.scrollY + window.innerHeight;
                                
                                // Scroll by a bit
                                window.scrollBy(0, distance);
                                
                                // Check if we got more content
                                if (scrollHeight > lastHeight) {
                                    // Content grew, reset stableRounds
                                    lastHeight = scrollHeight;
                                    stableRounds = 0;
                                } else {
                                    // No growth in scrollHeight and we're at the bottom
                                    if (currentPosition >= scrollHeight - 50) {  // 50px threshold
                                        stableRounds++;
                                    }
                                }

                                // If we haven't seen an increase for X rounds and we're at the bottom, assume we're done
                                if (stableRounds >= maxStableRounds) {
                                    clearInterval(scrollInterval);
                                    resolve();
                                }
                            }, 300);
                        });
                    }
                """)

            await auto_scroll()        # first pass
            await asyncio.sleep(2)     # short wait
            await auto_scroll()        # second pass, in case new items loaded
            await asyncio.sleep(2)     # short wait

            # Now that we've reached the bottom, scroll back to top
            await page.evaluate("window.scrollTo(0, 0)")
            
            # Wait for any final loading or animations
            await asyncio.sleep(3)
            
            # Check for any new network requests or animations
            await page.wait_for_load_state('networkidle', timeout=5000)
            
            html_source = await page.content()
            return html_source
        except Exception as e:
            print(f"An error occurred: {str(e)}")
            return None
        finally:
            await browser.close()


maxpage = 19  # hard-coded for quick implementation (1883 = 19 pages)

for i in range(maxpage):
    #url = f"https://finna.fi/Search/Results?sort=title,id+asc&limit=100&filter%5B%5D=~language:%22zho%22&filter%5B%5D=~format:%221/Book/Book/%22&filter%5B%5D=~building:%222/Helmet/h/h01l/%22&filter%5B%5D=~building:%222/Helmet/h/h00l/%22&filter%5B%5D=~building:%222/Helmet/h/h90l/%22&filter%5B%5D=~building:%222/Helmet/h/h94l/%22&filter%5B%5D=~building:%222/Helmet/h/h72l/%22&filter%5B%5D=~building:%222/Helmet/h/h80l/%22&filter%5B%5D=~building:%222/Helmet/h/h98l/%22&filter%5B%5D=~building:%222/Helmet/h/h82l/%22&filter%5B%5D=~building:%222/Helmet/h/h02l/%22&filter%5B%5D=~building:%222/Helmet/h/h18l/%22&filter%5B%5D=~building:%222/Helmet/h/h67l/%22&filter%5B%5D=~building:%222/Helmet/h/h73l/%22&filter%5B%5D=~building:%222/Helmet/h/h61l/%22&filter%5B%5D=~building:%222/Helmet/h/h67/%22&filter%5B%5D=~building:%222/Helmet/h/h03l/%22&filter%5B%5D=~building:%222/Helmet/h/h70l/%22&filter%5B%5D=~building:%222/Helmet/h/h74l/%22&filter%5B%5D=~building:%222/Helmet/h/h41l/%22&filter%5B%5D=~building:%222/Helmet/h/h42l/%22&filter%5B%5D=~building:%222/Helmet/h/h92l/%22&join=AND&bool0%5B%5D=AND&lookfor0%5B%5D=&type0%5B%5D=AllFields&page={i+1}"
    url = f"https://finna.fi/Search/Results?sort=title,id+asc&limit=100&filter%5B%5D=~language:%22zho%22&filter%5B%5D=~format:%221/Book/Book/%22&filter%5B%5D=~building:%222/Helmet/h/h01l/%22&filter%5B%5D=~building:%222/Helmet/h/h00l/%22&filter%5B%5D=~building:%222/Helmet/h/h90l/%22&filter%5B%5D=~building:%222/Helmet/h/h94l/%22&filter%5B%5D=~building:%222/Helmet/h/h72l/%22&filter%5B%5D=~building:%222/Helmet/h/h80l/%22&filter%5B%5D=~building:%222/Helmet/h/h98l/%22&filter%5B%5D=~building:%222/Helmet/h/h82l/%22&filter%5B%5D=~building:%222/Helmet/h/h02l/%22&filter%5B%5D=~building:%222/Helmet/h/h18l/%22&filter%5B%5D=~building:%222/Helmet/h/h67l/%22&filter%5B%5D=~building:%222/Helmet/h/h73l/%22&filter%5B%5D=~building:%222/Helmet/h/h61l/%22&filter%5B%5D=~building:%222/Helmet/h/h67/%22&filter%5B%5D=~building:%222/Helmet/h/h03l/%22&filter%5B%5D=~building:%222/Helmet/h/h70l/%22&filter%5B%5D=~building:%222/Helmet/h/h74l/%22&filter%5B%5D=~building:%222/Helmet/h/h41l/%22&filter%5B%5D=~building:%222/Helmet/h/h42l/%22&filter%5B%5D=~building:%222/Helmet/h/h92l/%22&filter%5B%5D=~building:%222/Helmet/e/e01l/%22&filter%5B%5D=~building:%222/Helmet/e/e23l/%22&filter%5B%5D=~building:%222/Helmet/e/e76l/%22&filter%5B%5D=~building:%222/Helmet/e/e10l/%22&filter%5B%5D=~building:%222/Helmet/e/e32l/%22&filter%5B%5D=~building:%222/Helmet/e/e25l/%22&filter%5B%5D=~building:%222/Helmet/e/e78l/%22&filter%5B%5D=~building:%222/Helmet/e/e73l/%22&filter%5B%5D=~building:%222/Helmet/v/v30l/%22&filter%5B%5D=~building:%222/Helmet/v/v60l/%22&filter%5B%5D=~building:%222/Helmet/v/v40l/%22&filter%5B%5D=~building:%222/Helmet/v/v20l/%22&filter%5B%5D=~building:%222/Helmet/v/v28l/%22&filter%5B%5D=~building:%222/Helmet/v/v62l/%22&filter%5B%5D=~building:%222/Helmet/v/v51l/%22&filter%5B%5D=~building:%222/Helmet/v/v45l/%22&filter%5B%5D=~building:%222/Helmet/v/v70l/%22&join=AND&bool0%5B%5D=AND&lookfor0%5B%5D=&type0%5B%5D=AllFields&page={i+1}"
    file_name = f"output/chinese-{i+1:02}.htm"

  
    # Run the async function
    html_text = await fetch_page_content(url)

    if html_text:
        with open(file_name, "w", encoding="utf-8") as file:
            file.write(html_text)
        print(f"HTML content saved to {file_name}")
    else:
        print(f"Failed to fetch page number {i+1}")
    await asyncio.sleep(3)

HTML content saved to output/chinese-01.htm
HTML content saved to output/chinese-02.htm
HTML content saved to output/chinese-03.htm
HTML content saved to output/chinese-04.htm
HTML content saved to output/chinese-05.htm
HTML content saved to output/chinese-06.htm
HTML content saved to output/chinese-07.htm
HTML content saved to output/chinese-08.htm
HTML content saved to output/chinese-09.htm
HTML content saved to output/chinese-10.htm
HTML content saved to output/chinese-11.htm
HTML content saved to output/chinese-12.htm
HTML content saved to output/chinese-13.htm
HTML content saved to output/chinese-14.htm
HTML content saved to output/chinese-15.htm
HTML content saved to output/chinese-16.htm
HTML content saved to output/chinese-17.htm
HTML content saved to output/chinese-18.htm
HTML content saved to output/chinese-19.htm


In [2]:
import pandas as pd
from bs4 import BeautifulSoup

# Initialize a list to store book details
book_details = []
processed = 0

for i in range(maxpage):

    # Load the HTML file
    with open(f"output/chinese-{i+1:02}.htm", "r", encoding="utf-8") as file:
        soup = BeautifulSoup(file, "html.parser")

    # Find all book entries
    book_entries = soup.find_all("div", class_="result-body")

    # Loop through each book entry
    for book in book_entries:
        # Extract the unique ID
        id_tag = book.find("a", class_="title")
        book_id = id_tag["href"].split("/")[2].split("?")[0] if id_tag else "Unknown ID"

        # Extract the title
        title_tag = book.find("a", class_="title")
        title = title_tag.text.replace("Näytä tarkat tiedot", "").strip() if title_tag else "Unknown Title"

        # Extract the first author
        author_tag = book.find("div", class_="inline-linked-field")
        first_author = author_tag.find("span", class_="field-label").get_text(strip=True) if author_tag else "Unknown Author"

        # Check for "Saatavuustiedot" (availability information)
        holdings_dropdown = book.find("select", class_="dedup-select")
        holdings = []
        if holdings_dropdown:
            holdings = [option.text.strip() for option in holdings_dropdown.find_all("option")]

        # Fallback to "Helmet-kirjastot" if "Saatavuustiedot" is not found
        if not holdings:
            dedup_info = book.find("div", class_="dedupInformation")
            if dedup_info:
                holdings.append(dedup_info.text.strip())

        # Extract library availability
        availability = {}
        availability_rows = book.find_all("div", class_="groupLocation")
        for row in availability_rows:
            library_name_tag = row.find("span", class_="branch")
            status_tag = row.find("span", class_="status-icon")
            if library_name_tag and status_tag:
                library_name = library_name_tag.text.strip()
                status = True if "fi-circle-filled" in status_tag["class"] else False
                availability[library_name] = status

        # Append details to the list
        book_details.append({
            "id": book_id,
            "title": title,
            "author": first_author,
            "holdings": holdings,
            "availability": availability,
        })

        processed += 1

        # Print the extracted details
        if (False):
            for i, book in enumerate(book_details, start=1):
                print(f"Book {i}:")
                print(f"  ID: {book['id']}")
                print(f"  Title: {book['title']}")
                print(f"  Author: {book['first_author']}")
                print(f"  Holdings: {', '.join(book['holdings']) if book['holdings'] else 'No'}")
                print(f"  Availability:")
                for library, status in book['availability'].items():
                    print(f"    - {library}: {status}")
                print()

    print(f"Processed {processed} books...")

print("Done!")

Processed 100 books...
Processed 200 books...
Processed 300 books...
Processed 400 books...
Processed 500 books...
Processed 600 books...
Processed 700 books...
Processed 800 books...
Processed 900 books...
Processed 1000 books...
Processed 1100 books...
Processed 1200 books...
Processed 1300 books...
Processed 1400 books...
Processed 1500 books...
Processed 1600 books...
Processed 1700 books...
Processed 1800 books...
Processed 1820 books...
Done!


In [3]:
# Convert to Dataframe for further processing
df = pd.DataFrame(book_details)
df

Unnamed: 0,id,title,author,holdings,availability
0,helmet.2154902,365 ye gushi,"Wang, Dan",[Helmet-kirjastot],{'Myyrmäki las': True}
1,helmet.2301669,365 ye tonghua,"Lü, Luo",[Helmet-kirjastot],{'Hakunila las': True}
2,helmet.2455859,A'erfa R xing de lanse haiyang,"Lu, Yang","[Helmet-kirjastot, OUTI-kirjastot, PIKI-kirjas...","{'Oodi las mon': True, 'Pasila las mon': False}"
3,helmet.1941650,"A’meng, guilai ba","Bei, Beilong","[Helle-kirjastot, Helmet-kirjastot, Vaski-kirj...",{'Pasila las mon': True}
4,helmet.2462692,A pei li he hei hei bao,"Saarnio, Antti",[Helmet-kirjastot],{'Oodi las': False}
...,...,...,...,...,...
1815,helmet.2041871,Zuixin shiyong miyu da quan,"Liu, Zhi",[Helmet-kirjastot],"{'Iso Omena las': True, 'Pasila las mon': True}"
1816,helmet.2004786,"Zuo moxing, xue ke xue : Feiji he feixing gongju","Hawkes, Nigel",[Helmet-kirjastot],{'Iso Omena las': True}
1817,helmet.2448919,Zǐ zǐ cūn tónghuà,"Xiao, Shan","[Helmet-kirjastot, OUTI-kirjastot]","{'Oodi las mon': True, 'Pasila las mon': False}"
1818,helmet.2552174,Āi luō ěr de huāyuán = Errol's garden,"Hibbs, Gillian",[Helmet-kirjastot],{'Pasila las mon': True}


In [4]:
# Add 'Vaski' column
df['Vaski'] = df['holdings'].apply(lambda holdings: 'Vaski-kirjastot' in holdings)

# Add Important Places column
df['Oodi'] = df['availability'].apply(lambda availability: any('Oodi' in key and value for key, value in availability.items()))
df['Pasila'] = df['availability'].apply(lambda availability: any('Pasila' in key and value for key, value in availability.items()))
df['Pasila'] = df['availability'].apply(lambda availability: any('Pasila' in key and value for key, value in availability.items()))
df['Pasila'] = df['availability'].apply(lambda availability: any('Pasila' in key and value for key, value in availability.items()))

In [5]:
# Export to Excel
df.to_excel("output/chinese_details.xlsx", index=False, engine='openpyxl')

In [6]:
# Export only names to csv
df['title'].to_csv("output/chinese_title.csv")