In [1]:
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from PIL import Image
from io import BytesIO
from time import sleep
from pathlib import Path
import pytesseract, csv

In [2]:
# variables used in the crawler
allLabLinks = []
url = "https://ma.mohw.gov.tw/masearch/"
targetType = '牙體技術所'
linkTxt = '詳細資料'

# css ID selector
captchaImgID = 'ctl00_ContentPlaceHolder1_ImageCheck'
typeID = 'ctl00_ContentPlaceHolder1_ddlBAS_KIND'
captchaBoxID = 'ctl00_ContentPlaceHolder1_TextBox1'
submitID = 'ctl00_ContentPlaceHolder1_btnSearch'
pageID = 'ctl00_ContentPlaceHolder1_NetPager1_lblCurrentIndex'
totalID = 'ctl00_ContentPlaceHolder1_NetPager1_lblPageCount'
nextButtonID = 'ctl00_ContentPlaceHolder1_NetPager1_lnkbtnNext'

In [3]:
options = Options()
options.add_argument('--headless')
options.add_argument('--disable-gpu')

driver = webdriver.Chrome(
    service=ChromeService(ChromeDriverManager().install()),
    options=options
)


In [None]:
driver.get(url)

orgType = Select(driver.find_element(By.ID, typeID))
orgType.select_by_visible_text(targetType)

In [6]:
def tryCaptcha():
    '''try resolve the captcha, re-try every 2 sec if not success'''
    captchaPngData = driver.find_element(By.ID, captchaImgID).screenshot_as_png
    imgObj = Image.open(BytesIO(captchaPngData))
    captchaTxt = pytesseract.image_to_string(imgObj).strip()
    driver.find_element(By.ID, captchaBoxID).send_keys(captchaTxt)
    driver.find_element(By.ID, submitID).click()
    try:
        driver.switch_to.alert.accept()
        sleep(2)
        tryCaptcha()
    except:
        return

In [5]:
tryCaptcha()

In [7]:
totalPage = int(
    driver.find_element(By.ID, totalID).text.removeprefix("共 ").removesuffix(" 頁")
)

In [None]:
while True:
    try:
        page = int(
            driver.find_element(By.ID, pageID).text.removeprefix("第 ").removesuffix(" 頁")
        )
        links = driver.find_elements(By.LINK_TEXT, linkTxt)
        hrefs = [l.get_attribute("href") for l in links]
    except:
        # may fail due to network problem
        break
    allLabLinks += hrefs
    print('processed:')
    print(page, end=', ')
    if page >= totalPage:
        print('done!')
        break
    driver.find_element(By.ID, nextButtonID).click()
    sleep(1)

# driver.close()

In [None]:
# export all lab links to a text file
Path("labs.txt").write_text('\n'.join(allLabLinks))

## crawl lab details

In [15]:
# CSS selectors to search for
selectors = [
    "ctl00_ContentPlaceHolder1_lblBAS_ID",
    "ctl00_ContentPlaceHolder1_lblBAS_NAME",
    "ctl00_ContentPlaceHolder1_lblBAS_AREA",
    "ctl00_ContentPlaceHolder1_lblBAS_STATUS",
    "ctl00_ContentPlaceHolder1_lblBAS_ADDRESS",
    "ctl00_ContentPlaceHolder1_lblBAS_TEL",
]

In [None]:
with open('lab_details.csv', 'w') as f:
    w = csv.writer(f, dialect='unix')
    w.writerow('id name area status address tel'.split())
    for link in allLabLinks:
        try:
            driver.get(link)
            row = [driver.find_element(By.ID, s).text for s in selectors]
            print(row[0], )   # id
            w.writerow(row)
        except:
            # maybe network problem, whatever error
            break

driver.close()