In [1]:
import os
import sys
import pandas as pd
import requests
from dotenv import load_dotenv

from search_para import search_para
from search_parser import extract_doc_urls
from doc_parser import extract_profile_info

load_dotenv()

True

### Preparation

In [2]:
# Bypass token verification
cookies = requests.cookies.RequestsCookieJar()
cookies['__RequestVerificationToken_L1B1YmxpYw2'] = os.environ.get(
    "__RequestVerificationToken_L1B1YmxpYw2")

headers = {
    'User-Agent': 'Mozilla/5.0',
    'Content-Type': 'application/x-www-form-urlencoded',
    'sec-ch-ua': '"Chromium";v="122", "Not(A:Brand";v="24", "Google Chrome";v="122"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"macOS"',
    'Upgrade-Insecure-Requests': '1'
}

# Create directories for file exchange
os.makedirs("temp", exist_ok=True)
os.makedirs("input_output", exist_ok=True)

### Get match search results

In [3]:
def search_doc(doc_name):
    print("Doc to search:", doc_name)

    # Import search parameters
    request_para = search_para
    request_para['Name'] = doc_name
    request_para['__RequestVerificationToken'] = os.environ.get(
    "__RequestVerificationToken")

    # Sending POST request with the specified headers and data
    search_url = 'https://apps.pcdirectory.gov.hk/Public/TC/AdvancedSearch'
    search_response = requests.post(search_url, headers=headers,
                            cookies=cookies, data=request_para)

    # Save if successfully get search reseults
    if search_response.status_code == 200:
        # Save search result conrent as HTML file
        with open('temp/search_response.html', 'wb') as f:
            f.write(search_response.content)
        print("Search results saved as HTML file.")
    else:
        print("Failed to retrieve search results.")

    # Extract doc profile urls
    print("\nParsing search results...")
    doc_url_lst = extract_doc_urls()

    return doc_url_lst

### Extract doc profile details

In [4]:
# ======================
# Extract Doc Profile
# ======================

def extract_doc_detail(doc_url_lst):
    doc_profile_lst = []


    print("\nExtracting doc info from profile...")
    doc_counter = 0
    for doc_url in doc_url_lst:
        doc_counter += 1
        doc_profile_response = requests.get(doc_url, headers=headers, cookies=cookies)

        print("\n" + "-" * 30 + "\n")
        print(f"Processing doc profile {doc_counter}")

        # Save doc profile as HTML
        if doc_profile_response.status_code == 200:
            with open(f'temp/doc_profile_{doc_counter}.html', 'wb') as f:
                f.write(doc_profile_response.content)
            print(f"Doc profile ({doc_counter}) saved as HTML file.")
        else:
            print(f"Failed to access doc profile {doc_url}")
            continue

        doc_profile_info = extract_profile_info(doc_counter)
        doc_profile_lst.append(doc_profile_info)
        
        # Info summary check (for debug)
        print("Required fields in doc profile extracted:\n")
        for key, value in doc_profile_info.items():
            print(key, ':', value)

    return doc_profile_lst

## Driver Code

In [5]:
DOC_NAMES_EXCEL = "doc_names.xlsx"
input_file_path = f"input_output/{DOC_NAMES_EXCEL}"

if not os.path.exists(input_file_path):
    sys.exit(f"Error: Input file '{input_file_path}' does not exist!")

input_df = pd.read_excel(input_file_path)
input_df

Unnamed: 0,1,Local Data Provider ID ▼,Name,First Name,Last Nam,Individual Specialty,License Number
0,114,PHK 60000005833170,CHOR LUP CHU,CHOR LUP,CHU,Cardiology,M03269
1,115,PHK 60000005963540,LAP WING RONNIE WONG,LAP WING RONNIE,WONG,General Practitioner,M05995
2,116,PHK 60000016859821,WAI LING WONG,WAI LING,WONG,General Practitioner,M10074
3,117,PHK 60000004093445,SEE YUI Lam,SEE YUI,Lam,General Practitioner,M07365
4,118,PHK 9990000010061258,CHUN HON CHAN,CHUN HON,CHAN,General Practitioner,M02612
5,119,PHK 60000004093395,YAU SHING LAI,YAU SHING,LAI,General Practitioner,M07124
6,120,PHK 60000005834031,HOI CHUEN MICHAEL LEUNG,HOI CHUEN MICHAEL,LEUNG,General Practitioner,M06352
7,121,PHK 60000005833403,SAI FAI VICTOR HO,SAI FAI VICTOR,HO,Paediatrics,M02517
8,122,PHK 60000005963122,ON SANG TUET,ON SANG,TUET,General Practitioner,M11797
9,123,PHK 60000004096675,MING HO EDMOND WONG,MING HO EDMOND,WONG,Urology,M14674


In [6]:
doc_names = input_df['Name'].tolist()
# doc_names = ["CHING PONG Sin"]

In [7]:
result_profiles = []

for doc_name in doc_names:
    print("\n" + "=" * 30 + "\n")
    doc_url_lst = search_doc(doc_name)

    # Hande if no result found for name
    if len(doc_url_lst) == 0:
        print(f"Result not found for: {doc_name}")
        continue

    # Extract doc details from profile page
    doc_profile_lst = extract_doc_detail(doc_url_lst)
    for doc_profile in doc_profile_lst:
        doc_profile["Search Name"] = doc_name

    result_profiles += doc_profile_lst



Doc to search: CHOR LUP CHU
Search results saved as HTML file.

Parsing search results...
(1) Found doc name: 朱初立, link: https://apps.pcdirectory.gov.hk/Public/TC/SearchResult/ToViewDetails?DPID=00936072&ProfID=RMP&PracticeID=1&SelectedResultRowID=1

Extracting doc info from profile...

------------------------------

Processing doc profile 1
Doc profile (1) saved as HTML file.
Required fields in doc profile extracted:

姓名 : 朱初立
性別 : 没有提供
電郵 : 没有提供
基層醫療服務提供者類別 : 西醫
香港醫務委員會註冊號碼 : M03269
執業處所 : 朱初立醫生
地址 : 香港灣仔軒尼詩道38號新基大廈2樓A室
執業類別 : 私營
電話 : 25299228
政府基層醫療促進計劃 : 長者醫療券計劃 (HCVS), 疫苗資助計劃 (VSS)


Doc to search: LAP WING RONNIE WONG
Search results saved as HTML file.

Parsing search results...
(1) Found doc name: 黃立榮, link: https://apps.pcdirectory.gov.hk/Public/TC/SearchResult/ToViewDetails?DPID=00158739&ProfID=RMP&PracticeID=1&SelectedResultRowID=1

Extracting doc info from profile...

------------------------------

Processing doc profile 1
Doc profile (1) saved as HTML file.
Required fie

In [8]:
result_df = pd.DataFrame(result_profiles)
result_df = result_df[["Search Name", "姓名", "性別", "電郵", "基層醫療服務提供者類別", "科別", "香港醫務委員會註冊號碼", "執業處所", "地址", "執業類別", "電話", "應診時間", "政府基層醫療促進計劃"]]
result_df

Unnamed: 0,Search Name,姓名,性別,電郵,基層醫療服務提供者類別,科別,香港醫務委員會註冊號碼,執業處所,地址,執業類別,電話,應診時間,政府基層醫療促進計劃
0,CHOR LUP CHU,朱初立,没有提供,没有提供,西醫,,M03269,朱初立醫生,香港灣仔軒尼詩道38號新基大廈2樓A室,私營,25299228,,"長者醫療券計劃 (HCVS), 疫苗資助計劃 (VSS)"
1,LAP WING RONNIE WONG,黃立榮,没有提供,没有提供,西醫,,M05995,海港中心醫務所,香港灣仔港灣道25號海港中心21樓2109B室,私營,28271331,,"大腸癌篩查計劃 (CRCSP), 長者醫療券計劃 (HCVS), 疫苗資助計劃 (VSS),..."
2,WAI LING WONG,黃惠玲,女性,没有提供,西醫,普通科,M10074,賽馬會善寧之家,新界沙田亞公角山路18號,非政府機構,23317000,星期一(09:00 至 17:30)\n星期二(09:00 至 17:30)\n星期三(09...,"長者醫療券計劃 (HCVS), 院舍防疫注射計劃 (RVP)"
3,SEE YUI Lam,林思睿,男性,没有提供,西醫,,M07365,林思睿醫生,香港灣仔軒尼詩道302-308號集成中心24樓2409室,私營,25988673,星期一(08:30 至 14:30) (15:30 至 18:00)\n星期二(08:30 ...,"大腸癌篩查計劃 (CRCSP), 長者醫療券計劃 (HCVS), 疫苗資助計劃 (VSS),..."
4,CHUN HON CHAN,陳振漢,男性,没有提供,西醫,,M02612,,香港灣仔軒尼詩道 302-308 號集成中心709 室,私營,25739191,,長者醫療券計劃 (HCVS)
5,YAU SHING LAI,賴友成,没有提供,没有提供,西醫,,M07124,賴友成醫生,香港柴灣小西灣道18號富景花園4號舖,私營,28987529,,"大腸癌篩查計劃 (CRCSP), 長者醫療券計劃 (HCVS), 疫苗資助計劃 (VSS)"
6,ON SANG TUET,脫安生,男性,没有提供,西醫,,M11797,安健醫務中心,香港筲箕灣南安街5-7號嘉裕大廈地下A9鋪,私營,28859333,星期一(08:30 至 14:00) (15:30 至 20:00)\n星期二(08:30 ...,"大腸癌篩查計劃 (CRCSP), 長者醫療券計劃 (HCVS), 疫苗資助計劃 (VSS),..."
7,MING HO EDMOND WONG,王明晧,没有提供,没有提供,西醫,,M14674,香港港安醫院-司徒拔道,香港灣仔司徒拔道40號,私營,28350503,,
8,KA YUE DAVID Lam,林家裕,男性,没有提供,西醫,,M13774,宏健醫務中心,香港北角英皇道257-273號南方大廈地下6號舖,私營,25666620,星期一(09:00 至 22:30)\n星期二(09:00 至 22:30)\n星期三(09...,"長者醫療券計劃 (HCVS), 疫苗資助計劃 (VSS)"
9,CHAP FAI JOHNNIE CHIU,趙集輝,男性,没有提供,西醫,,M05419,趙集輝醫生,香港筲箕灣筲箕灣道68號西灣河中心地下6號舖,私營,28850865,,"大腸癌篩查計劃 (CRCSP), 長者醫療券計劃 (HCVS), 疫苗資助計劃 (VSS)"


In [9]:
result_df.to_excel('input_output/doc_profile_result.xlsx', index=False)