In [1]:
import os
import sys
import pandas as pd
import requests
from dotenv import load_dotenv

from search_para import search_para
from search_parser import extract_doc_urls
from doc_parser import extract_profile_info

load_dotenv()

True

### Preparation

In [2]:
# Define IO paths and files
FILE_EXCHANGE_DIR = "input_output"
TEMP_DIR = "temp"
DOC_NAMES_EXCEL = "doc_names.xlsx"
OUTPUT_EXCEL = "doc_profile_result.xlsx"

input_file_path = f"{FILE_EXCHANGE_DIR}/{DOC_NAMES_EXCEL}"
output_file_path = f"{FILE_EXCHANGE_DIR}/{OUTPUT_EXCEL}"

# Create directories for file exchange
os.makedirs(TEMP_DIR, exist_ok=True)
os.makedirs(FILE_EXCHANGE_DIR, exist_ok=True)

# Bypass token verification
cookies = requests.cookies.RequestsCookieJar()
cookies['__RequestVerificationToken_L1B1YmxpYw2'] = os.environ.get(
    "__RequestVerificationToken_L1B1YmxpYw2")

headers = {
    'User-Agent': 'Mozilla/5.0',
    'Content-Type': 'application/x-www-form-urlencoded',
    'sec-ch-ua': '"Chromium";v="122", "Not(A:Brand";v="24", "Google Chrome";v="122"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"macOS"',
    'Upgrade-Insecure-Requests': '1'
}

### Get match search results

In [3]:
def search_doc(doc_name):
    print("Doc to search:", doc_name)

    # Import search parameters
    request_para = search_para
    request_para['Name'] = doc_name
    request_para['__RequestVerificationToken'] = os.environ.get(
    "__RequestVerificationToken")

    # Sending POST request with the specified headers and data
    search_url = 'https://apps.pcdirectory.gov.hk/Public/TC/AdvancedSearch'
    search_response = requests.post(search_url, headers=headers,
                            cookies=cookies, data=request_para)

    # Save if successfully get search reseults
    if search_response.status_code == 200:
        # Save search result conrent as HTML file
        with open('temp/search_response.html', 'wb') as f:
            f.write(search_response.content)
        print("Search results saved as HTML file.")
    else:
        print("Failed to retrieve search results.")

    # Extract doc profile urls
    print("\nParsing search results...")
    doc_url_lst = extract_doc_urls()

    return doc_url_lst

### Extract doc profile details

In [4]:
def extract_doc_detail(doc_url_lst):
    doc_profile_lst = []


    print("\nExtracting doc info from profile...")
    doc_counter = 0
    for doc_url in doc_url_lst:
        doc_counter += 1
        doc_profile_response = requests.get(doc_url, headers=headers, cookies=cookies)

        print("\n" + "-" * 30 + "\n")
        print(f"Processing doc profile {doc_counter}")

        # Save doc profile as HTML
        if doc_profile_response.status_code == 200:
            with open(f'temp/doc_profile_{doc_counter}.html', 'wb') as f:
                f.write(doc_profile_response.content)
            print(f"Doc profile ({doc_counter}) saved as HTML file.")
        else:
            print(f"Failed to access doc profile {doc_url}")
            continue

        doc_profile_info = extract_profile_info(doc_counter)
        doc_profile_lst.append(doc_profile_info)
        
        # Info summary check (for debug)
        print("Required fields in doc profile extracted:\n")
        for key, value in doc_profile_info.items():
            print(key, ':', value)

    return doc_profile_lst

### Clean Up

In [5]:
def remove_temp_html(dir_name):
    for file in os.listdir(dir_name):
        # Delete only HTML files
        if file.endswith(".html"):
            file_path = os.path.join(dir_name, file)
            os.remove(file_path)

## Driver Code

In [6]:
# Load input file & read given doc names
if not os.path.exists(input_file_path):
    sys.exit(f"Error: Input file '{input_file_path}' does not exist!")

input_df = pd.read_excel(input_file_path)
input_df

Unnamed: 0,1,Local Data Provider ID ▼,Name,First Name,Last Nam,Individual Specialty,License Number
0,114,PHK 60000005833170,CHOR LUP CHU,CHOR LUP,CHU,Cardiology,M03269
1,115,PHK 60000005963540,LAP WING RONNIE WONG,LAP WING RONNIE,WONG,General Practitioner,M05995
2,116,PHK 60000016859821,WAI LING WONG,WAI LING,WONG,General Practitioner,M10074
3,117,PHK 60000004093445,SEE YUI Lam,SEE YUI,Lam,General Practitioner,M07365
4,118,PHK 9990000010061258,CHUN HON CHAN,CHUN HON,CHAN,General Practitioner,M02612
5,119,PHK 60000004093395,YAU SHING LAI,YAU SHING,LAI,General Practitioner,M07124
6,120,PHK 60000005834031,HOI CHUEN MICHAEL LEUNG,HOI CHUEN MICHAEL,LEUNG,General Practitioner,M06352
7,121,PHK 60000005833403,SAI FAI VICTOR HO,SAI FAI VICTOR,HO,Paediatrics,M02517
8,122,PHK 60000005963122,ON SANG TUET,ON SANG,TUET,General Practitioner,M11797
9,123,PHK 60000004096675,MING HO EDMOND WONG,MING HO EDMOND,WONG,Urology,M14674


In [7]:
# doc_names = input_df['Name'].tolist()
doc_names = ["CHING PONG Sin"]

In [8]:
result_profiles = []

for doc_name in doc_names:
    print("\n" + "=" * 30 + "\n")
    doc_url_lst = search_doc(doc_name)

    # Hande if no result found for name
    if len(doc_url_lst) == 0:
        print(f"Result not found for: {doc_name}")
        continue

    # Extract doc details from profile page
    doc_profile_lst = extract_doc_detail(doc_url_lst)
    for doc_profile in doc_profile_lst:
        doc_profile["Search Name"] = doc_name

    # Clean up temp files
    remove_temp_html(TEMP_DIR)

    result_profiles += doc_profile_lst



Doc to search: CHING PONG Sin
Search results saved as HTML file.

Parsing search results...
(1) Found doc name: 冼正邦, link: https://apps.pcdirectory.gov.hk/Public/TC/SearchResult/ToViewDetails?DPID=00827626&ProfID=RMP&PracticeID=1&SelectedResultRowID=1
(2) Found doc name: 冼正邦, link: https://apps.pcdirectory.gov.hk/Public/TC/SearchResult/ToViewDetails?DPID=00827626&ProfID=RMP&PracticeID=2&SelectedResultRowID=2
(3) Found doc name: 冼正邦, link: https://apps.pcdirectory.gov.hk/Public/TC/SearchResult/ToViewDetails?DPID=00827626&ProfID=RMP&PracticeID=3&SelectedResultRowID=3

Extracting doc info from profile...

------------------------------

Processing doc profile 1
Doc profile (1) saved as HTML file.
Required fields in doc profile extracted:

姓名 : 冼正邦
性別 : 男性
電郵 : 没有提供
基層醫療服務提供者類別 : 西醫
香港醫務委員會註冊號碼 : M13113
執業處所 : 名仕醫務中心
地址 : 香港北角英皇道425-431號安寧大廈3B舗
執業類別 : 私營
電話 : 26570078
政府基層醫療促進計劃 : 大腸癌篩查計劃 (CRCSP), 長者醫療券計劃 (HCVS), 疫苗資助計劃 (VSS)

------------------------------

Processing doc profile 2
Doc 

In [9]:
result_df = pd.DataFrame(result_profiles)
result_df = result_df[["Search Name", "姓名", "性別", "電郵", "基層醫療服務提供者類別", "科別", "香港醫務委員會註冊號碼", "執業處所", "地址", "執業類別", "電話", "應診時間", "政府基層醫療促進計劃"]]
result_df

KeyError: "['科別'] not in index"

In [None]:
# Save doc detail results as an Excel file
result_df.to_excel(output_file_path, index=False)
