In [1]:
import requests
from bs4 import BeautifulSoup
import time
import json

# 网站的基础 URL
BASE_URL = "https://www.hanyuguoxue.com"

# 起始的多音字列表页面
START_URL = f"{BASE_URL}/zidian/pinyin-duoyinzi"


POS_MAPPING = {
    "名词": {"ckip": "Na", "ltp": "n"},
    "动词": {"ckip": "V", "ltp": "v"},
    "形容词": {"ckip": "VH", "ltp": "a"},
    "副词": {"ckip": "D", "ltp": "d"},
    "介词": {"ckip": "P", "ltp": "p"},
    "连词": {"ckip": "C", "ltp": "c"},
    "助词": {"ckip": "T", "ltp": "u"},
    # 其他词性可以继续补充
}

raw_cixing_list = ["名词", "动词", "形容词", "副词", "介词", "连词", "助词"]

same_value = 0
def fetch_html(url):
    """发送 GET 请求并返回 HTML 内容"""
    try:
        response = requests.get(url)
        response.raise_for_status()
        return response.text
    except requests.exceptions.RequestException as e:
        print(f"请求失败: {e}")
        return None

def extract_polyphones_from_list(html):
    """从多音字列表页面提取每个字的链接和信息"""
    soup = BeautifulSoup(html, "html.parser")
    polyphone_links = []

    # 找到所有多音字的 <a> 标签
    for a_tag in soup.find_all("a", class_="han"):
        char = a_tag.find("span").text.strip()  # 提取汉字
        href = a_tag.get("href")  # 提取链接
        if href:
            full_url = BASE_URL + href  # 构造完整 URL
            polyphone_links.append({"char": char, "url": full_url})
    return polyphone_links

def parse_pinyin_and_cixing(html):
    """解析拼音和词性信息"""
    soup = BeautifulSoup(html, "html.parser")
    results = []
    tag = [0,0,0,0,0,0,0]
    flag=0
    # 提取拼音和词性部分
    test1 = soup.find("div", class_ ="zi-contents")
    if test1:
        test2 = test1.find_all("div", class_="zi-content")  # 对该元素调用 find_all()
        for test in test2:
            #print(test)
            pinyin_tags = test.find("em", class_="py")
            cixing_tags = None
            if  pinyin_tags: # 拼音部分
                pinyin_tags = pinyin_tags.text
                #print(f'pinyin_tags:{pinyin_tags}')
                cixing_tags = test.find_all("p", class_="cixing")  # 词性部分
                #print(f'cixing_tags:{cixing_tags}')

                pinyin = pinyin_tags
                cixing = None
                cixing_list = []
                for cixing in cixing_tags:
                    raw_cixing = cixing.text.strip()
                    #for index, raw_cixing in enumerate(raw_cixing_list):
                    cixing = POS_MAPPING.get(raw_cixing, {"ckip": "未知", "ltp": "未知"})
                    #print(cixing)
                    cixing_list.append(cixing)
                    
                    ##判斷有無多音字有相同詞性
                    if raw_cixing in raw_cixing_list:
                        index = raw_cixing_list.index(raw_cixing)

                        if tag[index]>=1:
                            ##已經重複了!!
                            flag=1
                            tag[index] += 1
                        else:
                            tag[index] = 1

                # 保存结果
                result = {"pinyin": pinyin}
                if cixing:
                    result["cixing"] = cixing_list
                results.append(result)

    return results, flag, tag

def crawl_polyphones(start_url):
    """爬取多音字及其详细信息"""
    polyphone_data = []
    global same_value

    # 获取起始页面的 HTML
    start_html = fetch_html(start_url)
    if not start_html:
        return []

    # 提取多音字列表及其链接
    polyphone_links = extract_polyphones_from_list(start_html)

    # 遍历每个多音字的链接
    for polyphone in polyphone_links:
        char = polyphone["char"]
        url = polyphone["url"]
        print(f"正在处理: {char} ({url})")

        # 获取具体字页面的 HTML
        detail_html = fetch_html(url)
        if not detail_html:
            continue

        # 提取详情
        details, flag, tag = parse_pinyin_and_cixing(detail_html)
        print(details)
        if flag == 1:
            same_value+=1
            print(f"same_value:{tag}")

        #details["char"] = char
        #details["url"] = url
        polyphone_data.append(details)

        # 避免过快请求，设置延迟
        time.sleep(1)

    return polyphone_data

def save_to_json(data, filename):
    """将数据保存为 JSON 文件"""
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=4)
    print(f"数据已保存到 {filename}")

# 主程序
if __name__ == "__main__":
    print("开始爬取多音字数据...")
    polyphone_data = crawl_polyphones(START_URL)
    save_to_json(polyphone_data, "polyphones.json")
    print("爬取完成！")
    print(f"詞性有重複的多音字:{same_value}")


开始爬取多音字数据...
正在处理: 丁 (https://www.hanyuguoxue.com/zidian/zi-19969)
[{'pinyin': 'dīng', 'cixing': [{'ckip': 'Na', 'ltp': 'n'}, {'ckip': 'VH', 'ltp': 'a'}, {'ckip': 'V', 'ltp': 'v'}]}, {'pinyin': 'zhēng'}]
正在处理: 厂 (https://www.hanyuguoxue.com/zidian/zi-21378)
[{'pinyin': 'chǎng', 'cixing': [{'ckip': 'Na', 'ltp': 'n'}]}, {'pinyin': 'hǎn'}, {'pinyin': 'yăn'}, {'pinyin': 'ān', 'cixing': [{'ckip': 'Na', 'ltp': 'n'}]}]
same_value:[2, 0, 0, 0, 0, 0, 0]
正在处理: 卜 (https://www.hanyuguoxue.com/zidian/zi-21340)
[{'pinyin': 'bǔ', 'cixing': [{'ckip': 'V', 'ltp': 'v'}, {'ckip': '未知', 'ltp': '未知'}, {'ckip': 'Na', 'ltp': 'n'}]}, {'pinyin': 'bo'}]
正在处理: 几 (https://www.hanyuguoxue.com/zidian/zi-20960)
[{'pinyin': 'jī', 'cixing': [{'ckip': 'Na', 'ltp': 'n'}, {'ckip': 'Na', 'ltp': 'n'}, {'ckip': 'D', 'ltp': 'd'}, {'ckip': 'VH', 'ltp': 'a'}, {'ckip': 'V', 'ltp': 'v'}]}, {'pinyin': 'jǐ', 'cixing': [{'ckip': '未知', 'ltp': '未知'}]}]
same_value:[2, 1, 1, 1, 0, 0, 0]
正在处理: 了 (https://www.hanyuguoxue.com/zidian/zi-20