## 搜尋網路 Function
1. 過濾網域：先在台灣的網域 (用fstring加上`.tw`)

EX:`
search_web("空氣清淨機 使用心得", site="ptt.cc")
search_web("Switch 遊戲 評價", site="shopee.tw")
`

In [None]:
from typing import List
from googlesearch import search as _search
from bs4 import BeautifulSoup
from charset_normalizer import detect
import asyncio
from requests_html import AsyncHTMLSession
import urllib3
urllib3.disable_warnings()

async def worker(s:AsyncHTMLSession, url:str):
    try:
        header_response = await asyncio.wait_for(s.head(url, verify=False), timeout=10)
        if 'text/html' not in header_response.headers.get('Content-Type', ''):
            return None
        r = await asyncio.wait_for(s.get(url, verify=False), timeout=10)
        return r.text
    except:
        return None

async def get_htmls(urls):
    session = AsyncHTMLSession()
    tasks = (worker(session, url) for url in urls)
    return await asyncio.gather(*tasks)

# keyword 關鍵字 / 回覆數量 n_results (不能太高喔 => 會收到 HTTP 429 error)
async def search(keyword: str, n_results: int=3) -> List[str]:
    '''
    This function will search the keyword and return the text content in the first n_results web pages.
    Warning: You may suffer from HTTP 429 errors if you search too many times in a period of time. This is unavoidable and you should take your own risk if you want to try search more results at once.
    The rate limit is not explicitly announced by Google, hence there's not much we can do except for changing the IP or wait until Google unban you (we don't know how long the penalty will last either).
    '''
    keyword = keyword[:100]
    # First, search the keyword and get the results. Also, get 2 times more results in case some of them are invalid.
    # 這邊用了GoogleSearch 的 search 函式
    results = list(_search(keyword, n_results * 2, lang="zh", unique=True))
    # Then, get the HTML from the results. Also, the helper function will filter out the non-HTML urls.
    results = await get_htmls(results)
    # Filter out the None values.
    results = [x for x in results if x is not None]
    # Parse the HTML.
    results = [BeautifulSoup(x, 'html.parser') for x in results]
    # Get the text from the HTML and remove the spaces. Also, filter out the non-utf-8 encoding.
    results = [''.join(x.get_text().split()) for x in results if detect(x.encode()).get('encoding') == 'utf-8']
    # Return the first n results.
    return results[:n_results]

In [None]:
from serpapi import GoogleSearch

def search_web(query, site=None, lang='zh-TW', num=100):
    if site:
        query = f"{query} site:{site}"
    else:
        query = f"{query} site:.tw"

    all_results = []
    for start in range(0, num, 10):
        params = {
            "q": query,
            "api_key": "YOUR_SERPAPI_KEY",
            "lr": lang,
            "hl": "zh-tw",  # 介面語言
            "gl": "tw",    # 地理位置
            "start": start
        }
        search = GoogleSearch(params)
        results = search.get_dict()

        organic = results.get("organic_results", [])
        all_results.extend([
            {
                "title": r.get("title", ""),
                "link": r.get("link", ""),
                "snippet": r.get("snippet", "")
            }
            for r in organic if 'snippet' in r
        ])
    
    return all_results


1. 用`jieba`產生中文token
2. 移除攏言贅字 (`stopwords`)

In [None]:
import jieba
from collections import Counter

# Optional: Define your own Chinese stopwords
stopwords = set([
    "的", "是", "我", "也", "很", "都", "在", "有", "和", "就", "不", "了", "還", "這", "好"
])

def extract_frequent_keywords(snippets, top_k=20):
    # Combine all snippets
    all_text = " ".join(snippets)

    # Use jieba to cut Chinese text into words
    words = jieba.cut(all_text)

    # Filter out stopwords and short tokens
    filtered_words = [w for w in words if w not in stopwords and len(w.strip()) > 1]

    # Count frequency
    word_counts = Counter(filtered_words)

    return word_counts.most_common(top_k)


In [None]:
snippets = 
top_keywords = extract_frequent_keywords(snippets)
for word, freq in top_keywords:
    print(f"{word}: {freq}")