In [17]:
""""
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

BASE_URL = "https://zh.wikipedia.org"

def list_wiki_links(url, ndisplay=10):
    """抓取指定條目下的 wiki 連結，列出前 ndisplay 筆 (不重複)"""
    res = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}, timeout=5)
    res.raise_for_status()
    soup = BeautifulSoup(res.text, "html.parser")

    links = {}
    for a in soup.select("#mw-content-text a[href]"):
        href = a.get("href")
        title = a.get_text(strip=True)
        if not href or not title:
            continue
        if href.startswith("/wiki/") or href.startswith("/zh-tw/"):
            if not any(prefix in href for prefix in [":", "#"]):
                full_url = urljoin(BASE_URL, href)
                if full_url not in links:  # 避免重複
                    links[full_url] = title

    # 顯示前 ndisplay 筆
    for i, (link, title) in enumerate(links.items()):
        if i >= ndisplay:
            break
        print(f"{i+1}. {title} → {link}")

if __name__ == "__main__":
    list_wiki_links("https://zh.wikipedia.org/zh-tw/淡江大學", ndisplay=10)

""""


1. 臺北縣 → https://zh.wikipedia.org/wiki/%E8%87%BA%E5%8C%97%E7%B8%A3
2. TKU → https://zh.wikipedia.org/wiki/TKU
3. 校訓 → https://zh.wikipedia.org/wiki/%E6%A0%A1%E8%AE%AD
4. 居正 → https://zh.wikipedia.org/wiki/%E5%B1%85%E6%AD%A3
5. 學校代碼 → https://zh.wikipedia.org/wiki/%E5%AD%B8%E6%A0%A1%E4%BB%A3%E7%A2%BC
6. 私立 → https://zh.wikipedia.org/wiki/%E7%A7%81%E7%AB%8B%E5%A4%A7%E5%AD%B8
7. 綜合大學 → https://zh.wikipedia.org/wiki/%E7%B6%9C%E5%90%88%E5%A4%A7%E5%AD%B8
8. 張家宜 → https://zh.wikipedia.org/wiki/%E5%BC%B5%E5%AE%B6%E5%AE%9C
9. 校長 → https://zh.wikipedia.org/wiki/%E6%A0%A1%E9%95%BF
10. 葛煥昭 → https://zh.wikipedia.org/wiki/%E8%91%9B%E7%85%A5%E6%98%AD


In [6]:
#conda install requests beautifulsoup4 -y


[1;33mJupyter detected[0m[1;33m...[0m
[1;32m2[0m[1;32m channel Terms of Service accepted[0m
doneieving notices: - 
Channels:
 - defaults
Platform: osx-arm64
doneecting package metadata (repodata.json): / 
doneing environment: | 

## Package Plan ##

  environment location: /opt/anaconda3

  added / updated specs:
    - beautifulsoup4
    - requests


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    beautifulsoup4-4.13.5      |  py313hca03da5_0         264 KB
    certifi-2025.10.5          |  py313hca03da5_0         157 KB
    openssl-3.0.18             |       h9b4081a_0         3.1 MB
    ------------------------------------------------------------
                                           Total:         3.5 MB

The following packages will be UPDATED:

  beautifulsoup4                     4.12.3-py313hca03da5_0 --> 4.13.5-py313hca03da5_0 
  certifi                          2025.

In [40]:
"""""
User-Agent: *
Disallow: /m/
Disallow: /me/
Disallow: /@me$
Disallow: /@me/
Disallow: /*/edit$
Disallow: /*/*/edit$
Disallow: /media/
Disallow: /p/*/share
Disallow: /r/
Disallow: /trending
Disallow: /search?q$
Disallow: /search?q=
Disallow: /*/search?q=
Disallow: /*/search/*?q=
Disallow: /*/*source=
Allow: /_/api/users/*/meta
Allow: /_/api/users/*/profile/stream
Allow: /_/api/posts/*/responses
Allow: /_/api/posts/*/responsesStream
Allow: /_/api/posts/*/related
User-Agent: Amazonbot
User-Agent: Applebot-Extended
User-Agent: Bytespider
User-Agent: ClaudeBot
User-Agent: FacebookBot
User-Agent: GoogleOther
User-Agent: GPTBot
User-Agent: meta-externalagent
Disallow: /
Allow: /about
Allow: /business
Allow: /earn
Allow: /gift
Allow: /membership
Allow: /partner-program
Allow: /verified-authors
Sitemap: https://netflixtechblog.com/sitemap/sitemap.xml
License: https://medium.com/license.xml
"""""

import requests
from bs4 import BeautifulSoup
import urllib3
import pandas as pd

# 忽略 SSL 警告
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

# 取得 sitemap
sitemap_url = "https://netflixtechblog.com/sitemap/sitemap.xml"
response = requests.get(sitemap_url, verify=False)
soup = BeautifulSoup(response.content, "lxml-xml")

# 解析 <url> 標籤
urls = []
for url_tag in soup.find_all("url"):
    loc = url_tag.find("loc").text
    lastmod_tag = url_tag.find("lastmod")
    lastmod = lastmod_tag.text if lastmod_tag else None
    changefreq_tag = url_tag.find("changefreq")
    changefreq = changefreq_tag.text if changefreq_tag else None
    priority_tag = url_tag.find("priority")
    priority = priority_tag.text if priority_tag else None
    urls.append({
        "loc": loc,
        "lastmod": lastmod,
        "changefreq": changefreq,
        "priority": priority
    })

# 轉成 DataFrame
df = pd.DataFrame(urls)

# 存 CSV
df.to_csv("netflix_techblog_sitemap.csv", index=False, encoding="utf-8-sig")

print("CSV 檔案已儲存！")
print(df.head(10))


CSV 檔案已儲存！
                                                 loc     lastmod changefreq  \
0       https://netflixtechblog.com/tagged/benchmark  2016-09-01    monthly   
1           https://netflixtechblog.com/tagged/build  2016-03-09    monthly   
2    https://netflixtechblog.com/tagged/chaos-monkey  2017-07-26    monthly   
3     https://netflixtechblog.com/tagged/simian-army  2017-03-13    monthly   
4  https://netflixtechblog.com/tagged/netflixsecu...  2019-11-22    monthly   
5             https://netflixtechblog.com/tagged/isp  2016-08-09    monthly   
6  https://netflixtechblog.com/tagged/streaming-v...  2017-12-20    monthly   
7           https://netflixtechblog.com/tagged/nosql  2015-01-27    monthly   
8           https://netflixtechblog.com/tagged/cloud  2023-08-29    monthly   
9             https://netflixtechblog.com/tagged/api  2021-09-15    monthly   

  priority  
0      1.0  
1      1.0  
2      1.0  
3      1.0  
4      1.0  
5      1.0  
6      1.0  
7      1.0  
8 

In [16]:
import pandas as pd

# 讀取原始 CSV
df = pd.read_csv("netflix_techblog_sitemap.csv", encoding="utf-8-sig")

# 1. 去掉網域（https://netflixtechblog.com 或 https://netflixtechblog.com/）
df["loc"] = df["loc"].str.replace(r"^https://netflixtechblog.com/?", "", regex=True)

# 2. 刪掉空白 loc 的觀察值
df = df[df["loc"].str.strip() != ""]

# 3. 計算每個 loc 中 '/' 的數量，找最大層級
df["slash_count"] = df["loc"].str.count("/")
max_slash = df["slash_count"].max()
print(f"最大層級數: {max_slash}")

# 4. 拆分 loc 為 level 欄位
split_cols = [f"level_{i+1}" for i in range(max_slash + 1)]
df[split_cols] = df["loc"].str.split("/", expand=True)

# 5. 移除暫時的 slash_count
df = df.drop(columns=["slash_count"])

# 6. 保留最後需要的欄位順序（loc、lastmod、level_1、level_2…）
cols = ["loc", "lastmod"] + split_cols
df = df[cols]

# 7. 存成 CSV
df.to_csv("netflix_techblog_sitemap_split_clean.csv", index=False, encoding="utf-8-sig")

print("已清理並拆層級，存成 netflix_techblog_sitemap_split_clean.csv")
print(df.head(10))


最大層級數: 1
已清理並拆層級，存成 netflix_techblog_sitemap_split_clean.csv
                                                 loc     lastmod  \
0  fm-intent-predicting-user-session-intent-with-...  2025-05-21   
1  unbundling-data-science-workflows-with-metaflo...  2020-07-29   
2                      tagged/developer-productivity  2021-04-26   
3                             tagged/neural-networks  2022-11-14   
4  machine-learning-for-a-better-developer-experi...  2020-07-28   
5  rebuilding-netflix-video-processing-pipeline-w...  2024-01-10   
6                                     tagged/network  2024-04-24   
7                                         tagged/tcp  2024-04-24   
8                                      tagged/kernel  2024-04-24   
9  investigation-of-a-cross-regional-network-perf...  2024-04-24   

                                             level_1                 level_2  
0  fm-intent-predicting-user-session-intent-with-...                    None  
1  unbundling-data-science-workf

In [17]:
import pandas as pd

# 讀取剛剛清理後的檔案
df = pd.read_csv("netflix_techblog_sitemap_split_clean.csv", encoding="utf-8-sig")

# 檢查欄位
print(df.columns)

# 拆分成兩個 DataFrame
df_tagged = df[df["level_1"] == "tagged"].copy()
df_non_tagged = df[df["level_1"] != "tagged"].copy()

# 分別輸出成 CSV
df_tagged.to_csv("netflix_techblog_tagged.csv", index=False, encoding="utf-8-sig")
df_non_tagged.to_csv("netflix_techblog_non_tagged.csv", index=False, encoding="utf-8-sig")

print(f"已拆分完成：")
print(f"  tagged 共有 {len(df_tagged)} 筆")
print(f"  非 tagged 共有 {len(df_non_tagged)} 筆")


Index(['loc', 'lastmod', 'level_1', 'level_2'], dtype='object')
已拆分完成：
  tagged 共有 742 筆
  非 tagged 共有 214 筆


In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import random
import urllib3

# 關閉 SSL 驗證警告
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

# === 1. 匯入 CSV ===
df_tags = pd.read_csv("netflix_techblog_tagged.csv")
df_tags["tag_url"] = "https://netflixtechblog.com/" + df_tags["loc"]
df_tags["tag"] = df_tags["level_2"]

# 用 dict 暫存文章 -> tags
articles_dict = {}

# === 2. 定義爬蟲函數 ===
def fetch_tag_articles(tag_url, tag):
    """從單一 tag 頁面提取所有 data-href"""
    headers = {
        "User-Agent": random.choice([
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/120.0.0.0 Safari/537.36",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_0) AppleWebKit/537.36 Chrome/120.0.0.0 Safari/537.36"
        ]),
        "Accept-Language": "en-US,en;q=0.9"
    }

    print(f"爬取中: {tag_url}")
    try:
        resp = requests.get(tag_url, headers=headers, verify=False, timeout=10)
    except Exception as e:
        print(f"❌ {tag_url} 請求失敗: {e}")
        return []

    if resp.status_code != 200:
        print(f"⚠️ {tag_url} 狀態碼 {resp.status_code}")
        return []

    soup = BeautifulSoup(resp.text, "html.parser")
    articles = []
    for card in soup.select("div[data-href]"):
        href = card.get("data-href")
        if href:
            articles.append(href.strip())
    time.sleep(0.2)  # 隨機延遲
    return articles

# === 3. 主爬蟲迴圈 ===
for i, row in df_tags.iterrows():
    tag = row["tag"]
    tag_url = row["tag_url"]
    try:
        article_urls = fetch_tag_articles(tag_url, tag)
        for url in article_urls:
            if url not in articles_dict:
                articles_dict[url] = [tag]  # 新文章 -> 新 list
            else:
                articles_dict[url].append(tag)  # 已有文章 -> append tag
        print(f"✅ {tag} 共抓到 {len(article_urls)} 篇文章")
    except Exception as e:
        print(f"⚠️ {tag} 發生錯誤: {e}")

# === 4. 轉成 DataFrame ===
articles_df = pd.DataFrame([
    {"article_url": url, "tags": ",".join(tags)}
    for url, tags in articles_dict.items()
])

# === 5. 儲存 CSV ===
articles_df.to_csv("netflix_techblog_articles.csv", index=False, encoding="utf-8-sig")
print(f"✅ 完成！共 {len(articles_df)} 筆文章連結。")


爬取中: https://netflixtechblog.com/tagged/developer-productivity
✅ developer-productivity 共抓到 2 篇文章
爬取中: https://netflixtechblog.com/tagged/neural-networks
✅ neural-networks 共抓到 2 篇文章
爬取中: https://netflixtechblog.com/tagged/network
✅ network 共抓到 1 篇文章
爬取中: https://netflixtechblog.com/tagged/tcp
✅ tcp 共抓到 1 篇文章
爬取中: https://netflixtechblog.com/tagged/kernel
✅ kernel 共抓到 1 篇文章
爬取中: https://netflixtechblog.com/tagged/artificial-intelligence
✅ artificial-intelligence 共抓到 1 篇文章
爬取中: https://netflixtechblog.com/tagged/payment-processing
✅ payment-processing 共抓到 1 篇文章
爬取中: https://netflixtechblog.com/tagged/ml-explainability
✅ ml-explainability 共抓到 1 篇文章
爬取中: https://netflixtechblog.com/tagged/ml-observability
✅ ml-observability 共抓到 1 篇文章
爬取中: https://netflixtechblog.com/tagged/system-design-concepts
✅ system-design-concepts 共抓到 1 篇文章
爬取中: https://netflixtechblog.com/tagged/foundation-models
✅ foundation-models 共抓到 2 篇文章
爬取中: https://netflixtechblog.com/tagged/intelligibility
✅ intelligibility 