In [3]:
import re
import time
import sqlite3
from typing import List, Dict, Optional
from urllib.parse import urljoin

import requests
from bs4 import BeautifulSoup


# ====== 基本設定 ======

ORG_NAME = "google"
BASE = f"https://github.com/orgs/{ORG_NAME}/repositories?type=all"

HEAD = {
    "User-Agent": "Mozilla/5.0 (compatible; RepoScanner/2.0)",
    "Accept": "text/html",
}


# ====== ユーティリティ ======

def normalize_stars(raw: str) -> int:
    """
    GitHubのスター表記（例: 2.3k, 712, 1.1M）を整数に変換
    """
    txt = raw.strip().replace(",", "").lower()
    m = re.match(r"([0-9]+(?:\.[0-9]+)?)([km]?)$", txt)
    if not m:
        return 0

    val = float(m.group(1))
    unit = m.group(2)

    if unit == "k":
        val *= 1_000
    elif unit == "m":
        val *= 1_000_000

    return int(val)


def get_html(url: str) -> str:
    """
    HTTP GET → HTML 取得
    """
    r = requests.get(url, headers=HEAD)
    r.raise_for_status()
    return r.text


# ====== スクレイピング処理 ======

def parse_repo_list(html: str) -> List[Dict[str, Optional[str]]]:
    """
    リポジトリ一覧ページから:
      - リポジトリ名
      - 言語
      - スター数
    の情報だけ抜き出す
    """
    soup = BeautifulSoup(html, "html.parser")
    items: List[Dict[str, Optional[str]]] = []

    # GitHub の UI 変動に耐えるため複数セレクタを試す
    candidates = (
        soup.select("li[data-test-selector='repo-list-item']")
        or soup.select("div.org-repos li")
        or soup.select("div[data-test-selector='org-repositories'] li")
    )

    for block in candidates:
        data = {"name": None, "language": None, "stars": 0}

        # --- 名前取得 ---
        name_node = (
            block.select_one(f"a[href*='/{ORG_NAME}/']")
            or block.select_one("a[data-testid='repo-name-link']")
        )
        if name_node:
            name = name_node.get_text(strip=True)
            # "google / repo" のような書き方を整形
            if "/" in name:
                name = name.split("/")[-1].strip()
            data["name"] = name

        # --- 言語 ---
        lang_node = (
            block.select_one("[itemprop='programmingLanguage']")
            or block.select_one("span:has(svg[aria-label='Programming language'])")
        )
        if lang_node:
            lang_txt = lang_node.get_text(strip=True)
            # 明らかに言語でないテキストは排除
            if lang_txt and len(lang_txt) <= 30 and "Star" not in lang_txt:
                data["language"] = lang_txt

        # --- スター数 ---
        star_node = (
            block.select_one("a[href$='/stargazers']")
            or block.select_one("a.Link--muted[href$='/stargazers']")
        )
        if star_node:
            data["stars"] = normalize_stars(star_node.get_text(strip=True))

        if data["name"]:
            items.append(data)

    return items


def extract_next_url(html: str, current: str) -> Optional[str]:
    """
    ページネーションの Next リンクを取得
    """
    soup = BeautifulSoup(html, "html.parser")

    next_btn = soup.select_one("a.next_page") or soup.select_one("a[rel='next']")
    if next_btn and next_btn.get("href"):
        return urljoin(current, next_btn["href"])

    return None


# ====== DB 処理 ======

def prepare_db(path: str):
    conn = sqlite3.connect(path)
    cur = conn.cursor()
    cur.execute(
        """
        CREATE TABLE IF NOT EXISTS repos (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            name TEXT UNIQUE,
            language TEXT,
            stars INTEGER
        )
        """
    )
    conn.commit()
    return conn


def save_repos(conn, rows: List[Dict[str, Optional[str]]]):
    cur = conn.cursor()

    for r in rows:
        cur.execute(
            """
            INSERT INTO repos (name, language, stars)
            VALUES (?, ?, ?)
            ON CONFLICT(name) DO UPDATE SET
                language=excluded.language,
                stars=excluded.stars
            """,
            (r["name"], r["language"], r["stars"]),
        )

    conn.commit()


# ====== メイン ======

def run(limit_pages: int = 5, db="google_repos.db"):
    conn = prepare_db(db)
    url = BASE
    count = 0

    while url and count < limit_pages:
        print(f"[Fetch] {url}")
        html = get_html(url)

        repos = parse_repo_list(html)
        print(f" → {len(repos)} repos found")

        save_repos(conn, repos)
        count += 1

        time.sleep(1)  # 連続アクセス回避

        url = extract_next_url(html, url)

    print("\n=== Saved Repositories ===")
    cur = conn.cursor()
    for name, lang, stars in cur.execute(
        "SELECT name, language, stars FROM repos ORDER BY stars DESC, name"
    ):
        print(f"{name}\t{lang or ''}\t{stars}")

    conn.close()


if __name__ == "__main__":
    run(limit_pages=1000)

[Fetch] https://github.com/orgs/google/repositories?type=all
 → 0 repos found
[Fetch] https://github.com/orgs/google/repositories?type=all#2
 → 0 repos found
[Fetch] https://github.com/orgs/google/repositories?type=all#2
 → 0 repos found
[Fetch] https://github.com/orgs/google/repositories?type=all#2
 → 0 repos found
[Fetch] https://github.com/orgs/google/repositories?type=all#2
 → 0 repos found
[Fetch] https://github.com/orgs/google/repositories?type=all#2
 → 0 repos found
[Fetch] https://github.com/orgs/google/repositories?type=all#2
 → 0 repos found
[Fetch] https://github.com/orgs/google/repositories?type=all#2
 → 0 repos found
[Fetch] https://github.com/orgs/google/repositories?type=all#2
 → 0 repos found
[Fetch] https://github.com/orgs/google/repositories?type=all#2
 → 0 repos found
[Fetch] https://github.com/orgs/google/repositories?type=all#2
 → 0 repos found
[Fetch] https://github.com/orgs/google/repositories?type=all#2
 → 0 repos found
[Fetch] https://github.com/orgs/google/rep