Fetching URL candidate from GitHub, PyPI, CRAN and then finally from Google, excluding results from the previous three.  

In [13]:
%pip install requests

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0.1 -> 25.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [14]:
import requests
import xmlrpc.client
from time import sleep
from typing import List, Dict
import os

In [15]:
%pip install googlesearch-python beautifulsoup4

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0.1 -> 25.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [16]:
from googlesearch import search
from bs4 import BeautifulSoup

GITHUB_API_URL = "https://api.github.com/search/repositories"
CRAN_BASE_URL = "https://cran.r-project.org/web/packages/{name}/index.html"
PYPI_SEARCH_URL = "https://pypi.org/search/"

In [None]:
GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")
if not GITHUB_TOKEN:
    raise ValueError("Please set the GITHUB_TOKEN environment variable.")

ValueError: Please set the GITHUB_TOKEN environment variable.

In [None]:
def fetch_github_urls(name: str, per_page: int = 5) -> List[str]:
    """Return up to `per_page` GitHub repo URLs matching `name`, using token if provided."""
    params = {"q": name, "sort": "stars", "order": "desc", "per_page": per_page}
    headers = {}
    if GITHUB_TOKEN:
        headers["Authorization"] = f"token {GITHUB_TOKEN}"

    resp = requests.get(GITHUB_API_URL, params=params, headers=headers, timeout=10)
    if resp.status_code == 401:
        raise RuntimeError("GitHub API Unauthorized (401). Check your GITHUB_TOKEN environment variable.")
    resp.raise_for_status()
    items = resp.json().get("items", [])
    return [item["html_url"] for item in items]

In [None]:
def fetch_pypi_urls(name: str, max_results: int = 5) -> List[str]:
    """Return up to `max_results` PyPI project URLs matching `name`, by scraping search results."""
    resp = requests.get(PYPI_SEARCH_URL, params={"q": name}, timeout=10)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")
    snippets = soup.select("a.package-snippet")
    urls = []
    for snippet in snippets[:max_results]:
        href = snippet.get("href")
        if href:
            urls.append(f"https://pypi.org{href}")
    return urls


In [None]:
def fetch_cran_url(name: str) -> List[str]:
    """Return the CRAN package URL if it exists, else empty list."""
    url = CRAN_BASE_URL.format(name=name)
    resp = requests.head(url, allow_redirects=True, timeout=5)
    if resp.status_code == 200:
        return [url]
    return []


In [None]:
def fetch_google_urls(name: str, num_results: int = 5, sleep_interval: float = 2.0) -> List[str]:
    """
    Use googlesearch to find other URLs, excluding GitHub, PyPI, and CRAN.
    """
    query = f"{name} -site:github.com -site:pypi.org -site:cran.r-project.org"
    # `num_results` and `sleep_interval` are supported by googlesearch-python
    return list(search(query, num_results=num_results, sleep_interval=sleep_interval))


In [None]:
def fetch_candidate_urls(software_names: List[str]) -> Dict[str, List[str]]:
    """
    For each software name, fetch candidate URLs in this order:
      1. GitHub
      2. PyPI
      3. CRAN
      4. General Google search (excluding above domains)
    """
    all_results: Dict[str, List[str]] = {}
    for name in software_names:
        results = []

        # GitHub
        try:
            results += fetch_github_urls(name)
        except Exception as e:
            print(f"[!] GitHub fetch failed for '{name}': {e}")

        # PyPI
        try:
            results += fetch_pypi_urls(name)
        except Exception as e:
            print(f"[!] PyPI fetch failed for '{name}': {e}")

        # CRAN
        try:
            results += fetch_cran_url(name)
        except Exception as e:
            print(f"[!] CRAN check failed for '{name}': {e}")

        # Google
        try:
            sleep(1)
            results += fetch_google_urls(name)
        except Exception as e:
            print(f"[!] Google search failed for '{name}': {e}")

        # dedupe, preserve order
        seen = set()
        deduped = []
        for url in results:
            if url not in seen:
                seen.add(url)
                deduped.append(url)
        all_results[name] = deduped

    return all_results

In [None]:
software_list = ["tensorflow", "dplyr", "requests"]
candidates = fetch_candidate_urls(software_list)
for name, urls in candidates.items():
    print(f"\n{name}:")
    for u in urls:
        print("  -", u)

[!] GitHub fetch failed for 'tensorflow': 401 Client Error: Unauthorized for url: https://api.github.com/search/repositories?q=tensorflow&sort=stars&order=desc&per_page=5
[!] PyPI fetch failed for 'tensorflow': <Fault -32500: "RuntimeError: PyPI no longer supports 'pip search' (or XML-RPC search). Please use https://pypi.org/search (via a browser) instead. See https://warehouse.pypa.io/api-reference/xml-rpc.html#deprecated-methods for more information.">
[!] Google search failed for 'tensorflow': search() got an unexpected keyword argument 'pause'
[!] GitHub fetch failed for 'dplyr': 401 Client Error: Unauthorized for url: https://api.github.com/search/repositories?q=dplyr&sort=stars&order=desc&per_page=5
[!] PyPI fetch failed for 'dplyr': <Fault -32500: "RuntimeError: PyPI no longer supports 'pip search' (or XML-RPC search). Please use https://pypi.org/search (via a browser) instead. See https://warehouse.pypa.io/api-reference/xml-rpc.html#deprecated-methods for more information.">
[!