Fetching URL candidate from GitHub, PyPI, CRAN and then finally from Google, excluding results from the previous three.  

In [34]:
%pip install requests

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0.1 -> 25.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [35]:
import requests
import xmlrpc.client
from time import sleep
from typing import List, Dict
import os

In [36]:
%pip install googlesearch-python beautifulsoup4

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0.1 -> 25.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [37]:
from googlesearch import search
from bs4 import BeautifulSoup

GITHUB_API_URL = "https://api.github.com/search/repositories"
CRAN_BASE_URL = "https://cran.r-project.org/web/packages/{name}/index.html"
PYPI_JSON_API = "https://pypi.org/pypi/{name}/json"
PYPI_PROJECT_URL = "https://pypi.org/project/{name}/"

In [38]:
GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")
if not GITHUB_TOKEN:
    raise ValueError("Please set the GITHUB_TOKEN environment variable.")

In [39]:
def fetch_github_urls(name: str, per_page: int = 5) -> List[str]:
    """Return up to `per_page` GitHub repo URLs matching `name`, using token if provided."""
    params = {"q": name, "sort": "stars", "order": "desc", "per_page": per_page}
    headers = {}
    if GITHUB_TOKEN:
        headers["Authorization"] = f"token {GITHUB_TOKEN}"

    resp = requests.get(GITHUB_API_URL, params=params, headers=headers, timeout=10)
    if resp.status_code == 401:
        raise RuntimeError("GitHub API Unauthorized (401). Check your GITHUB_TOKEN environment variable.")
    resp.raise_for_status()
    items = resp.json().get("items", [])
    return [item["html_url"] for item in items]

In [40]:
def fetch_pypi_urls(name: str, max_results: int = 5) -> List[str]:
    """Return up to `max_results` PyPI project URLs matching `name`, combining exact lookup and fuzzy search."""
    urls: List[str] = []
    # 1) Exact lookup via JSON API
    api_url = PYPI_JSON_API.format(name=name)
    try:
        resp = requests.get(api_url, timeout=10)
        if resp.status_code == 200:
            urls.append(PYPI_PROJECT_URL.format(name=name))
    except requests.RequestException:
        pass

    # 2) Fuzzy search via Google for site-specific PyPI project pages
    query = f"site:pypi.org/project {name}"
    try:
        results = list(search(query, num_results=max_results, sleep_interval=2.0))
        for url in results:
            if url.startswith("https://pypi.org/project/"):
                urls.append(url)
    except Exception:
        pass

    # Deduplicate and return
    seen = set()
    deduped = []
    for u in urls:
        if u not in seen:
            seen.add(u)
            deduped.append(u)
    return deduped

In [41]:
def fetch_cran_urls(name: str, max_results: int = 5) -> List[str]:
    """Return up to `max_results` CRAN package URLs matching `name`, combining exact lookup and fuzzy search."""
    urls: List[str] = []
    # 1) Exact lookup via GET
    exact_url = CRAN_BASE_URL.format(name=name)
    try:
        resp = requests.get(exact_url, allow_redirects=True, timeout=10)
        if resp.status_code == 200:
            urls.append(exact_url)
    except requests.RequestException:
        pass

    # 2) Fuzzy search via Google for site-specific CRAN package pages
    query = f"site:cran.r-project.org/web/packages {name}"
    try:
        results = list(search(query, num_results=max_results, sleep_interval=2.0))
        for url in results:
            if url.startswith("https://cran.r-project.org/web/packages/"):
                urls.append(url)
    except Exception:
        pass

    # Deduplicate and return
    seen = set()
    deduped = []
    for u in urls:
        if u not in seen:
            seen.add(u)
            deduped.append(u)
    return deduped


In [42]:
# Domains to exclude in general web search
EXCLUDE_SITES = [
    "github.com",
    "pypi.org",
    "cran.r-project.org",
    "youtube.com",
    "youtu.be",
    "medium.com",
    "stackoverflow.com",
    "reddit.com",
    "twitter.com",
    "facebook.com",
    "linkedin.com",
    "geeksforgeeks.org",
    "w3schools.com",
    "tutorialspoint.com"
]
def fetch_google_urls(name: str, num_results: int = 5, sleep_interval: float = 2.0) -> List[str]:
    """
    Use googlesearch to find other URLs for `name`, excluding known non-software domains.

    `EXCLUDE_SITES` contains domains to omit (e.g., YouTube, social media, Q&A sites).
    """
    # Build exclude portion of query
    exclude_queries = " ".join(f"-site:{domain}" for domain in EXCLUDE_SITES)
    query = f"{name} {exclude_queries}"
    # `num_results` and `sleep_interval` are supported parameters
    return list(search(query, num_results=num_results, sleep_interval=sleep_interval))


In [43]:
def fetch_candidate_urls(software_names: List[str]) -> Dict[str, List[str]]:
    """
    For each software name, fetch candidate URLs in this order:
      1. GitHub
      2. PyPI
      3. CRAN
      4. General Google search (excluding above domains)
    """
    all_results: Dict[str, List[str]] = {}
    for name in software_names:
        results = []

        # GitHub
        try:
            results += fetch_github_urls(name)
        except Exception as e:
            print(f"[!] GitHub fetch failed for '{name}': {e}")

        # PyPI
        try:
            results += fetch_pypi_urls(name)
        except Exception as e:
            print(f"[!] PyPI fetch failed for '{name}': {e}")

        # CRAN
        try:
            results += fetch_cran_urls(name)
        except Exception as e:
            print(f"[!] CRAN check failed for '{name}': {e}")

        # Google
        try:
            sleep(1)
            results += fetch_google_urls(name)
        except Exception as e:
            print(f"[!] Google search failed for '{name}': {e}")

        # dedupe, preserve order
        seen = set()
        deduped = []
        for url in results:
            if url not in seen:
                seen.add(url)
                deduped.append(url)
        all_results[name] = deduped

    return all_results

In [44]:
software_list = ["tensorflow", "dplyr", "requests"]
candidates = fetch_candidate_urls(software_list)
for name, urls in candidates.items():
    print(f"\n{name}:")
    for u in urls:
        print("  -", u)


tensorflow:
  - https://github.com/tensorflow/tensorflow
  - https://github.com/huggingface/transformers
  - https://github.com/tensorflow/models
  - https://github.com/fighting41love/funNLP
  - https://github.com/keras-team/keras
  - https://pypi.org/project/tensorflow/
  - https://pypi.org/project/tensorflow-tpu/
  - https://pypi.org/project/tensorflow/2.4.1/
  - https://pypi.org/project/types-tensorflow/
  - https://pypi.org/project/tensorflow/2.5.0/
  - https://cran.r-project.org/web/packages/tensorflow/index.html
  - https://cran.r-project.org/web/packages/tfhub/vignettes/hub-with-keras.html
  - https://cran.r-project.org/web/packages/tfestimators/vignettes/tensorflow_layers.html
  - https://cran.r-project.org/web/packages/tfdeploy/vignettes/introduction.html
  - https://cran.r-project.org/web/packages/tensorflow/tensorflow.pdf
  - https://cran.r-project.org/web/packages/tfaddons/readme/README.html
  - https://www.tensorflow.org/
  - https://en.wikipedia.org/wiki/TensorFlow
  - h