Fetching URL candidate from GitHub, PyPI, CRAN and then finally from Google, excluding results from the previous three.  

In [159]:
%pip install requests

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0.1 -> 25.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [160]:
import requests
from time import sleep
from typing import List, Dict
import os
import pandas as pd

In [161]:
%pip install googlesearch-python beautifulsoup4

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0.1 -> 25.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [162]:
from googlesearch import search
from bs4 import BeautifulSoup
import re
from urllib.parse import urljoin


GITHUB_API_URL = "https://api.github.com/search/repositories"
CRAN_BASE_URL = "https://cran.r-project.org/web/packages/{name}/index.html"
PYPI_JSON_API    = "https://pypi.org/pypi/{name}/json"
PYPI_PROJECT_URL = "https://pypi.org/project/{name}/"
PYPI_SEARCH_URL  = "https://pypi.org/search/?q={query}"



In [163]:
GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")
if not GITHUB_TOKEN:
    raise ValueError("Please set the GITHUB_TOKEN environment variable.")

In [164]:
def fetch_github_urls(name: str, per_page: int = 5) -> List[str]:
    """Return up to `per_page` GitHub repo URLs matching `name`, using token if provided."""
    params = {"q": name, "sort": "stars", "order": "desc", "per_page": per_page}
    headers = {}
    if GITHUB_TOKEN:
        headers["Authorization"] = f"token {GITHUB_TOKEN}"

    resp = requests.get(GITHUB_API_URL, params=params, headers=headers, timeout=10)
    if resp.status_code == 401:
        raise RuntimeError("GitHub API Unauthorized (401). Check your GITHUB_TOKEN environment variable.")
    resp.raise_for_status()
    items = resp.json().get("items", [])
    return [item["html_url"] for item in items]

In [165]:

_pkg_re = re.compile(r"^/project/([^/]+)/?$")

def fetch_pypi_urls(name: str, max_results: int = 5) -> List[str]:
    """
    Return up to `max_results` *existing* PyPI project root URLs for `name`,
    preserving whatever final URL PyPI redirects you to.
    """
    raw_urls: List[str] = []

    # 1) Try the JSON API (gives you the canonical root if it exists)
    try:
        api_url = PYPI_JSON_API.format(name=name)
        resp = requests.get(api_url, timeout=10)
        if resp.status_code == 200:
            raw_urls.append(f"https://pypi.org/project/{name}/")
    except requests.RequestException:
        pass

    # 2) Scrape the official PyPI search page
    try:
        resp = requests.get(PYPI_SEARCH_URL.format(query=name), timeout=10)
        resp.raise_for_status()
        soup = BeautifulSoup(resp.text, "html.parser")
        snippets = soup.select("a.package-snippet")[:max_results]
        for a in snippets:
            href = a.get("href", "")
            if _pkg_re.match(href):
                raw_urls.append(urljoin("https://pypi.org", href))
    except Exception:
        pass

    # 3) Normalize + HEAD‐check + dedupe
    seen = set()
    valid = []
    for url in raw_urls:
        # ensure trailing slash
        if not url.endswith("/"):
            url += "/"
        try:
            head = requests.head(url, allow_redirects=True, timeout=5)
            if head.status_code != 200:
                continue
            real = head.url
            if not real.endswith("/"):
                real += "/"
        except requests.RequestException:
            continue

        if real not in seen:
            seen.add(real)
            valid.append(real)
            if len(valid) >= max_results:
                break

    return valid


In [166]:
import re
_pkg_re = re.compile(r"https?://cran\.r-project\.org/web/packages/([^/]+)/")


def fetch_cran_urls(name: str, max_results: int = 5) -> List[str]:
    """Return up to `max_results` CRAN package URLs matching `name`."""
    raw_urls: List[str] = []

    # 1) Exact lookup via GET
    exact_url = CRAN_BASE_URL.format(name=name)
    try:
        resp = requests.get(exact_url, allow_redirects=True, timeout=10)
        if resp.status_code == 200:
            raw_urls.append(exact_url)
    except requests.RequestException:
        pass

    # 2) Fuzzy search via Google for any CRAN/web/packages URLs
    query = f"site:cran.r-project.org/web/packages {name}"
    try:
        for url in search(query, num_results=max_results, sleep_interval=2.0):
            if "cran.r-project.org/web/packages/" in url:
                raw_urls.append(url)
    except Exception:
        pass

    # 3) Extract package names and rebuild canonical index URLs
    seen = set()
    canonical = []
    for u in raw_urls:
        m = _pkg_re.match(u)
        if m:
            pkg = m.group(1)
            if pkg not in seen:
                seen.add(pkg)
                canonical.append(CRAN_BASE_URL.format(name=pkg))
        # else: skip any URL that isn’t a top-level package path

    # 4) Limit to max_results
    return canonical[:max_results]

In [167]:
# Domains to exclude in general web search
EXCLUDE_SITES = [
    "github.com",
    "pypi.org",
    "cran.r-project.org",
    "youtube.com",
    "youtu.be",
    "medium.com",
    "stackoverflow.com",
    "reddit.com",
    "twitter.com",
    "facebook.com",
    "linkedin.com",
    "geeksforgeeks.org",
    "w3schools.com",
    "tutorialspoint.com"
]
def fetch_google_urls(name: str, num_results: int = 5, sleep_interval: float = 2.0) -> List[str]:
    """
    Use googlesearch to find other URLs for `name`, excluding known non-software domains.

    `EXCLUDE_SITES` contains domains to omit (e.g., YouTube, social media, Q&A sites).
    """
    # Build exclude portion of query
    exclude_queries = " ".join(f"-site:{domain}" for domain in EXCLUDE_SITES)
    query = f"{name} {exclude_queries}"
    # `num_results` and `sleep_interval` are supported parameters
    return list(search(query, num_results=num_results, sleep_interval=sleep_interval))


In [168]:
def fetch_candidate_urls(name: str) -> set[str]:
    """
    For each software name, fetch candidate URLs in this order:
      1. GitHub
      2. PyPI
      3. CRAN
      4. General Google search (excluding above domains)
    """
    results = []

    # GitHub
    try:
        results += fetch_github_urls(name)
    except Exception as e:
        print(f"[!] GitHub fetch failed for '{name}': {e}")

    # PyPI
    try:
        results += fetch_pypi_urls(name)
    except Exception as e:
        print(f"[!] PyPI fetch failed for '{name}': {e}")

    # CRAN
    try:
        results += fetch_cran_urls(name)
    except Exception as e:
        print(f"[!] CRAN check failed for '{name}': {e}")

    # Google
    try:
        sleep(1)
        results += fetch_google_urls(name)
    except Exception as e:
        print(f"[!] Google search failed for '{name}': {e}")

    # dedupe, preserve order
    return set(results)

In [None]:
corpus = pd.read_excel("../corpus_v2.xlsx")
unique_names = corpus['name'].unique()
candidates = {name: set() for name in unique_names}

for _, row in corpus.iterrows():
    name = row['name']
    urls_str = row.get('candidate_urls', '')
    if not isinstance(urls_str, str):
        continue
    for url in urls_str.split(','):
        url = url.strip()
        if url:
            candidates[name].add(url)
            candidates[name].update(fetch_candidate_urls(name))
print(candidates)

[!] Google search failed for 'BCEA': 429 Client Error: Too Many Requests for url: https://www.google.com/sorry/index?continue=https://www.google.com/search%3Fq%3DBCEA%2B-site%253Agithub.com%2B-site%253Apypi.org%2B-site%253Acran.r-project.org%2B-site%253Ayoutube.com%2B-site%253Ayoutu.be%2B-site%253Amedium.com%2B-site%253Astackoverflow.com%2B-site%253Areddit.com%2B-site%253Atwitter.com%2B-site%253Afacebook.com%2B-site%253Alinkedin.com%2B-site%253Ageeksforgeeks.org%2B-site%253Aw3schools.com%2B-site%253Atutorialspoint.com%26num%3D7%26hl%3Den%26start%3D0%26safe%3Dactive&hl=en&q=EgSy3DRZGJjq0sAGIjC9gYUEqWlFO1tx0R-_pDv5HqHJPkeJiSSe957in8z_h50z8kk-tAec8a65_79LQxkyAnJSWgFD
[!] Google search failed for 'BCEA': 429 Client Error: Too Many Requests for url: https://www.google.com/sorry/index?continue=https://www.google.com/search%3Fq%3DBCEA%2B-site%253Agithub.com%2B-site%253Apypi.org%2B-site%253Acran.r-project.org%2B-site%253Ayoutube.com%2B-site%253Ayoutu.be%2B-site%253Amedium.com%2B-site%253Astack