Fetching URL candidate from GitHub, PyPI, CRAN and then finally from Google, excluding results from the previous three.  

In [218]:
%pip install requests

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0.1 -> 25.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [219]:
import requests
from time import sleep
from typing import List, Dict
import os
import pandas as pd
import difflib


In [220]:
%pip install googlesearch-python beautifulsoup4

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0.1 -> 25.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [221]:
from googlesearch import search
from bs4 import BeautifulSoup
import re
from urllib.parse import urljoin
import xmlrpc.client
from functools import lru_cache





In [222]:
GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")
if not GITHUB_TOKEN:
    raise ValueError("Please set the GITHUB_TOKEN environment variable.")

In [223]:
GITHUB_API_URL = "https://api.github.com/search/repositories"

def fetch_github_urls(name: str, per_page: int = 5) -> List[str]:
    """Return up to `per_page` GitHub repo URLs matching `name`, using token if provided."""
    params = {"q": name, "sort": "stars", "order": "desc", "per_page": per_page}
    headers = {}
    if GITHUB_TOKEN:
        headers["Authorization"] = f"token {GITHUB_TOKEN}"

    resp = requests.get(GITHUB_API_URL, params=params, headers=headers, timeout=10)
    if resp.status_code == 401:
        raise RuntimeError("GitHub API Unauthorized (401). Check your GITHUB_TOKEN environment variable.")
    resp.raise_for_status()
    items = resp.json().get("items", [])
    return [item["html_url"] for item in items]

In [224]:
PYPI_JSON_URL    = "https://pypi.org/pypi/{pkg}/json"
PYPI_PROJECT_URL = "https://pypi.org/project/{pkg}/"

@lru_cache(maxsize=512)
def _get_pypi_info(pkg: str, timeout: float = 10.0) -> Dict:
    """
    Fetches the JSON info block for `pkg`, or returns {} on error.
    """
    try:
        r = requests.get(PYPI_JSON_URL.format(pkg=pkg), timeout=timeout)
        if r.status_code == 200:
            return r.json().get("info", {})
    except requests.RequestException:
        pass
    return {}

@lru_cache(maxsize=256)
def fetch_pypi_urls(
    pkg_name: str,
    max_results: int = 5,
    timeout: float = 10.0
) -> List[str]:
    """
    1) Exact lookup via JSON API → returns info['package_url'] (or info['project_url'])
    2) Fuzzy lookup via XML‐RPC + JSON API per hit
    """
    urls: List[str] = []

    # 1) Exact match
    info = _get_pypi_info(pkg_name, timeout)
    if info:
        url = info.get("package_url") or info.get("project_url")
        if url:
            urls.append(url)

    if len(urls) >= max_results:
        return urls[:max_results]

    # 2) Fuzzy search
    try:
        client = xmlrpc.client.ServerProxy("https://pypi.org/pypi")
        hits = client.search({"name": pkg_name}, "or")
        seen = set(pkg_name.lower())

        for hit in hits:
            name = hit.get("name")
            key  = name.lower() if name else None
            if not key or key in seen:
                continue
            seen.add(key)

            # pull its JSON info to get the true URL
            info = _get_pypi_info(name, timeout)
            if info:
                url = info.get("package_url") or info.get("project_url")
                if url:
                    urls.append(url)
                    if len(urls) >= max_results:
                        break
                    continue

            # fallback (should rarely be needed)
            urls.append(PYPI_PROJECT_URL.format(pkg=name))
            if len(urls) >= max_results:
                break

    except Exception:
        pass

    return urls[:max_results]

In [225]:
CRAN_PACKAGES_URL = "https://cran.r-project.org/src/contrib/PACKAGES"
CRAN_BASE_URL     = "https://cran.r-project.org/web/packages/{pkg}/index.html"

@lru_cache(maxsize=1)
def _load_cran_packages(timeout: float = 10.0) -> List[str]:
    """
    Fetch and parse the CRAN PACKAGES index into a list of package names.
    Cached in memory so we only download it once.
    """
    resp = requests.get(CRAN_PACKAGES_URL, timeout=timeout)
    resp.raise_for_status()
    pkgs = []
    for line in resp.text.splitlines():
        if line.startswith("Package:"):
            pkgs.append(line.split(":", 1)[1].strip())
    return pkgs

@lru_cache(maxsize=256)
def fetch_cran_urls(
    name: str,
    max_results: int = 5,
    timeout: float = 10.0
) -> List[str]:
    """
    Return up to `max_results` canonical CRAN URLs for packages matching `name`:
      1) exact match
      2) substring match
      3) fuzzy match via difflib
    """
    pkgs = _load_cran_packages(timeout)
    urls: List[str] = []
    name_lower = name.lower()

    # 1) Exact
    if name in pkgs:
        urls.append(CRAN_BASE_URL.format(pkg=name))

    # 2) Substring
    if len(urls) < max_results:
        subs = [p for p in pkgs if name_lower in p.lower() and p != name]
        for p in subs:
            if len(urls) >= max_results:
                break
            urls.append(CRAN_BASE_URL.format(pkg=p))

    # 3) Fuzzy
    if len(urls) < max_results:
        # cutoff=0.6 is a sensible default; tweak as needed
        fuzzy = difflib.get_close_matches(name, pkgs, n=max_results, cutoff=0.6)
        for p in fuzzy:
            if len(urls) >= max_results:
                break
            if p not in [u.split("/")[-2] for u in urls]:
                urls.append(CRAN_BASE_URL.format(pkg=p))

    return urls[:max_results]

In [None]:
GOOGLE_API_KEY = os.environ["GOOGLE_API_KEY"]
CSE_ID         = os.environ["GOOGLE_CSE_ID"]
EXCLUDE_SITES = [
    "github.com",
    "pypi.org",
    "cran.r-project.org",
    "youtube.com",
    "youtu.be",
    "medium.com",
    "stackoverflow.com",
    "reddit.com",
    "twitter.com",
    "facebook.com",
    "linkedin.com",
    "geeksforgeeks.org",
    "w3schools.com",
    "tutorialspoint.com"
]

@lru_cache(maxsize=128)
def fetch_google_urls(
    name: str,
    num_results: int = 10
) -> List[str]:
    """
    Fetch up to `num_results` web URLs for `name` via the
    Google Custom Search JSON API, excluding any in EXCLUDE_SITES.
    """
    # build the "-site:..." string
    exclude_query = " ".join(f"-site:{d}" for d in EXCLUDE_SITES)
    query = f"{name} {exclude_query}"

    urls: List[str] = []
    page_size = 10  # API max per request

    for start in range(1, num_results + 1, page_size):
        params = {
            "key":   GOOGLE_API_KEY,
            "cx":    CSE_ID,
            "q":     query,
            "start": start,
            "num":   min(page_size, num_results - len(urls)),
        }
        r = requests.get("https://www.googleapis.com/customsearch/v1", params=params, timeout=5)
        r.raise_for_status()
        data = r.json()
        for item in data.get("items", []):
            urls.append(item["link"])
            if len(urls) >= num_results:
                break
        if len(urls) >= num_results:
            break

    return urls

In [None]:
def fetch_candidate_urls(name: str) -> set[str]:
    """
    For each software name, fetch candidate URLs in this order:
      1. GitHub
      2. PyPI
      3. CRAN
      4. General Google search (excluding above domains)
    """
    results = []

    # GitHub
    try:
        results += fetch_github_urls(name)
    except Exception as e:
        print(f"[!] GitHub fetch failed for '{name}': {e}")

    # PyPI
    try:
        results += fetch_pypi_urls(name)
    except Exception as e:
        print(f"[!] PyPI fetch failed for '{name}': {e}")

    # CRAN
    try:
        results += fetch_cran_urls(name)
    except Exception as e:
        print(f"[!] CRAN check failed for '{name}': {e}")

    # Google
    try:
        sleep(1)
        results += fetch_google_urls(name)
    except Exception as e:
        print(f"[!] Google search failed for '{name}': {e}")

    # dedupe, preserve order
    return set(results)

In [228]:
"""names = ["TensOrflow",'tidyr','reQuests']
for name in names:
    print(f"Candidate URLs for '{name}':")
    urls = fetch_candidate_urls(name)
    for url in urls:
        print(f"  - {url}")
    print()"""

'names = ["TensOrflow",\'tidyr\',\'reQuests\']\nfor name in names:\n    print(f"Candidate URLs for \'{name}\':")\n    urls = fetch_candidate_urls(name)\n    for url in urls:\n        print(f"  - {url}")\n    print()'

In [229]:
corpus = pd.read_excel("../corpus_v2.xlsx")
unique_names = corpus['name'].unique()
candidates = {name: set() for name in unique_names}

for _, row in corpus.iterrows():
    name = row['name']
    urls_str = row.get('candidate_urls', '')
    if not isinstance(urls_str, str):
        continue
    for url in urls_str.split(','):
        url = url.strip()
        if url:
            candidates[name].add(url)
            candidates[name].update(fetch_candidate_urls(name))
print(candidates)

[!] GitHub fetch failed for 'BeautifulSoup': 403 Client Error: Forbidden for url: https://api.github.com/search/repositories?q=BeautifulSoup&sort=stars&order=desc&per_page=5
[!] GitHub fetch failed for 'BeautifulSoup': 403 Client Error: Forbidden for url: https://api.github.com/search/repositories?q=BeautifulSoup&sort=stars&order=desc&per_page=5
[!] GitHub fetch failed for 'BeautifulSoup': 403 Client Error: Forbidden for url: https://api.github.com/search/repositories?q=BeautifulSoup&sort=stars&order=desc&per_page=5
[!] GitHub fetch failed for 'BeautifulSoup': 403 Client Error: Forbidden for url: https://api.github.com/search/repositories?q=BeautifulSoup&sort=stars&order=desc&per_page=5
[!] GitHub fetch failed for 'BeautifulSoup': 403 Client Error: Forbidden for url: https://api.github.com/search/repositories?q=BeautifulSoup&sort=stars&order=desc&per_page=5
[!] GitHub fetch failed for 'BeautifulSoup': 403 Client Error: Forbidden for url: https://api.github.com/search/repositories?q=Bea

KeyboardInterrupt: 