The notebook that will from any version of corpus remove all websites from ground truth and from candidate URLs.

In [1]:
import pandas as pd
from urllib.parse import urlparse
from urllib.parse import urlparse, parse_qs

In [2]:
from urllib.parse import urlparse

def is_website(url: str) -> bool:
    """
    Return True if `url` is *not* a GitHub repo, PyPI package, or CRAN package link.
    Otherwise (if it’s one of those), return False.

    Raises:
        ValueError: if `url` is empty or not a string.
    """
    if not isinstance(url, str) or not url.strip():
        raise ValueError("Invalid URL")

    parsed = urlparse(url.strip())
    domain = parsed.netloc.lower()
    path = parsed.path or ""

    # GitHub repositories
    if "github.com" in domain:
        return False

    # PyPI packages
    if domain in {"pypi.org", "pypi.python.org"}:
        return False

    # CRAN packages
    if domain == "cran.r-project.org" and (
        path.startswith("/web/packages/") or path.startswith("/package=")
    ):
        return False

    # Anything else we treat as a “generic” website
    return True


In [3]:
def remove_websites(cell):
    """
    Given a comma-separated string of URLs, remove all URLs
    for which is_website(url) returns True, and re-join the rest.
    """
    # preserve NaNs
    if pd.isna(cell):
        return cell

    # split & clean
    parts = [u.strip() for u in cell.split(",") if u.strip()]
    # keep only the “specialized” repo/package URLs
    keep = [u for u in parts if not is_website(u)]
    return ", ".join(keep)



In [None]:
df = pd.read_excel("../corpus_v2.xlsx")
# Example: apply in place to both columns
for col in ["candidate_urls", "url (ground truth)"]:
    df[col] = df[col].apply(remove_websites)
df = df.loc[
    df['url (ground truth)']
      .fillna('')           # turn NaNs into ''
      .str.strip()          # remove any leading/trailing whitespace
      .astype(bool)         # keep only non-empty strings
]

# If you prefer to modify in-place:
df['url (ground truth)'] = df['url (ground truth)'].fillna('').str.strip()
df = df[df['url (ground truth)'].astype(bool)]
df['id'] = range(1, len(df) + 1)

# Save the modified DataFrame to a new Excel file
df.to_excel("../corpus_v3.xlsx", index=False)