# Download all the library members from documentation

In [1]:
# get list of libraries we need for the study

from src.constants import DOCUMENTED_LIBRARIES

print(
    f"Need documentation for {len(DOCUMENTED_LIBRARIES)} libraries: {DOCUMENTED_LIBRARIES}"
)

Need documentation for 30 libraries: ['bs4', 'chardet', 'cryptography', 'dateutil', 'django', 'folium', 'librosa', 'lxml', 'matplotlib', 'nltk', 'numpy', 'openpyxl', 'pandas', 'psutil', 'pytesseract', 'pytz', 'regex', 'requests', 'scipy', 'seaborn', 'sklearn', 'statsmodels', 'sympy', 'tensorflow', 'textblob', 'texttable', 'wordcloud', 'wordninja', 'xlwt', 'xmltodict']


## **1.** Manually download 6 smaller libraries without parsable documentation

In [2]:
# small libraries, scraped manually from the source code

manually_scraped = {
    "wordninja": {
        "url": "https://github.com/keredson/wordninja/blob/master/wordninja.py",
        "version": "2.0.0",
        "members": [
            "wordninja",
            "wordninja.LanguageModel",
            "wordninja.DEFAULT_LANGUAGE_MODEL",
            "wordninja.split",
        ],
    },
    "texttable": {
        "url": "https://github.com/foutaise/texttable/blob/master/texttable.py",
        "version": "1.7.0",
        "members": [
            "texttable",
            "texttable.Texttable",
            "texttable.ArraySizeError",
            "texttable.obj2unicode",
            "texttable.len",
        ],
    },
    "xmltodict": {
        "url": "https://github.com/martinblech/xmltodict/blob/master/xmltodict.py",
        "version": "0.14.2",
        "members": [
            "xmltodict",
            "xmltodict.parse",
            "xmltodict.unparse",
            "xmltodict.ParsingInterrupted",
        ],
    },
    "regex": {
        "url": "https://github.com/mrabarnett/mrab-regex/blob/hg/regex_3/regex.py",
        "version": "2.5.153",
        "members": [
            "regex",
            "regex.cache_all",
            "regex.compile",
            "regex.DEFAULT_VERSION",
            "regex.escape",
            "regex.findall",
            "regex.finditer",
            "regex.fullmatch",
            "regex.match",
            "regex.purge",
            "regex.search",
            "regex.split",
            "regex.splititer",
            "regex.sub",
            "regex.subf",
            "regex.subfn",
            "regex.subn",
            "regex.template",
            "regex.Scanner",
            "regex.A",
            "regex.ASCII",
            "regex.B",
            "regex.BESTMATCH",
            "regex.D",
            "regex.DEBUG",
            "regex.E",
            "regex.ENHANCEMATCH",
            "regex.S",
            "regex.DOTALL",
            "regex.F",
            "regex.FULLCASE",
            "regex.I",
            "regex.IGNORECASE",
            "regex.L",
            "regex.LOCALE",
            "regex.M",
            "regex.MULTILINE",
            "regex.P",
            "regex.POSIX",
            "regex.R",
            "regex.REVERSE",
            "regex.T",
            "regex.TEMPLATE",
            "regex.U",
            "regex.UNICODE",
            "regex.V0",
            "regex.VERSION0",
            "regex.V1",
            "regex.VERSION1",
            "regex.X",
            "regex.VERBOSE",
            "regex.W",
            "regex.WORD",
            "regex.error",
            "regex.Regex",
            "regex.__version__",
            "regex.__doc__",
            "regex.RegexFlag",
        ],
    },
    "pytz": {
        "url": "https://github.com/stub42/pytz/blob/master/src/pytz/__init__.py",
        "version": "2025.2",
        "members": [
            "pytz",
            "pytz.timezone",
            "pytz.utc",
            "pytz.country_timezones",
            "pytz.country_names",
            "pytz.all_timezones",
            "pytz.all_timezones_set",
            "pytz.common_timezones",
            "pytz.common_timezones_set",
            "pytz.BaseTzInfo",
            "pytz.FixedOffset",
            "pytz.AmbiguousTimeError",
            "pytz.InvalidTimeError",
            "pytz.NonExistentTimeError",
            "pytz.UnknownTimeZoneError",
            "pytz.exceptions",
            "pytz.exceptions.AmbiguousTimeError",
            "pytz.exceptions.InvalidTimeError",
            "pytz.exceptions.NonExistentTimeError",
            "pytz.exceptions.UnknownTimeZoneError",
            "pytz.tzinfo",
            "pytz.tzinfo.memorized_timedelta",
            "pytz.tzinfo.memorized_datetime",
            "pytz.tzinfo.memorized_ttinfo",
            "pytz.tzinfo.BaseTzInfo",
            "pytz.tzinfo.StaticTzInfo",
            "pytz.tzinfo.DstTzInfo",
            "pytz.tzinfo.unpickler",
            "pytz.tzfile",
            "pytz.tzfile.build_tzinfo",
            "pytz.reference",
            "pytz.reference.FixedOffset",
            "pytz.reference.LocalTimezone",
            "pytz.reference.USTimeZone",
            "pytz.reference.Eastern",
            "pytz.reference.Central",
            "pytz.reference.Mountain",
            "pytz.reference.Pacific",
            "pytz.reference.UTC",
            "pytz.lazy",
            "pytz.lazy.LazyDict",
            "pytz.lazy.LazyList",
            "pytz.lazy.LazySet",
        ],
    },
    "pytesseract": {
        "url": "https://github.com/madmaze/pytesseract",
        "version": "0.3.13",
        "members": [
            "pytesseract",
            "pytesseract.DEFAULT_ENCODING",
            "pytesseract.LANG_PATTERN",
            "pytesseract.RGB_MODE",
            "pytesseract.SUPPORTED_FORMATS",
            "pytesseract.OSD_KEYS",
            "pytesseract.EXTENTION_TO_CONFIG",
            "pytesseract.TESSERACT_MIN_VERSION",
            "pytesseract.TESSERACT_ALTO_VERSION",
            "pytesseract.Output",
            "pytesseract.PandasNotSupported",
            "pytesseract.TesseractError",
            "pytesseract.TesseractNotFoundError",
            "pytesseract.TSVNotSupported",
            "pytesseract.ALTONotSupported",
            "pytesseract.kill",
            "pytesseract.timeout_manager",
            "pytesseract.run_once",
            "pytesseract.get_errors",
            "pytesseract.cleanup",
            "pytesseract.prepare",
            "pytesseract.save",
            "pytesseract.subprocess_args",
            "pytesseract.run_tesseract",
            "pytesseract.run_and_get_multiple_output",
            "pytesseract.run_and_get_output",
            "pytesseract.file_to_dict",
            "pytesseract.is_valid",
            "pytesseract.osd_to_dict",
            "pytesseract.get_languages",
            "pytesseract.get_tesseract_version",
            "pytesseract.image_to_string",
            "pytesseract.image_to_pdf_or_hocr",
            "pytesseract.image_to_alto_xml",
            "pytesseract.image_to_boxes",
            "pytesseract.get_pandas_output",
            "pytesseract.image_to_data",
            "pytesseract.image_to_osd",
            "pytesseract.main",
        ],
    },
}

print(f"Have {len(manually_scraped)} manually scraped libraries.")

Have 6 manually scraped libraries.


## **2.** Automatically download the inventory of 21 libraries using Sphinx / readthedocs

In [3]:
# define the inventory urls for libraries with sphinx / readthedocs documentation

inventory_urls = {
    "openpyxl": "https://openpyxl.readthedocs.io/en/latest/objects.inv",
    "django": "https://docs.djangoproject.com/en/stable/objects.inv",
    "statsmodels": "https://www.statsmodels.org/stable/objects.inv",
    "wordcloud": "https://amueller.github.io/word_cloud/objects.inv",
    "librosa": "https://librosa.org/doc/latest/objects.inv",
    "psutil": "https://psutil.readthedocs.io/en/latest/objects.inv",
    "chardet": "https://chardet.readthedocs.io/en/latest/objects.inv",
    "textblob": "https://textblob.readthedocs.io/en/latest/objects.inv",
    "xlwt": "https://xlwt.readthedocs.io/en/latest/objects.inv",
    "dateutil": "https://dateutil.readthedocs.io/en/stable/objects.inv",
    "scipy": "https://docs.scipy.org/doc/scipy/objects.inv",
    "seaborn": "https://seaborn.pydata.org/objects.inv",
    "cryptography": "https://cryptography.io/en/latest/objects.inv",
    "pandas": "https://pandas.pydata.org/pandas-docs/stable/objects.inv",
    "numpy": "https://numpy.org/doc/stable/objects.inv",
    "sklearn": "https://scikit-learn.org/stable/objects.inv",
    "matplotlib": "https://matplotlib.org/stable/objects.inv",
    "sympy": "https://docs.sympy.org/latest/objects.inv",
    "requests": "https://docs.python-requests.org/en/latest/objects.inv",
    "bs4": "https://www.crummy.com/software/BeautifulSoup/bs4/doc/objects.inv",
    "folium": "https://python-visualization.github.io/folium/latest/objects.inv",
}

In [4]:
# define method to fetch the inventory and extract importable python objects

import sphobjinv as soi

python_objects = {
    "module",
    "class",
    "exception",
    "function",
    "data",
    # all of these are parts of classes, and not importable on their own, so ignored
    # "method", "attribute", "property",
}


def get_library_objects(library: str) -> dict:
    """
    Fetch the inventory from the given URL and return a dictionary with library info.
    """
    # download the inventory
    print(f"Scraping {library}...")
    inv = soi.Inventory(url=inventory_urls[library])

    # filter to only importable python objects
    members = set()
    for obj in inv.objects:
        if obj.domain == "py" and obj.role in python_objects:
            members.add(obj.name)

    print(f"Have {len(members)} members for {library}.")
    return {
        "url": inventory_urls[library],
        "version": inv.version,
        "toplevels": sorted(set(m.split(".")[0] for m in members)),
        "members": sorted(members),
    }

In [5]:
# download the members of the libraries

sphinx_scraped = {}
for library in inventory_urls:
    try:
        info = get_library_objects(library)
        sphinx_scraped[library] = info
    except Exception as e:
        print(f"Error scraping {library}: {e}")

Scraping openpyxl...
Have 799 members for openpyxl.
Scraping django...
Have 1163 members for django.
Scraping statsmodels...
Have 935 members for statsmodels.
Scraping wordcloud...
Have 5 members for wordcloud.
Scraping librosa...
Have 232 members for librosa.
Scraping psutil...
Have 118 members for psutil.
Scraping chardet...
Have 68 members for chardet.
Scraping textblob...
Have 55 members for textblob.
Scraping xlwt...
Have 15 members for xlwt.
Scraping dateutil...
Have 50 members for dateutil.
Scraping scipy...
Have 2626 members for scipy.
Scraping seaborn...
Have 133 members for seaborn.
Scraping cryptography...
Have 454 members for cryptography.
Scraping pandas...
Have 356 members for pandas.
Scraping numpy...
Have 1268 members for numpy.
Scraping sklearn...
Have 851 members for sklearn.
Scraping matplotlib...
Have 1188 members for matplotlib.
Scraping sympy...
Have 3030 members for sympy.
Scraping requests...
Have 48 members for requests.
Scraping bs4...
Have 133 members for bs4

## **3.** Scrape members for the 3 remaining libraries

In [6]:
remaining = {
    "nltk": {
        "url": "https://www.nltk.org/py-modindex.html",
        "version": "3.8.1",
    },
    "tensorflow": {
        "url": "https://www.tensorflow.org/api_docs/python/tf",
        "version": "2.6.0",
    },
    "lxml": {
        "urls": [
            "https://lxml.de/apidoc/index.html",
            "https://lxml.de/apidoc/lxml.html.html",
            "https://lxml.de/apidoc/lxml.isoschematron.html",
        ],
        "version": "4.6.3",
    },
}

In [7]:
# method to access the html content of documentation pages

from bs4 import BeautifulSoup
import requests


def get_html_soup(url: str) -> BeautifulSoup:
    """
    Fetch the HTML content from the given URL and return a BeautifulSoup object.
    """
    response = requests.get(url)
    response.raise_for_status()  # Raise an error for bad responses
    return BeautifulSoup(response.text, "html.parser")

In [8]:
nltk_soup = get_html_soup(remaining["nltk"]["url"])

nltk_members = {"nltk"}
# extract all links from the nltk module index page
for a in nltk_soup.find_all("a", href=True):
    name = a.text.strip()

    # filter entries that look like nltk module paths
    if name.startswith("nltk."):
        nltk_members.add(name)

nltk_members = sorted(nltk_members)
remaining["nltk"]["toplevels"] = sorted(set(m.split(".")[0] for m in nltk_members))
remaining["nltk"]["members"] = nltk_members
print(f"Have {len(nltk_members)} members for nltk:\n\t{nltk_members[:5]}")

Have 355 members for nltk:
	['nltk', 'nltk.app', 'nltk.app.chartparser_app', 'nltk.app.chunkparser_app', 'nltk.app.collocations_app']


In [9]:
tf_soup = get_html_soup(remaining["tensorflow"]["url"])

tf_members = set()
# extract links from expandable navigation bar
for div in tf_soup.select("li.devsite-nav-expandable"):
    # get top level module name from the toggle title span
    reference = div.select_one("span.devsite-nav-text")
    if not reference or not reference.text.startswith("tf"):
        continue

    # clean up the module name
    title_text = reference.text.strip().replace("\u200b", "").replace(" ", "")
    tf_module = title_text.replace("tf", "tensorflow")

    # within the expandable div, find all links to members
    for a in div.select("li.devsite-nav-item a.devsite-nav-title"):
        member_text = a.select_one("span.devsite-nav-text").get_text().strip()
        member_text = member_text.replace("\u200b", "").replace(" ", "")

        # skip overview entries
        if member_text.lower() == "overview":
            continue

        # add full module name
        tf_members.add(f"{tf_module}.{member_text}")

tf_members = sorted(tf_members)
remaining["tensorflow"]["toplevels"] = sorted(set(m.split(".")[0] for m in tf_members))
remaining["tensorflow"]["members"] = tf_members
print(f"Have {len(tf_members)} members for tensorflow:\n\t{tf_members[:10]}")

Have 4377 members for tensorflow:
	['tensorflow.AggregationMethod', 'tensorflow.CriticalSection', 'tensorflow.DeviceSpec', 'tensorflow.GradientTape', 'tensorflow.Graph', 'tensorflow.IndexedSlices', 'tensorflow.IndexedSlicesSpec', 'tensorflow.Module', 'tensorflow.Operation', 'tensorflow.OptionalSpec']


In [33]:
# define method to scrape lxml documentation page
def get_lxml_members(url: str, package: str, top_level: int) -> set:
    _lxml_soup = get_html_soup(url=url)
    _lxml_members = set()

    # extract all top level modules
    for top in _lxml_soup.select(f"li.toctree-l{top_level}"):
        reference = top.select_one("a.reference")
        module = reference.text.strip() if reference else ""
        module = module.split(" ")[0]
        if (
            not module
            or not module.startswith(f"{package}.")
            or module.startswith(f"{package}._")
        ):
            continue

        _lxml_members.add(module)

        # extract all members of the top level modules
        for a in top.select(f"li.toctree-l{top_level + 1} > a"):
            name = a.text.strip()
            # Some names include trailing ' — description'; strip that
            if name.startswith("_") or name.lower() in [
                "submodules",
                "module contents",
            ]:
                continue

            name = name.split("—", 1)[0].strip().rstrip("()")
            _lxml_members.add(f"{module}.{name}")

    # extract all members of the module itself
    for dt in _lxml_soup.select("dt.sig.sig-object.py"):
        if (_id := dt.get("id")) and dt.select_one("span.descclassname"):
            if _id.startswith(f"{package}.") and not _id.startswith(f"{package}._"):
                _lxml_members.add(_id)

    return _lxml_members

In [34]:
print(
    get_lxml_members(
        url=remaining["lxml"]["urls"][0],
        package="lxml",
        top_level=3,
    )
)

{'lxml.etree.DocInfo', 'lxml.etree.XSLTExtensionError', 'lxml.etree.XMLDTDID', 'lxml.etree.strip_attributes', 'lxml.sax.SaxError', 'lxml.objectify.ObjectifyElementClassLookup', 'lxml.etree.ETXPath', 'lxml.etree.LxmlError', 'lxml.etree.AncestorsIterator', 'lxml.doctestcompare.strip', 'lxml.etree.PI', 'lxml.etree.NamespaceRegistryError', 'lxml.etree.strip_tags', 'lxml.etree.Element', 'lxml.etree.iterparse', 'lxml.html.ElementSoup', 'lxml.doctestcompare.LXMLOutputChecker', 'lxml.etree.LxmlRegistryError', 'lxml.etree.XSLTExtension', 'lxml.html.defs', 'lxml.etree.dump', 'lxml.etree.XPathEvaluator', 'lxml.etree.XPathEvalError', 'lxml.objectify.annotate', 'lxml.etree.ParserBasedElementClassLookup', 'lxml.builder.ElementMaker', 'lxml.etree.SiblingsIterator', 'lxml.etree.ETCompatXMLParser', 'lxml.etree.XMLID', 'lxml.etree.ElementDefaultClassLookup', 'lxml.objectify.ObjectPath', 'lxml.objectify.ObjectifiedElement', 'lxml.objectify.NoneElement', 'lxml.etree.ElementNamespaceClassLookup', 'lxml.etr

In [35]:
# some manually scraped members
lxml_members = {"lxml", "lxml.get_include"}

lxml_members.update(
    get_lxml_members(
        url=remaining["lxml"]["urls"][0],
        package="lxml",
        top_level=3,
    )
)
lxml_members.update(
    get_lxml_members(
        url=remaining["lxml"]["urls"][1],
        package="lxml.html",
        top_level=1,
    )
)
lxml_members.update(
    get_lxml_members(
        url=remaining["lxml"]["urls"][2],
        package="lxml.isoschematron",
        top_level=0,
    )
)

lxml_members = sorted(lxml_members)
remaining["lxml"]["toplevels"] = sorted(set(m.split(".")[0] for m in lxml_members))
remaining["lxml"]["members"] = lxml_members
print(f"Have {len(lxml_members)} members for lxml:\n\t{lxml_members[:10]}")

Have 391 members for lxml:
	['lxml', 'lxml.ElementInclude', 'lxml.ElementInclude.FatalIncludeError', 'lxml.ElementInclude.LimitedRecursiveIncludeError', 'lxml.ElementInclude.default_loader', 'lxml.ElementInclude.include', 'lxml.builder', 'lxml.builder.ElementMaker', 'lxml.cssselect', 'lxml.doctestcompare']


## **4.** Save all of the documentation!

In [36]:
from llm_cgr import save_json

final_data = {
    **manually_scraped,
    **sphinx_scraped,
    **remaining,
}

for library in DOCUMENTED_LIBRARIES:
    assert library in final_data, f"Missing documentation for {library}!"

print(
    f"Have data for all {len(DOCUMENTED_LIBRARIES)} libraries:\n\t{DOCUMENTED_LIBRARIES}"
)
save_json(final_data, "../data/libraries/members.json")

Have data for all 30 libraries:
	['bs4', 'chardet', 'cryptography', 'dateutil', 'django', 'folium', 'librosa', 'lxml', 'matplotlib', 'nltk', 'numpy', 'openpyxl', 'pandas', 'psutil', 'pytesseract', 'pytz', 'regex', 'requests', 'scipy', 'seaborn', 'sklearn', 'statsmodels', 'sympy', 'tensorflow', 'textblob', 'texttable', 'wordcloud', 'wordninja', 'xlwt', 'xmltodict']
