# Download library documentation *(RQ1, RQ2, RQ3)*

Download the full API reference documentation for each external library in our processed BigCodeBench dataset.

This documentation will provide the ground truth necessary for determining library member hallucinations.

We download the latest version of each libraries documentation.

In [1]:
# get list of libraries we need for the study

from src.constants import DOCUMENTED_LIBRARIES

print(
    f"Need documentation for {len(DOCUMENTED_LIBRARIES)} libraries:\n\t{DOCUMENTED_LIBRARIES}"
)

Need documentation for 30 libraries:
	['bs4', 'chardet', 'cryptography', 'dateutil', 'django', 'folium', 'librosa', 'lxml', 'matplotlib', 'nltk', 'numpy', 'openpyxl', 'pandas', 'psutil', 'pytesseract', 'pytz', 'regex', 'requests', 'scipy', 'seaborn', 'sklearn', 'statsmodels', 'sympy', 'tensorflow', 'textblob', 'texttable', 'wordcloud', 'wordninja', 'xlwt', 'xmltodict']


## **1.** Manually download 6 smaller libraries without parsable documentation

In [2]:
# small libraries, scraped manually from the source code

manually_scraped = {
    "wordninja": {
        "url": "https://github.com/keredson/wordninja/blob/master/wordninja.py",
        "version": "2.0.0",
        "modules": [
            "wordninja",
        ],
        "members": [
            "wordninja",
            "wordninja.LanguageModel",
            "wordninja.DEFAULT_LANGUAGE_MODEL",
            "wordninja.split",
        ],
    },
    "texttable": {
        "url": "https://github.com/foutaise/texttable/blob/master/texttable.py",
        "version": "1.7.0",
        "modules": [
            "texttable",
        ],
        "members": [
            "texttable",
            "texttable.Texttable",
            "texttable.ArraySizeError",
            "texttable.obj2unicode",
            "texttable.len",
        ],
    },
    "xmltodict": {
        "url": "https://github.com/martinblech/xmltodict/blob/master/xmltodict.py",
        "version": "0.14.2",
        "modules": [
            "xmltodict",
        ],
        "members": [
            "xmltodict",
            "xmltodict.parse",
            "xmltodict.unparse",
            "xmltodict.ParsingInterrupted",
        ],
    },
    "regex": {
        "url": "https://github.com/mrabarnett/mrab-regex/blob/hg/regex_3/regex.py",
        "version": "2.5.153",
        "modules": [
            "regex",
        ],
        "members": [
            "regex",
            "regex.cache_all",
            "regex.compile",
            "regex.DEFAULT_VERSION",
            "regex.escape",
            "regex.findall",
            "regex.finditer",
            "regex.fullmatch",
            "regex.match",
            "regex.purge",
            "regex.search",
            "regex.split",
            "regex.splititer",
            "regex.sub",
            "regex.subf",
            "regex.subfn",
            "regex.subn",
            "regex.template",
            "regex.Scanner",
            "regex.A",
            "regex.ASCII",
            "regex.B",
            "regex.BESTMATCH",
            "regex.D",
            "regex.DEBUG",
            "regex.E",
            "regex.ENHANCEMATCH",
            "regex.S",
            "regex.DOTALL",
            "regex.F",
            "regex.FULLCASE",
            "regex.I",
            "regex.IGNORECASE",
            "regex.L",
            "regex.LOCALE",
            "regex.M",
            "regex.MULTILINE",
            "regex.P",
            "regex.POSIX",
            "regex.R",
            "regex.REVERSE",
            "regex.T",
            "regex.TEMPLATE",
            "regex.U",
            "regex.UNICODE",
            "regex.V0",
            "regex.VERSION0",
            "regex.V1",
            "regex.VERSION1",
            "regex.X",
            "regex.VERBOSE",
            "regex.W",
            "regex.WORD",
            "regex.error",
            "regex.Regex",
            "regex.__version__",
            "regex.__doc__",
            "regex.RegexFlag",
        ],
    },
    "pytz": {
        "url": "https://github.com/stub42/pytz/blob/master/src/pytz/__init__.py",
        "version": "2025.2",
        "modules": [
            "pytz",
        ],
        "members": [
            "pytz",
            "pytz.timezone",
            "pytz.utc",
            "pytz.country_timezones",
            "pytz.country_names",
            "pytz.all_timezones",
            "pytz.all_timezones_set",
            "pytz.common_timezones",
            "pytz.common_timezones_set",
            "pytz.BaseTzInfo",
            "pytz.FixedOffset",
            "pytz.AmbiguousTimeError",
            "pytz.InvalidTimeError",
            "pytz.NonExistentTimeError",
            "pytz.UnknownTimeZoneError",
            "pytz.exceptions",
            "pytz.exceptions.AmbiguousTimeError",
            "pytz.exceptions.InvalidTimeError",
            "pytz.exceptions.NonExistentTimeError",
            "pytz.exceptions.UnknownTimeZoneError",
            "pytz.tzinfo",
            "pytz.tzinfo.memorized_timedelta",
            "pytz.tzinfo.memorized_datetime",
            "pytz.tzinfo.memorized_ttinfo",
            "pytz.tzinfo.BaseTzInfo",
            "pytz.tzinfo.StaticTzInfo",
            "pytz.tzinfo.DstTzInfo",
            "pytz.tzinfo.unpickler",
            "pytz.tzfile",
            "pytz.tzfile.build_tzinfo",
            "pytz.reference",
            "pytz.reference.FixedOffset",
            "pytz.reference.LocalTimezone",
            "pytz.reference.USTimeZone",
            "pytz.reference.Eastern",
            "pytz.reference.Central",
            "pytz.reference.Mountain",
            "pytz.reference.Pacific",
            "pytz.reference.UTC",
            "pytz.lazy",
            "pytz.lazy.LazyDict",
            "pytz.lazy.LazyList",
            "pytz.lazy.LazySet",
        ],
    },
    "pytesseract": {
        "url": "https://github.com/madmaze/pytesseract",
        "version": "0.3.13",
        "modules": [
            "pytesseract",
        ],
        "members": [
            "pytesseract",
            "pytesseract.DEFAULT_ENCODING",
            "pytesseract.LANG_PATTERN",
            "pytesseract.RGB_MODE",
            "pytesseract.SUPPORTED_FORMATS",
            "pytesseract.OSD_KEYS",
            "pytesseract.EXTENTION_TO_CONFIG",
            "pytesseract.TESSERACT_MIN_VERSION",
            "pytesseract.TESSERACT_ALTO_VERSION",
            "pytesseract.Output",
            "pytesseract.PandasNotSupported",
            "pytesseract.TesseractError",
            "pytesseract.TesseractNotFoundError",
            "pytesseract.TSVNotSupported",
            "pytesseract.ALTONotSupported",
            "pytesseract.kill",
            "pytesseract.timeout_manager",
            "pytesseract.run_once",
            "pytesseract.get_errors",
            "pytesseract.cleanup",
            "pytesseract.prepare",
            "pytesseract.save",
            "pytesseract.subprocess_args",
            "pytesseract.run_tesseract",
            "pytesseract.run_and_get_multiple_output",
            "pytesseract.run_and_get_output",
            "pytesseract.file_to_dict",
            "pytesseract.is_valid",
            "pytesseract.osd_to_dict",
            "pytesseract.get_languages",
            "pytesseract.get_tesseract_version",
            "pytesseract.image_to_string",
            "pytesseract.image_to_pdf_or_hocr",
            "pytesseract.image_to_alto_xml",
            "pytesseract.image_to_boxes",
            "pytesseract.get_pandas_output",
            "pytesseract.image_to_data",
            "pytesseract.image_to_osd",
            "pytesseract.main",
        ],
    },
}

print(f"Have {len(manually_scraped)} manually scraped libraries.")

Have 6 manually scraped libraries.


## **2.** Automatically download the inventory of 21 libraries using Sphinx / readthedocs

In [3]:
# define the inventory urls for libraries with sphinx / readthedocs documentation

inventory_urls = {
    "bs4": "https://www.crummy.com/software/BeautifulSoup/bs4/doc/objects.inv",
    "chardet": "https://chardet.readthedocs.io/en/latest/objects.inv",
    "cryptography": "https://cryptography.io/en/latest/objects.inv",
    "dateutil": "https://dateutil.readthedocs.io/en/stable/objects.inv",  #
    "django": "https://docs.djangoproject.com/en/stable/objects.inv",
    "folium": "https://python-visualization.github.io/folium/latest/objects.inv",
    "librosa": "https://librosa.org/doc/latest/objects.inv",
    "matplotlib": "https://matplotlib.org/stable/objects.inv",
    "numpy": "https://numpy.org/doc/stable/objects.inv",  #
    "openpyxl": "https://openpyxl.readthedocs.io/en/latest/objects.inv",  #
    "pandas": "https://pandas.pydata.org/pandas-docs/stable/objects.inv",  #
    "psutil": "https://psutil.readthedocs.io/en/latest/objects.inv",
    "requests": "https://docs.python-requests.org/en/latest/objects.inv",
    "scipy": "https://docs.scipy.org/doc/scipy/objects.inv",  #
    "seaborn": "https://seaborn.pydata.org/objects.inv",
    "sklearn": "https://scikit-learn.org/stable/objects.inv",  #
    "statsmodels": "https://www.statsmodels.org/stable/objects.inv",
    "sympy": "https://docs.sympy.org/latest/objects.inv",  #
    "textblob": "https://textblob.readthedocs.io/en/latest/objects.inv",  #
    "wordcloud": "https://amueller.github.io/word_cloud/objects.inv",
    "xlwt": "https://xlwt.readthedocs.io/en/latest/objects.inv",
}

In [4]:
# define method to fetch the inventory and extract importable python objects

import sphobjinv as soi

python_objects = {
    "module",
    "class",
    "exception",
    "function",
    "data",
    # all of these are parts of classes, and not importable on their own, so ignored
    # "method", "attribute", "property",
}


def get_library_objects(library: str) -> dict:
    """
    Fetch the inventory from the given URL and return a dictionary with library info.
    """
    # download the inventory
    print(f"Scraping {library}...")
    inv = soi.Inventory(url=inventory_urls[library])

    # filter to only importable python objects
    base_members = set()
    for obj in inv.objects:
        if obj.domain == "py" and obj.role in python_objects:
            base_members.add(obj.name)

    # filter the members to catch any excessive inclusions by the sphinx file
    final_members = {library}
    for member in base_members:
        # sometimes other library members are included in the inventory, which we want to ignore
        if any(
            member.startswith(f"{_m}.") for _m in set(DOCUMENTED_LIBRARIES) - {library}
        ):
            continue

        # ignore any private members
        if any(_section.startswith("_") for _section in member.split(".")):
            continue

        # check if member marked as a module should instead have a parent
        if "." not in member:
            if any((member in m) and m != member for m in base_members):
                final_members.add(f"{library}.{member}")
            else:
                final_members.add(member)
            continue

        # skip any members that are known errors
        if any(
            member.startswith(f"{_skip}") for _skip in ["asgiref", "object", "builtins"]
        ):
            continue

        # otherwise add the member as is
        final_members.add(member)

    print(f"Have {len(final_members)} members for {library}.")
    return {
        "url": inventory_urls[library],
        "version": inv.version,
        "modules": sorted(set(m.split(".")[0] for m in final_members)),
        "members": sorted(final_members),
    }

In [5]:
# download the members of the libraries

sphinx_scraped = {}
for library in inventory_urls:
    try:
        info = get_library_objects(library)
        sphinx_scraped[library] = info
    except Exception as e:
        print(f"Error scraping {library}: {e}")

Scraping bs4...
Have 94 members for bs4.
Scraping chardet...
Have 69 members for chardet.
Scraping cryptography...
Have 437 members for cryptography.
Scraping dateutil...
Have 48 members for dateutil.
Scraping django...
Have 1162 members for django.
Scraping folium...
Have 130 members for folium.
Scraping librosa...
Have 233 members for librosa.
Scraping matplotlib...
Have 1130 members for matplotlib.
Scraping numpy...
Have 1258 members for numpy.
Scraping openpyxl...
Have 800 members for openpyxl.
Scraping pandas...
Have 312 members for pandas.
Scraping psutil...
Have 119 members for psutil.
Scraping requests...
Have 49 members for requests.
Scraping scipy...
Have 2439 members for scipy.
Scraping seaborn...
Have 100 members for seaborn.
Scraping sklearn...
Have 633 members for sklearn.
Scraping statsmodels...
Have 909 members for statsmodels.
Scraping sympy...
Have 2948 members for sympy.
Scraping textblob...
Have 56 members for textblob.
Scraping wordcloud...
Have 6 members for wordc

In [None]:
# add some missing members to the sphinx scraped libraries

# textblob top level imports not in documentation
sphinx_scraped["textblob"]["members"].extend(
    [
        "textblob.TextBlob",
        "textblob.WordList",
        "textblob.Word",
        "textblob.Sentence",
        "textblob.Blobber",
    ]
)

# numpy has known alias shortcut
sphinx_scraped["numpy"]["members"].extend(
    [
        "numpy.abs",
    ]
)

# dateutil parser functions are missed
sphinx_scraped["dateutil"]["members"].extend(
    [
        "dateutil.parser.parse",
        "dateutil.parser.isoparse",
    ]
)

# openpyxl top level imports not in documentation
sphinx_scraped["openpyxl"]["members"].extend(
    [
        "openpyxl.NUMPY",
        "openpyxl.DEFUSEDXML",
        "openpyxl.LXML",
        "openpyxl.Workbook",
        "openpyxl.load_workbook",
        "openpyxl.open",
        "openpyxl.constants",
    ]
)

# sympy allows everything to be imported from the top level
# fmt: off
sphinx_scraped["sympy"]["members"].extend(
    [
        f"sympy.{m}"
        for m in [
            # sympy.core
            'sympify', 'SympifyError', 'cacheit', 'Basic', 'Atom',
            'preorder_traversal', 'S', 'Expr', 'AtomicExpr', 'UnevaluatedExpr',
            'Symbol', 'Wild', 'Dummy', 'symbols', 'var', 'Number', 'Float',
            'Rational', 'Integer', 'NumberSymbol', 'RealNumber', 'igcd', 'ilcm',
            'seterr', 'E', 'I', 'nan', 'oo', 'pi', 'zoo', 'AlgebraicNumber', 'comp',
            'mod_inverse', 'Pow', 'integer_nthroot', 'integer_log', 'trailing', 'Mul', 'prod',
            'Add', 'Mod', 'Rel', 'Eq', 'Ne', 'Lt', 'Le', 'Gt', 'Ge', 'Equality',
            'GreaterThan', 'LessThan', 'Unequality', 'StrictGreaterThan',
            'StrictLessThan', 'vectorize', 'Lambda', 'WildFunction', 'Derivative',
            'diff', 'FunctionClass', 'Function', 'Subs', 'expand', 'PoleError',
            'count_ops', 'expand_mul', 'expand_log', 'expand_func', 'expand_trig',
            'expand_complex', 'expand_multinomial', 'nfloat', 'expand_power_base',
            'expand_power_exp', 'arity', 'PrecisionExhausted', 'N', 'evalf', 'Tuple',
            'Dict', 'gcd_terms', 'factor_terms', 'factor_nc', 'evaluate', 'Catalan',
            'EulerGamma', 'GoldenRatio', 'TribonacciConstant', 'bottom_up', 'use',
            'postorder_traversal', 'default_sort_key', 'ordered', 'num_digits',

            # sympy.logic
            'to_cnf', 'to_dnf', 'to_nnf', 'And', 'Or', 'Not', 'Xor', 'Nand', 'Nor',
            'Implies', 'Equivalent', 'ITE', 'POSform', 'SOPform', 'simplify_logic',
            'bool_map', 'true', 'false', 'satisfiable',

            # sympy.assumptions
            'AppliedPredicate', 'Predicate', 'AssumptionsContext', 'assuming', 'Q',
            'ask', 'register_handler', 'remove_handler', 'refine',

            # sympy.polys
            'Poly', 'PurePoly', 'poly_from_expr', 'parallel_poly_from_expr', 'degree',
            'total_degree', 'degree_list', 'LC', 'LM', 'LT', 'pdiv', 'prem', 'pquo',
            'pexquo', 'div', 'rem', 'quo', 'exquo', 'half_gcdex', 'gcdex', 'invert',
            'subresultants', 'resultant', 'discriminant', 'cofactors', 'gcd_list',
            'gcd', 'lcm_list', 'lcm', 'terms_gcd', 'trunc', 'monic', 'content',
            'primitive', 'compose', 'decompose', 'sturm', 'gff_list', 'gff',
            'sqf_norm', 'sqf_part', 'sqf_list', 'sqf', 'factor_list', 'factor',
            'intervals', 'refine_root', 'count_roots', 'all_roots', 'real_roots',
            'nroots', 'ground_roots', 'nth_power_roots_poly', 'cancel', 'reduced',
            'groebner', 'is_zero_dimensional', 'GroebnerBasis', 'poly', 'symmetrize',
            'horner', 'interpolate', 'rational_interpolate', 'viete', 'together',
            'BasePolynomialError', 'ExactQuotientFailed', 'PolynomialDivisionFailed',
            'OperationNotSupported', 'HeuristicGCDFailed', 'HomomorphismFailed',
            'IsomorphismFailed', 'ExtraneousFactors', 'EvaluationFailed',
            'RefinementFailed', 'CoercionFailed', 'NotInvertible', 'NotReversible',
            'NotAlgebraic', 'DomainError', 'PolynomialError', 'UnificationFailed',
            'GeneratorsError', 'GeneratorsNeeded', 'ComputationFailed',
            'UnivariatePolynomialError', 'MultivariatePolynomialError',
            'PolificationFailed', 'OptionError', 'FlagError', 'minpoly',
            'minimal_polynomial', 'primitive_element', 'field_isomorphism',
            'to_number_field', 'isolate', 'round_two', 'prime_decomp',
            'prime_valuation', 'galois_group', 'itermonomials', 'Monomial', 'lex', 'grlex',
            'grevlex', 'ilex', 'igrlex', 'igrevlex', 'CRootOf', 'rootof', 'RootOf',
            'ComplexRootOf', 'RootSum', 'roots', 'Domain', 'FiniteField',
            'IntegerRing', 'RationalField', 'RealField', 'ComplexField',
            'PythonFiniteField', 'GMPYFiniteField', 'PythonIntegerRing',
            'GMPYIntegerRing', 'PythonRational', 'GMPYRationalField',
            'AlgebraicField', 'PolynomialRing', 'FractionField', 'ExpressionDomain',
            'FF_python', 'FF_gmpy', 'ZZ_python', 'ZZ_gmpy', 'QQ_python', 'QQ_gmpy',
            'GF', 'FF', 'ZZ', 'QQ', 'ZZ_I', 'QQ_I', 'RR', 'CC', 'EX', 'EXRAW',
            'construct_domain', 'swinnerton_dyer_poly', 'cyclotomic_poly',
            'symmetric_poly', 'random_poly', 'interpolating_poly', 'jacobi_poly',
            'chebyshevt_poly', 'chebyshevu_poly', 'hermite_poly', 'hermite_prob_poly',
            'legendre_poly', 'laguerre_poly', 'apart', 'apart_list', 'assemble_partfrac_list',
            'Options', 'ring', 'xring', 'vring', 'sring', 'field', 'xfield', 'vfield',
            'sfield',

            # sympy.series
            'Order', 'O', 'limit', 'Limit', 'gruntz', 'series', 'approximants',
            'pade_approximant', 'residue', 'EmptySequence', 'SeqPer', 'SeqFormula',
            'sequence', 'SeqAdd', 'SeqMul', 'fourier_series', 'fps', 'difference_delta',
            'limit_seq',

            # sympy.functions
            'factorial', 'factorial2', 'rf', 'ff', 'binomial', 'RisingFactorial',
            'FallingFactorial', 'subfactorial', 'carmichael', 'fibonacci', 'lucas',
            'motzkin', 'tribonacci', 'harmonic', 'bernoulli', 'bell', 'euler', 'catalan',
            'genocchi', 'andre', 'partition',  'divisor_sigma', 'legendre_symbol', 'jacobi_symbol',
            'kronecker_symbol', 'mobius', 'primenu', 'primeomega', 'totient', 'primepi',
            'reduced_totient', 'sqrt', 'root', 'Min', 'Max', 'Id', 'real_root',
            'Rem', 'cbrt', 're', 'im', 'sign', 'Abs', 'conjugate', 'arg', 'polar_lift',
            'periodic_argument', 'unbranched_argument', 'principal_branch',
            'transpose', 'adjoint', 'polarify', 'unpolarify', 'sin', 'cos', 'tan',
            'sec', 'csc', 'cot', 'sinc', 'asin', 'acos', 'atan', 'asec', 'acsc',
            'acot', 'atan2', 'exp_polar', 'exp', 'ln', 'log', 'LambertW', 'sinh',
            'cosh', 'tanh', 'coth', 'sech', 'csch', 'asinh', 'acosh', 'atanh',
            'acoth', 'asech', 'acsch', 'floor', 'ceiling', 'frac', 'Piecewise',
            'piecewise_fold', 'piecewise_exclusive', 'erf', 'erfc', 'erfi', 'erf2',
            'erfinv', 'erfcinv', 'erf2inv', 'Ei', 'expint', 'E1', 'li', 'Li', 'Si',
            'Ci', 'Shi', 'Chi', 'fresnels', 'fresnelc', 'gamma', 'lowergamma',
            'uppergamma', 'polygamma', 'loggamma', 'digamma', 'trigamma', 'multigamma',
            'dirichlet_eta', 'zeta', 'lerchphi', 'polylog', 'stieltjes', 'Eijk', 'LeviCivita',
            'KroneckerDelta', 'SingularityFunction', 'DiracDelta', 'Heaviside',
            'bspline_basis', 'bspline_basis_set', 'interpolating_spline', 'besselj',
            'bessely', 'besseli', 'besselk', 'hankel1', 'hankel2', 'jn', 'yn',
            'jn_zeros', 'hn1', 'hn2', 'airyai', 'airybi', 'airyaiprime',
            'airybiprime', 'marcumq', 'hyper', 'meijerg', 'appellf1', 'legendre',
            'assoc_legendre', 'hermite', 'hermite_prob', 'chebyshevt', 'chebyshevu',
            'chebyshevu_root', 'chebyshevt_root', 'laguerre', 'assoc_laguerre',
            'gegenbauer', 'jacobi', 'jacobi_normalized', 'Ynm', 'Ynm_c', 'Znm',
            'elliptic_k', 'elliptic_f', 'elliptic_e', 'elliptic_pi', 'beta',
            'mathieus', 'mathieuc', 'mathieusprime', 'mathieucprime', 'riemann_xi','betainc',
            'betainc_regularized',

            # sympy.ntheory
            'nextprime', 'prevprime', 'prime', 'primerange', 'randprime',
            'Sieve', 'sieve', 'primorial', 'cycle_length', 'composite', 'compositepi',
            'isprime', 'divisors', 'proper_divisors', 'factorint', 'multiplicity',
            'perfect_power', 'pollard_pm1', 'factor_cache', 'pollard_rho', 'primefactors',
            'divisor_count', 'proper_divisor_count',
            'factorrat',
            'mersenne_prime_exponent', 'is_perfect', 'is_mersenne_prime',
            'is_abundant', 'is_deficient', 'is_amicable', 'is_carmichael', 'abundance',
            'npartitions',
            'is_primitive_root', 'is_quad_residue',
            'n_order', 'sqrt_mod', 'quadratic_residues',
            'primitive_root', 'nthroot_mod', 'is_nthpow_residue', 'sqrt_mod_iter',
            'discrete_log', 'quadratic_congruence', 'binomial_coefficients',
            'binomial_coefficients_list', 'multinomial_coefficients',
            'continued_fraction_periodic', 'continued_fraction_iterator',
            'continued_fraction_reduce', 'continued_fraction_convergents',
            'continued_fraction', 'egyptian_fraction',

            # sympy.concrete
            'product', 'Product', 'summation', 'Sum',

            # sympy.discrete
            'fft', 'ifft', 'ntt', 'intt', 'fwht', 'ifwht', 'mobius_transform',
            'inverse_mobius_transform', 'convolution', 'covering_product',
            'intersecting_product',

            # sympy.simplify
            'simplify', 'hypersimp', 'hypersimilar', 'logcombine', 'separatevars',
            'posify', 'besselsimp', 'kroneckersimp', 'signsimp',
            'nsimplify', 'FU', 'fu', 'sqrtdenest', 'cse', 'epath', 'EPath',
            'hyperexpand', 'collect', 'rcollect', 'radsimp', 'collect_const',
            'fraction', 'numer', 'denom', 'trigsimp', 'exptrigsimp', 'powsimp',
            'powdenest', 'combsimp', 'gammasimp', 'ratsimp', 'ratsimpmodprime',

            # sympy.sets
            'Set', 'Interval', 'Union', 'EmptySet', 'FiniteSet', 'ProductSet',
            'Intersection', 'imageset', 'DisjointUnion', 'Complement', 'SymmetricDifference',
            'ImageSet', 'Range', 'ComplexRegion', 'Reals', 'Contains', 'ConditionSet',
            'Ordinal', 'OmegaPower', 'ord0', 'PowerSet', 'Naturals',
            'Naturals0', 'UniversalSet', 'Integers', 'Rationals', 'Complexes',

            # sympy.solvers
            'solve', 'solve_linear_system', 'solve_linear_system_LU',
            'solve_undetermined_coeffs', 'nsolve', 'solve_linear', 'checksol',
            'det_quick', 'inv_quick', 'check_assumptions', 'failing_assumptions',
            'diophantine', 'rsolve', 'rsolve_poly', 'rsolve_ratio', 'rsolve_hyper',
            'checkodesol', 'classify_ode', 'dsolve', 'homogeneous_order',
            'solve_poly_system', 'factor_system', 'solve_triangulated', 'pde_separate',
            'pde_separate_add', 'pde_separate_mul', 'pdsolve', 'classify_pde',
            'checkpdesol', 'ode_order', 'reduce_inequalities',
            'reduce_abs_inequality', 'reduce_abs_inequalities',
            'solve_poly_inequality', 'solve_rational_inequalities',
            'solve_univariate_inequality', 'decompogen', 'solveset', 'linsolve',
            'linear_eq_to_matrix', 'nonlinsolve', 'substitution',

            # sympy.matrices
            'ShapeError', 'NonSquareMatrixError', 'GramSchmidt', 'casoratian', 'diag',
            'eye', 'hessian', 'jordan_cell', 'list2numpy', 'matrix2numpy',
            'matrix_multiply_elementwise', 'ones', 'randMatrix', 'rot_axis1',
            'rot_axis2', 'rot_axis3', 'symarray', 'wronskian', 'zeros',
            'MutableDenseMatrix', 'DeferredVector', 'MatrixBase', 'Matrix',
            'MutableMatrix', 'MutableSparseMatrix', 'banded', 'ImmutableDenseMatrix',
            'ImmutableSparseMatrix', 'ImmutableMatrix', 'SparseMatrix', 'MatrixSlice',
            'BlockDiagMatrix', 'BlockMatrix', 'FunctionMatrix', 'Identity', 'Inverse',
            'MatAdd', 'MatMul', 'MatPow', 'MatrixExpr', 'MatrixSymbol', 'Trace',
            'Transpose', 'ZeroMatrix', 'OneMatrix', 'blockcut', 'block_collapse',
            'matrix_symbols', 'Adjoint', 'hadamard_product', 'HadamardProduct',
            'HadamardPower', 'Determinant', 'det', 'diagonalize_vector', 'DiagMatrix',
            'DiagonalMatrix', 'DiagonalOf', 'trace', 'DotProduct',
            'kronecker_product', 'KroneckerProduct', 'PermutationMatrix',
            'MatrixPermute', 'Permanent', 'per', 'rot_ccw_axis1', 'rot_ccw_axis2',
            'rot_ccw_axis3', 'rot_givens',

            # sympy.geometry
            'Point', 'Point2D', 'Point3D', 'Line', 'Ray', 'Segment', 'Line2D',
            'Segment2D', 'Ray2D', 'Line3D', 'Segment3D', 'Ray3D', 'Plane', 'Ellipse',
            'Circle', 'Polygon', 'RegularPolygon', 'Triangle', 'rad', 'deg',
            'are_similar', 'centroid', 'convex_hull', 'idiff', 'intersection',
            'closest_points', 'farthest_points', 'GeometryError', 'Curve', 'Parabola',

            # sympy.utilities
            'flatten', 'group', 'take', 'subsets', 'variations', 'numbered_symbols',
            'cartes', 'capture', 'dict_merge', 'prefixes', 'postfixes', 'sift',
            'topological_sort', 'unflatten', 'has_dups', 'has_variety', 'reshape',
            'rotations', 'filldedent', 'lambdify', 'threaded', 'xthreaded',
            'public', 'memoize_property', 'timed',

            # sympy.integrals
            'integrate', 'Integral', 'line_integrate', 'mellin_transform',
            'inverse_mellin_transform', 'MellinTransform', 'InverseMellinTransform',
            'laplace_transform', 'inverse_laplace_transform', 'LaplaceTransform',
            'laplace_correspondence', 'laplace_initial_conds',
            'InverseLaplaceTransform', 'fourier_transform',
            'inverse_fourier_transform', 'FourierTransform',
            'InverseFourierTransform', 'sine_transform', 'inverse_sine_transform',
            'SineTransform', 'InverseSineTransform', 'cosine_transform',
            'inverse_cosine_transform', 'CosineTransform', 'InverseCosineTransform',
            'hankel_transform', 'inverse_hankel_transform', 'HankelTransform',
            'InverseHankelTransform', 'singularityintegrate',

            # sympy.tensor
            'IndexedBase', 'Idx', 'Indexed', 'get_contraction_structure',
            'get_indices', 'shape', 'MutableDenseNDimArray', 'ImmutableDenseNDimArray',
            'MutableSparseNDimArray', 'ImmutableSparseNDimArray', 'NDimArray',
            'tensorproduct', 'tensorcontraction', 'tensordiagonal', 'derive_by_array',
            'permutedims', 'Array', 'DenseNDimArray', 'SparseNDimArray',

            # sympy.parsing
            'parse_expr',

            # sympy.calculus
            'euler_equations', 'singularities', 'is_increasing',
            'is_strictly_increasing', 'is_decreasing', 'is_strictly_decreasing',
            'is_monotonic', 'finite_diff_weights', 'apply_finite_diff',
            'differentiate_finite', 'periodicity', 'not_empty_in',
            'AccumBounds', 'is_convex', 'stationary_points', 'minimum', 'maximum',

            # sympy.algebras
            'Quaternion',

            # sympy.printing
            'pager_print', 'pretty', 'pretty_print', 'pprint', 'pprint_use_unicode',
            'pprint_try_use_unicode', 'latex', 'print_latex', 'multiline_latex',
            'mathml', 'print_mathml', 'python', 'print_python', 'pycode', 'ccode',
            'print_ccode', 'smtlib_code', 'glsl_code', 'print_glsl', 'cxxcode', 'fcode',
            'print_fcode', 'rcode', 'print_rcode', 'jscode', 'print_jscode',
            'julia_code', 'mathematica_code', 'octave_code', 'rust_code', 'print_gtk',
            'preview', 'srepr', 'print_tree', 'StrPrinter', 'sstr', 'sstrrepr',
            'TableForm', 'dotprint', 'maple_code', 'print_maple_code',

            # sympy.plotting
            'plot', 'textplot', 'plot_backends', 'plot_implicit', 'plot_parametric',

            # sympy.interactive
            'init_session', 'init_printing', 'interactive_traversal',

            # sympy.testing
            'test', 'doctest',
        ]
    ]
)
# fmt: off

## **3.** Custom scraping code to download documentation for the 3 remaining libraries

In [7]:
remaining = {
    "nltk": {
        "url": "https://www.nltk.org/api/nltk.html",
        "version": "3.8.1",
    },
    "tensorflow": {
        "url": "https://www.tensorflow.org/api_docs/python/tf",
        "version": "2.6.0",
    },
    "lxml": {
        "urls": [
            "https://lxml.de/apidoc/index.html",
            "https://lxml.de/apidoc/lxml.html.html",
            "https://lxml.de/apidoc/lxml.isoschematron.html",
        ],
        "version": "4.6.3",
    },
}

In [8]:
# method to access the html content of documentation pages

from bs4 import BeautifulSoup
import requests


def get_html_soup(url: str) -> BeautifulSoup:
    """
    Fetch the HTML content from the given URL and return a BeautifulSoup object.
    """
    response = requests.get(url)
    response.raise_for_status()  # Raise an error for bad responses
    return BeautifulSoup(response.text, "html.parser")

In [9]:
# scrape the nltk documentation page
nltk_soup = get_html_soup(remaining["nltk"]["url"])
nltk_members = {"nltk"}

In [10]:
# find the subpackages of nltk, and their members
packages = nltk_soup.find("section", id="subpackages")
for l1 in packages.select("li.toctree-l1"):
    _a = l1.select_one("a")
    module = _a.text.strip().split(" ")[0]
    if not module.startswith("nltk."):
        continue

    nltk_members.add(module)

    for l3 in l1.select("li.toctree-l3"):
        _a = l3.select_one("a")
        submodule = _a.text.strip().strip("()").split(" ")[0]

        if submodule.startswith("nltk."):
            nltk_members.add(submodule)

            for l4 in l3.select("li.toctree-l4"):
                _a = l4.select_one("a")
                member = f"{submodule}.{_a.text.strip().rstrip('()').split(' ')[0]}"
                nltk_members.add(member)

        else:
            nltk_members.add(f"{module}.{submodule}")

print(f"Now have {len(nltk_members)} members for nltk.")

Now have 1258 members for nltk.


In [11]:
# find the submodules of nltk, and their members
packages = nltk_soup.find("section", id="submodules")
for l1 in packages.select("li.toctree-l1"):
    _a = l1.select_one("a")
    module = _a.text.strip().split(" ")[0]
    if not module.startswith("nltk."):
        continue

    nltk_members.add(module)

    for l2 in l1.select("li.toctree-l2"):
        _a = l2.select_one("a")
        member = f"{module}.{_a.text.strip().rstrip('()').split(' ')[0]}"
        nltk_members.add(member)

print(f"Now have {len(nltk_members)} members for nltk.")

Now have 1501 members for nltk.


In [12]:
# nltk seems to allow all members to be imported from the top level
top_level_members = {f"nltk.{m.split('.')[-1]}" for m in nltk_members if "." in m}
nltk_members.update(top_level_members)

In [13]:
# format the nltk members
nltk_members = sorted(nltk_members)
remaining["nltk"]["modules"] = sorted(set(m.split(".")[0] for m in nltk_members))
remaining["nltk"]["members"] = nltk_members
print(f"Have {len(nltk_members)} members for nltk:\n\t{nltk_members[:5]}")

Have 2783 members for nltk:
	['nltk', 'nltk.ALL', 'nltk.ARFF_Formatter', 'nltk.ARLSTem', 'nltk.ARLSTem2']


In [14]:
# find all modules and submodules, and the urls to their 'overview' pages

tf_members = set()
tf_module_urls = {}

tf_soup = get_html_soup(remaining["tensorflow"]["url"])

# extract links from expandable navigation bar
for div in tf_soup.select("li.devsite-nav-expandable"):
    # get top level module name from the toggle title span
    reference = div.select_one("span.devsite-nav-text")
    if not reference or not reference.text.startswith("tf"):
        continue

    # clean up the module name
    _title_text = reference.text.strip().replace("\u200b", "").replace(" ", "")
    tf_module = _title_text.replace("tf", "tensorflow")
    tf_members.add(tf_module)

    # find the module overview link
    for _a in div.select("li.devsite-nav-item a.devsite-nav-title"):
        member_text = _a.select_one("span.devsite-nav-text").get_text().strip()
        if member_text.lower() == "overview":
            tf_module_urls[tf_module] = _a["href"]
            break

    # find all submodules
    for _a in div.select(
        "li.devsite-nav-item li.devsite-nav-expandable:not(li.devsite-nav-deprecated)"
    ):
        # get top level module name from the toggle title span
        reference = _a.select_one("span.devsite-nav-text")
        if not reference:
            continue

        # clean up the module name
        _title_text = reference.text.strip().replace("\u200b", "").replace(" ", "")
        tf_submodule = f"{tf_module}.{_title_text}"
        tf_members.add(tf_submodule)

        # find the submodule overview link
        for b in _a.select("li.devsite-nav-item a.devsite-nav-title"):
            member_text = b.select_one("span.devsite-nav-text").get_text().strip()
            if member_text.lower() == "overview":
                tf_module_urls[tf_submodule] = b["href"]
                continue

print(f"Have {len(tf_module_urls)} modules to scrape for tensorflow.")

Have 217 modules to scrape for tensorflow.


In [16]:
# scrape docs for each tensorflow module

from tqdm import tqdm

BASE_URL = "https://www.tensorflow.org"

for tf_submodule, url in tqdm(list(tf_module_urls.items())):
    # fetch the overview page for the module
    _tf_sub_soup = get_html_soup(url=f"{BASE_URL}{url}")
    body = _tf_sub_soup.select_one("div.devsite-article-body")

    # extract all member names from the body
    for p in body.find_all("p"):
        _a = p.find("a")
        if not _a:
            continue

        member_text = _a.get_text().strip().rstrip("(...)").split(" ")[-1]
        if not member_text or member_text.startswith("tf."):
            continue

        tf_members.add(f"{tf_submodule}.{member_text}")

print(f"Have {len(tf_members)} members for tensorflow:\n\t{list(tf_members)[:10]}")

  0%|          | 0/217 [00:00<?, ?it/s]

100%|██████████| 217/217 [04:22<00:00,  1.21s/it]

Have 4716 members for tensorflow:
	['tensorflow.keras.reuters', 'tensorflow.lite.OpsSet', 'tensorflow.compat.raw_ops.MultiDeviceIteratorFromStringHandle', 'tensorflow.nn.approx_min_k', 'tensorflow.compat.raw_ops.GroupByWindowDataset', 'tensorflow.compat.raw_ops.SlidingWindowDataset', 'tensorflow.keras.losses.serialize', 'tensorflow.keras.numpy.all', 'tensorflow.compat.distributions.Laplace', 'tensorflow.compat.raw_ops.XlaSparseDenseMatmulGradWithAdamAndCsrInput']





In [17]:
# format the tensorflow members
tf_members = sorted(tf_members)
remaining["tensorflow"]["modules"] = sorted(set(m.split(".")[0] for m in tf_members))
remaining["tensorflow"]["members"] = tf_members

In [18]:
# define method to scrape lxml documentation page
def get_lxml_members(url: str, package: str, top_level: int) -> set:
    _lxml_soup = get_html_soup(url=url)
    _lxml_members = set()

    # extract all top level modules
    for top in _lxml_soup.select(f"li.toctree-l{top_level}"):
        reference = top.select_one("a.reference")
        module = reference.text.strip() if reference else ""
        module = module.split(" ")[0]
        if (
            not module
            or not module.startswith(f"{package}.")
            or module.startswith(f"{package}._")
        ):
            continue

        _lxml_members.add(module)

        # extract all members of the top level modules
        for a in top.select(f"li.toctree-l{top_level + 1} > a"):
            name = a.text.strip()
            # Some names include trailing ' — description'; strip that
            if name.startswith("_") or name.lower() in [
                "submodules",
                "module contents",
            ]:
                continue

            name = name.split("—", 1)[0].strip().rstrip("()")
            _lxml_members.add(f"{module}.{name}")

    # extract all members of the module itself
    for dt in _lxml_soup.select("dt.sig.sig-object.py"):
        if (_id := dt.get("id")) and dt.select_one("span.descclassname"):
            if _id.startswith(f"{package}.") and not _id.startswith(f"{package}._"):
                _lxml_members.add(_id)

    return _lxml_members

In [19]:
# some manually scraped members
lxml_members = {"lxml", "lxml.get_include"}

lxml_members.update(
    get_lxml_members(
        url=remaining["lxml"]["urls"][0],
        package="lxml",
        top_level=3,
    )
)
lxml_members.update(
    get_lxml_members(
        url=remaining["lxml"]["urls"][1],
        package="lxml.html",
        top_level=1,
    )
)
lxml_members.update(
    get_lxml_members(
        url=remaining["lxml"]["urls"][2],
        package="lxml.isoschematron",
        top_level=0,
    )
)

lxml_members = sorted(lxml_members)
remaining["lxml"]["modules"] = sorted(set(m.split(".")[0] for m in lxml_members))
remaining["lxml"]["members"] = lxml_members
print(f"Have {len(lxml_members)} members for lxml:\n\t{lxml_members[:10]}")

Have 391 members for lxml:
	['lxml', 'lxml.ElementInclude', 'lxml.ElementInclude.FatalIncludeError', 'lxml.ElementInclude.LimitedRecursiveIncludeError', 'lxml.ElementInclude.default_loader', 'lxml.ElementInclude.include', 'lxml.builder', 'lxml.builder.ElementMaker', 'lxml.cssselect', 'lxml.doctestcompare']


## **4.** Save all of the documentation!

In [20]:
from llm_cgr import save_json
from datetime import datetime

final_data = {
    **manually_scraped,
    **sphinx_scraped,
    **remaining,
}

for library in DOCUMENTED_LIBRARIES:
    assert library in final_data, f"Missing documentation for {library}!"

print(
    f"Have data for all {len(DOCUMENTED_LIBRARIES)} libraries:\n\t{DOCUMENTED_LIBRARIES}"
)
save_json(
    data={
        "datetime": datetime.now().isoformat(),
        "data": final_data,
    },
    file_path="../data/libraries/documentation.json",
)

Have data for all 30 libraries:
	['bs4', 'chardet', 'cryptography', 'dateutil', 'django', 'folium', 'librosa', 'lxml', 'matplotlib', 'nltk', 'numpy', 'openpyxl', 'pandas', 'psutil', 'pytesseract', 'pytz', 'regex', 'requests', 'scipy', 'seaborn', 'sklearn', 'statsmodels', 'sympy', 'tensorflow', 'textblob', 'texttable', 'wordcloud', 'wordninja', 'xlwt', 'xmltodict']
