# big query datasets PEP 508 analysis

We analyze the entirety of all `requires_dist` in pypi's big query dataset from mid december 2022

In [1]:
import json
from pathlib import Path

import packaging.requirements
from pep508_rs import Requirement, Pep508Error, MarkerEnvironment
from tqdm import tqdm

In [2]:
env = MarkerEnvironment.current()

packages = 0
valid = 0
invalid = 0

general_parse_errors = list()
rs_parse_errors = list()
all_warnings = list()


num_lines = sum(1 for line in Path("pipy_requires_dist.ndjson").open("r"))
with Path("pipy_requires_dist.ndjson").open("r") as fp:
    for line in tqdm(fp, total=num_lines):
        num_lines -= 1
        if num_lines == 0:
            break
        packages += 1
        data = json.loads(line)
        all_valid = True
        for requirement in data["requires_dist"]:
            try:
                requirement = Requirement(requirement)
            except Pep508Error as e:
                try:
                    packaging.requirements.Requirement(requirement)
                    if "===" in requirement:
                        general_parse_errors.append(e)
                    else:
                        rs_parse_errors.append(e)
                except packaging.requirements.InvalidRequirement as e:
                    general_parse_errors.append(e)
                all_valid = False
                continue
                # print(data["name"], data["version"], requirement, e)
            result, warnings = requirement.evaluate_markers_and_report(env, [])
            if warnings:
                all_warnings.extend(warnings)
                # print(warnings)
        if all_valid:
            valid += 1
        else:
            invalid += 1

100%|█████████▉| 8175705/8175706 [02:28<00:00, 54923.32it/s] 


In [3]:
# TODO: check what validation pypi currently does
# obviously they can't change old metadata though
print(
    f"packages: {packages} valid: {valid} invalid: {invalid} fraction invalid: {invalid / (valid + invalid):.2%}"
)

packages: 8175705 valid: 8152056 invalid: 23649 fraction invalid: 0.29%


In [4]:
# Let's look at all the things that look unintentional
for i in sorted(set(all_warnings)):
    print(i)



In [5]:
# Are there any cases our rust parser can't handle that the python parser (packaging.requirements) can handle?
print("errors in both rust and python", len(general_parse_errors))
print("rust only errors", len(rs_parse_errors))
print(*rs_parse_errors)

errors in both rust and python 37716
rust only errors 1
Expected a valid marker name, found 'python_implementation'
isal (>=0.3.0) ; sys_platform=="linux" and python_implementation != "PyPy"
                                           ^^^^^^^^^^^^^^^^^^^^^
