In [1]:
import networkx as nx
import psycopg2
from tqdm import tqdm


conn = psycopg2.connect(
    database="npm_data", host="localhost", user="postgres", password="secret"
)
cursor = conn.cursor()

In [2]:
cursor.execute(
    """SELECT packages.id, packages.name, packages.dist_tag_latest_version, versions.repository_parsed, (SELECT SUM(s.counter) FROM UNNEST(download_metrics.download_counts) s) as downloads
        FROM packages
        JOIN download_metrics ON packages.id = download_metrics.package_id
        JOIN versions ON packages.dist_tag_latest_version = versions.id
        WHERE 
            current_package_state_type='normal' AND
            dist_tag_latest_version IS NOT NULL"""
)

all_packages = cursor.fetchall()

In [3]:
graph = nx.DiGraph()

graph.add_nodes_from(
    [
        (
            id,
            {
                "name": name,
                "repository": repository[1:].split(",")[0] if repository else None,
                "downloads": str(downloads),
            },
        )
        for id, name, _, repository, downloads in all_packages
    ]
)
print(f"Graph with {graph.number_of_nodes()} nodes created")

Graph with 2423835 nodes created


In [4]:
get_dev_deps = False
if get_dev_deps:
    join_str = "JOIN dependencies ON dependencies.id = ANY(versions.prod_dependencies) OR dependencies.id = ANY(versions.dev_dependencies)"
else:
    join_str = "JOIN dependencies ON dependencies.id = ANY(versions.prod_dependencies)"

for id, _, version_id, _, _ in tqdm(all_packages):
    cursor.execute(
        f"""SELECT dependencies.dst_package_id_if_exists
                FROM versions
                {join_str}
                WHERE versions.id={version_id} AND
                    dependencies.dst_package_id_if_exists IS NOT NULL""",
    )

    dependencies = cursor.fetchall()
    edges = [(id, dst) for (dst,) in dependencies]
    graph.add_edges_from(edges)

print(
    f"Graph with {graph.number_of_nodes()} nodes and {graph.number_of_edges()} edges created"
)

100%|██████████| 2423835/2423835 [19:28<00:00, 2074.45it/s]


Graph with 2424918 nodes and 7928230 edges created


In [5]:
nx.write_pajek(graph, "npm_graph_prod.net")



In [35]:
print(graph.nodes[2207340])

s1 = {graph.nodes[n]["name"] for n in graph.neighbors(2207340)}

{'name': 'express', 'repository': 'https://github.com/expressjs/express', 'downloads': '2384861834'}


In [40]:
# print top 10 degree nodes
top10_in = sorted(graph.in_degree, key=lambda x: x[1], reverse=True)[:10]
top10_out = sorted(graph.out_degree, key=lambda x: x[1], reverse=True)[:10]

In [42]:
print("Top 10 in-degree nodes:")
for node, degree in top10_in:
    print(f"{graph.nodes[node]['name']} - {degree}")
print()
print("Top 10 out-degree nodes:")
for node, degree in top10_out:
    print(f"{graph.nodes[node]['name']} - {degree}")

Top 10 in-degree nodes:
typescript - 486570
eslint - 411491
@types/node - 254491
mocha - 247154
jest - 231394
prettier - 226916
react - 223797
webpack - 197805
@babel/core - 182003
react-dom - 173274

Top 10 out-degree nodes:
sindresorhus.js - 1000
potionseller - 1000
1000-packages - 1000
bloater - 998
m2m-chartjs-plugin-crosshair - 979
npm-all-packages - 977
digital-keyboard-demos - 976
u-library - 963
design-system-fitbank-450 - 962
dfeuk-frontend-manual - 961
