In [1]:
import subprocess

import numpy as np
import pandas as pd
import h5py
import networkx as nx
from ipysigma import Sigma

In [2]:
file = h5py.File("/home/jpivarski/storage/data/GHArchive/GHArchive-2022-aggregated.h5")
df = pd.DataFrame(
    {
        "actor": file["actor_id"],
        "repo": file["repo_id"],
        "type": file["event_type_id"],
        "count": file["count"],
    }
)

In [3]:
bot_actors = np.array(
    [
        int(line.split(b"\t")[0])
        for line in subprocess.Popen(
            [
                "fgrep",
                "[bot]",
                "/home/jpivarski/storage/data/GHArchive/actor_id_name.txt",
            ],
            stdout=subprocess.PIPE,
        )
        .communicate()[0]
        .split(b"\n")
        if line != b""
    ]
)
df_no_bots = df[~df.actor.isin(bot_actors)]

In [4]:
event_type_to_id = {
    "PullRequestEvent": 14,
    "PullRequestReviewCommentEvent": 15,
    "PullRequestReviewEvent": 16,
    "PushEvent": 17,
    "ReleaseEvent": 18,
}

In [5]:
df_imp_events = df_no_bots[df_no_bots.type.isin(event_type_to_id.values())]
seed = pd.read_csv("list-of-scientific-python-repo_id.csv")

df_seed = df_imp_events[df_imp_events.repo.isin(seed.repo_id)]
df_seed_triples = (
    df_seed[["actor", "repo", "count"]].groupby(["actor", "repo"]).sum("count")
)
df_seed_triples.reset_index(inplace=True)

df_seed_triples.head()

Unnamed: 0,actor,repo,count
0,1641,1349775,11
1,1796,858127,2
2,1875,2058,5
3,2071,65600975,1
4,3303,37287688,1


In [6]:
G = nx.from_pandas_edgelist(
    df_seed_triples, source="actor", target="repo", edge_attr="count"
)

# Find the largest cc to partition into bipartite sets
largest_cc = max(nx.connected_components(G), key=len)
actor, repo = nx.bipartite.sets(nx.subgraph(G, largest_cc))

projection = nx.bipartite.overlap_weighted_projected_graph(G, repo)

In [7]:
seed_map = dict(zip(seed.repo_id, seed.repo_name))

In [8]:
temp = nx.relabel_nodes(projection, seed_map)

In [9]:
Sigma(temp, node_size=temp.degree, node_metrics=["louvain"], node_color="louvain", label_density=3)

Sigma(nx.Graph with 158 nodes and 1,502 edges)